-
Notifications
You must be signed in to change notification settings - Fork 232
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Read pdf image via tesseract #19
Changes from all commits
8729c97
5bc56ae
8c05c20
c4246a8
80f9689
fec59c2
5b66790
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,12 +3,110 @@ package docconv | |
import ( | ||
"fmt" | ||
"io" | ||
"io/ioutil" | ||
"log" | ||
"os" | ||
"os/exec" | ||
"path/filepath" | ||
"strings" | ||
"sync" | ||
"time" | ||
) | ||
|
||
var ( | ||
exts = []string{".jpg", ".tif", ".tiff", ".png"} | ||
) | ||
|
||
var pdfMutex struct { | ||
sync.Mutex | ||
} | ||
|
||
func compareExt(ext string, exts []string) bool { | ||
for _, e := range exts { | ||
if ext == e { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func PDFImages(path string) (string, map[string]string, error) { | ||
tmp, err := ioutil.TempDir("/tmp", "tmp-imgs-") | ||
if err != nil { | ||
log.Println(err) | ||
return "", nil, err | ||
} | ||
tmpDir := fmt.Sprintf("%s/", tmp) | ||
defer os.RemoveAll(tmpDir) | ||
|
||
_, err = exec.Command("pdfimages", "-j", path, tmpDir).Output() | ||
if err != nil { | ||
return "", nil, err | ||
} | ||
|
||
files := []string{} | ||
|
||
walkFunc := func(path string, info os.FileInfo, err error) error { | ||
path, err = filepath.Abs(path) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
if compareExt(filepath.Ext(path), exts) == true { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need the |
||
files = append(files, path) | ||
} | ||
return nil | ||
} | ||
filepath.Walk(tmpDir, walkFunc) | ||
|
||
var wg sync.WaitGroup | ||
m := make(map[int]string) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to create an anonymous type here with the map and mutex inside, rather than having a global |
||
|
||
wg.Add(len(files)) | ||
for indx, p := range files { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
go func(idx int, pathFile string, m map[int]string, ww *sync.WaitGroup) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
defer ww.Done() | ||
f, err := os.Open(pathFile) | ||
if err != nil { | ||
log.Println(err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do something about the error instead of logging it? |
||
} | ||
out, _, err := ConvertImage(f) | ||
if err != nil { | ||
log.Println(err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably want to handle this one too. |
||
} | ||
|
||
pdfMutex.Lock() | ||
m[idx] = out | ||
pdfMutex.Unlock() | ||
|
||
f.Close() | ||
}(indx, p, m, &wg) | ||
} | ||
wg.Wait() | ||
|
||
o := make([]string, len(m)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this space. |
||
for i := 0; i < len(m); i++ { | ||
o = append(o, m[i]) | ||
} | ||
|
||
return strings.Join(o, " "), nil, nil | ||
} | ||
|
||
// PdfHasImage verify if `path` (PDF) has images | ||
func PDFHasImage(path string) bool { | ||
cmd := "pdffonts -l 5 %s | tail -n +3 | cut -d' ' -f1 | sort | uniq" | ||
out, err := exec.Command("bash", "-c", fmt.Sprintf(cmd, path)).Output() | ||
if err != nil { | ||
log.Println(err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't log the error, return it? |
||
return false | ||
} | ||
if string(out) == "" { | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
// Convert PDF | ||
func ConvertPDF(r io.Reader) (string, map[string]string, error) { | ||
f, err := NewLocalFile(r, "/tmp", "sajari-convert-") | ||
|
@@ -17,6 +115,11 @@ func ConvertPDF(r io.Reader) (string, map[string]string, error) { | |
} | ||
defer f.Done() | ||
|
||
// Verify if pdf has images or is pdf only-text | ||
if PDFHasImage(f.Name()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like you missed a comment from an earlier diff: Does this mean that if a PDF has an embedded image then it will ignore completely the text of the document? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, in our case, we use this check for PDFs typically generated by scanners, many of them generate PDF's with each page being the first photo of the original document. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, so you need to move this so that it's only enabled when OCR is enabled (otherwise PDFs which have images and text will be ignored if OCR hasn't been built in). |
||
return PDFImages(f.Name()) | ||
} | ||
|
||
// Meta data | ||
mc := make(chan map[string]string, 1) | ||
go func() { | ||
|
@@ -52,7 +155,6 @@ func ConvertPDF(r io.Reader) (string, map[string]string, error) { | |
|
||
mc <- meta | ||
}() | ||
|
||
// Document body | ||
bc := make(chan string, 1) | ||
go func() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Return the error instead of logging it.