Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert Images in PDFs #40

Merged
merged 7 commits into from
Dec 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@

sajari-convert
*tests/
*tests/
2 changes: 1 addition & 1 deletion image_ocr.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"io"
"sync"

"github.com/otiai10/gosseract"
"github.com/otiai10/gosseract/v1/gosseract"
)

var langs = struct {
Expand Down
62 changes: 14 additions & 48 deletions pdf.go
Original file line number Diff line number Diff line change
@@ -1,64 +1,30 @@
// +build !ocr

package docconv

import (
"fmt"
"io"
"log"
"os/exec"
"strings"
"time"
)

// ConvertPDF converts a PDF file to text.
func ConvertPDF(r io.Reader) (string, map[string]string, error) {

f, err := NewLocalFile(r, "/tmp", "sajari-convert-")
if err != nil {
return "", nil, fmt.Errorf("error creating local file: %v", err)
}
defer f.Done()

// Meta data
mc := make(chan map[string]string, 1)
go func() {
meta := make(map[string]string)
metaStr, err := exec.Command("pdfinfo", f.Name()).Output()
if err != nil {
// TODO: Remove this.
log.Println("pdfinfo:", err)
}

// Parse meta output
for _, line := range strings.Split(string(metaStr), "\n") {
if parts := strings.SplitN(line, ":", 2); len(parts) > 1 {
meta[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
}
}

// Convert parsed dates into unix timestamps
if tmp, ok := meta["ModDate"]; ok {
if t, err := time.Parse(time.ANSIC, tmp); err == nil {
meta["ModifiedDate"] = fmt.Sprintf("%d", t.Unix())
}
}
if tmp, ok := meta["CreationDate"]; ok {
if t, err := time.Parse(time.ANSIC, tmp); err == nil {
meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix())
}
}

mc <- meta
}()

// Document body
bc := make(chan string, 1)
go func() {
body, err := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", f.Name(), "-").Output()
if err != nil {
// TODO: Remove this.
log.Println("pdftotext:", err)
}
bc <- string(body)
}()
bodyResult, metaResult, convertErr := ConvertPDFText(f.Name())
if convertErr != nil {
return "", nil, convertErr
}
if bodyResult.err != nil {
return "", nil, bodyResult.err
}
if metaResult.err != nil {
return "", nil, metaResult.err
}
return bodyResult.body, metaResult.meta, nil

return <-bc, <-mc, nil
}
163 changes: 163 additions & 0 deletions pdf_ocr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
// +build ocr

package docconv

import (
"fmt"
"io"
"io/ioutil"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
)

var (
exts = []string{".jpg", ".tif", ".tiff", ".png", ".pbm"}
)

func compareExt(ext string, exts []string) bool {
for _, e := range exts {
if ext == e {
return true
}
}
return false
}

func cleanupTemp(tmpDir string) {
err := os.RemoveAll(tmpDir)
if err != nil {
log.Println(err)
}
}

func ConvertPDFImages(path string) (BodyResult, error) {
bodyResult := BodyResult{}

tmp, err := ioutil.TempDir("/tmp", "tmp-imgs-")
if err != nil {
bodyResult.err = err
return bodyResult, err
}
tmpDir := fmt.Sprintf("%s/", tmp)

defer cleanupTemp(tmpDir)

_, err = exec.Command("pdfimages", "-j", path, tmpDir).Output()
if err != nil {
return bodyResult, err
}

files := []string{}

walkFunc := func(path string, info os.FileInfo, err error) error {
path, err = filepath.Abs(path)
if err != nil {
return err
}

if compareExt(filepath.Ext(path), exts) {
files = append(files, path)
}
return nil
}
filepath.Walk(tmpDir, walkFunc)

fileLength := len(files)

if fileLength < 1 {
return bodyResult, nil
}

var wg sync.WaitGroup

data := make(chan string, fileLength)

wg.Add(fileLength)

for _, p := range files {
go func(pathFile string) {
defer wg.Done()
f, err := os.Open(pathFile)

if err != nil {
bodyResult.err = err
return
}

defer f.Close()
out, _, err := ConvertImage(f)
if err != nil {
bodyResult.err = err
}

data <- out

}(p)
}

wg.Wait()

close(data)

for str := range data {
bodyResult.body += str + " "
}

return bodyResult, nil
}

// PdfHasImage verify if `path` (PDF) has images
func PDFHasImage(path string) bool {
cmd := "pdffonts -l 5 %s | tail -n +3 | cut -d' ' -f1 | sort | uniq"
out, err := exec.Command("bash", "-c", fmt.Sprintf(cmd, path)).Output()
if err != nil {
log.Println(err)
return false
}
if string(out) == "" {
return true
}
return false
}

func ConvertPDF(r io.Reader) (string, map[string]string, error) {
f, err := NewLocalFile(r, "/tmp", "sajari-convert-")
if err != nil {
return "", nil, fmt.Errorf("error creating local file: %v", err)
}
defer f.Done()

bodyResult, metaResult, textConvertErr := ConvertPDFText(f.Name())
if textConvertErr != nil {
return "", nil, textConvertErr
}
if bodyResult.err != nil {
return "", nil, bodyResult.err
}
if metaResult.err != nil {
return "", nil, metaResult.err
}

if !PDFHasImage(f.Name()) {
return bodyResult.body, metaResult.meta, nil
}

imageConvertResult, imageConvertErr := ConvertPDFImages(f.Name())
if imageConvertErr != nil {
log.Println(imageConvertErr)
return bodyResult.body, metaResult.meta, nil
}
if imageConvertResult.err != nil {
log.Println(imageConvertResult.err)
return bodyResult.body, metaResult.meta, nil
}

fullBody := strings.Join([]string{bodyResult.body, imageConvertResult.body}, " ")

return fullBody, metaResult.meta, nil

}
73 changes: 73 additions & 0 deletions pdf_text.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package docconv

import (
"fmt"
"os/exec"
"strings"
"time"
)

// Meta data
type MetaResult struct {
meta map[string]string
err error
}

type BodyResult struct {
body string
err error
}

// Convert PDF

func ConvertPDFText(path string) (BodyResult, MetaResult, error) {
metaResult := MetaResult{meta: make(map[string]string)}
bodyResult := BodyResult{}
mr := make(chan MetaResult, 1)
go func() {
metaStr, err := exec.Command("pdfinfo", path).Output()
if err != nil {
metaResult.err = err
mr <- metaResult
return
}

// Parse meta output
for _, line := range strings.Split(string(metaStr), "\n") {
if parts := strings.SplitN(line, ":", 2); len(parts) > 1 {
metaResult.meta[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
}
}

// Convert parsed meta
if tmp, ok := metaResult.meta["Author"]; ok {
metaResult.meta["Author"] = tmp
}
if tmp, ok := metaResult.meta["ModDate"]; ok {
if t, err := time.Parse(time.ANSIC, tmp); err == nil {
metaResult.meta["ModifiedDate"] = fmt.Sprintf("%d", t.Unix())
}
}
if tmp, ok := metaResult.meta["CreationDate"]; ok {
if t, err := time.Parse(time.ANSIC, tmp); err == nil {
metaResult.meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix())
}
}

mr <- metaResult
}()

br := make(chan BodyResult, 1)
go func() {
body, err := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output()
if err != nil {
bodyResult.err = err
}

bodyResult.body = string(body)

br <- bodyResult
}()

return <-br, <-mr, nil
}