From 8729c9736d2a181663df3b7d0eac0ea4a01e506b Mon Sep 17 00:00:00 2001 From: Mario Idival Date: Mon, 28 Sep 2015 15:58:51 -0300 Subject: [PATCH 1/7] recognizing pdf with images --- pdf.go | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/pdf.go b/pdf.go index 76270fd..ab0de6f 100644 --- a/pdf.go +++ b/pdf.go @@ -3,12 +3,91 @@ package docconv import ( "fmt" "io" + "io/ioutil" "log" + "os" "os/exec" + "path/filepath" "strings" "time" ) +func compareExt(ext string, exts []string) bool { + for _, e := range exts { + if ext == e { + return true + } + } + return false +} + +func PdfImages(path string) (string, map[string]string, error) { + + tmp, err := ioutil.TempDir("/tmp", "tmp-imgs-") + if err != nil { + log.Println(err) + return "", nil, err + } + tmpDir := fmt.Sprintf("%s/", tmp) + defer os.RemoveAll(tmpDir) + + cmd := "pdfimages -j %s %s" + _, err = exec.Command("bash", "-c", fmt.Sprintf(cmd, path, tmpDir)).Output() + if err != nil { + log.Println(err) + return "", nil, err + } + + files := []string{} + m := make(map[int]string) + + walkFunc := func(path string, info os.FileInfo, err error) error { + path, err = filepath.Abs(path) + if err != nil { + return err + } + + exts := []string{".jpg", ".tif", ".tiff", ".png"} + if compareExt(filepath.Ext(path), exts) == true { + files = append(files, path) + } + return nil + } + + filepath.Walk(tmpDir, walkFunc) + + for indx, p := range files { + f, err := os.Open(p) + if err != nil { + log.Println(err) + continue + } + out, _, _ := ConvertImage(f) + m[indx] = out + f.Close() + } + o := make([]string, len(m)) + for i := 0; i < len(m); i++ { + o = append(o, m[i]) + } + + return strings.Join(o, " "), nil, nil +} + +// PdfHasImage verify if `path` (PDF) has images +func PdfHasImage(path string) bool { + cmd := "pdffonts -l 5 %s | tail -n +3 | cut -d' ' -f1 | sort | uniq" + out, err := exec.Command("bash", "-c", fmt.Sprintf(cmd, path)).Output() + if err != nil { + log.Println(err) + return false + } + if string(out) == "" { + return true + } + return false +} + // Convert PDF func ConvertPDF(r io.Reader) (string, map[string]string, error) { f, err := NewLocalFile(r, "/tmp", "sajari-convert-") @@ -17,6 +96,11 @@ func ConvertPDF(r io.Reader) (string, map[string]string, error) { } defer f.Done() + // Verify if pdf has images or is pdf only-text + if PdfHasImage(f.Name()) == true { + return PdfImages(f.Name()) + } + // Meta data mc := make(chan map[string]string, 1) go func() { @@ -52,7 +136,6 @@ func ConvertPDF(r io.Reader) (string, map[string]string, error) { mc <- meta }() - // Document body bc := make(chan string, 1) go func() { From 5bc56ae7f5a6cf44398ca6b8de94b43b636aedfd Mon Sep 17 00:00:00 2001 From: Mario Idival Date: Wed, 7 Oct 2015 10:35:30 -0300 Subject: [PATCH 2/7] use goroutines to process ConvertImage function --- pdf.go | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pdf.go b/pdf.go index ab0de6f..c9d1186 100644 --- a/pdf.go +++ b/pdf.go @@ -9,6 +9,7 @@ import ( "os/exec" "path/filepath" "strings" + "sync" "time" ) @@ -22,7 +23,6 @@ func compareExt(ext string, exts []string) bool { } func PdfImages(path string) (string, map[string]string, error) { - tmp, err := ioutil.TempDir("/tmp", "tmp-imgs-") if err != nil { log.Println(err) @@ -31,8 +31,7 @@ func PdfImages(path string) (string, map[string]string, error) { tmpDir := fmt.Sprintf("%s/", tmp) defer os.RemoveAll(tmpDir) - cmd := "pdfimages -j %s %s" - _, err = exec.Command("bash", "-c", fmt.Sprintf(cmd, path, tmpDir)).Output() + _, err = exec.Command("pdfimages", "-j", path, tmpDir).Output() if err != nil { log.Println(err) return "", nil, err @@ -53,24 +52,28 @@ func PdfImages(path string) (string, map[string]string, error) { } return nil } - filepath.Walk(tmpDir, walkFunc) + var wg sync.WaitGroup + wg.Add(len(files)) for indx, p := range files { - f, err := os.Open(p) - if err != nil { - log.Println(err) - continue - } - out, _, _ := ConvertImage(f) - m[indx] = out - f.Close() + go func(idx int, pathFile string, m map[int]string, ww *sync.WaitGroup) { + defer ww.Done() + f, err := os.Open(pathFile) + if err != nil { + log.Println(err) + } + out, _, _ := ConvertImage(f) + m[idx] = out + f.Close() + }(indx, p, m, &wg) } + wg.Wait() + o := make([]string, len(m)) for i := 0; i < len(m); i++ { o = append(o, m[i]) } - return strings.Join(o, " "), nil, nil } From 8c05c208ca9068cd3027da5cc36b5b051dde3781 Mon Sep 17 00:00:00 2001 From: Mario Idival Date: Thu, 8 Oct 2015 10:44:25 -0300 Subject: [PATCH 3/7] fixing code review --- pdf.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pdf.go b/pdf.go index c9d1186..deecd73 100644 --- a/pdf.go +++ b/pdf.go @@ -13,6 +13,10 @@ import ( "time" ) +var ( + exts = []string{".jpg", ".tif", ".tiff", ".png"} +) + func compareExt(ext string, exts []string) bool { for _, e := range exts { if ext == e { @@ -22,7 +26,7 @@ func compareExt(ext string, exts []string) bool { return false } -func PdfImages(path string) (string, map[string]string, error) { +func PDFImages(path string) (string, map[string]string, error) { tmp, err := ioutil.TempDir("/tmp", "tmp-imgs-") if err != nil { log.Println(err) @@ -33,7 +37,6 @@ func PdfImages(path string) (string, map[string]string, error) { _, err = exec.Command("pdfimages", "-j", path, tmpDir).Output() if err != nil { - log.Println(err) return "", nil, err } @@ -46,7 +49,6 @@ func PdfImages(path string) (string, map[string]string, error) { return err } - exts := []string{".jpg", ".tif", ".tiff", ".png"} if compareExt(filepath.Ext(path), exts) == true { files = append(files, path) } @@ -63,7 +65,10 @@ func PdfImages(path string) (string, map[string]string, error) { if err != nil { log.Println(err) } - out, _, _ := ConvertImage(f) + out, _, err := ConvertImage(f) + if err != nil { + log.Println(err) + } m[idx] = out f.Close() }(indx, p, m, &wg) @@ -78,7 +83,7 @@ func PdfImages(path string) (string, map[string]string, error) { } // PdfHasImage verify if `path` (PDF) has images -func PdfHasImage(path string) bool { +func PDFHasImage(path string) bool { cmd := "pdffonts -l 5 %s | tail -n +3 | cut -d' ' -f1 | sort | uniq" out, err := exec.Command("bash", "-c", fmt.Sprintf(cmd, path)).Output() if err != nil { @@ -100,8 +105,8 @@ func ConvertPDF(r io.Reader) (string, map[string]string, error) { defer f.Done() // Verify if pdf has images or is pdf only-text - if PdfHasImage(f.Name()) == true { - return PdfImages(f.Name()) + if PDFHasImage(f.Name()) { + return PDFImages(f.Name()) } // Meta data From c4246a88c151bd8974c7da94e82dff450e27f5d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rio=20Idival?= Date: Tue, 13 Oct 2015 16:56:35 -0300 Subject: [PATCH 4/7] remove map to race condition --- pdf.go | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/pdf.go b/pdf.go index deecd73..72bc368 100644 --- a/pdf.go +++ b/pdf.go @@ -11,12 +11,24 @@ import ( "strings" "sync" "time" + "sort" ) var ( exts = []string{".jpg", ".tif", ".tiff", ".png"} ) +type PagePDF struct { + NumPage int + Page string +} + +type ByPage []PagePDF + +func (a ByPage) Len() int { return len(a) } +func (a ByPage) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a ByPage) Less(i, j int) bool { return a[i].NumPage < a[j].NumPage } + func compareExt(ext string, exts []string) bool { for _, e := range exts { if ext == e { @@ -27,6 +39,7 @@ func compareExt(ext string, exts []string) bool { } func PDFImages(path string) (string, map[string]string, error) { + log.Println("Running with PDFImages") tmp, err := ioutil.TempDir("/tmp", "tmp-imgs-") if err != nil { log.Println(err) @@ -41,7 +54,7 @@ func PDFImages(path string) (string, map[string]string, error) { } files := []string{} - m := make(map[int]string) + m := []PagePDF{} walkFunc := func(path string, info os.FileInfo, err error) error { path, err = filepath.Abs(path) @@ -59,7 +72,7 @@ func PDFImages(path string) (string, map[string]string, error) { var wg sync.WaitGroup wg.Add(len(files)) for indx, p := range files { - go func(idx int, pathFile string, m map[int]string, ww *sync.WaitGroup) { + go func(idx int, pathFile string, m *[]PagePDF, ww *sync.WaitGroup) { defer ww.Done() f, err := os.Open(pathFile) if err != nil { @@ -69,16 +82,23 @@ func PDFImages(path string) (string, map[string]string, error) { if err != nil { log.Println(err) } - m[idx] = out + var p PagePDF + p.NumPage = idx + p.Page = out + *m = append(*m, p) f.Close() - }(indx, p, m, &wg) + }(indx, p, &m, &wg) } wg.Wait() o := make([]string, len(m)) - for i := 0; i < len(m); i++ { - o = append(o, m[i]) + + sort.Sort(ByPage(m)) + + for _, sPdf := range m { + o = append(o, sPdf.Page) } + return strings.Join(o, " "), nil, nil } From 80f9689b27870039dc0058bd3b2fda6f8a4f0423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rio=20Idival?= Date: Tue, 13 Oct 2015 18:46:30 -0300 Subject: [PATCH 5/7] add Lock on goroutine --- pdf.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pdf.go b/pdf.go index 72bc368..424432a 100644 --- a/pdf.go +++ b/pdf.go @@ -39,7 +39,6 @@ func compareExt(ext string, exts []string) bool { } func PDFImages(path string) (string, map[string]string, error) { - log.Println("Running with PDFImages") tmp, err := ioutil.TempDir("/tmp", "tmp-imgs-") if err != nil { log.Println(err) @@ -70,6 +69,8 @@ func PDFImages(path string) (string, map[string]string, error) { filepath.Walk(tmpDir, walkFunc) var wg sync.WaitGroup + mt := &sync.Mutex{} + wg.Add(len(files)) for indx, p := range files { go func(idx int, pathFile string, m *[]PagePDF, ww *sync.WaitGroup) { @@ -85,7 +86,11 @@ func PDFImages(path string) (string, map[string]string, error) { var p PagePDF p.NumPage = idx p.Page = out + + mt.Lock() *m = append(*m, p) + mt.Unlock() + f.Close() }(indx, p, &m, &wg) } From fec59c26a74e8a6f9b7001b294af727b27693c14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rio=20Idival?= Date: Wed, 14 Oct 2015 15:17:48 -0300 Subject: [PATCH 6/7] add sync.Mutex in aanonymous struct --- pdf.go | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/pdf.go b/pdf.go index 424432a..f5ebe55 100644 --- a/pdf.go +++ b/pdf.go @@ -18,17 +18,10 @@ var ( exts = []string{".jpg", ".tif", ".tiff", ".png"} ) -type PagePDF struct { - NumPage int - Page string +var pdfMutex struct { + sync.Mutex } -type ByPage []PagePDF - -func (a ByPage) Len() int { return len(a) } -func (a ByPage) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -func (a ByPage) Less(i, j int) bool { return a[i].NumPage < a[j].NumPage } - func compareExt(ext string, exts []string) bool { for _, e := range exts { if ext == e { @@ -69,11 +62,11 @@ func PDFImages(path string) (string, map[string]string, error) { filepath.Walk(tmpDir, walkFunc) var wg sync.WaitGroup - mt := &sync.Mutex{} + m := make(map[int]string) wg.Add(len(files)) for indx, p := range files { - go func(idx int, pathFile string, m *[]PagePDF, ww *sync.WaitGroup) { + go func(idx int, pathFile string, m map[int]string, ww *sync.WaitGroup) { defer ww.Done() f, err := os.Open(pathFile) if err != nil { @@ -83,25 +76,20 @@ func PDFImages(path string) (string, map[string]string, error) { if err != nil { log.Println(err) } - var p PagePDF - p.NumPage = idx - p.Page = out - mt.Lock() - *m = append(*m, p) - mt.Unlock() + pdfMutex.Lock() + m[idx] = out + pdfMutex.Unlock() f.Close() - }(indx, p, &m, &wg) + }(indx, p, m, &wg) } wg.Wait() o := make([]string, len(m)) - sort.Sort(ByPage(m)) - - for _, sPdf := range m { - o = append(o, sPdf.Page) + for i := 0; i < len(m); i++ { + o = append(o, m[i]) } return strings.Join(o, " "), nil, nil From 5b66790c52b640431421539e2deddb05e557723c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rio=20Idival?= Date: Thu, 15 Oct 2015 09:41:34 -0300 Subject: [PATCH 7/7] remove old things --- pdf.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pdf.go b/pdf.go index f5ebe55..10ff0fa 100644 --- a/pdf.go +++ b/pdf.go @@ -11,7 +11,6 @@ import ( "strings" "sync" "time" - "sort" ) var ( @@ -46,7 +45,6 @@ func PDFImages(path string) (string, map[string]string, error) { } files := []string{} - m := []PagePDF{} walkFunc := func(path string, info os.FileInfo, err error) error { path, err = filepath.Abs(path)