Skip to content

Commit

Permalink
v2: initial commit (#153)
Browse files Browse the repository at this point in the history
- Some methods have been changed to return an error as their last
  argument
- Log calls inside various functions have been removed
- Use a v1 tag if you need the previous signature
  • Loading branch information
jonathaningram authored Oct 31, 2023
1 parent e9e59ef commit 93312f4
Show file tree
Hide file tree
Showing 27 changed files with 88 additions and 104 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
with:
images: sajari/docd
labels: |
org.opencontainers.image.description=A tool which exposes code.sajari.com/docconv as a service
org.opencontainers.image.description=A tool which exposes code.sajari.com/docconv/v2 as a service
org.opencontainers.image.title=docd
tags: |
type=semver,pattern={{version}}
Expand Down
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# docconv

[![Go reference](https://pkg.go.dev/badge/code.sajari.com/docconv.svg)](https://pkg.go.dev/code.sajari.com/docconv)
[![Go reference](https://pkg.go.dev/badge/code.sajari.com/docconv/v2.svg)](https://pkg.go.dev/code.sajari.com/docconv/v2)
[![Build status](https://github.com/sajari/docconv/workflows/Go/badge.svg?branch=master)](https://github.com/sajari/docconv/actions)
[![Report card](https://goreportcard.com/badge/code.sajari.com/docconv)](https://goreportcard.com/report/code.sajari.com/docconv)
[![Sourcegraph](https://sourcegraph.com/github.com/sajari/docconv/-/badge.svg)](https://sourcegraph.com/github.com/sajari/docconv)
[![Report card](https://goreportcard.com/badge/code.sajari.com/docconv/v2)](https://goreportcard.com/report/code.sajari.com/docconv/v2)
[![Sourcegraph](https://sourcegraph.com/github.com/sajari/docconv/v2/-/badge.svg)](https://sourcegraph.com/github.com/sajari/docconv/v2)

A Go wrapper library to convert PDF, DOC, DOCX, XML, HTML, RTF, ODT, Pages documents and images (see optional dependencies below) to plain text.

Expand All @@ -14,7 +14,7 @@ If you haven't setup Go before, you first need to [install Go](https://golang.or
To fetch and build the code:

```console
$ go install code.sajari.com/docconv/docd@latest
$ go install code.sajari.com/docconv/v2/docd@latest
```

See `go help install` for details on the installation location of the installed `docd` executable. Make sure that the full path to the executable is in your `PATH` environment variable.
Expand Down Expand Up @@ -48,7 +48,7 @@ To add image support to the `docconv` library you first need to [install and bui
Now you can add `-tags ocr` to any `go` command when building/fetching/testing `docconv` to include support for processing images:

```console
$ go get -tags ocr code.sajari.com/docconv/...
$ go get -tags ocr code.sajari.com/docconv/v2/...
```

This may complain on macOS, which you can fix by installing [tesseract](https://tesseract-ocr.github.io) via brew:
Expand Down Expand Up @@ -119,7 +119,7 @@ package main
import (
"fmt"

"code.sajari.com/docconv"
"code.sajari.com/docconv/v2"
)

func main() {
Expand All @@ -139,7 +139,7 @@ package main
import (
"fmt"

"code.sajari.com/docconv/client"
"code.sajari.com/docconv/v2/client"
)

func main() {
Expand Down
2 changes: 1 addition & 1 deletion client/cmd/docconv-client/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"fmt"
"os"

"code.sajari.com/docconv/client"
"code.sajari.com/docconv/v2/client"
)

var (
Expand Down
18 changes: 6 additions & 12 deletions doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bytes"
"fmt"
"io"
"log"
"os"
"os/exec"
"time"
Expand All @@ -26,24 +25,24 @@ func ConvertDoc(r io.Reader) (string, map[string]string, error) {
go func() {
defer func() {
if e := recover(); e != nil {
log.Printf("panic when reading doc format: %v", e)
// TODO: Propagate error.
}
}()

meta := make(map[string]string)

doc, err := mscfb.New(f)
if err != nil {
log.Printf("ConvertDoc: could not read doc: %v", err)
// TODO: Propagate error.
mc <- meta
return
}

props := msoleps.New()
for entry, err := doc.Next(); err == nil; entry, err = doc.Next() {
if msoleps.IsMSOLEPS(entry.Initial) {
if oerr := props.Reset(doc); oerr != nil {
log.Printf("ConvertDoc: could not reset props: %v", oerr)
if err := props.Reset(doc); err != nil {
// TODO: Propagate error.
break
}

Expand Down Expand Up @@ -73,28 +72,23 @@ func ConvertDoc(r io.Reader) (string, map[string]string, error) {
// Document body
bc := make(chan string, 1)
go func() {

// Save output to a file
var buf bytes.Buffer
outputFile, err := os.CreateTemp("/tmp", "sajari-convert-")
if err != nil {
// TODO: Remove this.
log.Println("TempFile Out:", err)
bc <- buf.String()
return
}
defer os.Remove(outputFile.Name())

err = exec.Command("wvText", f.Name(), outputFile.Name()).Run()
if err != nil {
// TODO: Remove this.
log.Println("wvText:", err)
// TODO: Propagate error.
}

_, err = buf.ReadFrom(outputFile)
if err != nil {
// TODO: Remove this.
log.Println("wvText:", err)
// TODO: Propagate error.
}

bc <- buf.String()
Expand Down
2 changes: 1 addition & 1 deletion docconv.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package docconv // import "code.sajari.com/docconv"
package docconv // import "code.sajari.com/docconv/v2"

import (
"encoding/json"
Expand Down
4 changes: 2 additions & 2 deletions docd/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ import (

"cloud.google.com/go/errorreporting"

"code.sajari.com/docconv"
"code.sajari.com/docconv/docd/internal"
"code.sajari.com/docconv/v2"
"code.sajari.com/docconv/v2/docd/internal"
)

type convertServer struct {
Expand Down
8 changes: 4 additions & 4 deletions docd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ import (

"github.com/gorilla/mux"

"code.sajari.com/docconv"
"code.sajari.com/docconv/docd/internal"
"code.sajari.com/docconv/docd/internal/cloudtrace"
"code.sajari.com/docconv/docd/internal/debug"
"code.sajari.com/docconv/v2"
"code.sajari.com/docconv/v2/docd/internal"
"code.sajari.com/docconv/v2/docd/internal/cloudtrace"
"code.sajari.com/docconv/v2/docd/internal/debug"
)

var (
Expand Down
2 changes: 1 addition & 1 deletion docx_test/docx_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"strings"
"testing"

"code.sajari.com/docconv"
"code.sajari.com/docconv/v2"
)

func TestConvertDocx(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module code.sajari.com/docconv
module code.sajari.com/docconv/v2

go 1.21

Expand Down
30 changes: 16 additions & 14 deletions html.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
// +build !appengine
//go:build !appengine

package docconv

import (
"bytes"
"io"
"log"
"strings"

"golang.org/x/net/html"
Expand All @@ -25,18 +24,23 @@ func ConvertHTML(r io.Reader, readability bool) (string, map[string]string, erro

cleanXML, err := Tidy(buf, false)
if err != nil {
log.Println("Tidy:", err)
// Tidy failed, so we now manually tokenize instead
clean := cleanHTML(buf, true)
cleanXML = []byte(clean)
// TODO: remove this log
log.Println("Cleaned HTML using Golang tokenizer")
}

if readability {
cleanXML = HTMLReadability(bytes.NewReader(cleanXML))
var err error
cleanXML, err = HTMLReadability(bytes.NewReader(cleanXML))
if err != nil {
return "", nil, err
}
}
text, err := HTMLToText(bytes.NewReader(cleanXML))
if err != nil {
return "", nil, err
}
return HTMLToText(bytes.NewReader(cleanXML)), meta, nil
return text, meta, nil
}

var acceptedHTMLTags = [...]string{
Expand Down Expand Up @@ -127,7 +131,7 @@ type HTMLReadabilityOptions struct {
var HTMLReadabilityOptionsValues HTMLReadabilityOptions

// HTMLReadability extracts the readable text in an HTML document
func HTMLReadability(r io.Reader) []byte {
func HTMLReadability(r io.Reader) ([]byte, error) {
jr := justext.NewReader(r)

// TODO: Improve this!
Expand All @@ -141,8 +145,7 @@ func HTMLReadability(r io.Reader) []byte {

paragraphSet, err := jr.ReadAll()
if err != nil {
log.Println("Justext:", err)
return nil
return nil, err
}

useClasses := strings.SplitN(HTMLReadabilityOptionsValues.ReadabilityUseClasses, ",", 10)
Expand All @@ -156,13 +159,12 @@ func HTMLReadability(r io.Reader) []byte {
}
}

return []byte(output)
return []byte(output), nil
}

// HTMLToText converts HTML to plain text.
func HTMLToText(input io.Reader) string {
text, _ := XMLToText(input, []string{"br", "p", "h1", "h2", "h3", "h4"}, []string{}, false)
return text
func HTMLToText(input io.Reader) (string, error) {
return XMLToText(input, []string{"br", "p", "h1", "h2", "h3", "h4"}, []string{}, false)
}

var readabilityStopList = map[string]bool{"and": true, "the": true, "a": true, "about": true, "above": true, "across": true, "after": true, "afterwards": true, "again": true, "against": true, "all": true, "almost": true, "alone": true,
Expand Down
11 changes: 2 additions & 9 deletions html_appengine.go
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
//go:build appengine
// +build appengine

package docconv

import (
"io"
"log"
)

func HTMLReadability(r io.Reader) []byte {
b, err := io.ReadAll(r)
if err != nil {
log.Printf("HTMLReadability: %v", err)
return nil
}
return b
func HTMLReadability(r io.Reader) ([]byte, error) {
return io.ReadAll(r)
}
2 changes: 1 addition & 1 deletion html_test/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (

"github.com/google/go-cmp/cmp"

"code.sajari.com/docconv"
"code.sajari.com/docconv/v2"
)

func TestConvertHTML_readabilityUseClasses(t *testing.T) {
Expand Down
10 changes: 5 additions & 5 deletions iWork/TSPArchiveMessages.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions iWork/TSPDatabaseMessages.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions iWork/TSPMessages.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion iWork/pb-schema/TSPArchiveMessages.proto
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
syntax = "proto2";

option go_package = "code.sajari.com/docconv/tsp";
option go_package = "code.sajari.com/docconv/v2/tsp";

package TSP;

Expand Down
3 changes: 1 addition & 2 deletions iWork/pb-schema/TSPDatabaseMessages.proto
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
syntax = "proto2";

option go_package = "code.sajari.com/docconv/tsp";
option go_package = "code.sajari.com/docconv/v2/tsp";

import "TSPMessages.proto";
package TSP;
Expand All @@ -27,4 +27,3 @@ message DatabaseImageDataArchive {
required .TSP.DatabaseDataArchive super = 1;
required .TSP.DatabaseImageDataArchive.ImageType type = 2;
}

3 changes: 1 addition & 2 deletions iWork/pb-schema/TSPMessages.proto
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
syntax = "proto2";

option go_package = "code.sajari.com/docconv/tsp";
option go_package = "code.sajari.com/docconv/v2/tsp";

package TSP;

Expand Down Expand Up @@ -94,4 +94,3 @@ message ObjectContainer {
optional uint32 identifier = 1;
repeated .TSP.Reference objects = 2;
}

2 changes: 1 addition & 1 deletion image.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// +build !ocr
//go:build !ocr

package docconv

Expand Down
Loading

0 comments on commit 93312f4

Please sign in to comment.