Skip to content

Commit

Permalink
revert to daulet tokenizers, version upgrades
Browse files Browse the repository at this point in the history
  • Loading branch information
RJKeevil committed Aug 13, 2024
1 parent 2ff025f commit 5421f62
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 74 deletions.
39 changes: 14 additions & 25 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,46 +1,35 @@
#--- dockerfile with hugot dependencies and cli (cpu only) ---

ARG GO_VERSION=1.22.5
ARG RUST_VERSION=1.79
ARG GO_VERSION=1.22.6
ARG ONNXRUNTIME_VERSION=1.18.0
ARG BUILD_PLATFORM=linux/amd64

#--- rust build of tokenizer ---

FROM --platform=$BUILD_PLATFORM rust:$RUST_VERSION AS tokenizer

COPY ./go.mod .

RUN tokenizer_version=$(grep 'github.com/knights-analytics/tokenizers' go.mod | awk '{print $2}') && \
tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \
echo "tokenizer_version: $tokenizer_version" && \
git clone https://github.com/knights-analytics/tokenizers && \
cd tokenizers && \
git checkout $tokenizer_version && \
cargo build --release

#--- build layer ---

FROM --platform=$BUILD_PLATFORM public.ecr.aws/amazonlinux/amazonlinux:2023 AS hugot-build
ARG GO_VERSION
ARG ONNXRUNTIME_VERSION

RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git && \
COPY ./go.mod .

RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git which && \
ln -s /usr/lib64/libstdc++.so.6 /usr/lib64/libstdc++.so && \
dnf install -y 'dnf-command(config-manager)' && \
dnf config-manager --add-repo https://download.fedoraproject.org/pub/fedora/linux/releases/39/Everything/x86_64/os/ && \
dnf clean all

RUN tokenizer_version=$(grep 'github.com/daulet/tokenizers' go.mod | awk '{print $2}') && \
tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \
echo "tokenizer_version: $tokenizer_version" && \
curl -LO https://github.com/daulet/tokenizers/releases/download/${tokenizer_version}/libtokenizers.linux-amd64.tar.gz && \
tar -C /usr/lib -xzf libtokenizers.linux-amd64.tar.gz && \
rm libtokenizers.linux-amd64.tar.gz

# go
RUN curl -LO https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
rm go${GO_VERSION}.linux-amd64.tar.gz
ENV PATH="$PATH:/usr/local/go/bin"

# tokenizer
COPY --from=tokenizer /tokenizers/target/release/libtokenizers.a /usr/lib/libtokenizers.a

# onnxruntime cpu and gpu
# onnxruntime cpu
RUN curl -LO https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \
tar -xzf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \
mv ./onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}/lib/libonnxruntime.so.${ONNXRUNTIME_VERSION} /usr/lib64/onnxruntime.so
Expand All @@ -53,6 +42,6 @@ RUN cd ./cmd && CGO_ENABLED=1 CGO_LDFLAGS="-L/usr/lib/" GOOS=linux GOARCH=amd64
#--- final layer ---
FROM --platform=$BUILD_PLATFORM public.ecr.aws/amazonlinux/amazonlinux:2023 AS final

COPY --from=tokenizer /tokenizers/target/release/libtokenizers.a /usr/lib/libtokenizers.a
COPY --from=hugot-build /usr/lib/libtokenizers.a /usr/lib/libtokenizers.a
COPY --from=hugot-build /build/cmd/target /hugot-cli
COPY --from=hugot-build /usr/lib64/onnxruntime.so /usr/lib64/onnxruntime.so
COPY --from=hugot-build /usr/lib64/onnxruntime.so /usr/lib64/onnxruntime.so
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Hugot can be used in two ways: as a library in your go application, or as a comm

To use Hugot as a library in your application, you will need the following two dependencies on your system:

- the tokenizers.a file obtained from the releases section of this page (if you want to use alternative architecture from `linux/amd64` you will have to build the tokenizers.a yourself, see [here](https://github.com/knights-analytics/tokenizers). This file should be at /usr/lib/tokenizers.a so that hugot can load it. Alternatively, you can explicitly specify the path to the folder with the `libtokenizers.a` file using the `CGO_LDFLAGS` env variable, see the [dockerfile](./Dockerfile).
- the tokenizers.a file obtained from the releases section of this page (if you want to use alternative architecture from `linux/amd64` you will have to build the tokenizers.a yourself, see [here](https://github.com/daulet/tokenizers). This file should be at /usr/lib/tokenizers.a so that hugot can load it. Alternatively, you can explicitly specify the path to the folder with the `libtokenizers.a` file using the `CGO_LDFLAGS` env variable, see the [dockerfile](./Dockerfile).
- the onnxruntime.go file obtained from the releases section of this page (if you want to use alternative architectures from `linux/amd64` you will have to download it from [the onnxruntime releases page](https://github.com/microsoft/onnxruntime/releases/), see the [dockerfile](./Dockerfile) as an example). Hugot looks for this file at /usr/lib/onnxruntime.so or /usr/lib64/onnxruntime.so by default. A different location can be specified by passing the `WithOnnxLibraryPath()` option to `NewSession()`, e.g:

```
Expand Down
2 changes: 1 addition & 1 deletion compose-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ services:
platform: linux/amd64
build:
context: .
dockerfile: ./Dockerfile.test
dockerfile: ./test.Dockerfile
target: hugot-build
volumes:
- $src_dir:/home/testuser/repositories/hugot
Expand Down
2 changes: 1 addition & 1 deletion compose-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ services:
container_name: hugot
build:
context: .
dockerfile: ./Dockerfile.test
dockerfile: ./test.Dockerfile
target: hugot-build
volumes:
- $test_folder:/test
Expand Down
14 changes: 7 additions & 7 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@ go 1.22

require (
github.com/bodaay/HuggingFaceModelDownloader v0.0.0-20240307153905-2f38356a6d6c
github.com/daulet/tokenizers v0.9.0
github.com/json-iterator/go v1.1.12
github.com/knights-analytics/tokenizers v0.14.0
github.com/mattn/go-isatty v0.0.20
github.com/stretchr/testify v1.9.0
github.com/urfave/cli/v2 v2.27.3
github.com/urfave/cli/v2 v2.27.4
github.com/viant/afs v1.25.1
github.com/viant/afsc v1.9.3-0.20240719110652-735d07c594f3
github.com/yalue/onnxruntime_go v1.11.0
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56
golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa
)

require (
github.com/aws/aws-sdk-go v1.55.4 // indirect
github.com/aws/aws-sdk-go v1.55.5 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/fatih/color v1.17.0 // indirect
Expand All @@ -30,8 +30,8 @@ require (
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
golang.org/x/crypto v0.25.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/crypto v0.26.0 // indirect
golang.org/x/oauth2 v0.22.0 // indirect
golang.org/x/sys v0.24.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
32 changes: 16 additions & 16 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
github.com/aws/aws-sdk-go v1.55.4 h1:u7sFWQQs5ivGuYvCxi7gJI8nN/P9Dq04huLaw39a4lg=
github.com/aws/aws-sdk-go v1.55.4/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
github.com/bodaay/HuggingFaceModelDownloader v0.0.0-20240307153905-2f38356a6d6c h1:3TPq2BhzOquTGmbS53KeGcM1yalBUb/4zQM1wmaINrE=
github.com/bodaay/HuggingFaceModelDownloader v0.0.0-20240307153905-2f38356a6d6c/go.mod h1:p6JQ7mJjWx82F+SrFfj9RkoHlKEGXR4959uX/vkMbzE=
github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/daulet/tokenizers v0.9.0 h1:PSjFUGeuhqb3C0GKP9hdvtHvJ6L1AZceV+0nYGACtCk=
github.com/daulet/tokenizers v0.9.0/go.mod h1:tGnMdZthXdcWY6DGD07IygpwJqiPvG85FQUnhs/wSCs=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand All @@ -19,8 +21,6 @@ github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGw
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/knights-analytics/tokenizers v0.14.0 h1:Ww8dBnlhfCcchVFQxoIJp+kl9N+6/e5Vg4OkQqozXEw=
github.com/knights-analytics/tokenizers v0.14.0/go.mod h1:QCmtYGTdiEQYNFOf+MiRSNYZvDnFjJONcs9ZUmfKK6g=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand All @@ -45,8 +45,8 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/urfave/cli/v2 v2.27.3 h1:/POWahRmdh7uztQ3CYnaDddk0Rm90PyOgIxgW2rr41M=
github.com/urfave/cli/v2 v2.27.3/go.mod h1:m4QzxcD2qpra4z7WhzEGn74WZLViBnMpb1ToCAKdGRQ=
github.com/urfave/cli/v2 v2.27.4 h1:o1owoI+02Eb+K107p27wEX9Bb8eqIoZCfLXloLUSWJ8=
github.com/urfave/cli/v2 v2.27.4/go.mod h1:m4QzxcD2qpra4z7WhzEGn74WZLViBnMpb1ToCAKdGRQ=
github.com/viant/afs v1.25.1 h1:IPcqwzsPUaWqsSkQXoM1vXwQuRI6u7ZgqQHKQZ8Wxyg=
github.com/viant/afs v1.25.1/go.mod h1:rScbFd9LJPGTM8HOI8Kjwee0AZ+MZMupAvFpPg+Qdj4=
github.com/viant/afsc v1.9.3-0.20240719110652-735d07c594f3 h1:0ynp8+Pq3zuFPFoRGAYyaNc9xi7eBPXwCqYZpzMVywI=
Expand All @@ -55,18 +55,18 @@ github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGC
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
github.com/yalue/onnxruntime_go v1.11.0 h1:aKH4yPIbqfcB3SfnQWq/WxzLelkyolntHnffL3eMBHY=
github.com/yalue/onnxruntime_go v1.11.0/go.mod h1:b4X26A8pekNb1ACJ58wAXgNKeUCGEAQ9dmACut9Sm/4=
golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30=
golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa h1:ELnwvuAXPNtPk1TJRuGkI9fDTwym6AYBu0qzT8AcHdI=
golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ=
golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA=
golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk=
golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4=
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU=
golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
Expand Down
2 changes: 1 addition & 1 deletion pipelines/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"os"
"strings"

"github.com/knights-analytics/tokenizers"
"github.com/daulet/tokenizers"
ort "github.com/yalue/onnxruntime_go"

util "github.com/knights-analytics/hugot/utils"
Expand Down
2 changes: 1 addition & 1 deletion pipelines/tokenClassification.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
"sync/atomic"
"time"

"github.com/knights-analytics/tokenizers"
"github.com/daulet/tokenizers"
ort "github.com/yalue/onnxruntime_go"

util "github.com/knights-analytics/hugot/utils"
Expand Down
4 changes: 2 additions & 2 deletions scripts/run-unit-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ docker compose -f "$src_dir/compose-test.yaml" logs --no-color >& "$test_folder/
docker compose -f "$src_dir/compose-test.yaml" rm -fsv

echo "Extracting lib artifacts"
docker build -f ./Dockerfile.test . --output "$src_dir/artifacts" --target artifacts
echo "lib artifacts extracted"
docker build -f ./test.Dockerfile . --output "$src_dir/artifacts" --target artifacts
echo "lib artifacts extracted"
29 changes: 10 additions & 19 deletions Dockerfile.test → test.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,30 +1,17 @@
#--- dockerfile to test hugot ---

ARG GO_VERSION=1.22.5
ARG RUST_VERSION=1.79
ARG GO_VERSION=1.22.6
ARG ONNXRUNTIME_VERSION=1.18.0
ARG BUILD_PLATFORM=linux/amd64

#--- rust build of tokenizer ---

FROM --platform=$BUILD_PLATFORM rust:$RUST_VERSION AS tokenizer

COPY ./go.mod .

RUN tokenizer_version=$(grep 'github.com/knights-analytics/tokenizers' go.mod | awk '{print $2}') && \
tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \
echo "tokenizer_version: $tokenizer_version" && \
git clone https://github.com/knights-analytics/tokenizers && \
cd tokenizers && \
git checkout $tokenizer_version && \
cargo build --release

#--- build and test layer ---

FROM --platform=$BUILD_PLATFORM public.ecr.aws/amazonlinux/amazonlinux:2023 AS hugot-build
ARG GO_VERSION
ARG ONNXRUNTIME_VERSION

COPY ./go.mod .

RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git && \
ln -s /usr/lib64/libstdc++.so.6 /usr/lib64/libstdc++.so && \
dnf install -y 'dnf-command(config-manager)' && \
Expand All @@ -37,15 +24,19 @@ RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git &
dnf install -y libcudnn8 && \
dnf clean all

RUN tokenizer_version=$(grep 'github.com/daulet/tokenizers' go.mod | awk '{print $2}') && \
tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \
echo "tokenizer_version: $tokenizer_version" && \
curl -LO https://github.com/daulet/tokenizers/releases/download/${tokenizer_version}/libtokenizers.linux-amd64.tar.gz && \
tar -C /usr/lib -xzf libtokenizers.linux-amd64.tar.gz && \
rm libtokenizers.linux-amd64.tar.gz

# go
RUN curl -LO https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
rm go${GO_VERSION}.linux-amd64.tar.gz
ENV PATH="$PATH:/usr/local/go/bin"

# tokenizer
COPY --from=tokenizer /tokenizers/target/release/libtokenizers.a /usr/lib/libtokenizers.a

# onnxruntime cpu and gpu
RUN curl -LO https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \
tar -xzf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \
Expand Down

0 comments on commit 5421f62

Please sign in to comment.