diff --git a/Dockerfile b/Dockerfile index 346b448..544df23 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,46 +1,35 @@ #--- dockerfile with hugot dependencies and cli (cpu only) --- -ARG GO_VERSION=1.22.5 -ARG RUST_VERSION=1.79 +ARG GO_VERSION=1.22.6 ARG ONNXRUNTIME_VERSION=1.18.0 ARG BUILD_PLATFORM=linux/amd64 -#--- rust build of tokenizer --- - -FROM --platform=$BUILD_PLATFORM rust:$RUST_VERSION AS tokenizer - -COPY ./go.mod . - -RUN tokenizer_version=$(grep 'github.com/knights-analytics/tokenizers' go.mod | awk '{print $2}') && \ - tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \ - echo "tokenizer_version: $tokenizer_version" && \ - git clone https://github.com/knights-analytics/tokenizers && \ - cd tokenizers && \ - git checkout $tokenizer_version && \ - cargo build --release - #--- build layer --- FROM --platform=$BUILD_PLATFORM public.ecr.aws/amazonlinux/amazonlinux:2023 AS hugot-build ARG GO_VERSION ARG ONNXRUNTIME_VERSION -RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git && \ +COPY ./go.mod . + +RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git which && \ ln -s /usr/lib64/libstdc++.so.6 /usr/lib64/libstdc++.so && \ - dnf install -y 'dnf-command(config-manager)' && \ - dnf config-manager --add-repo https://download.fedoraproject.org/pub/fedora/linux/releases/39/Everything/x86_64/os/ && \ dnf clean all +RUN tokenizer_version=$(grep 'github.com/daulet/tokenizers' go.mod | awk '{print $2}') && \ + tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \ + echo "tokenizer_version: $tokenizer_version" && \ + curl -LO https://github.com/daulet/tokenizers/releases/download/${tokenizer_version}/libtokenizers.linux-amd64.tar.gz && \ + tar -C /usr/lib -xzf libtokenizers.linux-amd64.tar.gz && \ + rm libtokenizers.linux-amd64.tar.gz + # go RUN curl -LO https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \ tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \ rm go${GO_VERSION}.linux-amd64.tar.gz ENV PATH="$PATH:/usr/local/go/bin" -# tokenizer -COPY --from=tokenizer /tokenizers/target/release/libtokenizers.a /usr/lib/libtokenizers.a - -# onnxruntime cpu and gpu +# onnxruntime cpu RUN curl -LO https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \ tar -xzf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \ mv ./onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}/lib/libonnxruntime.so.${ONNXRUNTIME_VERSION} /usr/lib64/onnxruntime.so @@ -53,6 +42,6 @@ RUN cd ./cmd && CGO_ENABLED=1 CGO_LDFLAGS="-L/usr/lib/" GOOS=linux GOARCH=amd64 #--- final layer --- FROM --platform=$BUILD_PLATFORM public.ecr.aws/amazonlinux/amazonlinux:2023 AS final -COPY --from=tokenizer /tokenizers/target/release/libtokenizers.a /usr/lib/libtokenizers.a +COPY --from=hugot-build /usr/lib/libtokenizers.a /usr/lib/libtokenizers.a COPY --from=hugot-build /build/cmd/target /hugot-cli -COPY --from=hugot-build /usr/lib64/onnxruntime.so /usr/lib64/onnxruntime.so \ No newline at end of file +COPY --from=hugot-build /usr/lib64/onnxruntime.so /usr/lib64/onnxruntime.so diff --git a/README.md b/README.md index bd52337..7c9d413 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Hugot can be used in two ways: as a library in your go application, or as a comm To use Hugot as a library in your application, you will need the following two dependencies on your system: -- the tokenizers.a file obtained from the releases section of this page (if you want to use alternative architecture from `linux/amd64` you will have to build the tokenizers.a yourself, see [here](https://github.com/knights-analytics/tokenizers). This file should be at /usr/lib/tokenizers.a so that hugot can load it. Alternatively, you can explicitly specify the path to the folder with the `libtokenizers.a` file using the `CGO_LDFLAGS` env variable, see the [dockerfile](./Dockerfile). +- the tokenizers.a file obtained from the releases section of this page (if you want to use alternative architecture from `linux/amd64` you will have to build the tokenizers.a yourself, see [here](https://github.com/daulet/tokenizers). This file should be at /usr/lib/tokenizers.a so that hugot can load it. Alternatively, you can explicitly specify the path to the folder with the `libtokenizers.a` file using the `CGO_LDFLAGS` env variable, see the [dockerfile](./Dockerfile). - the onnxruntime.go file obtained from the releases section of this page (if you want to use alternative architectures from `linux/amd64` you will have to download it from [the onnxruntime releases page](https://github.com/microsoft/onnxruntime/releases/), see the [dockerfile](./Dockerfile) as an example). Hugot looks for this file at /usr/lib/onnxruntime.so or /usr/lib64/onnxruntime.so by default. A different location can be specified by passing the `WithOnnxLibraryPath()` option to `NewSession()`, e.g: ``` diff --git a/compose-dev.yaml b/compose-dev.yaml index 10cbd4b..12b39ae 100644 --- a/compose-dev.yaml +++ b/compose-dev.yaml @@ -4,7 +4,7 @@ services: platform: linux/amd64 build: context: . - dockerfile: ./Dockerfile.test + dockerfile: ./test.Dockerfile target: hugot-build volumes: - $src_dir:/home/testuser/repositories/hugot diff --git a/compose-test.yaml b/compose-test.yaml index 7e56abe..bd3b6fa 100644 --- a/compose-test.yaml +++ b/compose-test.yaml @@ -5,7 +5,7 @@ services: container_name: hugot build: context: . - dockerfile: ./Dockerfile.test + dockerfile: ./test.Dockerfile target: hugot-build volumes: - $test_folder:/test diff --git a/go.mod b/go.mod index 81b2502..3fd839f 100644 --- a/go.mod +++ b/go.mod @@ -4,19 +4,19 @@ go 1.22 require ( github.com/bodaay/HuggingFaceModelDownloader v0.0.0-20240307153905-2f38356a6d6c + github.com/daulet/tokenizers v0.9.0 github.com/json-iterator/go v1.1.12 - github.com/knights-analytics/tokenizers v0.14.0 github.com/mattn/go-isatty v0.0.20 github.com/stretchr/testify v1.9.0 github.com/urfave/cli/v2 v2.27.3 github.com/viant/afs v1.25.1 github.com/viant/afsc v1.9.3-0.20240719110652-735d07c594f3 github.com/yalue/onnxruntime_go v1.11.0 - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 + golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa ) require ( - github.com/aws/aws-sdk-go v1.55.4 // indirect + github.com/aws/aws-sdk-go v1.55.5 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/fatih/color v1.17.0 // indirect @@ -30,8 +30,8 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect - golang.org/x/crypto v0.25.0 // indirect - golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.22.0 // indirect + golang.org/x/crypto v0.26.0 // indirect + golang.org/x/oauth2 v0.22.0 // indirect + golang.org/x/sys v0.24.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 46d4bb3..2946d4a 100644 --- a/go.sum +++ b/go.sum @@ -1,10 +1,12 @@ -github.com/aws/aws-sdk-go v1.55.4 h1:u7sFWQQs5ivGuYvCxi7gJI8nN/P9Dq04huLaw39a4lg= -github.com/aws/aws-sdk-go v1.55.4/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= +github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= +github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/bodaay/HuggingFaceModelDownloader v0.0.0-20240307153905-2f38356a6d6c h1:3TPq2BhzOquTGmbS53KeGcM1yalBUb/4zQM1wmaINrE= github.com/bodaay/HuggingFaceModelDownloader v0.0.0-20240307153905-2f38356a6d6c/go.mod h1:p6JQ7mJjWx82F+SrFfj9RkoHlKEGXR4959uX/vkMbzE= github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/daulet/tokenizers v0.9.0 h1:PSjFUGeuhqb3C0GKP9hdvtHvJ6L1AZceV+0nYGACtCk= +github.com/daulet/tokenizers v0.9.0/go.mod h1:tGnMdZthXdcWY6DGD07IygpwJqiPvG85FQUnhs/wSCs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -19,8 +21,6 @@ github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGw github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/knights-analytics/tokenizers v0.14.0 h1:Ww8dBnlhfCcchVFQxoIJp+kl9N+6/e5Vg4OkQqozXEw= -github.com/knights-analytics/tokenizers v0.14.0/go.mod h1:QCmtYGTdiEQYNFOf+MiRSNYZvDnFjJONcs9ZUmfKK6g= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -55,18 +55,18 @@ github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGC github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yalue/onnxruntime_go v1.11.0 h1:aKH4yPIbqfcB3SfnQWq/WxzLelkyolntHnffL3eMBHY= github.com/yalue/onnxruntime_go v1.11.0/go.mod h1:b4X26A8pekNb1ACJ58wAXgNKeUCGEAQ9dmACut9Sm/4= -golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= -golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa h1:ELnwvuAXPNtPk1TJRuGkI9fDTwym6AYBu0qzT8AcHdI= +golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ= +golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= +golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= -golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= +golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/pipelines/pipeline.go b/pipelines/pipeline.go index 6886d62..a3ff679 100644 --- a/pipelines/pipeline.go +++ b/pipelines/pipeline.go @@ -8,7 +8,7 @@ import ( "os" "strings" - "github.com/knights-analytics/tokenizers" + "github.com/daulet/tokenizers" ort "github.com/yalue/onnxruntime_go" util "github.com/knights-analytics/hugot/utils" diff --git a/pipelines/tokenClassification.go b/pipelines/tokenClassification.go index 99806e3..cb57b72 100644 --- a/pipelines/tokenClassification.go +++ b/pipelines/tokenClassification.go @@ -9,7 +9,7 @@ import ( "sync/atomic" "time" - "github.com/knights-analytics/tokenizers" + "github.com/daulet/tokenizers" ort "github.com/yalue/onnxruntime_go" util "github.com/knights-analytics/hugot/utils" diff --git a/scripts/run-unit-tests.sh b/scripts/run-unit-tests.sh index e8ea146..029ef12 100755 --- a/scripts/run-unit-tests.sh +++ b/scripts/run-unit-tests.sh @@ -23,5 +23,5 @@ docker compose -f "$src_dir/compose-test.yaml" logs --no-color >& "$test_folder/ docker compose -f "$src_dir/compose-test.yaml" rm -fsv echo "Extracting lib artifacts" -docker build -f ./Dockerfile.test . --output "$src_dir/artifacts" --target artifacts -echo "lib artifacts extracted" \ No newline at end of file +docker build -f ./test.Dockerfile . --output "$src_dir/artifacts" --target artifacts +echo "lib artifacts extracted" diff --git a/Dockerfile.test b/test.Dockerfile similarity index 87% rename from Dockerfile.test rename to test.Dockerfile index 31e2dc4..81ae5df 100644 --- a/Dockerfile.test +++ b/test.Dockerfile @@ -1,30 +1,17 @@ #--- dockerfile to test hugot --- -ARG GO_VERSION=1.22.5 -ARG RUST_VERSION=1.79 +ARG GO_VERSION=1.22.6 ARG ONNXRUNTIME_VERSION=1.18.0 ARG BUILD_PLATFORM=linux/amd64 -#--- rust build of tokenizer --- - -FROM --platform=$BUILD_PLATFORM rust:$RUST_VERSION AS tokenizer - -COPY ./go.mod . - -RUN tokenizer_version=$(grep 'github.com/knights-analytics/tokenizers' go.mod | awk '{print $2}') && \ - tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \ - echo "tokenizer_version: $tokenizer_version" && \ - git clone https://github.com/knights-analytics/tokenizers && \ - cd tokenizers && \ - git checkout $tokenizer_version && \ - cargo build --release - #--- build and test layer --- FROM --platform=$BUILD_PLATFORM public.ecr.aws/amazonlinux/amazonlinux:2023 AS hugot-build ARG GO_VERSION ARG ONNXRUNTIME_VERSION +COPY ./go.mod . + RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git && \ ln -s /usr/lib64/libstdc++.so.6 /usr/lib64/libstdc++.so && \ dnf install -y 'dnf-command(config-manager)' && \ @@ -37,15 +24,19 @@ RUN dnf -y install gcc jq bash tar xz gzip glibc-static libstdc++ wget zip git & dnf install -y libcudnn8 && \ dnf clean all +RUN tokenizer_version=$(grep 'github.com/daulet/tokenizers' go.mod | awk '{print $2}') && \ + tokenizer_version=$(echo $tokenizer_version | awk -F'-' '{print $NF}') && \ + echo "tokenizer_version: $tokenizer_version" && \ + curl -LO https://github.com/daulet/tokenizers/releases/download/${tokenizer_version}/libtokenizers.linux-amd64.tar.gz && \ + tar -C /usr/lib -xzf libtokenizers.linux-amd64.tar.gz && \ + rm libtokenizers.linux-amd64.tar.gz && \ + # go RUN curl -LO https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \ tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \ rm go${GO_VERSION}.linux-amd64.tar.gz ENV PATH="$PATH:/usr/local/go/bin" -# tokenizer -COPY --from=tokenizer /tokenizers/target/release/libtokenizers.a /usr/lib/libtokenizers.a - # onnxruntime cpu and gpu RUN curl -LO https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \ tar -xzf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz && \