Skip to content

Commit

Permalink
Merge branch 'master' into feature/dropwizard2
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Dec 15, 2023
2 parents bd30ba2 + 29da579 commit 8105411
Show file tree
Hide file tree
Showing 9 changed files with 295 additions and 107 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,25 @@ jobs:
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
format: jacoco

docker-build:
needs: [ build ]
runs-on: ubuntu-latest

steps:
- name: Create more disk space
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v2
- name: Build and push
id: docker_build
uses: mr-smithers-excellent/docker-build-push@v5
with:
dockerfile: Dockerfile.local
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
image: lfoppiano/grobid-quantities
registry: docker.io
pushImage: ${{ github.event_name != 'pull_request' }}
tags: latest-develop
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [0.8.0]

### Added

+ Docker image snapshots are built and pushed on dockerhub at each commit
+ new Dockerfile.local that does not clone from github

### Changed

+ Updated to Grobid version 0.8.0
+ Updated to Dropwizard version 4.x (from version 1.x)



## [0.7.3] – 2023-06-26

### Added
Expand Down
121 changes: 121 additions & 0 deletions Dockerfile.local
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
## Docker GROBID-quantities image using deep learning models and/or CRF models, and various python modules
## Borrowed from https://github.com/kermitt2/grobid/blob/master/Dockerfile.delft
## See https://grobid.readthedocs.io/en/latest/Grobid-docker/

## usage example with grobid: https://github.com/kermitt2/grobid/blob/master/Dockerfile.delft

## docker build -t lfoppiano/grobid-quantities:0.7.0 --build-arg GROBID_VERSION=0.7.0 --file Dockerfile .

## no GPU:
## docker run -t --rm --init -p 8060:8060 -p 8061:8061 -v config.yml:/opt/grobid/grobid-quantities:ro lfoppiano/grobid-quantities:0.7.1

## allocate all available GPUs (only Linux with proper nvidia driver installed on host machine):
## docker run --rm --gpus all --init -p 8072:8072 -p 8073:8073 -v grobid.yaml:/opt/grobid/grobid-home/config/grobid.yaml:ro lfoppiano/grobid-superconductors:0.3.0-SNAPSHOT

# -------------------
# build builder image
# -------------------

FROM openjdk:17-jdk-slim as builder

USER root

RUN apt-get update && \
apt-get -y --no-install-recommends install apt-utils libxml2 git unzip

WORKDIR /opt/grobid

RUN mkdir -p grobid-quantities-source grobid-home/models
COPY src grobid-quantities-source/src
COPY settings.gradle grobid-quantities-source/
COPY resources/config/config-docker.yml grobid-quantities-source/resources/config/config.yml
COPY resources/models grobid-quantities-source/resources/models
COPY resources/clearnlp/models/* grobid-quantities-source/resources/clearnlp/models/
COPY build.gradle grobid-quantities-source/
COPY gradle.properties grobid-quantities-source/
COPY gradle grobid-quantities-source/gradle/
COPY gradlew grobid-quantities-source/
COPY .git grobid-quantities-source/.git
COPY localLibs grobid-quantities-source/localLibs

# Preparing models
WORKDIR /opt/grobid/grobid-quantities-source
RUN rm -rf /opt/grobid/grobid-home/models/*
RUN ./gradlew clean assemble -x shadowJar --no-daemon --stacktrace --info
#RUN ./gradlew copyModels --info --no-daemon
RUN ./gradlew downloadTransformers --no-daemon --info --stacktrace && rm -f /opt/grobid/grobid-home/models/*.zip

# Preparing distribution
WORKDIR /opt/grobid
RUN unzip -o /opt/grobid/grobid-quantities-source/build/distributions/grobid-quantities-*.zip -d grobid-quantities_distribution && mv grobid-quantities_distribution/grobid-quantities-* grobid-quantities

WORKDIR /opt

# -------------------
# build runtime image
# -------------------

FROM grobid/grobid:0.7.3 as runtime

# setting locale is likely useless but to be sure
ENV LANG C.UTF-8

RUN apt-get update && \
apt-get -y --no-install-recommends install git wget

WORKDIR /opt/grobid

RUN mkdir -p /opt/grobid/grobid-quantities/resources/clearnlp/models /opt/grobid/grobid-quantities/resources/clearnlp/config
COPY --from=builder /opt/grobid/grobid-home/models ./grobid-home/models
COPY --from=builder /opt/grobid/grobid-quantities ./grobid-quantities/
COPY --from=builder /opt/grobid/grobid-quantities-source/resources/config/config.yml ./grobid-quantities/resources/config/
COPY --from=builder /opt/grobid/grobid-quantities-source/resources/clearnlp/models/* ./grobid-quantities/resources/clearnlp/models/

VOLUME ["/opt/grobid/grobid-home/tmp"]

RUN ln -s /opt/grobid/grobid-quantities/resources /opt/grobid/resources

# JProfiler
#RUN wget https://download-gcdn.ej-technologies.com/jprofiler/jprofiler_linux_12_0_2.tar.gz -P /tmp/ && \
# tar -xzf /tmp/jprofiler_linux_12_0_2.tar.gz -C /usr/local &&\
# rm /tmp/jprofiler_linux_12_0_2.tar.gz

WORKDIR /opt/grobid
ARG GROBID_VERSION
ENV GROBID_VERSION=${GROBID_VERSION:-latest}
ENV GROBID_QUANTITIES_OPTS "-Djava.library.path=/opt/grobid/grobid-home/lib/lin-64:/usr/local/lib/python3.8/dist-packages/jep --add-opens java.base/java.lang=ALL-UNNAMED"

# This code removes the fixed seeed in DeLFT to increase the uncertanty
#RUN sed -i '/seed(7)/d' /usr/local/lib/python3.8/dist-packages/delft/utilities/Utilities.py
#RUN sed -i '/from numpy\.random import seed/d' /usr/local/lib/python3.8/dist-packages/delft/utilities/Utilities.py

EXPOSE 8060 8061 5005

#CMD ["java", "-agentpath:/usr/local/jprofiler12.0.2/bin/linux-x64/libjprofilerti.so=port=8849", "-jar", "grobid-superconductors/grobid-quantities-${GROBID_VERSION}-onejar.jar", "server", "grobid-superconductors/config.yml"]
#CMD ["sh", "-c", "java -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=0.0.0.0:5005 -jar grobid-quantities/grobid-quantities-${GROBID_VERSION}-onejar.jar server grobid-quantities/config.yml"]
#CMD ["sh", "-c", "java -jar grobid-quantities/grobid-quantities-${GROBID_VERSION}-onejar.jar server grobid-quantities/config.yml"]
CMD ["./grobid-quantities/bin/grobid-quantities", "server", "grobid-quantities/resources/config/config.yml"]


LABEL \
authors="Luca Foppiano, Patrice Lopez" \
org.label-schema.name="grobid-quantities" \
org.label-schema.description="Docker image for grobid-quantities service" \
org.label-schema.url="https://github.com/kermitt2/grobid-quantities" \
org.label-schema.version=${GROBID_VERSION}


## Docker tricks:

# - remove all stopped containers
# > docker rm $(docker ps -a -q)

# - remove all unused images
# > docker rmi $(docker images --filter "dangling=true" -q --no-trunc)

# - remove all untagged images
# > docker rmi $(docker images | grep "^<none>" | awk "{print $3}")

# - "Cannot connect to the Docker daemon. Is the docker daemon running on this host?"
# > docker-machine restart

101 changes: 0 additions & 101 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,107 +75,6 @@ results especially for what concern the unit normalisation.

You can find the latest documentation [here](http://grobid-quantities.readthedocs.io).

## Evaluation

The results (Precision, Recall, F-score) for all the models have been obtained using an holdout set.
For DL models we provide the average over 5 runs.
Update on the 27/10/2022

#### Quantities

| Labels | CRF | | | **BidLSTM_CRF** | | | **BidLSTM_CRF_FEATURES** | | | **BERT_CRF** | | |
|------------------|---------------|------------|--------------|-----------------|------------|--------------|--------------------------|------------|--------------|---------------|------------|--------------|
| Metrics | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | **F1-Score** |
| `<unitLeft>` | 88.74 | 83.19 | 85.87 | 88.56 | 92.07 | 90.28 | 88.91 | 92.20 | 90.53 | 93.99 | 90.30 | 92.11 |
| `<unitRight>` | 30.77 | 30.77 | 30.77 | 24.75 | 30.77 | 27.42 | 21.73 | 30.77 | 25.41 | 21.84 | 36.92 | 27.44 |
| `<valueAtomic>` | 76.29 | 78.66 | 77.46 | 78.14 | 86.06 | 81.90 | 78.21 | 86.20 | 82.01 | 84.50 | 88.19 | 86.31 |
| `<valueBase>` | 84.62 | 62.86 | 72.13 | 83.51 | 94.86 | 88.61 | 83.36 | 97.14 | 89.72 | 100.00 | 90.86 | 95.20 |
| `<valueLeast>` | 77.68 | 69.05 | 73.11 | 82.14 | 60.63 | 69.67 | 80.73 | 60.63 | 69.12 | 81.09 | 71.59 | 76.04 |
| `<valueList>` | 45.45 | 18.87 | 26.67 | 62.15 | 10.19 | 17.34 | 73.33 | 8.68 | 15.33 | 64.12 | 43.78 | 51.64 |
| `<valueMost>` | 71.62 | 54.64 | 61.99 | 77.64 | 68.25 | 72.61 | 77.25 | 70.31 | 73.58 | 81.52 | 67.42 | 73.71 |
| `<valueRange>` | 100 | 97.14 | 98.55 | 96.72 | 100.00 | 98.32 | 94.05 | 98.86 | 96.38 | 99.39 | 91.43 | 95.24 |
| -- | | | | | | | | | | | | |
| All (micro avg) | 80.08 | 75 | 77.45 | 81.81 | 81.73 | 81.76 | 81.76 | 81.94 | 81.85 | 86.24 | 83.96 | 85.08 |

#### Units

| | **CRF** | | | **BidLSTM_CRF** | | | **BidLSTM_CRF_FEATURES** | | | **BERT_CRF** | | |
|-----------------|---------------|------------|--------------|-----------------|------------|--------------|--------------------------|------------|--------------|---------------|------------|--------------|
| Labels | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | **F1-Score** |
| `<base>` | 80.57 | 82.34 | 81.45 | 56.01 | 50.34 | 53.02 | 59.98 | 56.33 | 58.09 | 61.41 | 57.08 | 59.16 |
| `<pow>` | 72.65 | 74.45 | 73.54 | 93.70 | 62.38 | 74.88 | 93.71 | 68.40 | 78.94 | 91.24 | 64.60 | 75.60 |
| `<prefix>` | 93.8 | 84.69 | 89.02 | 80.31 | 85.25 | 82.54 | 83.21 | 83.58 | 83.35 | 82.10 | 85.30 | 83.62 |
| -- | | | | | | | | | | | | |
| All (micro avg) | 80.73 | 80.6 | 80.66 | 70.19 | 60.88 | 65.20 | 73.03 | 65.31 | 68.94 | 73.02 | 64.97 | 68.76 |

#### Values

| | **CRF** | | | **BidLSTM_CRF** | | | **BidLSTM_CRF_FEATURES** | | | **BERT_CRF** | | |
|-----------------|---------------|------------|--------------|-----------------|------------|----------|--------------------------|------------|--------------|-----------------|------------|--------------|
| Labels | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | F1-Score | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | **F1-Score** |
| `<alpha>` | 98.06 | 96.03 | 92.02 | 97.67 | 99.53 | 98.58 | 97.82 | 99.53 | 98.66 | 98.59 | 99.53 | 99.05 |
| `<base>` | 99.91 | 92.31 | 96 | 96.92 | 92.31 | 94.52 | 96.92 | 93.85 | 95.32 | 90.40 | 98.46 | 92.88 |
| `<number>` | 97.5 | 99.88 | 98.36 | 99.24 | 99.34 | 99.29 | 99.21 | 99.38 | 99.30 | 99.48 | 99.31 | 99.40 |
| `<pow>` | 100 | 100 | 100 | 92.92 | 92.31 | 92.47 | 90.28 | 93.85 | 91.90 | 100.00 | 100.00 | 100.00 |
| -- | | | | | | | | | | | | |
| All (micro avg) | 95.79 | 99.27 | 97.5 | 98.90 | 99.17 | 99.03 | 98.86 | 99.25 | 99.05 | 99.13 | 99.33 | 99.23 |

<details>
<summary>Previous evaluations</summary>

Previous evaluation were provided using 10-fold cross-validation (with average metrics over the 10 folds).

The `CRF` model was evaluated on the 30/04/2020.
The `BidLSTM_CRF_FEATURES` model was evaluated on the 28/11/2021

#### Quantities

| | CRF | | | BidLSTM_CRF_FEATURES | | |
|-----------------|---------------|------------|--------------|----------------------|------------|----------|
| Labels | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | F1-Score |
| `<unitLeft>` | 96.45 | 95.06 | 95.74 | 95.17 | 96.67 | 95.91 |
| `<unitRight>` | 88.96 | 68.65 | 75.43 | 92.52 | 83.64 | 87.69 |
| `<valueAtomic> | 85.75 | 85.35 | 85.49 | 81.74 | 89.21 | 85.30 |
| `<valueBase>` | 73.06 | 66.43 | 68.92 | 100.00 | 75.00 | 85.71 |
| `<valueLeast>` | 85.68 | 79.03 | 82.07 | 89.24 | 82.25 | 85.55 |
| `<valueList>` | 68.38 | 53.31 | 58.94 | 75.27 | 75.33 | 75.12 |
| `<valueMost>` | 83.67 | 75.82 | 79.42 | 89.02 | 81.56 | 85.10 |
| `<valueRange>` | 90.25 | 88.58 | 88.86 | 100.00 | 96.25 | 97.90 |
| -- | | | | | | |
| All (micro avg) | 88.96 | 85.4 | 87.14 | 87.23 | 89.00 | 88.10 |

#### Units

CRF was updated the 10/02/2021

| | CRF | | | BidLSTM_CRF_FEATURES | | |
|-----------------|---------------|------------|--------------|----------------------|------------|----------|
| Labels | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | F1-Score |
| `<base>` | 98.82 | 99.14 | 98.98 | 98.26 | 98.52 | 98.39 |
| `<pow>` | 97.62 | 98.56 | 98.08 | 100.00 | 98.57 | 99.28 |
| `<prefix>` | 99.5 | 98.76 | 99.13 | 98.89 | 97.75 | 98.30 |
| -- | | | | | | |
| All (micro avg) | 98.85 | 99.01 | 98.93 | 98.51 | 98.39 | 98.45 |

#### Values

| | CRF | | | BidLSTM_CRF_FEATURES | | |
|-----------------|---------------|------------|--------------|----------------------|------------|----------|
| Labels | **Precision** | **Recall** | **F1-Score** | **Precision** | **Recall** | F1-Score |
| `<alpha>` | 96.9 | 98.84 | 97.85 | 99.41 | 99.55 | 99.48 |
| `<base>` | 85.14 | 74.48 | 79 | 96.67 | 100.00 | 98.00 |
| `<number>` | 98.07 | 99.05 | 98.55 | 99.55 | 98.68 | 99.11 |
| `<pow>` | 80.05 | 76.33 | 77.54 | 72.50 | 75.00 | 73.50 |
| `<time>` | 73.07 | 86.82 | 79.26 | 80.84 | 100.00 | 89.28 |
| -- | | | | | | |
| All (micro avg) | 96.15 | 97.95 | 97.4 | 98.49 | 98.66 | 98.57 |

</details>

The current average results have been calculated using micro average which provides more realistic results by giving
different weights to labels based on their frequency.
The [paper](https://hal.inria.fr/hal-02294424) "Automatic Identification and Normalisation of Physical Measurements in
Scientific Literature", published in September 2019 reported average evaluation based on macro average.

## Acknowledgement

Expand Down
11 changes: 7 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,9 @@ publishing {

def conf = new org.yaml.snakeyaml.Yaml().load(new File("resources/config/config.yml").newInputStream())
def grobidHome = conf.grobidHome.replace("\$", "").replace('{', "").replace("GROBID_HOME:- ", "").replace("}", "")
if (grobidHome.startsWith("../")) {
grobidHome = "${rootProject.rootDir}/${grobidHome}"
}

/** Model management **/

Expand All @@ -355,7 +358,7 @@ task copyModels(type: Copy) {
include "**/preprocessor.json"
exclude "**/features-engineering/**"
exclude "**/result-logs/**"
into "${rootDir}/${grobidHome}/models/"
into "${grobidHome}/models/"

doLast {
print "Copy models under grobid-home: ${grobidHome}"
Expand All @@ -366,11 +369,11 @@ task downloadTransformers(dependsOn: copyModels) {
doLast {
download {
src "https://transformers-data.s3.eu-central-1.amazonaws.com/quantities-transformers.zip"
dest "${rootDir}/${grobidHome}/models/quantities-transformers.zip"
dest "${grobidHome}/models/quantities-transformers.zip"
overwrite false
print "Download bulky transformers files under grobid-home: ${grobidHome}"
}
ant.unzip(src: "${rootDir}/${grobidHome}/models/quantities-transformers.zip", dest: "${rootDir}/${grobidHome}/models/")
ant.unzip(src: "${grobidHome}/models/quantities-transformers.zip", dest: "${grobidHome}/models/")
}
}

Expand All @@ -397,4 +400,4 @@ release {
git {
requireBranch.set('test')
}
}
}
Loading

0 comments on commit 8105411

Please sign in to comment.