diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index a9e858cd..62312dce 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -28,3 +28,25 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} format: jacoco + + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - uses: actions/checkout@v2 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v5 + with: + dockerfile: Dockerfile.local + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/grobid-quantities + registry: docker.io + pushImage: ${{ github.event_name != 'pull_request' }} + tags: latest-develop + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index e643b476..26fa4e6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,55 +4,104 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [Unreleased] +## [0.8.0] -## [0.7.1] – 2021-09-06 +### Added + ++ Docker image snapshots are built and pushed on dockerhub at each commit ++ new Dockerfile.local that does not clone from github + +### Changed + ++ Updated to Grobid version 0.8.0 ++ Updated to Dropwizard version 4.x (from version 1.x) + +## [0.7.3] – 2023-06-26 + +### Added + ++ Added additional units in the lexicon ++ Added missing log when exception are raised ++ Introduced Kotlin for new development + +### Changed + ++ Upgrade to grobid 0.7.3 and support to JDK > 11 ++ Updated Docker image to support JDK 17 and use the gradle distribution script instead of the JAR directly ++ Transitioned from circleci to GitHub actions + +### Fixed + ++ Fix notation lexicon #97 ++ Fix list and labelled sequence extraction with DL BERT models #153 ++ Improve recognition of composed units using sentence segmentation #155 #87 + +## [0.7.2] – 2023-01-20 + +### Added + ++ Create holdout set by @lfoppiano in #145 ++ Add additional DL and transformers models by @lfoppiano in #146 + +### Changed + +Update to Grobid 0.7.2 + +### Fixed + ++ Fix value parser's incorrect recognition by @lfoppiano in #141 + +## [0.7.1] – 2022-09-02 ### Added + + New BidLSTM_CRF models for quantities, values and units parsing #129 -+ Add docker image on hub.docker.com #142 -+ Update to Grobid 0.7.1 #137 ++ Add docker image on hub.docker.com #142 ++ Update to Grobid 0.7.1 #137 ### Changed + + Use the grobid sentence segmentation for the quantified object sentence splitting #138 ### Fixed -+ Fixes incorrect boxes colors #125 -+ Fixed lexicon #134 ++ Fixes incorrect boxes colors #125 ++ Fixed lexicon #134 ## [0.7.0] – 2021-08-06 ### Added + + Docker image #128 -+ Configurable number of parallel request ++ Configurable number of parallel request + Various improvement in the unit normalisation and update of library Unit of measurement to version 2.x #95 ### Changed + + Retrained models with CRF + Grobid 0.7.0 #123 ### Fixed + + Coveralls build #127 + Fixed command line parameters #119 - - ## [0.6.0] – 2020-04-30 ### Added + + First official release -+ Extraction of quantities, units and values using CRF -+ Support for Text and PDF ++ Extraction of quantities, units and values using CRF ++ Support for Text and PDF ### Changed -+ Added evaluation measurement and models ++ Added evaluation measurement and models ### Fixed - [Unreleased]: https://github.com/kermitt2/grobid/compare/0.6.0...HEAD + [0.6.0]: https://github.com/kermitt2/grobid/compare/0.6.0 diff --git a/Dockerfile.local b/Dockerfile.local new file mode 100644 index 00000000..28d82958 --- /dev/null +++ b/Dockerfile.local @@ -0,0 +1,121 @@ +## Docker GROBID-quantities image using deep learning models and/or CRF models, and various python modules +## Borrowed from https://github.com/kermitt2/grobid/blob/master/Dockerfile.delft +## See https://grobid.readthedocs.io/en/latest/Grobid-docker/ + +## usage example with grobid: https://github.com/kermitt2/grobid/blob/master/Dockerfile.delft + +## docker build -t lfoppiano/grobid-quantities:0.7.0 --build-arg GROBID_VERSION=0.7.0 --file Dockerfile . + +## no GPU: +## docker run -t --rm --init -p 8060:8060 -p 8061:8061 -v config.yml:/opt/grobid/grobid-quantities:ro lfoppiano/grobid-quantities:0.7.1 + +## allocate all available GPUs (only Linux with proper nvidia driver installed on host machine): +## docker run --rm --gpus all --init -p 8072:8072 -p 8073:8073 -v grobid.yaml:/opt/grobid/grobid-home/config/grobid.yaml:ro lfoppiano/grobid-superconductors:0.3.0-SNAPSHOT + +# ------------------- +# build builder image +# ------------------- + +FROM openjdk:17-jdk-slim as builder + +USER root + +RUN apt-get update && \ + apt-get -y --no-install-recommends install apt-utils libxml2 git unzip + +WORKDIR /opt/grobid + +RUN mkdir -p grobid-quantities-source grobid-home/models +COPY src grobid-quantities-source/src +COPY settings.gradle grobid-quantities-source/ +COPY resources/config/config-docker.yml grobid-quantities-source/resources/config/config.yml +COPY resources/models grobid-quantities-source/resources/models +COPY resources/clearnlp/models/* grobid-quantities-source/resources/clearnlp/models/ +COPY build.gradle grobid-quantities-source/ +COPY gradle.properties grobid-quantities-source/ +COPY gradle grobid-quantities-source/gradle/ +COPY gradlew grobid-quantities-source/ +COPY .git grobid-quantities-source/.git +COPY localLibs grobid-quantities-source/localLibs + +# Preparing models +WORKDIR /opt/grobid/grobid-quantities-source +RUN rm -rf /opt/grobid/grobid-home/models/* +RUN ./gradlew clean assemble -x shadowJar --no-daemon --stacktrace --info +#RUN ./gradlew copyModels --info --no-daemon +RUN ./gradlew downloadTransformers --no-daemon --info --stacktrace && rm -f /opt/grobid/grobid-home/models/*.zip + +# Preparing distribution +WORKDIR /opt/grobid +RUN unzip -o /opt/grobid/grobid-quantities-source/build/distributions/grobid-quantities-*.zip -d grobid-quantities_distribution && mv grobid-quantities_distribution/grobid-quantities-* grobid-quantities + +WORKDIR /opt + +# ------------------- +# build runtime image +# ------------------- + +FROM grobid/grobid:0.7.3 as runtime + +# setting locale is likely useless but to be sure +ENV LANG C.UTF-8 + +RUN apt-get update && \ + apt-get -y --no-install-recommends install git wget + +WORKDIR /opt/grobid + +RUN mkdir -p /opt/grobid/grobid-quantities/resources/clearnlp/models /opt/grobid/grobid-quantities/resources/clearnlp/config +COPY --from=builder /opt/grobid/grobid-home/models ./grobid-home/models +COPY --from=builder /opt/grobid/grobid-quantities ./grobid-quantities/ +COPY --from=builder /opt/grobid/grobid-quantities-source/resources/config/config.yml ./grobid-quantities/resources/config/ +COPY --from=builder /opt/grobid/grobid-quantities-source/resources/clearnlp/models/* ./grobid-quantities/resources/clearnlp/models/ + +VOLUME ["/opt/grobid/grobid-home/tmp"] + +RUN ln -s /opt/grobid/grobid-quantities/resources /opt/grobid/resources + +# JProfiler +#RUN wget https://download-gcdn.ej-technologies.com/jprofiler/jprofiler_linux_12_0_2.tar.gz -P /tmp/ && \ +# tar -xzf /tmp/jprofiler_linux_12_0_2.tar.gz -C /usr/local &&\ +# rm /tmp/jprofiler_linux_12_0_2.tar.gz + +WORKDIR /opt/grobid +ARG GROBID_VERSION +ENV GROBID_VERSION=${GROBID_VERSION:-latest} +ENV GROBID_QUANTITIES_OPTS "-Djava.library.path=/opt/grobid/grobid-home/lib/lin-64:/usr/local/lib/python3.8/dist-packages/jep --add-opens java.base/java.lang=ALL-UNNAMED" + +# This code removes the fixed seeed in DeLFT to increase the uncertanty +#RUN sed -i '/seed(7)/d' /usr/local/lib/python3.8/dist-packages/delft/utilities/Utilities.py +#RUN sed -i '/from numpy\.random import seed/d' /usr/local/lib/python3.8/dist-packages/delft/utilities/Utilities.py + +EXPOSE 8060 8061 5005 + +#CMD ["java", "-agentpath:/usr/local/jprofiler12.0.2/bin/linux-x64/libjprofilerti.so=port=8849", "-jar", "grobid-superconductors/grobid-quantities-${GROBID_VERSION}-onejar.jar", "server", "grobid-superconductors/config.yml"] +#CMD ["sh", "-c", "java -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=0.0.0.0:5005 -jar grobid-quantities/grobid-quantities-${GROBID_VERSION}-onejar.jar server grobid-quantities/config.yml"] +#CMD ["sh", "-c", "java -jar grobid-quantities/grobid-quantities-${GROBID_VERSION}-onejar.jar server grobid-quantities/config.yml"] +CMD ["./grobid-quantities/bin/grobid-quantities", "server", "grobid-quantities/resources/config/config.yml"] + + +LABEL \ + authors="Luca Foppiano, Patrice Lopez" \ + org.label-schema.name="grobid-quantities" \ + org.label-schema.description="Docker image for grobid-quantities service" \ + org.label-schema.url="https://github.com/kermitt2/grobid-quantities" \ + org.label-schema.version=${GROBID_VERSION} + + +## Docker tricks: + +# - remove all stopped containers +# > docker rm $(docker ps -a -q) + +# - remove all unused images +# > docker rmi $(docker images --filter "dangling=true" -q --no-trunc) + +# - remove all untagged images +# > docker rmi $(docker images | grep "^" | awk "{print $3}") + +# - "Cannot connect to the Docker daemon. Is the docker daemon running on this host?" +# > docker-machine restart + diff --git a/build.gradle b/build.gradle index 98f676b7..70ecd88f 100644 --- a/build.gradle +++ b/build.gradle @@ -341,6 +341,9 @@ publishing { def conf = new org.yaml.snakeyaml.Yaml().load(new File("resources/config/config.yml").newInputStream()) def grobidHome = conf.grobidHome.replace("\$", "").replace('{', "").replace("GROBID_HOME:- ", "").replace("}", "") +if (grobidHome.startsWith("../")) { + grobidHome = "${rootProject.rootDir}/${grobidHome}" +} /** Model management **/ @@ -354,7 +357,7 @@ task copyModels(type: Copy) { include "**/preprocessor.json" exclude "**/features-engineering/**" exclude "**/result-logs/**" - into "${rootDir}/${grobidHome}/models/" + into "${grobidHome}/models/" doLast { print "Copy models under grobid-home: ${grobidHome}" @@ -365,11 +368,11 @@ task downloadTransformers(dependsOn: copyModels) { doLast { download { src "https://transformers-data.s3.eu-central-1.amazonaws.com/quantities-transformers.zip" - dest "${rootDir}/${grobidHome}/models/quantities-transformers.zip" + dest "${grobidHome}/models/quantities-transformers.zip" overwrite false print "Download bulky transformers files under grobid-home: ${grobidHome}" } - ant.unzip(src: "${rootDir}/${grobidHome}/models/quantities-transformers.zip", dest: "${rootDir}/${grobidHome}/models/") + ant.unzip(src: "${grobidHome}/models/quantities-transformers.zip", dest: "${grobidHome}/models/") } } @@ -396,4 +399,4 @@ release { git { requireBranch.set('test') } -} \ No newline at end of file +} diff --git a/scripts/dataset_analysis_quantities.py b/scripts/dataset_analysis_quantities.py index 9580dd85..f9ceaf27 100644 --- a/scripts/dataset_analysis_quantities.py +++ b/scripts/dataset_analysis_quantities.py @@ -6,9 +6,8 @@ from pathlib import Path from bs4 import BeautifulSoup, NavigableString, Tag - -from grobid_superconductors.commons.grobid_tokenizer import tokenizeSimple -from grobid_superconductors.commons.quantities_tei_parser import get_children_list +from supermat.grobid_tokenizer import tokenizeSimple +from supermat.supermat_tei_parser import get_children_list def process_dir(input): @@ -53,7 +52,7 @@ def process_file(input): document_statistics['batch'] = batch - children = get_children_list(soup) + children = get_children_list(soup, use_paragraphs=True) for paragraph in children: for item in paragraph: diff --git a/scripts/quantities_tei_parser.py b/scripts/quantities_tei_parser.py index 5056feb1..21df6deb 100644 --- a/scripts/quantities_tei_parser.py +++ b/scripts/quantities_tei_parser.py @@ -1,18 +1,118 @@ import re +from collections import OrderedDict +from pathlib import Path +from typing import List from bs4 import BeautifulSoup, Tag, NavigableString +from supermat.supermat_tei_parser import tokenise -from .grobid_tokenizer import tokenizeSimple +ENTITY_TYPES = ['value', 'interval', 'range', 'list'] +def process_file_to_json(input_file_path): + with open(input_file_path, encoding='utf-8') as fp: + doc = fp.read() -def tokenise(string): - return tokenizeSimple(string) + mod_tags = re.finditer(r'() ', doc) + for mod in mod_tags: + doc = doc.replace(mod.group(), ' ' + mod.group(1)) + soup = BeautifulSoup(doc, 'xml') + output_document = OrderedDict() + output_document['doc_key'] = Path(str(input_file_path)).name + output_document['dataset'] = 'Quantities' + output_document['lang'] = 'en' -def get_children_list(soup, verbose=False): + output_document['level'] = 'paragraph' + paragraph_nodes = get_nodes(soup) + passages, relations = process_paragraphs(paragraph_nodes) + + output_document['passages'] = passages + output_document['relations'] = relations + + return output_document + + +def get_nodes(soup, verbose=False): children = soup.find_all("p") if verbose: print(str(children)) return children + + +def process_paragraphs(paragraph_list: list) -> [List, List]: + """ + Process XML with

and as sentences. + + Return two list passage (sentence or paragraph,spans and link) and relations (links at document-level) + """ + token_offset_sentence = 0 + ient = 1 + + passages = [] + relations = [] + + i = 0 + for paragraph_id, paragraph in enumerate(paragraph_list): + passage = OrderedDict() + + j = 0 + offset = 0 + tokens = [] + text_paragraph = '' + spans = [] + + passage['text'] = text_paragraph + passage['tokens'] = tokens + passage['type'] = 'paragraph' + passage['spans'] = spans + passage['id'] = paragraph_id + + for idx, item in enumerate(paragraph.contents): + if type(item) is NavigableString: + local_text = str(item).replace("\n", " ") + # We preserve spaces that are in the middle + if idx == 0 or idx == len(paragraph.contents) - 1: + local_text = local_text.strip() + text_paragraph += local_text + token_list = tokenise(local_text) + tokens.extend(token_list) + token_offset_sentence += len(token_list) + offset += len(local_text) + elif type(item) is Tag and item.name == 'measure' and 'type' in item.attrs and item.attrs['type'] in ENTITY_TYPES: + local_text = item.text + text_paragraph += local_text + span = OrderedDict() + front_offset = 0 + if local_text.startswith(" "): + front_offset = len(local_text) - len(local_text.lstrip(" ")) + + span['text'] = local_text.strip(" ") + span['offset_start'] = offset + front_offset + span['offset_end'] = offset + len(span['text']) + front_offset + spans.append(span) + + offset += len(local_text) + + assert text_paragraph[span['offset_start']:span['offset_end']] == span['text'] + + if 'type' not in item.attrs: + raise Exception("RS without type is invalid. Stopping") + token_list = tokenise(local_text) + tokens.extend(token_list) + + entity_class = item.attrs['type'] + span['type'] = entity_class + + span['token_start'] = token_offset_sentence + span['token_end'] = token_offset_sentence + len(token_list) - 1 + + j += 1 + + ient += 1 # entity No. + + passage['text'] = text_paragraph + passages.append(passage) + i += 1 + return passages, relations diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 3a6469d5..456dba20 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,6 +1,6 @@ beautifulsoup4 -blingfire lxml tqdm pyyaml -sklearn \ No newline at end of file +sklearn +supermat \ No newline at end of file diff --git a/scripts/xml2csv_entities.py b/scripts/xml2csv_entities.py new file mode 100644 index 00000000..4c32ce07 --- /dev/null +++ b/scripts/xml2csv_entities.py @@ -0,0 +1,139 @@ +import argparse +import csv +import os +from pathlib import Path + +from supermat.utils import get_in_paths_from_directory + +from quantities_tei_parser import process_file_to_json, ENTITY_TYPES + +paragraph_id = 'paragraph_id' + + +def write_output(output_path, data, header, format="csv"): + delimiter = '\t' if format == 'tsv' else ',' + fw = csv.writer(open(output_path, encoding='utf-8', mode='w'), delimiter=delimiter, quotechar='"') + fw.writerow(header) + fw.writerows(data) + + +def get_entity_data(data_sorted, remove_dups=False): + entities = [] + record_id = 0 + for passage in data_sorted['passages']: + text = passage['text'] + spans = [span['text'] for span in filter(lambda s: s['type'] in ENTITY_TYPES, passage['spans'])] + if remove_dups: + ents = list(set(spans)) + else: + ents = list(spans) + for ent in ents: + entities.append( + [ + record_id, + data_sorted['doc_key'], + passage['id'], + ent + ] + ) + record_id += 1 + + # entities.append( + # { + # "text": text, + # "entities": ents + # } + # ) + + return entities + + +def get_texts(data_sorted): + text_data = [[idx, data_sorted['doc_key'], data_sorted['passages'][idx]['id'], data_sorted['passages'][idx]['text']] + for idx in + range(0, len(data_sorted['passages']))] + + return text_data + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Converter XML (Supermat) to CSV for entity extraction (no relation information are used)") + + parser.add_argument("--input", + help="Input file or directory", + required=True) + parser.add_argument("--output", + help="Output directory", + required=True) + parser.add_argument("--recursive", + action="store_true", + default=False, + help="Process input directory recursively. If input is a file, this parameter is ignored.") + parser.add_argument("--entity-type", + default="quantity", + required=False, + help="Select which entity type to extract.") + + args = parser.parse_args() + + input = args.input + output = args.output + recursive = args.recursive + ent_type = args.entity_type + + if os.path.isdir(input): + path_list = get_in_paths_from_directory(input, "xml", recursive=recursive) + + entities_data = [] + texts_data = [] + for path in path_list: + print("Processing: ", path) + file_data = process_file_to_json(path) + # data = sorted(file_data, key=lambda k: k[paragraph_id]) + entity_data = get_entity_data(file_data, ent_type) + entities_data.extend(entity_data) + + text_data = get_texts(file_data) + texts_data.extend(text_data) + + if os.path.isdir(str(output)): + output_path_text = os.path.join(output, "output-text") + ".csv" + output_path_expected = os.path.join(output, "output-" + ent_type) + ".csv" + else: + parent_dir = Path(output).parent + output_path_text = os.path.join(parent_dir, "output-text" + ".csv") + output_path_expected = os.path.join(parent_dir, "output-" + ent_type + ".csv") + + header = ["id", "filename", "pid", ent_type] + + for idx, data in enumerate(entities_data): + data[0] = idx + + write_output(output_path_expected, entities_data, header) + + header = ["id", "filename", "pid", "text"] + for idx, data in enumerate(texts_data): + data[0] = idx + write_output(output_path_text, texts_data, header) + + elif os.path.isfile(input): + input_path = Path(input) + file_data = process_file_to_json(input_path) + output_filename = input_path.stem + + output_path_text = os.path.join(output, str(output_filename) + "-text" + ".csv") + texts_data = get_texts(file_data) + for idx, data in enumerate(texts_data): + data[0] = idx + + header = ["id", "filename", "pid", "text"] + write_output(output_path_text, texts_data, header) + + output_path_expected = os.path.join(output, str(output_filename) + "-" + ent_type + ".csv") + ent_data_no_duplicates = get_entity_data(file_data, ent_type) + for idx, data in enumerate(ent_data_no_duplicates): + data[0] = idx + + header = ["id", "filename", "pid", ent_type] + write_output(output_path_expected, ent_data_no_duplicates, header)