From 9bc4740ef372c5bc39066631540255b319c6e684 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Jun 2023 13:36:11 +0900 Subject: [PATCH 1/8] Migration to dropwizard 4.0.0: building again --- build.gradle | 38 +++++----- grobid-home/config/grobid.yaml | 2 +- .../org/grobid/service/GrobidRestService.java | 4 +- .../service/GrobidServiceConfiguration.java | 2 +- .../GrobidServicePropConfiguration.java | 2 +- .../exceptions/GrobidServiceException.java | 2 +- .../GrobidServicePropertyException.java | 2 +- .../mapper/GrobidExceptionMapper.java | 12 ++-- .../GrobidExceptionsTranslationUtility.java | 8 +-- .../mapper/GrobidServiceExceptionMapper.java | 12 ++-- .../GrobidStatusToHttpStatusMapper.java | 2 +- .../mapper/WebApplicationExceptionMapper.java | 8 +-- .../main/GrobidServiceApplication.java | 26 ++++--- .../service/modules/GrobidServiceModule.java | 69 +++++++++---------- .../process/GrobidRestProcessFiles.java | 8 +-- .../process/GrobidRestProcessGeneric.java | 8 +-- .../process/GrobidRestProcessString.java | 10 +-- .../process/GrobidRestProcessTraining.java | 14 ++-- .../service/resources/HealthResource.java | 12 ++-- .../module/GrobidServiceModuleTest.java | 17 +++-- .../process/GrobidRestProcessFilesTest.java | 12 +--- .../service/tests/GrobidRestServiceTest.java | 23 +++---- 22 files changed, 144 insertions(+), 149 deletions(-) diff --git a/build.gradle b/build.gradle index d694849a64..519eaed201 100644 --- a/build.gradle +++ b/build.gradle @@ -99,14 +99,14 @@ subprojects { implementation "org.apache.commons:commons-collections4:4.1" implementation 'org.apache.commons:commons-text:1.8' implementation "commons-dbutils:commons-dbutils:1.7" - implementation "com.google.guava:guava:28.2-jre" + implementation "com.google.guava:guava:31.0.1-jre" implementation "org.apache.httpcomponents:httpclient:4.5.3" implementation "black.ninia:jep:4.0.2" - implementation "com.fasterxml.jackson.core:jackson-core:2.10.1" - implementation "com.fasterxml.jackson.core:jackson-databind:2.10.1" - implementation "com.fasterxml.jackson.module:jackson-module-afterburner:2.10.1" - implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.10.1" + implementation "com.fasterxml.jackson.core:jackson-core:2.12.2" + implementation "com.fasterxml.jackson.core:jackson-databind:2.12.2" + implementation "com.fasterxml.jackson.module:jackson-module-afterburner:2.12.2" + implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.12.2" } task sourceJar(type: Jar) { @@ -346,19 +346,25 @@ project(":grobid-service") { dependencies { implementation project(':grobid-core') implementation project(':grobid-trainer') - implementation "io.dropwizard:dropwizard-core:1.3.23" - implementation "io.dropwizard:dropwizard-assets:1.3.23" - implementation "com.hubspot.dropwizard:dropwizard-guicier:1.3.5.0" - implementation "io.dropwizard:dropwizard-testing:1.3.23" - implementation "io.dropwizard:dropwizard-forms:1.3.23" - implementation "io.dropwizard:dropwizard-client:1.3.23" - implementation "io.dropwizard:dropwizard-auth:1.3.23" + + //Dropwizard + implementation 'ru.vyarus:dropwizard-guicey:7.0.0' + + implementation 'io.dropwizard:dropwizard-bom:4.0.0' + implementation 'io.dropwizard:dropwizard-core:4.0.0' + implementation 'io.dropwizard:dropwizard-assets:4.0.0' + implementation 'io.dropwizard:dropwizard-testing:4.0.0' + implementation 'io.dropwizard:dropwizard-forms:4.0.0' + implementation 'io.dropwizard:dropwizard-client:4.0.0' + implementation 'io.dropwizard:dropwizard-auth:4.0.0' + implementation 'io.dropwizard.metrics:metrics-core:4.2.19' + implementation 'io.dropwizard.metrics:metrics-servlets:4.2.19' + implementation 'jakarta.servlet:jakarta.servlet-api:6.0.0' + implementation "org.apache.pdfbox:pdfbox:2.0.3" implementation "javax.activation:activation:1.1.1" - implementation "io.prometheus:simpleclient_dropwizard:0.11.0" - implementation "io.prometheus:simpleclient_servlet:0.11.0" - - testImplementation "io.dropwizard:dropwizard-testing:1.3.17" + implementation "io.prometheus:simpleclient_dropwizard:0.16.0" + implementation "io.prometheus:simpleclient_servlet:0.16.0" } shadowJar { diff --git a/grobid-home/config/grobid.yaml b/grobid-home/config/grobid.yaml index 92ec845712..6adb4da4c5 100644 --- a/grobid-home/config/grobid.yaml +++ b/grobid-home/config/grobid.yaml @@ -66,7 +66,7 @@ grobid: delft: # DeLFT global parameters # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model, - # embeddings are usually compiled as lmdb under delft/data (this paramter is ignored if only featured-engineered CRF are used) + # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used) install: "../delft" pythonVirtualEnv: diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 85e78ded9c..ead15cbbc3 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -22,8 +22,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.ws.rs.*; -import javax.ws.rs.core.*; +import jakarta.ws.rs.*; +import jakarta.ws.rs.core.*; import java.io.File; import java.io.InputStream; import java.util.ArrayList; diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidServiceConfiguration.java b/grobid-service/src/main/java/org/grobid/service/GrobidServiceConfiguration.java index fa344ad842..3eb81ed85f 100644 --- a/grobid-service/src/main/java/org/grobid/service/GrobidServiceConfiguration.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidServiceConfiguration.java @@ -1,7 +1,7 @@ package org.grobid.service; import com.google.inject.Singleton; -import io.dropwizard.Configuration; +import io.dropwizard.core.Configuration; @Singleton public class GrobidServiceConfiguration extends Configuration { diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java b/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java index 9d52122638..202129fb3f 100644 --- a/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java @@ -1,7 +1,7 @@ package org.grobid.service; import com.fasterxml.jackson.annotation.JsonProperty; -import org.hibernate.validator.constraints.NotEmpty; +import jakarta.validation.constraints.NotEmpty; public class GrobidServicePropConfiguration { @NotEmpty diff --git a/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServiceException.java b/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServiceException.java index c680600498..39ea7bdc00 100755 --- a/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServiceException.java +++ b/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServiceException.java @@ -1,7 +1,7 @@ package org.grobid.service.exceptions; import org.grobid.core.exceptions.GrobidException; -import javax.ws.rs.core.Response; +import jakarta.ws.rs.core.Response; public class GrobidServiceException extends GrobidException { diff --git a/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServicePropertyException.java b/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServicePropertyException.java index 0617e594a9..b07b22b5bd 100755 --- a/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServicePropertyException.java +++ b/grobid-service/src/main/java/org/grobid/service/exceptions/GrobidServicePropertyException.java @@ -1,6 +1,6 @@ package org.grobid.service.exceptions; -import javax.ws.rs.core.Response; +import jakarta.ws.rs.core.Response; public class GrobidServicePropertyException extends GrobidServiceException { diff --git a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionMapper.java b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionMapper.java index 9bc49e52a1..5b22530b19 100644 --- a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionMapper.java +++ b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionMapper.java @@ -3,12 +3,12 @@ import com.google.inject.Inject; import org.grobid.core.exceptions.GrobidException; -import javax.ws.rs.core.Context; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.UriInfo; -import javax.ws.rs.ext.ExceptionMapper; -import javax.ws.rs.ext.Provider; +import jakarta.ws.rs.core.Context; +import jakarta.ws.rs.core.HttpHeaders; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.UriInfo; +import jakarta.ws.rs.ext.ExceptionMapper; +import jakarta.ws.rs.ext.Provider; @Provider public class GrobidExceptionMapper implements ExceptionMapper { diff --git a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionsTranslationUtility.java b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionsTranslationUtility.java index fb1fda15c8..da7810aed9 100644 --- a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionsTranslationUtility.java +++ b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidExceptionsTranslationUtility.java @@ -7,10 +7,10 @@ import org.grobid.core.exceptions.GrobidExceptionStatus; import org.slf4j.MDC; -import javax.inject.Inject; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; -import javax.ws.rs.ext.Provider; +import jakarta.inject.Inject; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.ext.Provider; import java.io.IOException; import java.util.ArrayList; import java.util.List; diff --git a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidServiceExceptionMapper.java b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidServiceExceptionMapper.java index ebe26ffb2f..7e92c1fedb 100644 --- a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidServiceExceptionMapper.java +++ b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidServiceExceptionMapper.java @@ -5,12 +5,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.ws.rs.core.Context; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.UriInfo; -import javax.ws.rs.ext.ExceptionMapper; -import javax.ws.rs.ext.Provider; +import jakarta.ws.rs.core.Context; +import jakarta.ws.rs.core.HttpHeaders; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.UriInfo; +import jakarta.ws.rs.ext.ExceptionMapper; +import jakarta.ws.rs.ext.Provider; @Provider public class GrobidServiceExceptionMapper implements ExceptionMapper { diff --git a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidStatusToHttpStatusMapper.java b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidStatusToHttpStatusMapper.java index 9642d13a61..54a0a869fa 100644 --- a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidStatusToHttpStatusMapper.java +++ b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/GrobidStatusToHttpStatusMapper.java @@ -2,7 +2,7 @@ import org.grobid.core.exceptions.GrobidExceptionStatus; -import javax.ws.rs.core.Response; +import jakarta.ws.rs.core.Response; public class GrobidStatusToHttpStatusMapper { public static Response.Status getStatusCode(GrobidExceptionStatus status) { diff --git a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/WebApplicationExceptionMapper.java b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/WebApplicationExceptionMapper.java index 7d72e026bc..7489b42715 100644 --- a/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/WebApplicationExceptionMapper.java +++ b/grobid-service/src/main/java/org/grobid/service/exceptions/mapper/WebApplicationExceptionMapper.java @@ -2,10 +2,10 @@ import com.google.inject.Inject; -import javax.ws.rs.WebApplicationException; -import javax.ws.rs.core.Response; -import javax.ws.rs.ext.ExceptionMapper; -import javax.ws.rs.ext.Provider; +import jakarta.ws.rs.WebApplicationException; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.ext.ExceptionMapper; +import jakarta.ws.rs.ext.Provider; @Provider public class WebApplicationExceptionMapper implements ExceptionMapper { diff --git a/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java b/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java index 9a4ff4606b..f7a5a752c2 100644 --- a/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java +++ b/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java @@ -1,30 +1,28 @@ package org.grobid.service.main; -import com.google.common.collect.Lists; -import com.google.inject.Module; -import com.hubspot.dropwizard.guicier.GuiceBundle; -import io.dropwizard.Application; +import com.google.inject.AbstractModule; import io.dropwizard.assets.AssetsBundle; +import io.dropwizard.core.Application; +import io.dropwizard.core.setup.Bootstrap; +import io.dropwizard.core.setup.Environment; import io.dropwizard.forms.MultiPartBundle; -import io.dropwizard.setup.Bootstrap; -import io.dropwizard.setup.Environment; +import io.dropwizard.metrics.servlets.MetricsServlet; import io.prometheus.client.dropwizard.DropwizardExports; -import io.prometheus.client.exporter.MetricsServlet; +import jakarta.servlet.DispatcherType; +import jakarta.servlet.FilterRegistration; +import jakarta.servlet.ServletRegistration; import org.apache.commons.lang3.ArrayUtils; import org.eclipse.jetty.servlets.CrossOriginFilter; import org.grobid.service.GrobidServiceConfiguration; import org.grobid.service.modules.GrobidServiceModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import ru.vyarus.dropwizard.guice.GuiceBundle; -import javax.servlet.DispatcherType; -import javax.servlet.FilterRegistration; -import javax.servlet.ServletRegistration; import java.io.File; import java.util.Arrays; import java.util.EnumSet; -import java.util.List; public final class GrobidServiceApplication extends Application { @@ -43,7 +41,7 @@ public String getName() { @Override public void initialize(Bootstrap bootstrap) { - GuiceBundle guiceBundle = GuiceBundle.defaultBuilder(GrobidServiceConfiguration.class) + GuiceBundle guiceBundle = GuiceBundle.builder() .modules(getGuiceModules()) .build(); bootstrap.addBundle(guiceBundle); @@ -51,8 +49,8 @@ public void initialize(Bootstrap bootstrap) { bootstrap.addBundle(new AssetsBundle("/web", "/", "index.html", "grobidAssets")); } - private List getGuiceModules() { - return Lists.newArrayList(new GrobidServiceModule()); + private AbstractModule getGuiceModules() { + return new GrobidServiceModule(); } @Override diff --git a/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java b/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java index d7c0039e93..7909eda991 100644 --- a/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java +++ b/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java @@ -2,9 +2,10 @@ import com.codahale.metrics.MetricRegistry; import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.inject.Binder; +import com.google.inject.AbstractModule; import com.google.inject.Provides; -import com.hubspot.dropwizard.guicier.DropwizardAwareModule; +import jakarta.ws.rs.client.Client; +import jakarta.ws.rs.client.ClientBuilder; import org.grobid.service.GrobidRestService; import org.grobid.service.GrobidServiceConfiguration; import org.grobid.service.exceptions.mapper.GrobidExceptionMapper; @@ -16,49 +17,47 @@ import org.grobid.service.process.GrobidRestProcessString; import org.grobid.service.process.GrobidRestProcessTraining; import org.grobid.service.resources.HealthResource; - -import javax.ws.rs.client.Client; -import javax.ws.rs.client.ClientBuilder; +import ru.vyarus.dropwizard.guice.module.support.DropwizardAwareModule; public class GrobidServiceModule extends DropwizardAwareModule { @Override - public void configure(Binder binder) { - binder.bind(HealthResource.class); + public void configure() { + bind(HealthResource.class); //REST - binder.bind(GrobidRestService.class); - binder.bind(GrobidRestProcessFiles.class); - binder.bind(GrobidRestProcessGeneric.class); - binder.bind(GrobidRestProcessString.class); - binder.bind(GrobidRestProcessTraining.class); + bind(GrobidRestService.class); + bind(GrobidRestProcessFiles.class); + bind(GrobidRestProcessGeneric.class); + bind(GrobidRestProcessString.class); + bind(GrobidRestProcessTraining.class); //Exception Mappers - binder.bind(GrobidServiceExceptionMapper.class); - binder.bind(GrobidExceptionsTranslationUtility.class); - binder.bind(GrobidExceptionMapper.class); - binder.bind(WebApplicationExceptionMapper.class); - } - - @Provides - protected ObjectMapper getObjectMapper() { - return getEnvironment().getObjectMapper(); - } - - @Provides - protected MetricRegistry provideMetricRegistry() { - return getMetricRegistry(); - } - - //for unit tests - protected MetricRegistry getMetricRegistry() { - return getEnvironment().metrics(); + bind(GrobidServiceExceptionMapper.class); + bind(GrobidExceptionsTranslationUtility.class); + bind(GrobidExceptionMapper.class); + bind(WebApplicationExceptionMapper.class); } - @Provides - Client provideClient() { - return ClientBuilder.newClient(); - } +// @Provides +// protected ObjectMapper getObjectMapper() { +// return getEnvironment().getObjectMapper(); +// } +// +// @Provides +// protected MetricRegistry provideMetricRegistry() { +// return getMetricRegistry(); +// } +// +// //for unit tests +// protected MetricRegistry getMetricRegistry() { +// return getEnvironment().metrics(); +// } +// +// @Provides +// Client provideClient() { +// return ClientBuilder.newClient(); +// } } diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java index 957be63031..4060c875da 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java @@ -24,10 +24,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.Response.Status; +import jakarta.ws.rs.core.HttpHeaders; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.Response.Status; import java.io.*; import java.nio.charset.Charset; import java.util.ArrayList; diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessGeneric.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessGeneric.java index 0c141d64a0..20717e198a 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessGeneric.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessGeneric.java @@ -6,10 +6,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.Response.Status; -import javax.ws.rs.core.UriInfo; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.Response.Status; +import jakarta.ws.rs.core.UriInfo; @Singleton public class GrobidRestProcessGeneric { diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java index dbd78cf5b8..82bfd52a67 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java @@ -4,11 +4,11 @@ import java.util.ArrayList; import java.util.NoSuchElementException; -import javax.ws.rs.core.HttpHeaders; -import javax.inject.Inject; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.Response.Status; +import jakarta.ws.rs.core.HttpHeaders; +import jakarta.inject.Inject; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.Response.Status; import com.google.inject.Singleton; import org.grobid.core.data.Affiliation; diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessTraining.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessTraining.java index cf03ccbde1..9438ea1384 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessTraining.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessTraining.java @@ -14,13 +14,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.Response.Status; -import javax.ws.rs.core.UriInfo; -import javax.ws.rs.WebApplicationException; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.StreamingOutput; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.Response.Status; +import jakarta.ws.rs.core.UriInfo; +import jakarta.ws.rs.WebApplicationException; +import jakarta.ws.rs.core.HttpHeaders; +import jakarta.ws.rs.core.StreamingOutput; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; diff --git a/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java b/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java index a0e2b8c7cd..1065db6946 100644 --- a/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java +++ b/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java @@ -3,12 +3,12 @@ import com.codahale.metrics.health.HealthCheck; import org.grobid.service.GrobidServiceConfiguration; -import javax.inject.Inject; -import javax.inject.Singleton; -import javax.ws.rs.GET; -import javax.ws.rs.Path; -import javax.ws.rs.Produces; -import javax.ws.rs.core.Response; +import jakarta.inject.Inject; +import jakarta.inject.Singleton; +import jakarta.ws.rs.GET; +import jakarta.ws.rs.Path; +import jakarta.ws.rs.Produces; +import jakarta.ws.rs.core.Response; @Path("health") @Singleton diff --git a/grobid-service/src/test/java/org/grobid/service/module/GrobidServiceModuleTest.java b/grobid-service/src/test/java/org/grobid/service/module/GrobidServiceModuleTest.java index b033c051e7..f6f744448b 100644 --- a/grobid-service/src/test/java/org/grobid/service/module/GrobidServiceModuleTest.java +++ b/grobid-service/src/test/java/org/grobid/service/module/GrobidServiceModuleTest.java @@ -2,20 +2,19 @@ import com.codahale.metrics.MetricRegistry; import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.inject.Binder; import com.google.inject.Provides; import com.google.inject.Singleton; import io.dropwizard.configuration.ConfigurationFactory; import io.dropwizard.configuration.DefaultConfigurationFactoryFactory; import io.dropwizard.configuration.FileConfigurationSourceProvider; +import io.dropwizard.core.setup.Environment; import io.dropwizard.jackson.Jackson; -import io.dropwizard.setup.Environment; +import jakarta.validation.Validation; +import jakarta.validation.ValidatorFactory; import org.grobid.service.GrobidServiceConfiguration; import org.grobid.service.modules.GrobidServiceModule; import org.hibernate.validator.HibernateValidator; -import javax.validation.Validation; -import javax.validation.ValidatorFactory; public class GrobidServiceModuleTest extends GrobidServiceModule { @@ -26,15 +25,15 @@ public GrobidServiceModuleTest() { } @Override - public void configure(Binder binder) { - super.configure(binder); + public void configure() { + super.configure(); } @Provides @Singleton @Override - public GrobidServiceConfiguration getConfiguration() { + public GrobidServiceConfiguration configuration() { ObjectMapper objectMapper = Jackson.newObjectMapper(); ValidatorFactory validatorFactory = Validation @@ -58,9 +57,9 @@ public GrobidServiceConfiguration getConfiguration() { @Override @Provides - protected Environment getEnvironment() { + protected Environment environment() { return new Environment("test-grobid-service-env", new ObjectMapper(), null, new MetricRegistry(), - this.getClass().getClassLoader()); + this.getClass().getClassLoader(), null, configuration()); } diff --git a/grobid-service/src/test/java/org/grobid/service/process/GrobidRestProcessFilesTest.java b/grobid-service/src/test/java/org/grobid/service/process/GrobidRestProcessFilesTest.java index b810392049..b0513c3d07 100644 --- a/grobid-service/src/test/java/org/grobid/service/process/GrobidRestProcessFilesTest.java +++ b/grobid-service/src/test/java/org/grobid/service/process/GrobidRestProcessFilesTest.java @@ -1,6 +1,5 @@ package org.grobid.service.process; -import com.squarespace.jersey2.guice.JerseyGuiceUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.easymock.EasyMock; import org.grobid.core.document.Document; @@ -13,26 +12,21 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.powermock.api.easymock.PowerMock; -import org.powermock.core.classloader.ClassloaderWrapper; import org.powermock.core.classloader.annotations.PrepareForTest; import org.powermock.modules.junit4.PowerMockRunner; -import java.io.ByteArrayInputStream; import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; import java.util.List; import static org.easymock.EasyMock.*; -import static org.junit.Assert.assertEquals; @RunWith(PowerMockRunner.class) @PrepareForTest({CitationsVisualizer.class, BlockVisualizer.class, FigureTableVisualizer.class}) public class GrobidRestProcessFilesTest { - static { - JerseyGuiceUtils.install((s, serviceLocator) -> null); - } +// static { +// JerseyGuiceUtils.install((s, serviceLocator) -> null); +// } DocumentSource documentSourceMock; GrobidRestProcessFiles target; diff --git a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java index 99454a2dff..e5030c5741 100755 --- a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java +++ b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java @@ -14,8 +14,7 @@ package org.grobid.service.tests; import com.google.inject.Guice; -import com.squarespace.jersey2.guice.JerseyGuiceUtils; -import io.dropwizard.testing.junit.DropwizardAppRule; +import io.dropwizard.testing.junit5.DropwizardAppExtension; import org.apache.commons.io.FileUtils; import org.glassfish.jersey.client.JerseyClientBuilder; import org.glassfish.jersey.media.multipart.FormDataMultiPart; @@ -32,13 +31,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.ws.rs.client.Client; -import javax.ws.rs.client.Entity; -import javax.ws.rs.core.Form; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.MultivaluedHashMap; -import javax.ws.rs.core.MultivaluedMap; -import javax.ws.rs.core.Response; +import jakarta.ws.rs.client.Client; +import jakarta.ws.rs.client.Entity; +import jakarta.ws.rs.core.Form; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.MultivaluedHashMap; +import jakarta.ws.rs.core.MultivaluedMap; +import jakarta.ws.rs.core.Response; import java.io.File; import java.io.IOException; @@ -63,8 +62,8 @@ public static void destroyInitialContext() throws Exception { } @ClassRule - public static DropwizardAppRule APP = - new DropwizardAppRule<>(GrobidServiceApplication.class, GrobidServiceModuleTest.TEST_CONFIG_FILE); + public static DropwizardAppExtension APP = + new DropwizardAppExtension<>(GrobidServiceApplication.class, GrobidServiceModuleTest.TEST_CONFIG_FILE); private String baseUrl() { @@ -73,7 +72,7 @@ private String baseUrl() { @Before public void setUp() throws IOException { - JerseyGuiceUtils.reset(); +// JerseyGuiceUtils.reset(); GrobidServiceModuleTest testWorkerModule = new GrobidServiceModuleTest() { // redefine methods that are needed: From 11fe10a9efb29e25ec731dcab84f5357e8e46d5a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Jun 2023 19:06:51 +0900 Subject: [PATCH 2/8] Fix config format --- grobid-home/config/grobid.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grobid-home/config/grobid.yaml b/grobid-home/config/grobid.yaml index 6adb4da4c5..25dc787b83 100644 --- a/grobid-home/config/grobid.yaml +++ b/grobid-home/config/grobid.yaml @@ -255,17 +255,17 @@ server: registerDefaultExceptionMappers: false logging: + level: INFO loggers: org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" org.glassfish.jersey.internal: "OFF" appenders: - type: console - level: INFO threshold: WARN timeZone: UTC - type: file currentLogFilename: logs/grobid-service.log - threshold: ALL + threshold: INFO archive: true archivedLogFilenamePattern: logs/grobid-service-%d.log archivedFileCount: 5 From 955c5d3594b699a94106e7c147e23fdb96770781 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Jun 2023 19:07:06 +0900 Subject: [PATCH 3/8] update JDK source compatibility --- build.gradle | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/build.gradle b/build.gradle index 519eaed201..15228b4782 100644 --- a/build.gradle +++ b/build.gradle @@ -53,8 +53,8 @@ subprojects { } } - sourceCompatibility = 1.8 - targetCompatibility = 1.8 + sourceCompatibility = 1.11 + targetCompatibility = 1.11 repositories { mavenCentral() @@ -359,7 +359,6 @@ project(":grobid-service") { implementation 'io.dropwizard:dropwizard-auth:4.0.0' implementation 'io.dropwizard.metrics:metrics-core:4.2.19' implementation 'io.dropwizard.metrics:metrics-servlets:4.2.19' - implementation 'jakarta.servlet:jakarta.servlet-api:6.0.0' implementation "org.apache.pdfbox:pdfbox:2.0.3" implementation "javax.activation:activation:1.1.1" From e3ef1dc9279d98a249733f1d4cba8aea2fe248b8 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Jun 2023 19:09:28 +0900 Subject: [PATCH 4/8] Fix load of properties not in the yaml --- .../GrobidServicePropConfiguration.java | 3 ++ .../main/GrobidServiceApplication.java | 1 + .../service/modules/GrobidServiceModule.java | 38 +++++++++---------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java b/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java index 202129fb3f..096d0d9efc 100644 --- a/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidServicePropConfiguration.java @@ -1,8 +1,11 @@ package org.grobid.service; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import jakarta.validation.constraints.NotEmpty; + +@JsonIgnoreProperties(ignoreUnknown = true) public class GrobidServicePropConfiguration { @NotEmpty @JsonProperty diff --git a/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java b/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java index f7a5a752c2..e42e90134c 100644 --- a/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java +++ b/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java @@ -1,6 +1,7 @@ package org.grobid.service.main; +import com.fasterxml.jackson.databind.DeserializationFeature; import com.google.inject.AbstractModule; import io.dropwizard.assets.AssetsBundle; import io.dropwizard.core.Application; diff --git a/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java b/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java index 7909eda991..ac418127d9 100644 --- a/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java +++ b/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java @@ -40,24 +40,24 @@ public void configure() { bind(WebApplicationExceptionMapper.class); } -// @Provides -// protected ObjectMapper getObjectMapper() { -// return getEnvironment().getObjectMapper(); -// } -// -// @Provides -// protected MetricRegistry provideMetricRegistry() { -// return getMetricRegistry(); -// } -// -// //for unit tests -// protected MetricRegistry getMetricRegistry() { -// return getEnvironment().metrics(); -// } -// -// @Provides -// Client provideClient() { -// return ClientBuilder.newClient(); -// } + @Provides + protected ObjectMapper getObjectMapper() { + return environment().getObjectMapper(); + } + + @Provides + protected MetricRegistry provideMetricRegistry() { + return getMetricRegistry(); + } + + //for unit tests + protected MetricRegistry getMetricRegistry() { + return environment().metrics(); + } + + @Provides + Client provideClient() { + return ClientBuilder.newClient(); + } } From 1f45c3f671dd6752deb341aced48e6353bcc8320 Mon Sep 17 00:00:00 2001 From: lfoppiano Date: Tue, 20 Jun 2023 05:28:28 +0900 Subject: [PATCH 5/8] fix healthcheck error --- .../java/org/grobid/service/main/GrobidServiceApplication.java | 3 +++ .../main/java/org/grobid/service/resources/HealthResource.java | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java b/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java index e42e90134c..bbe8e7d889 100644 --- a/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java +++ b/grobid-service/src/main/java/org/grobid/service/main/GrobidServiceApplication.java @@ -17,6 +17,7 @@ import org.eclipse.jetty.servlets.CrossOriginFilter; import org.grobid.service.GrobidServiceConfiguration; import org.grobid.service.modules.GrobidServiceModule; +import org.grobid.service.resources.HealthResource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.vyarus.dropwizard.guice.GuiceBundle; @@ -56,6 +57,8 @@ private AbstractModule getGuiceModules() { @Override public void run(GrobidServiceConfiguration configuration, Environment environment) { + environment.healthChecks().register("health-check", new HealthResource(configuration)); + LOGGER.info("Service config={}", configuration); new DropwizardExports(environment.metrics()).register(); ServletRegistration.Dynamic registration = environment.admin().addServlet("Prometheus", new MetricsServlet()); diff --git a/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java b/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java index 1065db6946..e743209998 100644 --- a/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java +++ b/grobid-service/src/main/java/org/grobid/service/resources/HealthResource.java @@ -19,7 +19,8 @@ public class HealthResource extends HealthCheck { private GrobidServiceConfiguration configuration; @Inject - public HealthResource() { + public HealthResource(GrobidServiceConfiguration configuration) { + this.configuration = configuration; } @GET From 0ce86ef52f08d5ca6ecdd9bed6a184158086e6f6 Mon Sep 17 00:00:00 2001 From: lfoppiano Date: Tue, 20 Jun 2023 06:51:08 +0900 Subject: [PATCH 6/8] fix tests that requires junit4 --- build.gradle | 9 +++++++-- .../service/tests/GrobidRestServiceTest.java | 20 +++++++++---------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/build.gradle b/build.gradle index 15228b4782..dce930585b 100644 --- a/build.gradle +++ b/build.gradle @@ -84,8 +84,10 @@ subprojects { // packaging local libs inside grobid-core.jar implementation fileTree(dir: new File(rootProject.rootDir, 'grobid-core/localLibs'), include: localLibs) - testImplementation "junit:junit:4.12" - testImplementation "org.easymock:easymock:3.4" + testRuntimeOnly 'org.junit.vintage:junit-vintage-engine:5.9.3' + testImplementation(platform('org.junit:junit-bom:5.9.3')) + testImplementation('org.junit.jupiter:junit-jupiter') + testImplementation 'org.easymock:easymock:5.1.0' testImplementation "org.powermock:powermock-api-easymock:2.0.7" testImplementation "org.powermock:powermock-module-junit4:2.0.7" testImplementation "xmlunit:xmlunit:1.6" @@ -147,6 +149,8 @@ subprojects { // } test { + useJUnitPlatform() + testLogging.showStandardStreams = true // enable for having separate test executor for different tests forkEvery = 1 @@ -354,6 +358,7 @@ project(":grobid-service") { implementation 'io.dropwizard:dropwizard-core:4.0.0' implementation 'io.dropwizard:dropwizard-assets:4.0.0' implementation 'io.dropwizard:dropwizard-testing:4.0.0' + implementation 'io.dropwizard.modules:dropwizard-testing-junit4:4.0.0' implementation 'io.dropwizard:dropwizard-forms:4.0.0' implementation 'io.dropwizard:dropwizard-client:4.0.0' implementation 'io.dropwizard:dropwizard-auth:4.0.0' diff --git a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java index e5030c5741..08ab9949e8 100755 --- a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java +++ b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java @@ -14,7 +14,11 @@ package org.grobid.service.tests; import com.google.inject.Guice; -import io.dropwizard.testing.junit5.DropwizardAppExtension; +import io.dropwizard.testing.junit.DropwizardAppRule; +import io.dropwizard.testing.junit5.DropwizardExtensionsSupport; +import jakarta.ws.rs.client.Client; +import jakarta.ws.rs.client.Entity; +import jakarta.ws.rs.core.*; import org.apache.commons.io.FileUtils; import org.glassfish.jersey.client.JerseyClientBuilder; import org.glassfish.jersey.media.multipart.FormDataMultiPart; @@ -28,16 +32,10 @@ import org.grobid.service.module.GrobidServiceModuleTest; import org.grobid.service.util.BibTexMediaType; import org.junit.*; +import org.junit.jupiter.api.extension.ExtendWith; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import jakarta.ws.rs.client.Client; -import jakarta.ws.rs.client.Entity; -import jakarta.ws.rs.core.Form; -import jakarta.ws.rs.core.MediaType; -import jakarta.ws.rs.core.MultivaluedHashMap; -import jakarta.ws.rs.core.MultivaluedMap; -import jakarta.ws.rs.core.Response; import java.io.File; import java.io.IOException; @@ -50,6 +48,8 @@ * * @author Florian Zipser */ + +@ExtendWith(DropwizardExtensionsSupport.class) public class GrobidRestServiceTest { private static final Logger LOGGER = LoggerFactory.getLogger(GrobidRestServiceTest.class); @@ -62,8 +62,8 @@ public static void destroyInitialContext() throws Exception { } @ClassRule - public static DropwizardAppExtension APP = - new DropwizardAppExtension<>(GrobidServiceApplication.class, GrobidServiceModuleTest.TEST_CONFIG_FILE); + public static DropwizardAppRule APP = + new DropwizardAppRule<>(GrobidServiceApplication.class, GrobidServiceModuleTest.TEST_CONFIG_FILE); private String baseUrl() { From 7346d1c3b599a426254f9d37d7a36544d08bfa3f Mon Sep 17 00:00:00 2001 From: lopez Date: Fri, 17 Nov 2023 18:01:49 +0100 Subject: [PATCH 7/8] unsuccessfully tried to manage hibernate warning --- build.gradle | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build.gradle b/build.gradle index b37021c91a..d7c160b3f0 100644 --- a/build.gradle +++ b/build.gradle @@ -34,6 +34,8 @@ allprojects { tasks.withType(JavaCompile) { options.encoding = 'UTF-8' + // note: the following is not working + options.compilerArgs << '-parameters' } } @@ -357,8 +359,8 @@ project(":grobid-service") { implementation 'io.dropwizard:dropwizard-forms:4.0.0' implementation 'io.dropwizard:dropwizard-client:4.0.0' implementation 'io.dropwizard:dropwizard-auth:4.0.0' - implementation 'io.dropwizard.metrics:metrics-core:4.2.19' - implementation 'io.dropwizard.metrics:metrics-servlets:4.2.19' + implementation 'io.dropwizard.metrics:metrics-core:4.2.22' + implementation 'io.dropwizard.metrics:metrics-servlets:4.2.22' implementation "org.apache.pdfbox:pdfbox:2.0.3" implementation "javax.activation:activation:1.1.1" From 8df47ec2bc0a9313ea3146a775bd39d4099fee98 Mon Sep 17 00:00:00 2001 From: lopez Date: Fri, 17 Nov 2023 22:40:37 +0100 Subject: [PATCH 8/8] update documentation for java 11 min requirement, and more generally for next version --- Readme.md | 3 ++- doc/Configuration.md | 12 ++++++++++-- doc/Grobid-service.md | 27 ++++++++++++++++----------- doc/Install-Grobid.md | 13 +++++++------ doc/Introduction.md | 7 ++++--- doc/Run-Grobid.md | 24 ++++++++++++++++++++++++ doc/Training-the-models-of-Grobid.md | 2 ++ doc/index.md | 14 +++++++++++--- mkdocs.yml | 5 +++-- 9 files changed, 79 insertions(+), 28 deletions(-) create mode 100644 doc/Run-Grobid.md diff --git a/Readme.md b/Readme.md index b631b8d448..b6b74e46cf 100644 --- a/Readme.md +++ b/Readme.md @@ -24,7 +24,7 @@ The following functionalities are available: - __Header extraction and parsing__ from article in PDF format. The extraction here covers the usual bibliographical information (e.g. title, abstract, authors, affiliations, keywords, etc.). - __References extraction and parsing__ from articles in PDF format, around .87 F1-score against on an independent PubMed Central set of 1943 PDF containing 90,125 references, and around .90 on a similar bioRxiv set of 2000 PDF (using the Deep Learning citation model). All the usual publication metadata are covered (including DOI, PMID, etc.). - __Citation contexts recognition and resolution__ of the full bibliographical references of the article. The accuracy of citation contexts resolution is between .76 and .91 F1-score depending on the evaluation collection (this corresponds to both the correct identification of the citation callout and its correct association with a full bibliographical reference). -- __Full text extraction and structuring__ from PDF articles, including a model for the overall document segmentation and models for the structuring of the text body (paragraph, section titles, reference and footnote callouts, figures, tables, etc.). +- __Full text extraction and structuring__ from PDF articles, including a model for the overall document segmentation and models for the structuring of the text body (paragraph, section titles, reference and footnote callouts, figures, tables, data availability statements, etc.). - __PDF coordinates__ for extracted information, allowing to create "augmented" interactive PDF based on bounding boxes of the identified structures. - Parsing of __references in isolation__ (above .90 F1-score at instance-level, .95 F1-score at field level, using the Deep Learning model). - __Parsing of names__ (e.g. person title, forenames, middle name, etc.), in particular author names in header, and author names in references (two distinct models). @@ -32,6 +32,7 @@ The following functionalities are available: - __Parsing of dates__, ISO normalized day, month, year. - __Consolidation/resolution of the extracted bibliographical references__ using the [biblio-glutton](https://github.com/kermitt2/biblio-glutton) service or the [CrossRef REST API](https://github.com/CrossRef/rest-api-doc). In both cases, DOI/PMID resolution performance is higher than 0.95 F1-score from PDF extraction. - __Extraction and parsing of patent and non-patent references in patent__ publications. +- __Extraction of Funders and funding information__ with optional matching of extracted funders with the CrossRef Funder Registry. In a complete PDF processing, GROBID manages 55 final labels used to build relatively fine-grained structures, from traditional publication metadata (title, author first/last/middle names, affiliation types, detailed address, journal, volume, issue, pages, DOI, PMID, etc.) to full text structures (section title, paragraph, reference markers, head/foot notes, figure captions, etc.). diff --git a/doc/Configuration.md b/doc/Configuration.md index 9c778ffeae..8ea092594b 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -207,15 +207,23 @@ logging: level: INFO loggers: org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" + org.glassfish.jersey.internal: "OFF" + com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" appenders: - type: console - threshold: ALL + threshold: WARN timeZone: UTC + # uncomment to have the logs in json format + #layout: + # type: json - type: file currentLogFilename: logs/grobid-service.log - threshold: ALL + threshold: INFO archive: true archivedLogFilenamePattern: logs/grobid-service-%d.log archivedFileCount: 5 timeZone: UTC + # uncomment to have the logs in json format + #layout: + # type: json ``` diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md index fb5e125795..b9d2916b9e 100644 --- a/doc/Grobid-service.md +++ b/doc/Grobid-service.md @@ -2,7 +2,12 @@ The GROBID Web API provides a simple and efficient way to use the tool. A service console is available to test GROBID in a human friendly manner. For production and benchmarking, we strongly recommand to use this web service mode on a multi-core machine and to avoid running GROBID in the batch mode. -## Start the server with Gradle +## Start the server with Docker + +This is the recommended and standard way to run the Grobid web services. + + +## Start a development server with Gradle Go under the `grobid/` main directory. Be sure that the GROBID project is built, see [Install GROBID](Install-Grobid.md). @@ -16,7 +21,7 @@ The following command will start the server on the default port __8070__: ## Install and run the service as standalone application -You could also build and install the service as a standalone service (let's supposed the destination directory is grobid-installation) +From a development installation, you can also build and install the service as a standalone service - here let's supposed the destination directory is grobid-installation: ```console ./gradlew clean assemble @@ -57,16 +62,16 @@ If required, modify the file under `grobid/grobid-home/config/grobid.yaml` for s You can choose to load all the models at the start of the service or lazily when a model is used the first time, the latter being the default. Loading all models at service startup will slow down the start of the server and will use more memories than the lazy mode in case only a few services will be used. -For preloading all the models, set the following config parameter to `true`: +Preloading all the models at server start is the default setting, but you choose a lazy loading of the model: ```yaml grobid: # for **service only**: how to load the models, - # false -> models are loaded when needed (default), avoiding putting in memory useless models but slow down significantly - # the service at first call - # true -> all the models are loaded into memory at the server startup, slow the start of the services and models not - # used will take some memory, but server is immediatly warm and ready - modelPreload: false + # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down + # significantly the service at first call + # true -> all the models are loaded into memory at the server startup (default), slow the start of the services + # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready + modelPreload: true ``` ## CORS (Cross-Origin Resource Share) @@ -89,13 +94,13 @@ We provide clients written in Python, Java, node.js using the GROBID PDF-to-TEI * Java GROBID client * Node.js GROBID client -All these clients will take advantage of the multi-threading for scaling PDF batch processing. As a consequence, they will be much more efficient than the [batch command lines](Grobid-batch.md) (which use only one thread) and should be prefered. +All these clients will take advantage of the multi-threading for scaling PDF batch processing. As a consequence, they will be much more efficient than the [batch command lines](Grobid-batch.md) (which use only one thread) and should be prefered. The Python client is the more up-to-date and complete and can be adapted for your needs. ## Use GROBID test console -On your browser, the welcome page of the Service console is available at the URL . +On your browser, the welcome page of the service console is available at the URL . -On the console, the RESTful API can be tested under the `TEI` tab for service returning a TEI document, under the `PDF` tab for services returning annotations relative to PDF or an annotated PDF and under the `Patent` tab for patent-related services: +On the service console, the RESTful API can be tested under the `TEI` tab for service returning a TEI document, under the `PDF` tab for services returning annotations relative to PDF or an annotated PDF and under the `Patent` tab for patent-related services: ![Example of GROBID Service console usage](img/grobid-rest-example.png) diff --git a/doc/Install-Grobid.md b/doc/Install-Grobid.md index 333a7375a0..f207aff4c2 100644 --- a/doc/Install-Grobid.md +++ b/doc/Install-Grobid.md @@ -1,8 +1,10 @@ -

Install GROBID

> +

Install a GROBID development environment

> -## Getting GROBID +## Getting the GROBID project source -GROBID requires a JVM installed on your machine, we tested the tool successfully up version **JVM 17**. Other recent JVM versions should work correctly. +For building GROBID yourself, a JDK must be installed on your machine. We tested the tool successfully from **JDK 1.11** up version **JDK 1.17**. Other recent JDK versions should work correctly. + +Note: Java/JDK 8 is not supported anymore from Grobid version `0.8.0` and the minimum requirement for Java is JDK 1.11. ### Latest stable release @@ -29,7 +31,7 @@ Or download directly the zip file: > unzip master ``` -## Build GROBID +## Build GROBID from the source **Please make sure that Grobid is installed in a path with no parent directories containing spaces.** @@ -59,9 +61,8 @@ systemProp.https.proxyUser=username systemProp.https.proxyPassword=password ``` -## Use GROBID +## Use a built GROBID project From there, the easiest and most efficient way to use GROBID is the [web service mode](Grobid-service.md). You can also use the tool in [batch mode](Grobid-batch.md) or integrate it in your Java project via the [Java API](Grobid-java-library.md). - diff --git a/doc/Introduction.md b/doc/Introduction.md index fe76df7b3f..c30b0bc945 100644 --- a/doc/Introduction.md +++ b/doc/Introduction.md @@ -22,7 +22,7 @@ The following functionalities are available: - __Header extraction and parsing__ from article in PDF format. The extraction here covers the usual bibliographical information (e.g. title, abstract, authors, affiliations, keywords, etc.). - __References extraction and parsing__ from articles in PDF format, around .87 F1-score against on an independent PubMed Central set of 1943 PDF containing 90,125 references, and around .90 on a similar bioRxiv set of 2000 PDF (using the Deep Learning citation model). All the usual publication metadata are covered (including DOI, PMID, etc.). - __Citation contexts recognition and resolution__ of the full bibliographical references of the article. The accuracy of citation contexts resolution is between .76 and .91 F1-score depending on the evaluation collection (this corresponds to both the correct identification of the citation callout and its correct association with a full bibliographical reference). -- __Full text extraction and structuring__ from PDF articles, including a model for the overall document segmentation and models for the structuring of the text body (paragraph, section titles, reference and footnote callouts, figures, tables, etc.). +- __Full text extraction and structuring__ from PDF articles, including a model for the overall document segmentation and models for the structuring of the text body (paragraph, section titles, reference and footnote callouts, figures, tables, data availability statements, etc.). - __PDF coordinates__ for extracted information, allowing to create "augmented" interactive PDF based on bounding boxes of the identified structures. - Parsing of __references in isolation__ (above .90 F1-score at instance-level, .95 F1-score at field level, using the Deep Learning model). - __Parsing of names__ (e.g. person title, forenames, middle name, etc.), in particular author names in header, and author names in references (two distinct models). @@ -30,8 +30,9 @@ The following functionalities are available: - __Parsing of dates__, ISO normalized day, month, year. - __Consolidation/resolution of the extracted bibliographical references__ using the [biblio-glutton](https://github.com/kermitt2/biblio-glutton) service or the [CrossRef REST API](https://github.com/CrossRef/rest-api-doc). In both cases, DOI/PMID resolution performance is higher than 0.95 F1-score from PDF extraction. - __Extraction and parsing of patent and non-patent references in patent__ publications. +- __Extraction of Funders and funding information__ with optional matching of extracted funders with the CrossRef Funder Registry. -In a complete PDF processing, GROBID manages 55 final labels used to build relatively fine-grained structures, from traditional publication metadata (title, author first/last/middle names, affiliation types, detailed address, journal, volume, issue, pages, DOI, PMID, etc.) to full text structures (section title, paragraph, reference markers, head/foot notes, figure captions, etc.). +In a complete PDF processing, GROBID manages more than 55 final labels used to build relatively fine-grained structures, from traditional publication metadata (title, author first/last/middle names, affiliation types, detailed address, journal, volume, issue, pages, DOI, PMID, etc.) to full text structures (section title, paragraph, reference markers, head/foot notes, figure captions, etc.). GROBID includes a comprehensive [web service API](https://grobid.readthedocs.io/en/latest/Grobid-service/), [Docker images](https://grobid.readthedocs.io/en/latest/Grobid-docker/), [batch processing](https://grobid.readthedocs.io/en/latest/Grobid-batch/), a JAVA API, a generic [training and evaluation framework](https://grobid.readthedocs.io/en/latest/Training-the-models-of-Grobid/) (precision, recall, etc., n-fold cross-evaluation), systematic [end-to-end benchmarking](https://grobid.readthedocs.io/en/latest/Benchmarking/) on thousand documents and the semi-automatic generation of training data. @@ -42,7 +43,7 @@ The key aspects of GROBID are the following ones: + Written in Java, with JNI call to native CRF libraries and/or Deep Learning libraries via Python JNI bridge. + Speed - on low profile Linux machine (8 threads): header extraction from 4000 PDF in 2 minutes (36 PDF per second with the RESTful API), parsing of 3500 references in 4 seconds, full processing of 4000 PDF (full body, header and reference, structured) in 26 minutes (around 2.5 PDF per second). + Scalability and robustness: We have been able recently to run the complete fulltext processing at around 10.6 PDF per second (around 915,000 PDF per day, around 20M pages per day) during one week on one 16 CPU machine (16 threads, 32GB RAM, no SDD, articles from mainstream publishers), see [here](https://github.com/kermitt2/grobid/issues/443#issuecomment-505208132) (11.3M PDF were processed in 6 days by 2 servers without crash). -+ Lazy loading of models and resources. Depending on the selected process, only the required data are loaded in memory. For instance, extracting only metadata header from a PDF requires less than 2 GB memory in a multithreading usage, extracting citations uses around 3GB and extracting all the PDF structures around 4GB. ++ Optional lazy loading of models and resources. Depending on the selected process, only the required data are loaded in memory. For instance, extracting only metadata header from a PDF requires less than 2 GB memory in a multithreading usage, extracting citations uses around 3GB and extracting all the PDF structures around 4GB. + Robust and fast PDF processing with [pdfalto](https://github.com/kermitt2/pdfalto), based on xpdf, and dedicated post-processing. + Modular and reusable machine learning models for sequence labelling. The default extractions are based on Linear Chain Conditional Random Fields, with the possibility to use various Deep Learning architectures for sequence labelling (including ELMo and BERT-CRF) for improving accuracy. The specialized sequence labelling models are cascaded to build a complete (hierarchical) document structure. + Full encoding in [__TEI__](http://www.tei-c.org/Guidelines/P5/index.xml), both for the training corpus and the parsed results. diff --git a/doc/Run-Grobid.md b/doc/Run-Grobid.md new file mode 100644 index 0000000000..127229a6d1 --- /dev/null +++ b/doc/Run-Grobid.md @@ -0,0 +1,24 @@ +

Run GROBID

> + +The standard way to run Grobid is to use Docker for starting a Grobid server. + +For installing Docker on your system, see [here](https://docs.docker.com/engine/understanding-docker/). + +For convenience, we provide two docker images: + +- the **full** image provides the best accuracy, because it includes all the required python and TensorFlow libraries, GPU support and all Deep Learning model resources. However it requires more resources, ideally a GPU (it will be automatically detected). If you have a limited amount of PDF, a good machine, and prioritize accuracy, use this Grobid flavor. To run this version of Grobid, the command is: + +```console +docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.7.3 +``` + +- the **lightweight** one offers best performance in term of runtime, memory usage and Docker image size. However, it does not use some of the best performing models in term of accuracy. If you have a lot of PDF to process, a low resource system, and accuracy is not so important, use this flavor: + +```console +docker run --rm --gpus all --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.7.3 +``` + +More documentation on the Docker images can be found [here](Grobid-docker.md). + +From there, you can check on your browser if the service works fine by accessing the welcome page of the service console, available at the URL . The GROBID server can be used via the [web service](Grobid-service.md). + diff --git a/doc/Training-the-models-of-Grobid.md b/doc/Training-the-models-of-Grobid.md index aad2f0eebc..f6d84a8f5d 100644 --- a/doc/Training-the-models-of-Grobid.md +++ b/doc/Training-the-models-of-Grobid.md @@ -28,6 +28,8 @@ Grobid uses different sequence labelling models depending on the labeling task t * table +* funding-acknowledgement + The models are located under `grobid/grobid-home/models`. Each of these models can be retrained using amended or additional training data. For production, a model is trained with all the available training data to maximize the performance. For development purposes, it is also possible to evaluate a model with part of the training data as frozen set (e.g. holdout set), automatic random split or apply 10-fold cross-evaluation. ## Train and evaluate diff --git a/doc/index.md b/doc/index.md index 66bd7e0810..5af27ef658 100644 --- a/doc/index.md +++ b/doc/index.md @@ -13,12 +13,14 @@

User manual

-* [Install GROBID](Install-Grobid.md) - -* [Use GROBID with containers (Docker)](Grobid-docker.md) +* [Run GROBID](Run-Grobid.md) * [Use GROBID as a service](Grobid-service.md) +* [Build a GROBID development environment](Install-Grobid.md) + +* [Manage GROBID with containers (Docker)](Grobid-docker.md) + * [Use GROBID in batch mode](Grobid-batch.md) * [GROBID configuration](Configuration.md) @@ -42,9 +44,13 @@

Benchmarking

* [Description](Benchmarking.md) + * [Evaluation PubMed Central](Benchmarking-pmc.md) + * [Evaluation bioRxiv](Benchmarking-biorxiv.md) + * [Evaluation PLOS](Benchmarking-plos.md) + * [Evaluation eLife](Benchmarking-elife.md)

Annotation guidelines

@@ -66,7 +72,9 @@

Developer notes

* [Notes for the Grobid Developers](Notes-grobid-developers.md) + * [Using Deep Learning models instead of default CRF](Deep-Learning-models.md) + * [Recompiling and integrating CRF libraries into GROBID](Recompiling-and-integrating-CRF-libraries.md) diff --git a/mkdocs.yml b/mkdocs.yml index f0e256fb10..d6b9b08d51 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,9 +16,10 @@ nav: - 'References': 'References.md' - 'Licence': 'License.md' - User manual: - - 'Install GROBID': 'Install-Grobid.md' - - 'GROBID with containers': 'Grobid-docker.md' + - 'Run GROBID': 'Run-Grobid.md' - 'GROBID service': 'Grobid-service.md' + - 'Build GROBID from source': 'Install-Grobid.md' + - 'GROBID with containers': 'Grobid-docker.md' - 'GROBID batch mode': 'Grobid-batch.md' - 'GROBID configuration': 'Configuration.md' - 'Troubleshooting and known issues': 'Troubleshooting.md'