diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml index 21a52548..606b0ce3 100644 --- a/.github/workflows/ci-build-manual.yml +++ b/.github/workflows/ci-build-manual.yml @@ -39,9 +39,9 @@ jobs: dockerfile: Dockerfile username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - image: lfoppiano/Pub2TEI + image: lfoppiano/pub2tei registry: docker.io pushImage: true tags: latest-develop - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} \ No newline at end of file + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/Readme.md b/Readme.md index 3b929e84..5776335c 100644 --- a/Readme.md +++ b/Readme.md @@ -72,9 +72,8 @@ git clone https://github.com/kermitt2/Pub2TEI cd client python3 pub2tei_client.py --help -usage: pub2tei_client.py [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG] [--n N] - [--consolidate_references] [--segment_sentences] [--grobid_refine] [--force] - [--verbose] +usage: pub2tei_client.py [-h] --input INPUT [--output OUTPUT] [--config CONFIG] [--n N] [--consolidate_references] [--segment_sentences] + [--generate_ids] [--grobid_refine] [--force] [--verbose] Client for Pub2TEI services @@ -86,10 +85,9 @@ optional arguments: --n N concurrency for service usage --consolidate_references use GROBID for consolidation of the bibliographical references - --segment_sentences segment sentences in the text content of the document with additional - elements - --grobid_refine use Grobid to structure/enhance raw fields: affiliations, references, person, - dates + --segment_sentences segment sentences in the text content of the document with additional elements + --generate_ids Generate idenfifier for each text item + --grobid_refine use Grobid to structure/enhance raw fields: affiliations, references, person, dates --force force re-processing pdf input files when tei output files already exist --verbose print information about processed files in the console ``` @@ -112,12 +110,13 @@ Note that the consolidation is realized with the consolidation service indicated Tranform a publisher XML into TEI XML format, with optional enhancements. -| method | request type | response type | parameters | requirement | description | -|--- |--- |--- |--- |--- |--- | -| POST | `multipart/form-data` | `application/xml` | `input` | required | publisher XML file to be processed | -| | | | `segmentSentences` | optional | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements | -| | | | `grobidRefine` | optional | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML | +| method | request type | response type | parameters | requirement | description | +|--- |--- |--- |-------------------------|--- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| POST | `multipart/form-data` | `application/xml` | `input` | required | publisher XML file to be processed | +| | | | `segmentSentences` | optional | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements | +| | | | `grobidRefine` | optional | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML | | | | | `consolidateReferences` | optional | Consolidate all the biblographical references, `consolidateReferences` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). | +| | | | `generateIDs` | optional | Inject the attribute `xml:id` in the textual elements (`title`, `note`, `term`, `keywords`, `p`, `s`) | Response status codes: diff --git a/client/pub2tei_client.py b/client/pub2tei_client.py index ff2d62c8..ee3e3cba 100644 --- a/client/pub2tei_client.py +++ b/client/pub2tei_client.py @@ -17,11 +17,11 @@ class ServerUnavailableException(Exception): class Pub2TEIClient(ApiClient): - def __init__(self, pub2tei_server='localhost', - batch_size=1000, + def __init__(self, pub2tei_server='localhost', + batch_size=1000, sleep_time=1, timeout=60, - config_path=None, + config_path=None, check_server=True): self.config = { 'pub2tei_server': pub2tei_server, @@ -81,6 +81,7 @@ def process( consolidate_references=True, segment_sentences=False, grobid_refine=False, + generate_ids=False, force=True, verbose=False, ): @@ -107,6 +108,7 @@ def process( consolidate_references, segment_sentences, grobid_refine, + generate_ids, force, verbose, ) @@ -122,6 +124,7 @@ def process( consolidate_references, segment_sentences, grobid_refine, + generate_ids, force, verbose, ) @@ -135,6 +138,7 @@ def process_batch( consolidate_references, segment_sentences, grobid_refine, + generate_ids, force, verbose=False, ): @@ -150,13 +154,14 @@ def process_batch( if not force and os.path.isfile(filename): print(filename, "already exist, skipping... (use --force to reprocess pdf input files)") continue - + r = executor.submit( self.process_xml, input_file, consolidate_references, segment_sentences, - grobid_refine) + grobid_refine, + generate_ids) results.append(r) @@ -190,7 +195,8 @@ def process_xml( xml_file, consolidate_references, segment_sentences, - grobid_refine + grobid_refine, + generate_ids ): xml_handle = open(xml_file, "rb") files = { @@ -201,7 +207,7 @@ def process_xml( {"Expires": "0"}, ) } - + the_url = self.get_server_url("processXML") # set the Pub2TEI parameters @@ -212,6 +218,8 @@ def process_xml( the_data["segmentSentences"] = "1" if grobid_refine: the_data["grobidRefine"] = "1" + if generate_ids: + the_data["generateIDs"] = "1" try: res, status = self.post( @@ -224,7 +232,8 @@ def process_xml( xml_file, consolidate_references, segment_sentences, - grobid_refine + grobid_refine, + generate_ids ) except requests.exceptions.ReadTimeout: xml_handle.close() @@ -240,7 +249,9 @@ def main(): parser = argparse.ArgumentParser(description="Client for Pub2TEI services") parser.add_argument( - "--input", default=None, help="path to the directory containing XML files to process: .xml" + "--input", + required=True, + help="path to the directory containing XML files to process: .xml" ) parser.add_argument( "--output", @@ -252,7 +263,11 @@ def main(): default="./config.json", help="path to the config file, default is ./config.json", ) - parser.add_argument("--n", default=10, help="concurrency for service usage") + parser.add_argument( + "--n", + default=10, + help="concurrency for service usage" + ) parser.add_argument( "--consolidate_references", action="store_true", @@ -261,16 +276,25 @@ def main(): parser.add_argument( "--segment_sentences", action="store_true", + default=False, help="segment sentences in the text content of the document with additional elements", ) + parser.add_argument( + "--generate_ids", + action="store_true", + default=False, + help="Generate idenfifier for each text item", + ) parser.add_argument( "--grobid_refine", action="store_true", + default=False, help="use Grobid to structure/enhance raw fields: affiliations, references, person, dates", ) parser.add_argument( "--force", action="store_true", + default=False, help="force re-processing pdf input files when tei output files already exist", ) parser.add_argument( @@ -305,7 +329,8 @@ def main(): consolidate_references = args.consolidate_references segment_sentences = args.segment_sentences grobid_refine = args.grobid_refine - + generate_ids = args.generate_ids + force = args.force verbose = args.verbose @@ -323,6 +348,7 @@ def main(): consolidate_references=consolidate_references, segment_sentences=segment_sentences, grobid_refine=grobid_refine, + generate_ids=generate_ids, force=force, verbose=verbose, ) diff --git a/src/main/java/org/pub2tei/document/DocumentProcessor.java b/src/main/java/org/pub2tei/document/DocumentProcessor.java index e63bcfc4..3418c361 100644 --- a/src/main/java/org/pub2tei/document/DocumentProcessor.java +++ b/src/main/java/org/pub2tei/document/DocumentProcessor.java @@ -99,6 +99,10 @@ public String processTEI(File file, boolean segmentSentences, boolean refine, in * Process a TEI XML format */ public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences) throws IOException { + return processTEI(tei, segmentSentences, refine, consolidateReferences, false); + } + + public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws IOException { if (tei == null || tei.length() == 0) return null; try { @@ -120,6 +124,11 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i XMLUtilities.fixSegmentedFigureTableList(document); } + if (generateIDs) { + org.w3c.dom.Element root = document.getDocumentElement(); + XMLUtilities.generateIDs(document, root); + } + if (refine) { // in case we have raw fields that can be further refined (like raw affiliation string, // raw reference string, etc.), use Grobid to add some parsed sub-structures together @@ -159,7 +168,7 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i * @return TEI string */ - public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception { + public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception { InputStream inputStream = null; try { @@ -168,17 +177,17 @@ public String processXML(File file, boolean segmentSentences, boolean refine, in LOGGER.error("Invalid input file: " + file.getAbsolutePath(), e); } - return processXML(inputStream, segmentSentences, refine, consolidateReferences); + return processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs); } - public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception { + public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception { if (inputStream == null) return null; String tei = null; try { tei = this.pub2TEIProcessor.transform(inputStream); - tei = processTEI(tei, segmentSentences, refine, consolidateReferences); + tei = processTEI(tei, segmentSentences, refine, consolidateReferences, generateIDs); } catch (final Exception exp) { LOGGER.error("An error occured while processing the XML input stream", exp); } diff --git a/src/main/java/org/pub2tei/document/XMLUtilities.java b/src/main/java/org/pub2tei/document/XMLUtilities.java index 253c9ec2..1156b78b 100644 --- a/src/main/java/org/pub2tei/document/XMLUtilities.java +++ b/src/main/java/org/pub2tei/document/XMLUtilities.java @@ -4,6 +4,7 @@ import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; +import org.grobid.core.utilities.KeyGen; import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.SentenceUtilities; import org.slf4j.Logger; @@ -40,6 +41,7 @@ public class XMLUtilities { private static List textualElements = Arrays.asList("p", "figDesc"); private static List noSegmentationElements = Arrays.asList("listBibl", "table"); + private static List elementsWithIds = Arrays.asList("s", "p", "title", "note", "term", "keywords"); private static DocumentBuilderFactory factory = getReasonableDocumentBuilderFactory(); @@ -243,7 +245,6 @@ public static void segment(org.w3c.dom.Document doc, Node node) { textBuffer.append(" "); } } - textBuffer.append(serializedString); } String text = textBuffer.toString(); @@ -556,6 +557,32 @@ public static String reformatTEI(String tei) { return tei; } + public static void generateIDs(org.w3c.dom.Document doc, Node node) { + final NodeList children = node.getChildNodes(); + final int nbChildren = children.getLength(); + + List newChildren = new ArrayList<>(); + for (int i = 0; i < nbChildren; i++) { + newChildren.add(children.item(i)); + } + + factory.setNamespaceAware(true); + + for (int i = 0; i < nbChildren; i++) { + final Node n = newChildren.get(i); + if (n.getNodeType() == Node.ELEMENT_NODE + && elementsWithIds.contains(n.getNodeName())) { + Element nodeAsElement = ((Element) n); + if (!nodeAsElement.hasAttribute("xml:id")) { + String divID = "_" + KeyGen.getKey().substring(0, 7); + ((Element) n).setAttribute("xml:id", divID); + } + XMLUtilities.generateIDs(doc, n); + } else if (n.getNodeType() == Node.ELEMENT_NODE) { + XMLUtilities.generateIDs(doc, n); + } + } + } /** * This method is similar to the usual Element.getTextContent() (get all text under the element diff --git a/src/main/java/org/pub2tei/service/ProcessFile.java b/src/main/java/org/pub2tei/service/ProcessFile.java index cf4bea16..4e9214d8 100644 --- a/src/main/java/org/pub2tei/service/ProcessFile.java +++ b/src/main/java/org/pub2tei/service/ProcessFile.java @@ -41,16 +41,17 @@ public ProcessFile() { * @param segmentSentences if true, return results with segmented sentences * @return a response object containing the converted/refined TEI XML */ - public static Response processXML(final InputStream inputStream, - final boolean segmentSentences, - final boolean refine, - final int consolidateReferences, - ServiceConfiguration serviceConfiguration) { + public static Response processXML(final InputStream inputStream, + final boolean segmentSentences, + final boolean refine, + final int consolidateReferences, + ServiceConfiguration serviceConfiguration, + final boolean generateIDs) { LOGGER.debug(methodLogIn()); Response response = null; try { DocumentProcessor documentProcessor = new DocumentProcessor(serviceConfiguration); - String result = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences); + String result = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs); if (result == null || result.length() == 0) { response = Response.status(Response.Status.NO_CONTENT).build(); diff --git a/src/main/java/org/pub2tei/service/ProcessString.java b/src/main/java/org/pub2tei/service/ProcessString.java index df128a32..91128989 100644 --- a/src/main/java/org/pub2tei/service/ProcessString.java +++ b/src/main/java/org/pub2tei/service/ProcessString.java @@ -39,7 +39,8 @@ public static Response processText(String text, final boolean segmentSentences, final boolean refine, final int consolidateReferences, - ServiceConfiguration serviceConfiguration) { + ServiceConfiguration serviceConfiguration, + Boolean generateIDs) { LOGGER.debug(methodLogIn()); Response response = null; @@ -55,7 +56,7 @@ public static Response processText(String text, DocumentProcessor documentProcessor = new DocumentProcessor(serviceConfiguration); InputStream inputStream = new ByteArrayInputStream(text.getBytes()); - String retValString = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences); + String retValString = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs); if (!isResultOK(retValString)) { response = Response.status(Response.Status.NO_CONTENT).build(); diff --git a/src/main/java/org/pub2tei/service/ServiceController.java b/src/main/java/org/pub2tei/service/ServiceController.java index c05bb3f9..7c8f786d 100644 --- a/src/main/java/org/pub2tei/service/ServiceController.java +++ b/src/main/java/org/pub2tei/service/ServiceController.java @@ -40,6 +40,7 @@ public class ServiceController implements Pub2TEIPaths { private static final String REFINE_FUNDERS = "funderRefine"; private static final String CONSOLIDATE_REFERENCES = "consolidateReferences"; + private static final String GENERATE_IDS = "generateIDs"; private static final String CONSOLIDATE_HEADER = "consolidateHeader"; private static final String CONSOLIDATE_FUNDERS = "consolidateFunders"; @@ -81,11 +82,13 @@ public Response processText_post( @FormParam(TEXT) String text, @DefaultValue("0") @FormParam(SEGMENT) String segmentSentences, @DefaultValue("0") @FormParam(REFINE_GROBID) String refineGrobid, + @DefaultValue("0") @FormDataParam(GENERATE_IDS) String generateIds, @DefaultValue("0") @FormParam(CONSOLIDATE_REFERENCES) int consolidateReferences ) { - boolean segment = validateGenerateIdParam(segmentSentences); - boolean refine = validateGenerateIdParam(refineGrobid); - return ProcessString.processText(text, segment, refine, consolidateReferences, this.configuration); + boolean segment = validateBooleanParam(segmentSentences); + boolean refine = validateBooleanParam(refineGrobid); + boolean generateIDs = validateBooleanParam(generateIds); + return ProcessString.processText(text, segment, refine, consolidateReferences, this.configuration, generateIDs); } @Path(PATH_TEXT) @@ -95,10 +98,12 @@ public Response processText_get( @QueryParam(TEXT) String text, @DefaultValue("0") @QueryParam(SEGMENT) String segmentSentences, @DefaultValue("0") @QueryParam(REFINE_GROBID) String refineGrobid, + @DefaultValue("0") @FormDataParam(GENERATE_IDS) String generateIds, @DefaultValue("0") @QueryParam(CONSOLIDATE_REFERENCES) int consolidateReferences) { - boolean segment = validateGenerateIdParam(segmentSentences); - boolean refine = validateGenerateIdParam(refineGrobid); - return ProcessString.processText(text, segment, refine, consolidateReferences, this.configuration); + boolean segment = validateBooleanParam(segmentSentences); + boolean refine = validateBooleanParam(refineGrobid); + boolean generateIDs = validateBooleanParam(generateIds); + return ProcessString.processText(text, segment, refine, consolidateReferences, this.configuration, generateIDs); } @Path(PATH_XML) @@ -109,19 +114,21 @@ public Response processXML( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(SEGMENT) String segmentSentences, @DefaultValue("0") @FormDataParam(REFINE_GROBID) String refineGrobid, + @DefaultValue("0") @FormDataParam(GENERATE_IDS) String generateIds, @DefaultValue("0") @FormDataParam(CONSOLIDATE_REFERENCES) int consolidateReferences ) { - boolean segment = validateGenerateIdParam(segmentSentences); - boolean refine = validateGenerateIdParam(refineGrobid); - return ProcessFile.processXML(inputStream, segment, refine, consolidateReferences, this.configuration); + boolean segment = validateBooleanParam(segmentSentences); + boolean refine = validateBooleanParam(refineGrobid); + boolean generateIDs = validateBooleanParam(generateIds); + return ProcessFile.processXML(inputStream, segment, refine, consolidateReferences, this.configuration, generateIDs); } - private static boolean validateGenerateIdParam(String generateIDs) { - boolean generate = false; - if ((generateIDs != null) && (generateIDs.equals("1") || generateIDs.equals("true") || generateIDs.equals("True"))) { - generate = true; + private static boolean validateBooleanParam(String param) { + boolean result = false; + if ((param != null) && (param.equals("1") || param.equals("true") || param.equals("True"))) { + result = true; } - return generate; + return result; } } diff --git a/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java b/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java index 637cdb4e..a961b19a 100644 --- a/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java +++ b/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java @@ -11,7 +11,6 @@ import javax.xml.parsers.DocumentBuilderFactory; import java.io.InputStream; import java.io.StringReader; - import java.util.Arrays; import static org.hamcrest.MatcherAssert.assertThat; @@ -57,7 +56,6 @@ public void testSegment_chunk_shouldInjectSegmentCorrectly() throws Exception { assertThat(XMLUtilities.serialize(document, document.getDocumentElement()), CompareMatcher.isIdenticalTo(expected.replace("\t"," "))); } - @Test public void testSegment_document1_shouldInjectSegmentCorrectly() throws Exception { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); diff --git a/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml b/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml index aade8755..43038f72 100644 --- a/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml +++ b/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml @@ -8,7 +8,7 @@ NASA - Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan + Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan NASA Carbon Monitoring System @@ -163,9 +163,9 @@ Shifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation. It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation. After short-term cultivation, the plot is abandoned, which allows the vegetation to recover. - Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017 + Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017 , Curtis et al 2018 - , Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015). + , Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015). Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small. Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce.

@@ -178,10 +178,10 @@ Due to the complexity of monitoring shifting cultivation and tracking the associated carbon dynamics, estimates of carbon emissions or sequestration from shifting cultivation are usually unavailable in REDD+ (Reducing Emissions from Deforestation and Forest Degradation) reporting.

- In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001 + In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001 , Douangsavanh et al 2006 - , Epprecht et al 2018 - , Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018 + , Epprecht et al 2018 + , Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018 , Chen et al 2023). It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years (Chen et al 2023). Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020.