Merge pull request #12 from lfoppiano/add-generated-id

Generate IDs on the different text components
kermitt2 · Sep 8, 2024 · d1a2429 · d1a2429
2 parents ad0dd9b + 9441e99
commit d1a2429
Show file tree

Hide file tree

Showing 10 changed files with 128 additions and 60 deletions.
diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml
@@ -39,9 +39,9 @@ jobs:
           dockerfile: Dockerfile
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-          image: lfoppiano/Pub2TEI
+          image: lfoppiano/pub2tei
           registry: docker.io
           pushImage: true
           tags: latest-develop
       - name: Image digest
-        run: echo ${{ steps.docker_build.outputs.digest }}
+        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/Readme.md b/Readme.md
@@ -72,9 +72,8 @@ git clone https://github.com/kermitt2/Pub2TEI
 cd client
 python3 pub2tei_client.py --help
 
-usage: pub2tei_client.py [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG] [--n N]
-                         [--consolidate_references] [--segment_sentences] [--grobid_refine] [--force]
-                         [--verbose]
+usage: pub2tei_client.py [-h] --input INPUT [--output OUTPUT] [--config CONFIG] [--n N] [--consolidate_references] [--segment_sentences]
+                         [--generate_ids] [--grobid_refine] [--force] [--verbose]
 
 Client for Pub2TEI services
 
@@ -86,10 +85,9 @@ optional arguments:
   --n N                 concurrency for service usage
   --consolidate_references
                         use GROBID for consolidation of the bibliographical references
-  --segment_sentences   segment sentences in the text content of the document with additional <s>
-                        elements
-  --grobid_refine       use Grobid to structure/enhance raw fields: affiliations, references, person,
-                        dates
+  --segment_sentences   segment sentences in the text content of the document with additional <s> elements
+  --generate_ids        Generate idenfifier for each text item
+  --grobid_refine       use Grobid to structure/enhance raw fields: affiliations, references, person, dates
   --force               force re-processing pdf input files when tei output files already exist
   --verbose             print information about processed files in the console
 ```
@@ -112,12 +110,13 @@ Note that the consolidation is realized with the consolidation service indicated
 
 Tranform a publisher XML into TEI XML format, with optional enhancements.
 
-|  method   |  request type         |  response type       |  parameters            |  requirement  |  description  |
-|---        |---                    |---                   |---                     |---            |---            |
-| POST      | `multipart/form-data` | `application/xml`    | `input`                | required      | publisher XML file to be processed |
-|           |                       |                      | `segmentSentences`     | optional      | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements <s> |
-|           |                       |                      | `grobidRefine`         | optional      | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML |
+|  method   |  request type         |  response type       | parameters              |  requirement  | description                                                                                                                                                                                                                                  |
+|---        |---                    |---                   |-------------------------|---            |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| POST      | `multipart/form-data` | `application/xml`    | `input`                 | required      | publisher XML file to be processed                                                                                                                                                                                                           |
+|           |                       |                      | `segmentSentences`      | optional      | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements <s>                                                                                                                         |
+|           |                       |                      | `grobidRefine`          | optional      | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML                                                          |
 |           |                       |                      | `consolidateReferences` | optional      | Consolidate all the biblographical references, `consolidateReferences` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). |
+|           |                       |                      | `generateIDs`           | optional      | Inject the attribute `xml:id` in the textual elements (`title`, `note`, `term`, `keywords`, `p`, `s`)                                                                                                                                                                                     |
 
 Response status codes:
 

diff --git a/client/pub2tei_client.py b/client/pub2tei_client.py
@@ -17,11 +17,11 @@ class ServerUnavailableException(Exception):
 
 class Pub2TEIClient(ApiClient):
 
-    def __init__(self, pub2tei_server='localhost', 
-                 batch_size=1000, 
+    def __init__(self, pub2tei_server='localhost',
+                 batch_size=1000,
                  sleep_time=1,
                  timeout=60,
-                 config_path=None, 
+                 config_path=None,
                  check_server=True):
         self.config = {
             'pub2tei_server': pub2tei_server,
@@ -81,6 +81,7 @@ def process(
         consolidate_references=True,
         segment_sentences=False,
         grobid_refine=False,
+        generate_ids=False,
         force=True,
         verbose=False,
     ):
@@ -107,6 +108,7 @@ def process(
                             consolidate_references,
                             segment_sentences,
                             grobid_refine,
+                            generate_ids,
                             force,
                             verbose,
                         )
@@ -122,6 +124,7 @@ def process(
                 consolidate_references,
                 segment_sentences,
                 grobid_refine,
+                generate_ids,
                 force,
                 verbose,
             )
@@ -135,6 +138,7 @@ def process_batch(
         consolidate_references,
         segment_sentences,
         grobid_refine,
+        generate_ids,
         force,
         verbose=False,
     ):
@@ -150,13 +154,14 @@ def process_batch(
                 if not force and os.path.isfile(filename):
                     print(filename, "already exist, skipping... (use --force to reprocess pdf input files)")
                     continue
-                
+
                 r = executor.submit(
                     self.process_xml,
                     input_file,
                     consolidate_references,
                     segment_sentences,
-                    grobid_refine)
+                    grobid_refine,
+                    generate_ids)
 
                 results.append(r)
 
@@ -190,7 +195,8 @@ def process_xml(
         xml_file,
         consolidate_references,
         segment_sentences,
-        grobid_refine
+        grobid_refine,
+        generate_ids
     ):
         xml_handle = open(xml_file, "rb")
         files = {
@@ -201,7 +207,7 @@ def process_xml(
                 {"Expires": "0"},
             )
         }
-        
+
         the_url = self.get_server_url("processXML")
 
         # set the Pub2TEI parameters
@@ -212,6 +218,8 @@ def process_xml(
             the_data["segmentSentences"] = "1"
         if grobid_refine:
             the_data["grobidRefine"] = "1"
+        if generate_ids:
+            the_data["generateIDs"] = "1"
 
         try:
             res, status = self.post(
@@ -224,7 +232,8 @@ def process_xml(
                     xml_file,
                     consolidate_references,
                     segment_sentences,
-                    grobid_refine
+                    grobid_refine,
+                    generate_ids
                 )
         except requests.exceptions.ReadTimeout:
             xml_handle.close()
@@ -240,7 +249,9 @@ def main():
     parser = argparse.ArgumentParser(description="Client for Pub2TEI services")
 
     parser.add_argument(
-        "--input", default=None, help="path to the directory containing XML files to process: .xml"
+        "--input",
+        required=True,
+        help="path to the directory containing XML files to process: .xml"
     )
     parser.add_argument(
         "--output",
@@ -252,7 +263,11 @@ def main():
         default="./config.json",
         help="path to the config file, default is ./config.json",
     )
-    parser.add_argument("--n", default=10, help="concurrency for service usage")
+    parser.add_argument(
+        "--n",
+        default=10,
+        help="concurrency for service usage"
+    )
     parser.add_argument(
         "--consolidate_references",
         action="store_true",
@@ -261,16 +276,25 @@ def main():
     parser.add_argument(
         "--segment_sentences",
         action="store_true",
+        default=False,
         help="segment sentences in the text content of the document with additional <s> elements",
     )
+    parser.add_argument(
+        "--generate_ids",
+        action="store_true",
+        default=False,
+        help="Generate idenfifier for each text item",
+    )
     parser.add_argument(
         "--grobid_refine",
         action="store_true",
+        default=False,
         help="use Grobid to structure/enhance raw fields: affiliations, references, person, dates",
     )
     parser.add_argument(
         "--force",
         action="store_true",
+        default=False,
         help="force re-processing pdf input files when tei output files already exist",
     )
     parser.add_argument(
@@ -305,7 +329,8 @@ def main():
     consolidate_references = args.consolidate_references
     segment_sentences = args.segment_sentences
     grobid_refine = args.grobid_refine
-
+    generate_ids = args.generate_ids
+
     force = args.force
     verbose = args.verbose
 
@@ -323,6 +348,7 @@ def main():
         consolidate_references=consolidate_references,
         segment_sentences=segment_sentences,
         grobid_refine=grobid_refine,
+        generate_ids=generate_ids,
         force=force,
         verbose=verbose,
     )

diff --git a/src/main/java/org/pub2tei/document/DocumentProcessor.java b/src/main/java/org/pub2tei/document/DocumentProcessor.java
@@ -99,6 +99,10 @@ public String processTEI(File file, boolean segmentSentences, boolean refine, in
      * Process a TEI XML format
      */
     public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences) throws IOException {
+        return processTEI(tei, segmentSentences, refine, consolidateReferences, false);
+    }
+
+    public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws IOException {
         if (tei == null || tei.length() == 0)
             return null;
         try {
@@ -120,6 +124,11 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i
                 XMLUtilities.fixSegmentedFigureTableList(document);
             }
 
+            if (generateIDs) {
+                org.w3c.dom.Element root = document.getDocumentElement();
+                XMLUtilities.generateIDs(document, root);
+            }
+
             if (refine) {
                 // in case we have raw fields that can be further refined (like raw affiliation string,
                 // raw reference string, etc.), use Grobid to add some parsed sub-structures together 
@@ -159,7 +168,7 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i
      * @return TEI string
      */
 
-    public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception {
+    public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception {
         InputStream inputStream = null;
 
         try {
@@ -168,17 +177,17 @@ public String processXML(File file, boolean segmentSentences, boolean refine, in
             LOGGER.error("Invalid input file: " + file.getAbsolutePath(), e);
         }
 
-        return processXML(inputStream, segmentSentences, refine, consolidateReferences);
+        return processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs);
     }
 
-    public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception {
+    public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception {
         if (inputStream == null) 
             return null;
 
         String tei = null;
         try {
             tei = this.pub2TEIProcessor.transform(inputStream);
-            tei = processTEI(tei, segmentSentences, refine, consolidateReferences);
+            tei = processTEI(tei, segmentSentences, refine, consolidateReferences, generateIDs);
         } catch (final Exception exp) {
             LOGGER.error("An error occured while processing the XML input stream", exp);
         } 

diff --git a/src/main/java/org/pub2tei/document/XMLUtilities.java b/src/main/java/org/pub2tei/document/XMLUtilities.java
@@ -4,6 +4,7 @@
 import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.grobid.core.utilities.KeyGen;
 import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.SentenceUtilities;
 import org.slf4j.Logger;
@@ -40,6 +41,7 @@ public class XMLUtilities {
 
     private static List<String> textualElements = Arrays.asList("p", "figDesc");
     private static List<String> noSegmentationElements = Arrays.asList("listBibl", "table");
+    private static List<String> elementsWithIds = Arrays.asList("s", "p", "title", "note", "term", "keywords");
 
     private static DocumentBuilderFactory factory = getReasonableDocumentBuilderFactory();
 
@@ -243,7 +245,6 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                             textBuffer.append(" ");
                         }
                     }
-
                     textBuffer.append(serializedString);
                 }
                 String text = textBuffer.toString();
@@ -556,6 +557,32 @@ public static String reformatTEI(String tei) {
         return tei;
     }
 
+    public static void generateIDs(org.w3c.dom.Document doc, Node node) {
+        final NodeList children = node.getChildNodes();
+        final int nbChildren = children.getLength();
+
+        List<Node> newChildren = new ArrayList<>();
+        for (int i = 0; i < nbChildren; i++) {
+            newChildren.add(children.item(i));
+        }
+
+        factory.setNamespaceAware(true);
+
+        for (int i = 0; i < nbChildren; i++) {
+            final Node n = newChildren.get(i);
+            if (n.getNodeType() == Node.ELEMENT_NODE
+                    && elementsWithIds.contains(n.getNodeName())) {
+                Element nodeAsElement = ((Element) n);
+                if (!nodeAsElement.hasAttribute("xml:id")) {
+                    String divID = "_" + KeyGen.getKey().substring(0, 7);
+                    ((Element) n).setAttribute("xml:id", divID);
+                }
+                XMLUtilities.generateIDs(doc, n);
+            } else if (n.getNodeType() == Node.ELEMENT_NODE) {
+                XMLUtilities.generateIDs(doc, n);
+            }
+        }
+    }
 
     /**
      * This method is similar to the usual Element.getTextContent() (get all text under the element

diff --git a/src/main/java/org/pub2tei/service/ProcessFile.java b/src/main/java/org/pub2tei/service/ProcessFile.java
@@ -41,16 +41,17 @@ public ProcessFile() {
      * @param segmentSentences     if true, return results with segmented sentences
      * @return a response object containing the converted/refined TEI XML
      */
-    public static Response processXML(final InputStream inputStream, 
-                                    final boolean segmentSentences,
-                                    final boolean refine, 
-                                    final int consolidateReferences,
-                                    ServiceConfiguration serviceConfiguration) {
+    public static Response processXML(final InputStream inputStream,
+                                      final boolean segmentSentences,
+                                      final boolean refine,
+                                      final int consolidateReferences,
+                                      ServiceConfiguration serviceConfiguration,
+                                      final boolean generateIDs) {
         LOGGER.debug(methodLogIn()); 
         Response response = null;
         try {
             DocumentProcessor documentProcessor = new DocumentProcessor(serviceConfiguration);
-            String result = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences);
+            String result = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs);
 
             if (result == null || result.length() == 0) {
                 response = Response.status(Response.Status.NO_CONTENT).build();

diff --git a/src/main/java/org/pub2tei/service/ProcessString.java b/src/main/java/org/pub2tei/service/ProcessString.java
@@ -39,7 +39,8 @@ public static Response processText(String text,
                                        final boolean segmentSentences,
                                        final boolean refine,
                                        final int consolidateReferences,
-                                       ServiceConfiguration serviceConfiguration) {
+                                       ServiceConfiguration serviceConfiguration,
+                                       Boolean generateIDs) {
         LOGGER.debug(methodLogIn());
         Response response = null;
 
@@ -55,7 +56,7 @@ public static Response processText(String text,
 
             DocumentProcessor documentProcessor = new DocumentProcessor(serviceConfiguration);
             InputStream inputStream = new ByteArrayInputStream(text.getBytes());
-            String retValString = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences);
+            String retValString = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs);
 
             if (!isResultOK(retValString)) {
                 response = Response.status(Response.Status.NO_CONTENT).build();