diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml
index 21a52548..606b0ce3 100644
--- a/.github/workflows/ci-build-manual.yml
+++ b/.github/workflows/ci-build-manual.yml
@@ -39,9 +39,9 @@ jobs:
dockerfile: Dockerfile
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- image: lfoppiano/Pub2TEI
+ image: lfoppiano/pub2tei
registry: docker.io
pushImage: true
tags: latest-develop
- name: Image digest
- run: echo ${{ steps.docker_build.outputs.digest }}
\ No newline at end of file
+ run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/Readme.md b/Readme.md
index 3b929e84..5776335c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -72,9 +72,8 @@ git clone https://github.com/kermitt2/Pub2TEI
cd client
python3 pub2tei_client.py --help
-usage: pub2tei_client.py [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG] [--n N]
- [--consolidate_references] [--segment_sentences] [--grobid_refine] [--force]
- [--verbose]
+usage: pub2tei_client.py [-h] --input INPUT [--output OUTPUT] [--config CONFIG] [--n N] [--consolidate_references] [--segment_sentences]
+ [--generate_ids] [--grobid_refine] [--force] [--verbose]
Client for Pub2TEI services
@@ -86,10 +85,9 @@ optional arguments:
--n N concurrency for service usage
--consolidate_references
use GROBID for consolidation of the bibliographical references
- --segment_sentences segment sentences in the text content of the document with additional
- elements
- --grobid_refine use Grobid to structure/enhance raw fields: affiliations, references, person,
- dates
+ --segment_sentences segment sentences in the text content of the document with additional elements
+ --generate_ids Generate idenfifier for each text item
+ --grobid_refine use Grobid to structure/enhance raw fields: affiliations, references, person, dates
--force force re-processing pdf input files when tei output files already exist
--verbose print information about processed files in the console
```
@@ -112,12 +110,13 @@ Note that the consolidation is realized with the consolidation service indicated
Tranform a publisher XML into TEI XML format, with optional enhancements.
-| method | request type | response type | parameters | requirement | description |
-|--- |--- |--- |--- |--- |--- |
-| POST | `multipart/form-data` | `application/xml` | `input` | required | publisher XML file to be processed |
-| | | | `segmentSentences` | optional | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements |
-| | | | `grobidRefine` | optional | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML |
+| method | request type | response type | parameters | requirement | description |
+|--- |--- |--- |-------------------------|--- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| POST | `multipart/form-data` | `application/xml` | `input` | required | publisher XML file to be processed |
+| | | | `segmentSentences` | optional | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements |
+| | | | `grobidRefine` | optional | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML |
| | | | `consolidateReferences` | optional | Consolidate all the biblographical references, `consolidateReferences` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). |
+| | | | `generateIDs` | optional | Inject the attribute `xml:id` in the textual elements (`title`, `note`, `term`, `keywords`, `p`, `s`) |
Response status codes:
diff --git a/client/pub2tei_client.py b/client/pub2tei_client.py
index ff2d62c8..ee3e3cba 100644
--- a/client/pub2tei_client.py
+++ b/client/pub2tei_client.py
@@ -17,11 +17,11 @@ class ServerUnavailableException(Exception):
class Pub2TEIClient(ApiClient):
- def __init__(self, pub2tei_server='localhost',
- batch_size=1000,
+ def __init__(self, pub2tei_server='localhost',
+ batch_size=1000,
sleep_time=1,
timeout=60,
- config_path=None,
+ config_path=None,
check_server=True):
self.config = {
'pub2tei_server': pub2tei_server,
@@ -81,6 +81,7 @@ def process(
consolidate_references=True,
segment_sentences=False,
grobid_refine=False,
+ generate_ids=False,
force=True,
verbose=False,
):
@@ -107,6 +108,7 @@ def process(
consolidate_references,
segment_sentences,
grobid_refine,
+ generate_ids,
force,
verbose,
)
@@ -122,6 +124,7 @@ def process(
consolidate_references,
segment_sentences,
grobid_refine,
+ generate_ids,
force,
verbose,
)
@@ -135,6 +138,7 @@ def process_batch(
consolidate_references,
segment_sentences,
grobid_refine,
+ generate_ids,
force,
verbose=False,
):
@@ -150,13 +154,14 @@ def process_batch(
if not force and os.path.isfile(filename):
print(filename, "already exist, skipping... (use --force to reprocess pdf input files)")
continue
-
+
r = executor.submit(
self.process_xml,
input_file,
consolidate_references,
segment_sentences,
- grobid_refine)
+ grobid_refine,
+ generate_ids)
results.append(r)
@@ -190,7 +195,8 @@ def process_xml(
xml_file,
consolidate_references,
segment_sentences,
- grobid_refine
+ grobid_refine,
+ generate_ids
):
xml_handle = open(xml_file, "rb")
files = {
@@ -201,7 +207,7 @@ def process_xml(
{"Expires": "0"},
)
}
-
+
the_url = self.get_server_url("processXML")
# set the Pub2TEI parameters
@@ -212,6 +218,8 @@ def process_xml(
the_data["segmentSentences"] = "1"
if grobid_refine:
the_data["grobidRefine"] = "1"
+ if generate_ids:
+ the_data["generateIDs"] = "1"
try:
res, status = self.post(
@@ -224,7 +232,8 @@ def process_xml(
xml_file,
consolidate_references,
segment_sentences,
- grobid_refine
+ grobid_refine,
+ generate_ids
)
except requests.exceptions.ReadTimeout:
xml_handle.close()
@@ -240,7 +249,9 @@ def main():
parser = argparse.ArgumentParser(description="Client for Pub2TEI services")
parser.add_argument(
- "--input", default=None, help="path to the directory containing XML files to process: .xml"
+ "--input",
+ required=True,
+ help="path to the directory containing XML files to process: .xml"
)
parser.add_argument(
"--output",
@@ -252,7 +263,11 @@ def main():
default="./config.json",
help="path to the config file, default is ./config.json",
)
- parser.add_argument("--n", default=10, help="concurrency for service usage")
+ parser.add_argument(
+ "--n",
+ default=10,
+ help="concurrency for service usage"
+ )
parser.add_argument(
"--consolidate_references",
action="store_true",
@@ -261,16 +276,25 @@ def main():
parser.add_argument(
"--segment_sentences",
action="store_true",
+ default=False,
help="segment sentences in the text content of the document with additional elements",
)
+ parser.add_argument(
+ "--generate_ids",
+ action="store_true",
+ default=False,
+ help="Generate idenfifier for each text item",
+ )
parser.add_argument(
"--grobid_refine",
action="store_true",
+ default=False,
help="use Grobid to structure/enhance raw fields: affiliations, references, person, dates",
)
parser.add_argument(
"--force",
action="store_true",
+ default=False,
help="force re-processing pdf input files when tei output files already exist",
)
parser.add_argument(
@@ -305,7 +329,8 @@ def main():
consolidate_references = args.consolidate_references
segment_sentences = args.segment_sentences
grobid_refine = args.grobid_refine
-
+ generate_ids = args.generate_ids
+
force = args.force
verbose = args.verbose
@@ -323,6 +348,7 @@ def main():
consolidate_references=consolidate_references,
segment_sentences=segment_sentences,
grobid_refine=grobid_refine,
+ generate_ids=generate_ids,
force=force,
verbose=verbose,
)
diff --git a/src/main/java/org/pub2tei/document/DocumentProcessor.java b/src/main/java/org/pub2tei/document/DocumentProcessor.java
index e63bcfc4..3418c361 100644
--- a/src/main/java/org/pub2tei/document/DocumentProcessor.java
+++ b/src/main/java/org/pub2tei/document/DocumentProcessor.java
@@ -99,6 +99,10 @@ public String processTEI(File file, boolean segmentSentences, boolean refine, in
* Process a TEI XML format
*/
public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences) throws IOException {
+ return processTEI(tei, segmentSentences, refine, consolidateReferences, false);
+ }
+
+ public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws IOException {
if (tei == null || tei.length() == 0)
return null;
try {
@@ -120,6 +124,11 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i
XMLUtilities.fixSegmentedFigureTableList(document);
}
+ if (generateIDs) {
+ org.w3c.dom.Element root = document.getDocumentElement();
+ XMLUtilities.generateIDs(document, root);
+ }
+
if (refine) {
// in case we have raw fields that can be further refined (like raw affiliation string,
// raw reference string, etc.), use Grobid to add some parsed sub-structures together
@@ -159,7 +168,7 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i
* @return TEI string
*/
- public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception {
+ public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception {
InputStream inputStream = null;
try {
@@ -168,17 +177,17 @@ public String processXML(File file, boolean segmentSentences, boolean refine, in
LOGGER.error("Invalid input file: " + file.getAbsolutePath(), e);
}
- return processXML(inputStream, segmentSentences, refine, consolidateReferences);
+ return processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs);
}
- public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception {
+ public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception {
if (inputStream == null)
return null;
String tei = null;
try {
tei = this.pub2TEIProcessor.transform(inputStream);
- tei = processTEI(tei, segmentSentences, refine, consolidateReferences);
+ tei = processTEI(tei, segmentSentences, refine, consolidateReferences, generateIDs);
} catch (final Exception exp) {
LOGGER.error("An error occured while processing the XML input stream", exp);
}
diff --git a/src/main/java/org/pub2tei/document/XMLUtilities.java b/src/main/java/org/pub2tei/document/XMLUtilities.java
index 253c9ec2..1156b78b 100644
--- a/src/main/java/org/pub2tei/document/XMLUtilities.java
+++ b/src/main/java/org/pub2tei/document/XMLUtilities.java
@@ -4,6 +4,7 @@
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
+import org.grobid.core.utilities.KeyGen;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.SentenceUtilities;
import org.slf4j.Logger;
@@ -40,6 +41,7 @@ public class XMLUtilities {
private static ListShifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation.
It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation.
After short-term cultivation, the plot is abandoned, which allows the vegetation to recover.
- Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017
+
Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017
, Curtis et al 2018
- , Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015).
+ , Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015).Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small.
Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce.
- In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001
+ In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001
, Douangsavanh et al 2006
- , Epprecht et al 2018
- , Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018
+ , Epprecht et al 2018
+ , Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018
, Chen et al 2023).
It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years (Chen et al 2023).
Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020.