Skip to content

Commit

Permalink
Merge pull request #12 from lfoppiano/add-generated-id
Browse files Browse the repository at this point in the history
Generate IDs on the different text components
  • Loading branch information
kermitt2 authored Sep 8, 2024
2 parents ad0dd9b + 9441e99 commit d1a2429
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 60 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci-build-manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ jobs:
dockerfile: Dockerfile
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
image: lfoppiano/Pub2TEI
image: lfoppiano/pub2tei
registry: docker.io
pushImage: true
tags: latest-develop
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
run: echo ${{ steps.docker_build.outputs.digest }}
23 changes: 11 additions & 12 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,8 @@ git clone https://github.com/kermitt2/Pub2TEI
cd client
python3 pub2tei_client.py --help

usage: pub2tei_client.py [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG] [--n N]
[--consolidate_references] [--segment_sentences] [--grobid_refine] [--force]
[--verbose]
usage: pub2tei_client.py [-h] --input INPUT [--output OUTPUT] [--config CONFIG] [--n N] [--consolidate_references] [--segment_sentences]
[--generate_ids] [--grobid_refine] [--force] [--verbose]

Client for Pub2TEI services

Expand All @@ -86,10 +85,9 @@ optional arguments:
--n N concurrency for service usage
--consolidate_references
use GROBID for consolidation of the bibliographical references
--segment_sentences segment sentences in the text content of the document with additional <s>
elements
--grobid_refine use Grobid to structure/enhance raw fields: affiliations, references, person,
dates
--segment_sentences segment sentences in the text content of the document with additional <s> elements
--generate_ids Generate idenfifier for each text item
--grobid_refine use Grobid to structure/enhance raw fields: affiliations, references, person, dates
--force force re-processing pdf input files when tei output files already exist
--verbose print information about processed files in the console
```
Expand All @@ -112,12 +110,13 @@ Note that the consolidation is realized with the consolidation service indicated

Tranform a publisher XML into TEI XML format, with optional enhancements.

| method | request type | response type | parameters | requirement | description |
|--- |--- |--- |--- |--- |--- |
| POST | `multipart/form-data` | `application/xml` | `input` | required | publisher XML file to be processed |
| | | | `segmentSentences` | optional | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements <s> |
| | | | `grobidRefine` | optional | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML |
| method | request type | response type | parameters | requirement | description |
|--- |--- |--- |-------------------------|--- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| POST | `multipart/form-data` | `application/xml` | `input` | required | publisher XML file to be processed |
| | | | `segmentSentences` | optional | Boolean, if true the paragraphs structures in the resulting TEI will be further segmented into sentence elements <s> |
| | | | `grobidRefine` | optional | Boolean, if true the raw affiliations and raw biblographical reference strings will be parsed with Grobid and the resulting structured information added in the transformed TEI XML |
| | | | `consolidateReferences` | optional | Consolidate all the biblographical references, `consolidateReferences` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). |
| | | | `generateIDs` | optional | Inject the attribute `xml:id` in the textual elements (`title`, `note`, `term`, `keywords`, `p`, `s`) |

Response status codes:

Expand Down
48 changes: 37 additions & 11 deletions client/pub2tei_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ class ServerUnavailableException(Exception):

class Pub2TEIClient(ApiClient):

def __init__(self, pub2tei_server='localhost',
batch_size=1000,
def __init__(self, pub2tei_server='localhost',
batch_size=1000,
sleep_time=1,
timeout=60,
config_path=None,
config_path=None,
check_server=True):
self.config = {
'pub2tei_server': pub2tei_server,
Expand Down Expand Up @@ -81,6 +81,7 @@ def process(
consolidate_references=True,
segment_sentences=False,
grobid_refine=False,
generate_ids=False,
force=True,
verbose=False,
):
Expand All @@ -107,6 +108,7 @@ def process(
consolidate_references,
segment_sentences,
grobid_refine,
generate_ids,
force,
verbose,
)
Expand All @@ -122,6 +124,7 @@ def process(
consolidate_references,
segment_sentences,
grobid_refine,
generate_ids,
force,
verbose,
)
Expand All @@ -135,6 +138,7 @@ def process_batch(
consolidate_references,
segment_sentences,
grobid_refine,
generate_ids,
force,
verbose=False,
):
Expand All @@ -150,13 +154,14 @@ def process_batch(
if not force and os.path.isfile(filename):
print(filename, "already exist, skipping... (use --force to reprocess pdf input files)")
continue

r = executor.submit(
self.process_xml,
input_file,
consolidate_references,
segment_sentences,
grobid_refine)
grobid_refine,
generate_ids)

results.append(r)

Expand Down Expand Up @@ -190,7 +195,8 @@ def process_xml(
xml_file,
consolidate_references,
segment_sentences,
grobid_refine
grobid_refine,
generate_ids
):
xml_handle = open(xml_file, "rb")
files = {
Expand All @@ -201,7 +207,7 @@ def process_xml(
{"Expires": "0"},
)
}

the_url = self.get_server_url("processXML")

# set the Pub2TEI parameters
Expand All @@ -212,6 +218,8 @@ def process_xml(
the_data["segmentSentences"] = "1"
if grobid_refine:
the_data["grobidRefine"] = "1"
if generate_ids:
the_data["generateIDs"] = "1"

try:
res, status = self.post(
Expand All @@ -224,7 +232,8 @@ def process_xml(
xml_file,
consolidate_references,
segment_sentences,
grobid_refine
grobid_refine,
generate_ids
)
except requests.exceptions.ReadTimeout:
xml_handle.close()
Expand All @@ -240,7 +249,9 @@ def main():
parser = argparse.ArgumentParser(description="Client for Pub2TEI services")

parser.add_argument(
"--input", default=None, help="path to the directory containing XML files to process: .xml"
"--input",
required=True,
help="path to the directory containing XML files to process: .xml"
)
parser.add_argument(
"--output",
Expand All @@ -252,7 +263,11 @@ def main():
default="./config.json",
help="path to the config file, default is ./config.json",
)
parser.add_argument("--n", default=10, help="concurrency for service usage")
parser.add_argument(
"--n",
default=10,
help="concurrency for service usage"
)
parser.add_argument(
"--consolidate_references",
action="store_true",
Expand All @@ -261,16 +276,25 @@ def main():
parser.add_argument(
"--segment_sentences",
action="store_true",
default=False,
help="segment sentences in the text content of the document with additional <s> elements",
)
parser.add_argument(
"--generate_ids",
action="store_true",
default=False,
help="Generate idenfifier for each text item",
)
parser.add_argument(
"--grobid_refine",
action="store_true",
default=False,
help="use Grobid to structure/enhance raw fields: affiliations, references, person, dates",
)
parser.add_argument(
"--force",
action="store_true",
default=False,
help="force re-processing pdf input files when tei output files already exist",
)
parser.add_argument(
Expand Down Expand Up @@ -305,7 +329,8 @@ def main():
consolidate_references = args.consolidate_references
segment_sentences = args.segment_sentences
grobid_refine = args.grobid_refine

generate_ids = args.generate_ids

force = args.force
verbose = args.verbose

Expand All @@ -323,6 +348,7 @@ def main():
consolidate_references=consolidate_references,
segment_sentences=segment_sentences,
grobid_refine=grobid_refine,
generate_ids=generate_ids,
force=force,
verbose=verbose,
)
Expand Down
17 changes: 13 additions & 4 deletions src/main/java/org/pub2tei/document/DocumentProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ public String processTEI(File file, boolean segmentSentences, boolean refine, in
* Process a TEI XML format
*/
public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences) throws IOException {
return processTEI(tei, segmentSentences, refine, consolidateReferences, false);
}

public String processTEI(String tei, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws IOException {
if (tei == null || tei.length() == 0)
return null;
try {
Expand All @@ -120,6 +124,11 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i
XMLUtilities.fixSegmentedFigureTableList(document);
}

if (generateIDs) {
org.w3c.dom.Element root = document.getDocumentElement();
XMLUtilities.generateIDs(document, root);
}

if (refine) {
// in case we have raw fields that can be further refined (like raw affiliation string,
// raw reference string, etc.), use Grobid to add some parsed sub-structures together
Expand Down Expand Up @@ -159,7 +168,7 @@ public String processTEI(String tei, boolean segmentSentences, boolean refine, i
* @return TEI string
*/

public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception {
public String processXML(File file, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception {
InputStream inputStream = null;

try {
Expand All @@ -168,17 +177,17 @@ public String processXML(File file, boolean segmentSentences, boolean refine, in
LOGGER.error("Invalid input file: " + file.getAbsolutePath(), e);
}

return processXML(inputStream, segmentSentences, refine, consolidateReferences);
return processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs);
}

public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences) throws Exception {
public String processXML(InputStream inputStream, boolean segmentSentences, boolean refine, int consolidateReferences, boolean generateIDs) throws Exception {
if (inputStream == null)
return null;

String tei = null;
try {
tei = this.pub2TEIProcessor.transform(inputStream);
tei = processTEI(tei, segmentSentences, refine, consolidateReferences);
tei = processTEI(tei, segmentSentences, refine, consolidateReferences, generateIDs);
} catch (final Exception exp) {
LOGGER.error("An error occured while processing the XML input stream", exp);
}
Expand Down
29 changes: 28 additions & 1 deletion src/main/java/org/pub2tei/document/XMLUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.utilities.KeyGen;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.SentenceUtilities;
import org.slf4j.Logger;
Expand Down Expand Up @@ -40,6 +41,7 @@ public class XMLUtilities {

private static List<String> textualElements = Arrays.asList("p", "figDesc");
private static List<String> noSegmentationElements = Arrays.asList("listBibl", "table");
private static List<String> elementsWithIds = Arrays.asList("s", "p", "title", "note", "term", "keywords");

private static DocumentBuilderFactory factory = getReasonableDocumentBuilderFactory();

Expand Down Expand Up @@ -243,7 +245,6 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
textBuffer.append(" ");
}
}

textBuffer.append(serializedString);
}
String text = textBuffer.toString();
Expand Down Expand Up @@ -556,6 +557,32 @@ public static String reformatTEI(String tei) {
return tei;
}

public static void generateIDs(org.w3c.dom.Document doc, Node node) {
final NodeList children = node.getChildNodes();
final int nbChildren = children.getLength();

List<Node> newChildren = new ArrayList<>();
for (int i = 0; i < nbChildren; i++) {
newChildren.add(children.item(i));
}

factory.setNamespaceAware(true);

for (int i = 0; i < nbChildren; i++) {
final Node n = newChildren.get(i);
if (n.getNodeType() == Node.ELEMENT_NODE
&& elementsWithIds.contains(n.getNodeName())) {
Element nodeAsElement = ((Element) n);
if (!nodeAsElement.hasAttribute("xml:id")) {
String divID = "_" + KeyGen.getKey().substring(0, 7);
((Element) n).setAttribute("xml:id", divID);
}
XMLUtilities.generateIDs(doc, n);
} else if (n.getNodeType() == Node.ELEMENT_NODE) {
XMLUtilities.generateIDs(doc, n);
}
}
}

/**
* This method is similar to the usual Element.getTextContent() (get all text under the element
Expand Down
13 changes: 7 additions & 6 deletions src/main/java/org/pub2tei/service/ProcessFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,17 @@ public ProcessFile() {
* @param segmentSentences if true, return results with segmented sentences
* @return a response object containing the converted/refined TEI XML
*/
public static Response processXML(final InputStream inputStream,
final boolean segmentSentences,
final boolean refine,
final int consolidateReferences,
ServiceConfiguration serviceConfiguration) {
public static Response processXML(final InputStream inputStream,
final boolean segmentSentences,
final boolean refine,
final int consolidateReferences,
ServiceConfiguration serviceConfiguration,
final boolean generateIDs) {
LOGGER.debug(methodLogIn());
Response response = null;
try {
DocumentProcessor documentProcessor = new DocumentProcessor(serviceConfiguration);
String result = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences);
String result = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs);

if (result == null || result.length() == 0) {
response = Response.status(Response.Status.NO_CONTENT).build();
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/pub2tei/service/ProcessString.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ public static Response processText(String text,
final boolean segmentSentences,
final boolean refine,
final int consolidateReferences,
ServiceConfiguration serviceConfiguration) {
ServiceConfiguration serviceConfiguration,
Boolean generateIDs) {
LOGGER.debug(methodLogIn());
Response response = null;

Expand All @@ -55,7 +56,7 @@ public static Response processText(String text,

DocumentProcessor documentProcessor = new DocumentProcessor(serviceConfiguration);
InputStream inputStream = new ByteArrayInputStream(text.getBytes());
String retValString = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences);
String retValString = documentProcessor.processXML(inputStream, segmentSentences, refine, consolidateReferences, generateIDs);

if (!isResultOK(retValString)) {
response = Response.status(Response.Status.NO_CONTENT).build();
Expand Down
Loading

0 comments on commit d1a2429

Please sign in to comment.