Skip to content

Commit

Permalink
Merge pull request #1207 from kermitt2/bugfix/fix-dropped-misclassifi…
Browse files Browse the repository at this point in the history
…ed-text

Handle incompleted/missclassified tables and figures
  • Loading branch information
lfoppiano authored Dec 26, 2024
2 parents 09b28cd + 09c824d commit 4c85ab0
Show file tree
Hide file tree
Showing 10 changed files with 1,067 additions and 408 deletions.
7 changes: 6 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,12 @@ public String getTeiId() {
return "fig_" + this.id;
}

public boolean isCompleteForTEI() {
return (StringUtils.isNotBlank(header) || StringUtils.isNotBlank(caption) || CollectionUtils.isNotEmpty(graphicObjects));
}

public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption) && CollectionUtils.isEmpty(graphicObjects)) {
if (!isCompleteForTEI()) {
return null;
}
Element figureElement = XmlBuilderUtils.teiElement("figure");
Expand Down Expand Up @@ -568,4 +572,5 @@ public void setLabel(StringBuilder label) {
public void setUri(URI uri) {
this.uri = uri;
}

}
27 changes: 19 additions & 8 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Text;

import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
Expand All @@ -43,6 +42,7 @@
public class Table extends Figure {
private List<LayoutToken> contentTokens = new ArrayList<>();
private List<LayoutToken> fullDescriptionTokens = new ArrayList<>();

private boolean goodTable = true;

private StringBuilder note = null;
Expand All @@ -62,9 +62,13 @@ public Table() {
note = new StringBuilder();
}

public boolean isCompleteForTEI() {
return (StringUtils.isNotEmpty(header) && StringUtils.isNotEmpty(caption));
}

@Override
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption)) {
if (!isCompleteForTEI()) {
return null;
}

Expand Down Expand Up @@ -104,7 +108,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
addXmlId(desc, "_" + divID);
}

if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand Down Expand Up @@ -169,15 +173,15 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

Element noteNode = null;
if (note != null && note.toString().trim().length()>0) {
if (StringUtils.isNotBlank(note)) {

noteNode = XmlBuilderUtils.teiElement("note");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
addXmlId(noteNode, "_" + divID);
}

if ( (labeledNote != null) && (labeledNote.length() > 0) ) {
if (StringUtils.isNotBlank(labeledNote)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand Down Expand Up @@ -346,9 +350,14 @@ public String getLabeledNote() {
return this.labeledNote;
}

private boolean validateTable() {
/** Check if the table:
* - has label, header and content
* - header starts with "tab"
* - label can be parsed
*/
public boolean validateTable() {
CntManager cnt = Engine.getCntManager();
if (StringUtils.isEmpty(label) || StringUtils.isEmpty(header) || StringUtils.isEmpty(content)) {
if (StringUtils.isAnyBlank(label, header, content)) {
cnt.i(TableRejectionCounters.EMPTY_LABEL_OR_HEADER_OR_CONTENT);
return false;
}
Expand All @@ -359,7 +368,8 @@ private boolean validateTable() {
cnt.i(TableRejectionCounters.CANNOT_PARSE_LABEL_TO_INT);
return false;
}
if (!getHeader().toLowerCase().startsWith("table")) {
// tab covers: table, tabelle, tableu, tabella, etc.
if (!StringUtils.startsWithIgnoreCase(getHeader(), "tab")) {
cnt.i(TableRejectionCounters.HEADER_NOT_STARTS_WITH_TABLE_WORD);
return false;
}
Expand Down Expand Up @@ -423,4 +433,5 @@ public boolean isGoodTable() {
public String getTeiId() {
return "tab_" + this.id;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,7 @@ public static List<GraphicObject> getConnectedGraphics(Block block, Document doc
public void postProcessTables() {
for (Table table : tables) {
if (!table.firstCheck()) {
table.setGoodTable(false);
continue;
}

Expand Down Expand Up @@ -919,7 +920,7 @@ public void postProcessTables() {
table.getContentTokens().clear();
table.getContentTokens().addAll(contentResult);

table.secondCheck();
table.setGoodTable(table.secondCheck());
}
}

Expand Down
Loading

0 comments on commit 4c85ab0

Please sign in to comment.