Skip to content

Commit

Permalink
avoid sentence segmentation when entities are in the middle
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Feb 20, 2024
1 parent 721b0f0 commit ab7d988
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ public List<Measurement> process(List<LayoutToken> tokens, List<Measurement> mea
try {
String text = LayoutTokensUtil.toText(tokens);
TextParser textParser = TextParser.getInstance();
List<Sentence> parsedSentences = textParser.parseText(text);
int firstOffset = Iterables.getFirst(tokens, new LayoutToken()).getOffset();
List<OffsetPosition> measurementsOffsets = measurements.stream()
.map(m -> new OffsetPosition(m.getRawOffsets().start - firstOffset, m.getRawOffsets().end - firstOffset))
.collect(Collectors.toList());
List<Sentence> parsedSentences = textParser.parseText(text, measurementsOffsets);
int firstTokenOffsetStart = tokens.get(0).getOffset();
parsedSentences.stream().forEach(s -> {
s.getOffset().start = s.getOffsetStart() + firstTokenOffsetStart;
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/grobid/core/engines/QuantityParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,7 @@ private OffsetPosition findSentenceOffset(List<OffsetPosition> sentences, Offset
OffsetPosition defaultValue = new OffsetPosition(0, 0);
return new OffsetPosition(Iterables.getFirst(sentencesCurrentMeasure, defaultValue).start, Iterables.getLast(sentencesCurrentMeasure).end);
} else {
LOGGER.warn("Cannot find sentence. The entity might be inconsistent: " + currentMeasureOffset.toString());
LOGGER.warn("Cannot find sentence. The entity might be outside the sentence: " + currentMeasureOffset.toString());
OffsetPosition defaultValue = new OffsetPosition(0, 0);
return new OffsetPosition(Iterables.getFirst(sentences, defaultValue).start, Iterables.getLast(sentences).end);
}
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/org/grobid/core/utilities/TextParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import com.googlecode.clearnlp.util.UTInput;
import com.googlecode.clearnlp.util.pair.Pair;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.Sentence;
import org.grobid.core.data.SentenceParse;
import org.grobid.core.exceptions.GrobidException;
Expand Down Expand Up @@ -168,16 +169,16 @@ private List<SentenceParse> getSentenceParses(String sentence) {
* semantic role labeling) as the n-best list of Parse object. If the CLEAR_PARSER is
* selected, only the best parse is provided in the list.
*/
public synchronized List<Sentence> parseText(String text) throws GrobidException {
public synchronized List<Sentence> parseText(String text, List<OffsetPosition> measurementOffsets) throws GrobidException {
if (text == null) {
throw new GrobidException("Cannot parse the sentence, because it is null.");
} else if (text.length() == 0) {
} else if (StringUtils.isEmpty(text)) {
LOGGER.error("The length of the text to be parsed is 0.");
return null;
}

List<Sentence> results = new ArrayList<>();
List<OffsetPosition> sentences = this.segmenter.runSentenceDetection(text);
List<OffsetPosition> sentences = this.segmenter.runSentenceDetection(text, measurementOffsets);

if (CollectionUtils.isEmpty(sentences)) {
// there is some text but not in a state so that a sentence at least can be
Expand Down

0 comments on commit ab7d988

Please sign in to comment.