Skip to content

Commit

Permalink
fix other corner cases
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Mar 29, 2024
1 parent d4f9e79 commit d9cfcef
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 12 deletions.
10 changes: 7 additions & 3 deletions src/main/java/org/grobid/core/data/Unit.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@
import com.fasterxml.jackson.core.util.BufferRecyclers;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.OffsetPosition;
import org.jetbrains.annotations.NotNull;

import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import static org.grobid.core.utilities.TextParser.handleRawData;

/**
* Class for managing normalized Unit representation.
Expand Down Expand Up @@ -104,13 +109,12 @@ public String toString() {
}*/

public String toJson() {
JsonStringEncoder encoder = JsonStringEncoder.getInstance();
StringBuilder json = new StringBuilder();
boolean started = false;
json.append("{ ");
if (rawName != null) {
byte[] encodedRawName = encoder.quoteAsUTF8(rawName);
String outputRawName = new String(encodedRawName);
String outputRawName = handleRawData(rawName);

started = true;
json.append("\"name\" : \"").append(outputRawName).append("\"");
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/grobid/core/engines/ValueParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,14 @@ protected BigDecimal parseValueBlock(ValueBlock block, Locale locale) {
return null;
}

private String removeTilde(String text) {
public static String removeTilde(String text) {
if (StringUtils.startsWithAny(text, "∼", "~")) {
return RegExUtils.replaceAll(text, TILDE_PATTERNS, "");
}
return text;
}

private String removeSpacesTabsAndBl(String block) {
public static String removeSpacesTabsAndBl(String block) {
return UnicodeUtil.normaliseText(block)
.replaceAll("\n", " ")
.replaceAll("\t", " ")
Expand Down
22 changes: 22 additions & 0 deletions src/main/java/org/grobid/core/utilities/TextParser.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.utilities;

import com.fasterxml.jackson.core.io.JsonStringEncoder;
import com.googlecode.clearnlp.component.srl.CRolesetClassifier;
import com.googlecode.clearnlp.dependency.AbstractDEPParser;
import com.googlecode.clearnlp.dependency.DEPTree;
Expand All @@ -22,12 +23,14 @@
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.lang.SentenceDetector;
import org.grobid.core.lang.impl.OpenNLPSentenceDetector;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.net.URLDecoder;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -246,4 +249,23 @@ public List<Sentence> parse(BufferedReader reader) throws GrobidException {
public List<Sentence> parseFile(String inputFile) throws GrobidException {
return parse(UTInput.createBufferedFileReader(inputFile));
}

@NotNull
public static String handleRawData(String rawValue) {
JsonStringEncoder encoder = JsonStringEncoder.getInstance();
String outputRawName = "";
try {
byte[] encodedRawName = encoder.quoteAsUTF8(rawValue);
outputRawName = new String(encodedRawName);
} catch (Exception e) {
String decode = URLDecoder.decode(rawValue);
String cleanedString = decode.chars()
.filter(c -> !Character.isSurrogate((char) c))
.mapToObj(c -> String.valueOf((char) c))
.collect(Collectors.joining());
byte[] encodedRawName = encoder.quoteAsUTF8(cleanedString);
outputRawName = new String(encodedRawName);
}
return outputRawName;
}
}
37 changes: 32 additions & 5 deletions src/main/java/org/grobid/core/utilities/WordsToNumber.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.apache.commons.lang3.StringUtils;
import org.checkerframework.checker.units.qual.N;
import org.grobid.core.data.normalization.NormalizationException;
import org.grobid.core.engines.ValueParser;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -181,7 +182,8 @@ public Set<String> getTokenSet() {
return numberTokens;
}

public BigDecimal normalize(String text, Locale local) throws NormalizationException {
public BigDecimal normalize(String text, Locale locale) throws NormalizationException {
NumberFormat formatter = NumberFormat.getInstance(locale);
text = StringUtils.lowerCase(text);

String numericPart = "";
Expand All @@ -201,11 +203,36 @@ public BigDecimal normalize(String text, Locale local) throws NormalizationExcep
String denominator = m.group(m.groupCount());

BigDecimal division = null;
BigDecimal numeratorAsBigDecimal = null;
BigDecimal denominatorAsBigDecimal = null;

try {
division = new BigDecimal(numerator).divide(new BigDecimal(denominator));
} catch (ArithmeticException ae) {
division = new BigDecimal(numerator).divide(new BigDecimal(denominator), 10, BigDecimal.ROUND_HALF_UP);
String numeratorAsString = ValueParser.removeTilde(ValueParser.removeSpacesTabsAndBl(numerator));
numeratorAsBigDecimal = new BigDecimal(formatter.parse(numeratorAsString).toString());
String denominatorAsString = ValueParser.removeTilde(ValueParser.removeSpacesTabsAndBl(denominator));
denominatorAsBigDecimal = new BigDecimal(formatter.parse(denominatorAsString).toString());
try {
division = numeratorAsBigDecimal.divide(denominatorAsBigDecimal);
} catch (ArithmeticException ae) {
division = numeratorAsBigDecimal.divide(denominatorAsBigDecimal, 10, BigDecimal.ROUND_HALF_UP);
}
} catch (Exception e) {
throw new NormalizationException("Cannot process the values '" + text + "'. The conversion is failing. Skipping them.");
}

// catch (NumberFormatException nfe) {
//
//
// String cleanedNumerator = formatter.parse(numerator.);
// String cleanedDenominator = StringUtils.replaceChars(denominator, ",.", "");
// try {
// division = new BigDecimal(cleanedNumerator).divide(new BigDecimal(cleanedDenominator));
// } catch (ArithmeticException ae) {
// division = new BigDecimal(cleanedNumerator).divide(new BigDecimal(cleanedDenominator), 10, BigDecimal.ROUND_HALF_UP);
// } catch (Exception e) {
// throw new NormalizationException("Cannot process the values '" + text + "'. The conversion is failing. Skipping them.");
// }
// }
return division;
} else if (OUT_OF_PATTERN_ALPHABETIC.matcher(text).find()) {
Matcher m = OUT_OF_PATTERN_ALPHABETIC.matcher(text);
Expand Down Expand Up @@ -250,7 +277,7 @@ public BigDecimal normalize(String text, Locale local) throws NormalizationExcep
}

// decimal part
BigDecimal decimalResult = convertDecimalPart(decimalPart, local);
BigDecimal decimalResult = convertDecimalPart(decimalPart, locale);

return result.add(decimalResult);
} else if (fractions.keySet().stream().filter(text::contains).findFirst().isPresent()) {
Expand Down
22 changes: 22 additions & 0 deletions src/test/kotlin/org/grobid/core/utilities/TextParserTest.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package org.grobid.core.utilities

import org.hamcrest.MatcherAssert
import org.hamcrest.core.Is
import org.junit.Ignore
import org.junit.Test
import java.math.BigDecimal
import java.nio.charset.StandardCharsets
import java.util.*

class TextParserTest {
@Ignore("Cannot reproduce the surrogate character")
@Test
@Throws(Exception::class)
fun testConvertFractions6Numeric() {
val byteArray = byteArrayOf(-3, -1, -73, 0, 103, 0, 47, 0, 109, 0, 108, 0);
val input = String(byteArray, StandardCharsets.UTF_16LE)

val output = TextParser.handleRawData(input)
MatcherAssert.assertThat(output, Is.`is`("·g/ml"))
}
}
11 changes: 9 additions & 2 deletions src/test/kotlin/org/grobid/core/utilities/WordsToNumberTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -204,12 +204,19 @@ class WordsToNumberTest {
}


@Test(expected = NormalizationException::class)
@Throws(Exception::class)
fun testErrorCase_1() {
val input = "six, 12"
val output = target.normalize(input, Locale.ENGLISH)

target.normalize(input, Locale.ENGLISH)
}

@Test
@Throws(Exception::class)
fun testErrorCase_2() {
val input = "912 out of the 14,759"
val number = target.normalize(input, Locale.ENGLISH)
MatcherAssert.assertThat(number, Is.`is`(BigDecimal("0.0617928044")))
}

}

0 comments on commit d9cfcef

Please sign in to comment.