Skip to content

Commit

Permalink
avoid adjusting the labelling when a <valueRange> is involved, as the…
Browse files Browse the repository at this point in the history
…y are not produced with I- starting element
  • Loading branch information
lfoppiano committed Mar 28, 2024
1 parent 895690f commit cab5b4c
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ private QuantitiesTaggingLabels() {
super();
}

private static final String QUANTITY_VALUE_ATOMIC_LABEL = "<valueAtomic>";
private static final String QUANTITY_VALUE_LEAST_LABEL = "<valueLeast>";
private static final String QUANTITY_VALUE_MOST_LABEL = "<valueMost>";
private static final String QUANTITY_VALUE_LIST_LABEL = "<valueList>";
private static final String QUANTITY_UNIT_LEFT_LABEL = "<unitLeft>";
private static final String QUANTITY_UNIT_RIGHT_LABEL = "<unitRight>";
private static final String QUANTITY_VALUE_BASE_LABEL = "<valueBase>";
private static final String QUANTITY_VALUE_RANGE_LABEL = "<valueRange>";
private static final String QUANTITY_OTHER_LABEL = "<other>";
public static final String QUANTITY_VALUE_ATOMIC_LABEL = "<valueAtomic>";
public static final String QUANTITY_VALUE_LEAST_LABEL = "<valueLeast>";
public static final String QUANTITY_VALUE_MOST_LABEL = "<valueMost>";
public static final String QUANTITY_VALUE_LIST_LABEL = "<valueList>";
public static final String QUANTITY_UNIT_LEFT_LABEL = "<unitLeft>";
public static final String QUANTITY_UNIT_RIGHT_LABEL = "<unitRight>";
public static final String QUANTITY_VALUE_BASE_LABEL = "<valueBase>";
public static final String QUANTITY_VALUE_RANGE_LABEL = "<valueRange>";
public static final String QUANTITY_OTHER_LABEL = "<other>";

private static final String UNIT_VALUE_BASE_LABEL = "<base>";
private static final String UNIT_VALUE_POW_LABEL = "<pow>";
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.grobid.core.engines.utilities

import org.grobid.core.engines.label.QuantitiesTaggingLabels
import org.grobid.core.engines.label.TaggingLabels
import java.util.*
import java.util.stream.Collectors

Expand All @@ -14,7 +16,9 @@ class LabellingUtils {
fun correctLabelling(resultLabelling: String): String? {
val resultAsList: MutableList<MutableList<String>> = Arrays
.stream(resultLabelling.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray())
.map { i: String -> Arrays.asList(*i.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) }
.map { i: String ->
Arrays.asList(*i.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray())
}
.collect(Collectors.toList())

var previousLabel: String? = null;
Expand All @@ -28,7 +32,8 @@ class LabellingUtils {

if (fixNext
&& (currentLabel.startsWith("I-")
|| currentLabel.equals("<other>"))
|| currentLabel.equals(TaggingLabels.OTHER_LABEL)
|| currentLabel.equals(QuantitiesTaggingLabels.QUANTITY_VALUE_RANGE_LABEL))
) {
fixNext = false
fixValue = null
Expand All @@ -37,7 +42,9 @@ class LabellingUtils {
if (!fixNext
&& previousLabel != null
// && previousLabel.startsWith("I-")
&& currentLabel != "<other>"
&& currentLabel != TaggingLabels.OTHER_LABEL
&& !(currentLabel == QuantitiesTaggingLabels.QUANTITY_VALUE_RANGE_LABEL
|| previousLabel == QuantitiesTaggingLabels.QUANTITY_VALUE_RANGE_LABEL)
&& !currentLabel.startsWith("I-")
&& !previousLabel.replace("I-", "").equals(currentLabel)
) {
Expand All @@ -58,7 +65,7 @@ class LabellingUtils {
val fixedListAsString = resultAsList
.map { i -> i.joinToString(separator = "\t") }
.toList().joinToString(separator = "\n")

return fixedListAsString
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package org.grobid.trainer.sax;

import static org.junit.jupiter.api.Assertions.*;

class MeasureAnnotationSaxHandlerTest {

}
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,69 @@ class LabellingUtilsTest {
"billion\t0\t0\tNOPUNCT\t<alpha>"

assertEquals(expected, output)
}

@Test
fun testCorrectingRangeValues() {
val input = "70\t70\t7\t70\t70\t70\t0\t70\t70\t70\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\tI-<valueBase>\n" +
"±\t±\t±\t±\t±\t±\t±\t±\t±\t±\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t±\t±\t0\t0\t<other>\n" +
"9\t9\t9\t9\t9\t9\t9\t9\t9\t9\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<valueRange>\n" +
"kg\tkg\tk\tkg\tkg\tkg\tg\tkg\tkg\tkg\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txx\tx\t1\t0\t<unitLeft>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tALLCAPS\tNODIGIT\t1\tCOMMA\t,\t,\t0\t0\t<other>\n" +
"15\t15\t1\t15\t15\t15\t5\t15\t15\t15\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\tI-<valueBase>\n" +
"±\t±\t±\t±\t±\t±\t±\t±\t±\t±\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t±\t±\t0\t0\t<other>\n" +
"5\t5\t5\t5\t5\t5\t5\t5\t5\t5\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<valueRange>\n" +
"%\t%\t%\t%\t%\t%\t%\t%\t%\t%\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t%\t%\t0\t0\t<unitLeft>\n" +
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txx\tx\t1\t0\t<other>\n" +
"fat\tfat\tf\tfa\tfat\tfat\tt\tat\tfat\tfat\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxx\tx\t1\t0\t<other>\n" +
"mass\tmass\tm\tma\tmas\tmass\ts\tss\tass\tmass\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxxx\tx\t1\t0\t<other>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tALLCAPS\tNODIGIT\t1\tCOMMA\t,\t,\t0\t0\t<other>\n" +
"VO\tvo\tV\tVO\tVO\tVO\tO\tVO\tVO\tVO\tALLCAPS\tNODIGIT\t0\tNOPUNCT\tXX\tX\t0\t0\t<other>\n" +
"2\t2\t2\t2\t2\t2\t2\t2\t2\t2\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<other>\n" +
"max\tmax\tm\tma\tmax\tmax\tx\tax\tmax\tmax\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxx\tx\t0\t0\t<other>\n" +
":\t:\t:\t:\t:\t:\t:\t:\t:\t:\tALLCAPS\tNODIGIT\t1\tPUNCT\t:\t:\t0\t0\t<other>\n" +
"50\t50\t5\t50\t50\t50\t0\t50\t50\t50\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\tI-<valueBase>\n" +
"±\t±\t±\t±\t±\t±\t±\t±\t±\t±\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t±\t±\t0\t0\t<other>\n" +
"8\t8\t8\t8\t8\t8\t8\t8\t8\t8\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<valueRange>\n" +
"ml\tml\tm\tml\tml\tml\tl\tml\tml\tml\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txx\tx\t1\t0\t<unitLeft>\n" +
"\t\t\t\t\t\t\t\t\t\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t\t\t0\t0\t<unitLeft>\n" +
"kg\tkg\tk\tkg\tkg\tkg\tg\tkg\tkg\tkg\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txx\tx\t1\t0\t<unitLeft>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tALLCAPS\tNODIGIT\t1\tHYPHEN\t-\t-\t0\t0\t<unitLeft>\n" +
"1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<unitLeft>\n" +
"\t\t\t\t\t\t\t\t\t\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t\t\t0\t0\t<unitLeft>\n" +
"min\tmin\tm\tmi\tmin\tmin\tn\tin\tmin\tmin\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxx\tx\t1\t0\t<unitLeft>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tALLCAPS\tNODIGIT\t1\tHYPHEN\t-\t-\t0\t0\t<unitLeft>\n" +
"1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<unitLeft>\n" +
"and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxx\tx\t0\t0\t<other>\n" +
"21\t21\t2\t21\t21\t21\t1\t21\t21\t21\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\tI-<valueAtomic>\n" +
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txx\tx\t1\t0\t<other>\n" +
"race\trace\tr\tra\trac\trace\te\tce\tace\trace\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxxx\tx\t0\t0\t<other>\n" +
"A\ta\tA\tA\tA\tA\tA\tA\tA\tA\tALLCAPS\tNODIGIT\t1\tNOPUNCT\tX\tX\t1\t0\t<other>\n" +
"(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tALLCAPS\tNODIGIT\t1\tOPENBRACKET\t(\t(\t0\t0\t<other>\n" +
"6\t6\t6\t6\t6\t6\t6\t6\t6\t6\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\tI-<valueAtomic>\n" +
"women\twomen\tw\two\twom\twome\tn\ten\tmen\tomen\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxxx\tx\t0\t0\t<other>\n" +
"and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxx\tx\t0\t0\t<other>\n" +
"15\t15\t1\t15\t15\t15\t5\t15\t15\t15\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\tI-<valueAtomic>\n" +
"men\tmen\tm\tme\tmen\tmen\tn\ten\tmen\tmen\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxx\tx\t0\t0\t<other>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tALLCAPS\tNODIGIT\t1\tCOMMA\t,\t,\t0\t0\t<other>\n" +
"40\t40\t4\t40\t40\t40\t0\t40\t40\t40\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\tI-<valueBase>\n" +
"±\t±\t±\t±\t±\t±\t±\t±\t±\t±\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t±\t±\t0\t0\t<other>\n" +
"7\t7\t7\t7\t7\t7\t7\t7\t7\t7\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<valueRange>\n" +
"years\tyears\ty\tye\tyea\tyear\ts\trs\tars\tears\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txxxx\tx\t1\t0\t<unitLeft>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tALLCAPS\tNODIGIT\t1\tCOMMA\t,\t,\t0\t0\t<other>\n" +
"176\t176\t1\t17\t176\t176\t6\t76\t176\t176\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tddd\td\t0\t0\tI-<valueBase>\n" +
"±\t±\t±\t±\t±\t±\t±\t±\t±\t±\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t±\t±\t0\t0\t<other>\n" +
"7\t7\t7\t7\t7\t7\t7\t7\t7\t7\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\td\td\t0\t0\t<valueRange>\n" +
"cm\tcm\tc\tcm\tcm\tcm\tm\tcm\tcm\tcm\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txx\tx\t1\t0\t<unitLeft>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tALLCAPS\tNODIGIT\t1\tCOMMA\t,\t,\t0\t0\t<other>\n" +
"72\t72\t7\t72\t72\t72\t2\t72\t72\t72\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\tI-<valueBase>\n" +
"±\t±\t±\t±\t±\t±\t±\t±\t±\t±\tALLCAPS\tNODIGIT\t1\tNOPUNCT\t±\t±\t0\t0\t<other>\n" +
"10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\tdd\td\t0\t0\t<valueRange>\n" +
"kg\tkg\tk\tkg\tkg\tkg\tg\tkg\tkg\tkg\tNOCAPS\tNODIGIT\t0\tNOPUNCT\txx\tx\t1\t0\t<unitLeft>"

var output = LabellingUtils.correctLabelling(input)

assertEquals(output, input)
}

}

0 comments on commit cab5b4c

Please sign in to comment.