diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h index 856e471c..a0094dc3 100644 --- a/include/kiwi/Types.h +++ b/include/kiwi/Types.h @@ -322,6 +322,8 @@ namespace kiwi { return !operator==(o); } + + uint32_t endPos() const { return position + length; } }; struct BasicToken diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index 72c4c9d5..8bef2f9f 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -482,15 +482,32 @@ namespace kiwi auto& t = tokens[i]; if ((i >= nestedEnd) && sp.next(t, nlPos, nestedSentEnd && i == nestedSentEnd)) { + bool includePrevToken = i > 1 && + (tokens[i - 1].tag == POSTag::so + || tokens[i - 1].tag == POSTag::sw + || tokens[i - 1].tag == POSTag::sp + || tokens[i - 1].tag == POSTag::se) + && tokens[i - 1].endPos() == tokens[i].position + && tokens[i - 1].position > tokens[i - 2].endPos(); if (nestedSentEnd) { subSentPos++; accumSubSent++; + if (includePrevToken) + { + tokens[i - 1].subSentPosition = subSentPos; + } } else { sentPos++; accumSubSent = 1; + if (includePrevToken) + { + tokens[i - 1].sentPosition = sentPos; + tokens[i - 1].wordPosition = 0; + accumWordPos = 0; + } } }