diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index 6cef0ae..533057a 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -18,11 +18,13 @@ jobs: # # steps: # - uses: actions/checkout@v1 -# - name: Build & Test +# - name: Build # run: | -# /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest konlpy # /opt/python/${{ matrix.cp }}/bin/python setup.py build install -# /opt/python/${{ matrix.cp }}/bin/python -m pytest test.py +# - name: Test +# run: | +# /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest konlpy +# /opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test.py build_macos: name: Build for macOS @@ -44,7 +46,7 @@ jobs: - name: Test run: | python -m pip install pytest konlpy - python -m pytest test.py + python -m pytest --verbose test.py build_windows: name: Build for Windows @@ -68,4 +70,4 @@ jobs: - name: Test run: | python -m pip install pytest konlpy - python -m pytest test.py + python -m pytest --verbose test.py diff --git a/README.md b/README.md index 3a1a437..f090fd0 100644 --- a/README.md +++ b/README.md @@ -197,13 +197,17 @@ reader와 receiver를 사용한 예시는 다음과 같습니다. self.input = open(input, encoding='utf-8') self.output = open(output, 'w', encoding='utf-8') - def read(self, id): - if id == 0: + def read(self, sent_id): + if sent_id == 0: self.input.seek(0) - return self.input.readline() - - def write(self, id, res): - print('Analyzed %dth row' % id) + self.iter = iter(self.input) + try: + return next(self.iter) + except StopIteration: + return None + + def write(self, sent_id, res): + print('Analyzed %dth row' % sent_id) self.output.write(' '.join(map(lambda x:x[0]+'/'+x[1], res[0][0])) + '\n') def __del__(self): diff --git a/kiwipiepy.vcxproj b/kiwipiepy.vcxproj index 422ba1b..0402d5a 100644 --- a/kiwipiepy.vcxproj +++ b/kiwipiepy.vcxproj @@ -35,6 +35,8 @@ + + diff --git a/kiwipiepy.vcxproj.filters b/kiwipiepy.vcxproj.filters index 2346edd..8aaf8e8 100644 --- a/kiwipiepy.vcxproj.filters +++ b/kiwipiepy.vcxproj.filters @@ -13,6 +13,12 @@ {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + {7797df98-8827-48a4-bba9-d27a0b3aa861} + + + {006c5263-f679-408c-9729-996ce9cc7008} + @@ -51,6 +57,12 @@ + + Release + + + Debug + diff --git a/setup.py b/setup.py index 5476665..87bc427 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ setup( name='kiwipiepy', - version='0.7.3', + version='0.7.4', description='Kiwi, the Korean Tokenizer for Python', long_description=long_description, diff --git a/src/KiwiPy.cpp b/src/KiwiPy.cpp index f9be524..a42959f 100644 --- a/src/KiwiPy.cpp +++ b/src/KiwiPy.cpp @@ -13,6 +13,45 @@ using namespace std; using namespace kiwi; +struct UniquePyObj +{ + PyObject* obj; + UniquePyObj(PyObject* _obj = nullptr) : obj(_obj) {} + ~UniquePyObj() + { + Py_XDECREF(obj); + } + + UniquePyObj(const UniquePyObj&) = delete; + UniquePyObj& operator=(const UniquePyObj&) = delete; + + UniquePyObj(UniquePyObj&& o) + { + std::swap(obj, o.obj); + } + + UniquePyObj& operator=(UniquePyObj&& o) + { + std::swap(obj, o.obj); + return *this; + } + + PyObject* get() const + { + return obj; + } + + operator bool() const + { + return !!obj; + } + + operator PyObject*() const + { + return obj; + } + }; + #if PY_MAJOR_VERSION < 3 string PyUnicode_AsUTF8(PyObject* obj) { @@ -208,18 +247,14 @@ static PyObject* kiwi__extractWords(KiwiObject* self, PyObject* args, PyObject * { auto res = self->inst->extractWords([argReader](size_t id) -> u16string { - PyObject* argList = Py_BuildValue("(n)", id); - PyObject* retVal = PyEval_CallObject(argReader, argList); - Py_DECREF(argList); + UniquePyObj argList = Py_BuildValue("(n)", id); + UniquePyObj retVal = PyEval_CallObject(argReader, argList); if (!retVal) throw bad_exception(); if (PyObject_Not(retVal)) { - Py_DECREF(retVal); return {}; } - auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal)); - Py_DECREF(retVal); - return p; + return Kiwi::toU16(PyUnicode_AsUTF8(retVal)); }, minCnt, maxWordLen, minScore); PyObject* retList = PyList_New(res.size()); @@ -256,18 +291,14 @@ static PyObject* kiwi__extractFilterWords(KiwiObject* self, PyObject* args, PyOb { auto res = self->inst->extractWords([argReader](size_t id) -> u16string { - PyObject* argList = Py_BuildValue("(n)", id); - PyObject* retVal = PyEval_CallObject(argReader, argList); - Py_DECREF(argList); + UniquePyObj argList = Py_BuildValue("(n)", id); + UniquePyObj retVal = PyEval_CallObject(argReader, argList); if (!retVal) throw bad_exception(); if (PyObject_Not(retVal)) { - Py_DECREF(retVal); return {}; } - auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal)); - Py_DECREF(retVal); - return p; + return Kiwi::toU16(PyUnicode_AsUTF8(retVal)); }, minCnt, maxWordLen, minScore); res = self->inst->filterExtractedWords(move(res), posScore); @@ -305,18 +336,14 @@ static PyObject* kiwi__extractAddWords(KiwiObject* self, PyObject* args, PyObjec { auto res = self->inst->extractAddWords([argReader](size_t id) -> u16string { - PyObject* argList = Py_BuildValue("(n)", id); - PyObject* retVal = PyEval_CallObject(argReader, argList); - Py_DECREF(argList); + UniquePyObj argList = Py_BuildValue("(n)", id); + UniquePyObj retVal = PyEval_CallObject(argReader, argList); if (!retVal) throw bad_exception(); if (PyObject_Not(retVal)) { - Py_DECREF(retVal); return {}; } - auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal)); - Py_DECREF(retVal); - return p; + return Kiwi::toU16(PyUnicode_AsUTF8(retVal)); }, minCnt, maxWordLen, minScore, posScore); PyObject* retList = PyList_New(res.size()); @@ -421,7 +448,7 @@ PyObject* resToPyList(const vector& res) { PyList_SetItem(rList, jdx++, Py_BuildValue("(ssnn)", Kiwi::toU8(q.str()).c_str(), tagToString(q.tag()), (size_t)q.pos(), (size_t)q.len())); } - PyList_SetItem(retList, idx++, Py_BuildValue("(Of)", rList, p.second)); + PyList_SetItem(retList, idx++, Py_BuildValue("(Nf)", rList, p.second)); } return retList; } @@ -434,16 +461,16 @@ static PyObject* kiwi__analyze(KiwiObject* self, PyObject* args, PyObject *kwarg static const char* kwlist[] = { "text", "top_n", nullptr }; if (PyArg_ParseTupleAndKeywords(args, kwargs, "s|n", (char**)kwlist, &text, &topN)) { - try + //try { auto res = self->inst->analyze(text, topN); return resToPyList(res); } - catch (const exception& e) + /*catch (const exception& e) { PyErr_SetString(PyExc_Exception, e.what()); return nullptr; - } + }*/ } PyErr_Clear(); } @@ -458,26 +485,19 @@ static PyObject* kiwi__analyze(KiwiObject* self, PyObject* args, PyObject *kwarg if (!PyCallable_Check(receiver)) return PyErr_SetString(PyExc_TypeError, "'analyze' requires 2nd parameter as callable"), nullptr; self->inst->analyze(topN, [&reader](size_t id)->u16string { - PyObject* argList = Py_BuildValue("(n)", id); - PyObject* retVal = PyEval_CallObject(reader, argList); - Py_DECREF(argList); + UniquePyObj argList = Py_BuildValue("(n)", id); + UniquePyObj retVal = PyEval_CallObject(reader, argList); if (!retVal) throw bad_exception(); if (PyObject_Not(retVal)) { - Py_DECREF(retVal); return {}; } - auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal)); - Py_DECREF(retVal); - return p; + return Kiwi::toU16(PyUnicode_AsUTF8(retVal)); }, [&receiver](size_t id, vector&& res) { - PyObject* l = resToPyList(res); - PyObject* argList = Py_BuildValue("(nO)", id, l); - PyObject* ret = PyEval_CallObject(receiver, argList); + UniquePyObj argList = Py_BuildValue("(nN)", id, resToPyList(res)); + UniquePyObj ret = PyEval_CallObject(receiver, argList); if(!ret) throw bad_exception(); - Py_DECREF(ret); - Py_DECREF(argList); }); Py_INCREF(Py_None); return Py_None; @@ -513,26 +533,19 @@ static PyObject* kiwi__perform(KiwiObject* self, PyObject* args, PyObject *kwarg self->inst->perform(topN, [&reader](size_t id)->u16string { - PyObject* argList = Py_BuildValue("(n)", id); - PyObject* retVal = PyEval_CallObject(reader, argList); - Py_DECREF(argList); + UniquePyObj argList = Py_BuildValue("(n)", id); + UniquePyObj retVal = PyEval_CallObject(reader, argList); if (!retVal) throw bad_exception(); if (PyObject_Not(retVal)) { - Py_DECREF(retVal); return {}; } - auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal)); - Py_DECREF(retVal); - return p; + return Kiwi::toU16(PyUnicode_AsUTF8(retVal)); }, [&receiver](size_t id, vector&& res) { - PyObject* l = resToPyList(res); - PyObject* argList = Py_BuildValue("(nO)", id, l); - PyObject* ret = PyEval_CallObject(receiver, argList); + UniquePyObj argList = Py_BuildValue("(nN)", id, resToPyList(res)); + UniquePyObj ret = PyEval_CallObject(receiver, argList); if (!ret) throw bad_exception(); - Py_DECREF(ret); - Py_DECREF(argList); }, minCnt, maxWordLen, minScore, posScore); Py_INCREF(Py_None); return Py_None; diff --git a/src/core/KMemory.h b/src/core/KMemory.h index 19c7ac5..2359a89 100644 --- a/src/core/KMemory.h +++ b/src/core/KMemory.h @@ -17,7 +17,7 @@ namespace kiwi void* allocate() { - assert(_CrtCheckMemory()); + //assert(_CrtCheckMemory()); //std::lock_guard lg(lock); if (!freeList) { @@ -37,7 +37,7 @@ namespace kiwi //fprintf(stderr, "deallocate %p\n", p); *((void**)p) = freeList; freeList = (void**)p; - assert(_CrtCheckMemory()); + //assert(_CrtCheckMemory()); } private: diff --git a/src/core/KModelMgr.cpp b/src/core/KModelMgr.cpp index 346f1bf..845503c 100644 --- a/src/core/KModelMgr.cpp +++ b/src/core/KModelMgr.cpp @@ -468,12 +468,13 @@ void KModelMgr::solidify() trieRoot[i].val = &forms[i - 1]; } + bool once = false; for (size_t i = (size_t)KPOSTag::SN; i < forms.size(); ++i) { auto& f = forms[i]; if (f.candidate.empty()) continue; size_t realSize = f.form.size(); - if (f.form.find(u'\x2665') != k_string::npos) + if (!once && f.form.find(u'\x2665') != k_string::npos) { realSize = f.form.find(u'\x2665') + 1; } @@ -482,9 +483,10 @@ void KModelMgr::solidify() trieRoot.emplace_back(); return &trieRoot.back(); }); - if (f.form.find(u'\x2665') != k_string::npos) + if (!once && f.form.find(u'\x2665') != k_string::npos) { f.form = f.form.substr(f.form.find(u'\x2665') + 1); + once = true; } } trieRoot[0].fillFail(); diff --git a/src/core/KTrie.cpp b/src/core/KTrie.cpp index 0475978..09f7296 100644 --- a/src/core/KTrie.cpp +++ b/src/core/KTrie.cpp @@ -35,10 +35,10 @@ vector KTrie::split(const k_string& str) const return nBegin == g.lastPos && lastSpecialEndPos == g.lastPos - (g.uform.empty() ? g.form->form.size() : g.uform.size()); }); - // inserting unknown form + // insert unknown form if (nBegin > lastSpecialEndPos && !longestMatched && !(0x11A8 <= cand->form[0] && cand->form[0] < (0x11A7 + 28)) - && str[nBegin - space -1] != 0x11BB) // cannot ends with + && str[nBegin - space -1] != 0x11BB) // cannot end with { auto it2 = spacePos.find(lastSpecialEndPos - 1); int space2 = it2 == spacePos.end() ? 0 : it2->second; diff --git a/src/core/Kiwi.cpp b/src/core/Kiwi.cpp index 1bab1dd..3ad536d 100644 --- a/src/core/Kiwi.cpp +++ b/src/core/Kiwi.cpp @@ -764,12 +764,12 @@ vector Kiwi::analyze(const u16string & str, size_t topN) const sents.emplace_back(chunk); } } - sents.emplace_back(str.end()); + if(sents.back() != str.end()) sents.emplace_back(str.end()); + if (sents.size() <= 1) return vector(1); vector ret = analyzeSent(sents[0], sents[1], topN); if (ret.empty()) { - ret.emplace_back(); - return ret; + return vector(1); } while (ret.size() < topN) ret.emplace_back(ret.back()); for (size_t i = 2; i < sents.size(); ++i) diff --git a/test.py b/test.py index e230ca8..26239b8 100644 --- a/test.py +++ b/test.py @@ -6,13 +6,16 @@ def __init__(self, input, output=None): self.input = input self.output = output - def read(self, id): - if id == 0: + def read(self, sent_id): + if sent_id == 0: self.input.seek(0) - return self.input.readline() + self.iter = iter(self.input) + try: + return next(self.iter) + except StopIteration: + return None - def write(self, id, res): - print('Analyzed %dth row' % id) + def write(self, sent_id, res): self.output.write(' '.join(map(lambda x:x[0]+'/'+x[1], res[0][0])) + '\n') def __del__(self): @@ -33,6 +36,6 @@ def test_analyze_multi(): def test_extract_words(): kiwi = Kiwi() + kiwi.prepare() handle = IOHandler(kolaw.open('constitution.txt')) - - + kiwi.extract_words(handle.read)