diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml
index 6cef0ae..533057a 100644
--- a/.github/workflows/pull_request_test.yml
+++ b/.github/workflows/pull_request_test.yml
@@ -18,11 +18,13 @@ jobs:
#
# steps:
# - uses: actions/checkout@v1
-# - name: Build & Test
+# - name: Build
# run: |
-# /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest konlpy
# /opt/python/${{ matrix.cp }}/bin/python setup.py build install
-# /opt/python/${{ matrix.cp }}/bin/python -m pytest test.py
+# - name: Test
+# run: |
+# /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest konlpy
+# /opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test.py
build_macos:
name: Build for macOS
@@ -44,7 +46,7 @@ jobs:
- name: Test
run: |
python -m pip install pytest konlpy
- python -m pytest test.py
+ python -m pytest --verbose test.py
build_windows:
name: Build for Windows
@@ -68,4 +70,4 @@ jobs:
- name: Test
run: |
python -m pip install pytest konlpy
- python -m pytest test.py
+ python -m pytest --verbose test.py
diff --git a/README.md b/README.md
index 3a1a437..f090fd0 100644
--- a/README.md
+++ b/README.md
@@ -197,13 +197,17 @@ reader와 receiver를 사용한 예시는 다음과 같습니다.
self.input = open(input, encoding='utf-8')
self.output = open(output, 'w', encoding='utf-8')
- def read(self, id):
- if id == 0:
+ def read(self, sent_id):
+ if sent_id == 0:
self.input.seek(0)
- return self.input.readline()
-
- def write(self, id, res):
- print('Analyzed %dth row' % id)
+ self.iter = iter(self.input)
+ try:
+ return next(self.iter)
+ except StopIteration:
+ return None
+
+ def write(self, sent_id, res):
+ print('Analyzed %dth row' % sent_id)
self.output.write(' '.join(map(lambda x:x[0]+'/'+x[1], res[0][0])) + '\n')
def __del__(self):
diff --git a/kiwipiepy.vcxproj b/kiwipiepy.vcxproj
index 422ba1b..0402d5a 100644
--- a/kiwipiepy.vcxproj
+++ b/kiwipiepy.vcxproj
@@ -35,6 +35,8 @@
+
+
diff --git a/kiwipiepy.vcxproj.filters b/kiwipiepy.vcxproj.filters
index 2346edd..8aaf8e8 100644
--- a/kiwipiepy.vcxproj.filters
+++ b/kiwipiepy.vcxproj.filters
@@ -13,6 +13,12 @@
{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
+
+ {7797df98-8827-48a4-bba9-d27a0b3aa861}
+
+
+ {006c5263-f679-408c-9729-996ce9cc7008}
+
@@ -51,6 +57,12 @@
+
+ Release
+
+
+ Debug
+
diff --git a/setup.py b/setup.py
index 5476665..87bc427 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@
setup(
name='kiwipiepy',
- version='0.7.3',
+ version='0.7.4',
description='Kiwi, the Korean Tokenizer for Python',
long_description=long_description,
diff --git a/src/KiwiPy.cpp b/src/KiwiPy.cpp
index f9be524..a42959f 100644
--- a/src/KiwiPy.cpp
+++ b/src/KiwiPy.cpp
@@ -13,6 +13,45 @@
using namespace std;
using namespace kiwi;
+struct UniquePyObj
+{
+ PyObject* obj;
+ UniquePyObj(PyObject* _obj = nullptr) : obj(_obj) {}
+ ~UniquePyObj()
+ {
+ Py_XDECREF(obj);
+ }
+
+ UniquePyObj(const UniquePyObj&) = delete;
+ UniquePyObj& operator=(const UniquePyObj&) = delete;
+
+ UniquePyObj(UniquePyObj&& o)
+ {
+ std::swap(obj, o.obj);
+ }
+
+ UniquePyObj& operator=(UniquePyObj&& o)
+ {
+ std::swap(obj, o.obj);
+ return *this;
+ }
+
+ PyObject* get() const
+ {
+ return obj;
+ }
+
+ operator bool() const
+ {
+ return !!obj;
+ }
+
+ operator PyObject*() const
+ {
+ return obj;
+ }
+ };
+
#if PY_MAJOR_VERSION < 3
string PyUnicode_AsUTF8(PyObject* obj)
{
@@ -208,18 +247,14 @@ static PyObject* kiwi__extractWords(KiwiObject* self, PyObject* args, PyObject *
{
auto res = self->inst->extractWords([argReader](size_t id) -> u16string
{
- PyObject* argList = Py_BuildValue("(n)", id);
- PyObject* retVal = PyEval_CallObject(argReader, argList);
- Py_DECREF(argList);
+ UniquePyObj argList = Py_BuildValue("(n)", id);
+ UniquePyObj retVal = PyEval_CallObject(argReader, argList);
if (!retVal) throw bad_exception();
if (PyObject_Not(retVal))
{
- Py_DECREF(retVal);
return {};
}
- auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal));
- Py_DECREF(retVal);
- return p;
+ return Kiwi::toU16(PyUnicode_AsUTF8(retVal));
}, minCnt, maxWordLen, minScore);
PyObject* retList = PyList_New(res.size());
@@ -256,18 +291,14 @@ static PyObject* kiwi__extractFilterWords(KiwiObject* self, PyObject* args, PyOb
{
auto res = self->inst->extractWords([argReader](size_t id) -> u16string
{
- PyObject* argList = Py_BuildValue("(n)", id);
- PyObject* retVal = PyEval_CallObject(argReader, argList);
- Py_DECREF(argList);
+ UniquePyObj argList = Py_BuildValue("(n)", id);
+ UniquePyObj retVal = PyEval_CallObject(argReader, argList);
if (!retVal) throw bad_exception();
if (PyObject_Not(retVal))
{
- Py_DECREF(retVal);
return {};
}
- auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal));
- Py_DECREF(retVal);
- return p;
+ return Kiwi::toU16(PyUnicode_AsUTF8(retVal));
}, minCnt, maxWordLen, minScore);
res = self->inst->filterExtractedWords(move(res), posScore);
@@ -305,18 +336,14 @@ static PyObject* kiwi__extractAddWords(KiwiObject* self, PyObject* args, PyObjec
{
auto res = self->inst->extractAddWords([argReader](size_t id) -> u16string
{
- PyObject* argList = Py_BuildValue("(n)", id);
- PyObject* retVal = PyEval_CallObject(argReader, argList);
- Py_DECREF(argList);
+ UniquePyObj argList = Py_BuildValue("(n)", id);
+ UniquePyObj retVal = PyEval_CallObject(argReader, argList);
if (!retVal) throw bad_exception();
if (PyObject_Not(retVal))
{
- Py_DECREF(retVal);
return {};
}
- auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal));
- Py_DECREF(retVal);
- return p;
+ return Kiwi::toU16(PyUnicode_AsUTF8(retVal));
}, minCnt, maxWordLen, minScore, posScore);
PyObject* retList = PyList_New(res.size());
@@ -421,7 +448,7 @@ PyObject* resToPyList(const vector& res)
{
PyList_SetItem(rList, jdx++, Py_BuildValue("(ssnn)", Kiwi::toU8(q.str()).c_str(), tagToString(q.tag()), (size_t)q.pos(), (size_t)q.len()));
}
- PyList_SetItem(retList, idx++, Py_BuildValue("(Of)", rList, p.second));
+ PyList_SetItem(retList, idx++, Py_BuildValue("(Nf)", rList, p.second));
}
return retList;
}
@@ -434,16 +461,16 @@ static PyObject* kiwi__analyze(KiwiObject* self, PyObject* args, PyObject *kwarg
static const char* kwlist[] = { "text", "top_n", nullptr };
if (PyArg_ParseTupleAndKeywords(args, kwargs, "s|n", (char**)kwlist, &text, &topN))
{
- try
+ //try
{
auto res = self->inst->analyze(text, topN);
return resToPyList(res);
}
- catch (const exception& e)
+ /*catch (const exception& e)
{
PyErr_SetString(PyExc_Exception, e.what());
return nullptr;
- }
+ }*/
}
PyErr_Clear();
}
@@ -458,26 +485,19 @@ static PyObject* kiwi__analyze(KiwiObject* self, PyObject* args, PyObject *kwarg
if (!PyCallable_Check(receiver)) return PyErr_SetString(PyExc_TypeError, "'analyze' requires 2nd parameter as callable"), nullptr;
self->inst->analyze(topN, [&reader](size_t id)->u16string
{
- PyObject* argList = Py_BuildValue("(n)", id);
- PyObject* retVal = PyEval_CallObject(reader, argList);
- Py_DECREF(argList);
+ UniquePyObj argList = Py_BuildValue("(n)", id);
+ UniquePyObj retVal = PyEval_CallObject(reader, argList);
if (!retVal) throw bad_exception();
if (PyObject_Not(retVal))
{
- Py_DECREF(retVal);
return {};
}
- auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal));
- Py_DECREF(retVal);
- return p;
+ return Kiwi::toU16(PyUnicode_AsUTF8(retVal));
}, [&receiver](size_t id, vector&& res)
{
- PyObject* l = resToPyList(res);
- PyObject* argList = Py_BuildValue("(nO)", id, l);
- PyObject* ret = PyEval_CallObject(receiver, argList);
+ UniquePyObj argList = Py_BuildValue("(nN)", id, resToPyList(res));
+ UniquePyObj ret = PyEval_CallObject(receiver, argList);
if(!ret) throw bad_exception();
- Py_DECREF(ret);
- Py_DECREF(argList);
});
Py_INCREF(Py_None);
return Py_None;
@@ -513,26 +533,19 @@ static PyObject* kiwi__perform(KiwiObject* self, PyObject* args, PyObject *kwarg
self->inst->perform(topN, [&reader](size_t id)->u16string
{
- PyObject* argList = Py_BuildValue("(n)", id);
- PyObject* retVal = PyEval_CallObject(reader, argList);
- Py_DECREF(argList);
+ UniquePyObj argList = Py_BuildValue("(n)", id);
+ UniquePyObj retVal = PyEval_CallObject(reader, argList);
if (!retVal) throw bad_exception();
if (PyObject_Not(retVal))
{
- Py_DECREF(retVal);
return {};
}
- auto p = Kiwi::toU16(PyUnicode_AsUTF8(retVal));
- Py_DECREF(retVal);
- return p;
+ return Kiwi::toU16(PyUnicode_AsUTF8(retVal));
}, [&receiver](size_t id, vector&& res)
{
- PyObject* l = resToPyList(res);
- PyObject* argList = Py_BuildValue("(nO)", id, l);
- PyObject* ret = PyEval_CallObject(receiver, argList);
+ UniquePyObj argList = Py_BuildValue("(nN)", id, resToPyList(res));
+ UniquePyObj ret = PyEval_CallObject(receiver, argList);
if (!ret) throw bad_exception();
- Py_DECREF(ret);
- Py_DECREF(argList);
}, minCnt, maxWordLen, minScore, posScore);
Py_INCREF(Py_None);
return Py_None;
diff --git a/src/core/KMemory.h b/src/core/KMemory.h
index 19c7ac5..2359a89 100644
--- a/src/core/KMemory.h
+++ b/src/core/KMemory.h
@@ -17,7 +17,7 @@ namespace kiwi
void* allocate()
{
- assert(_CrtCheckMemory());
+ //assert(_CrtCheckMemory());
//std::lock_guard lg(lock);
if (!freeList)
{
@@ -37,7 +37,7 @@ namespace kiwi
//fprintf(stderr, "deallocate %p\n", p);
*((void**)p) = freeList;
freeList = (void**)p;
- assert(_CrtCheckMemory());
+ //assert(_CrtCheckMemory());
}
private:
diff --git a/src/core/KModelMgr.cpp b/src/core/KModelMgr.cpp
index 346f1bf..845503c 100644
--- a/src/core/KModelMgr.cpp
+++ b/src/core/KModelMgr.cpp
@@ -468,12 +468,13 @@ void KModelMgr::solidify()
trieRoot[i].val = &forms[i - 1];
}
+ bool once = false;
for (size_t i = (size_t)KPOSTag::SN; i < forms.size(); ++i)
{
auto& f = forms[i];
if (f.candidate.empty()) continue;
size_t realSize = f.form.size();
- if (f.form.find(u'\x2665') != k_string::npos)
+ if (!once && f.form.find(u'\x2665') != k_string::npos)
{
realSize = f.form.find(u'\x2665') + 1;
}
@@ -482,9 +483,10 @@ void KModelMgr::solidify()
trieRoot.emplace_back();
return &trieRoot.back();
});
- if (f.form.find(u'\x2665') != k_string::npos)
+ if (!once && f.form.find(u'\x2665') != k_string::npos)
{
f.form = f.form.substr(f.form.find(u'\x2665') + 1);
+ once = true;
}
}
trieRoot[0].fillFail();
diff --git a/src/core/KTrie.cpp b/src/core/KTrie.cpp
index 0475978..09f7296 100644
--- a/src/core/KTrie.cpp
+++ b/src/core/KTrie.cpp
@@ -35,10 +35,10 @@ vector KTrie::split(const k_string& str) const
return nBegin == g.lastPos && lastSpecialEndPos == g.lastPos - (g.uform.empty() ? g.form->form.size() : g.uform.size());
});
- // inserting unknown form
+ // insert unknown form
if (nBegin > lastSpecialEndPos && !longestMatched
&& !(0x11A8 <= cand->form[0] && cand->form[0] < (0x11A7 + 28))
- && str[nBegin - space -1] != 0x11BB) // cannot ends with
+ && str[nBegin - space -1] != 0x11BB) // cannot end with
{
auto it2 = spacePos.find(lastSpecialEndPos - 1);
int space2 = it2 == spacePos.end() ? 0 : it2->second;
diff --git a/src/core/Kiwi.cpp b/src/core/Kiwi.cpp
index 1bab1dd..3ad536d 100644
--- a/src/core/Kiwi.cpp
+++ b/src/core/Kiwi.cpp
@@ -764,12 +764,12 @@ vector Kiwi::analyze(const u16string & str, size_t topN) const
sents.emplace_back(chunk);
}
}
- sents.emplace_back(str.end());
+ if(sents.back() != str.end()) sents.emplace_back(str.end());
+ if (sents.size() <= 1) return vector(1);
vector ret = analyzeSent(sents[0], sents[1], topN);
if (ret.empty())
{
- ret.emplace_back();
- return ret;
+ return vector(1);
}
while (ret.size() < topN) ret.emplace_back(ret.back());
for (size_t i = 2; i < sents.size(); ++i)
diff --git a/test.py b/test.py
index e230ca8..26239b8 100644
--- a/test.py
+++ b/test.py
@@ -6,13 +6,16 @@ def __init__(self, input, output=None):
self.input = input
self.output = output
- def read(self, id):
- if id == 0:
+ def read(self, sent_id):
+ if sent_id == 0:
self.input.seek(0)
- return self.input.readline()
+ self.iter = iter(self.input)
+ try:
+ return next(self.iter)
+ except StopIteration:
+ return None
- def write(self, id, res):
- print('Analyzed %dth row' % id)
+ def write(self, sent_id, res):
self.output.write(' '.join(map(lambda x:x[0]+'/'+x[1], res[0][0])) + '\n')
def __del__(self):
@@ -33,6 +36,6 @@ def test_analyze_multi():
def test_extract_words():
kiwi = Kiwi()
+ kiwi.prepare()
handle = IOHandler(kolaw.open('constitution.txt'))
-
-
+ kiwi.extract_words(handle.read)