Skip to content

Commit

Permalink
Merge pull request #219 from bab2min/dev_improve_utils
Browse files Browse the repository at this point in the history
add more utilities
  • Loading branch information
bab2min authored Jul 31, 2024
2 parents 235f85c + 65f914a commit 5b480b4
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 22 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ jobs:
TWINE_USERNAME=${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD=${{ secrets.PYPI_PASSWORD }} /opt/python/${{ matrix.cp }}/bin/python -m twine upload wheelhouse/*.whl
build_macos_11:
name: Build for macOS 11
runs-on: macOS-11
name: Build for macOS 13
runs-on: macOS-13
strategy:
max-parallel: 4
matrix:
Expand Down Expand Up @@ -141,7 +141,7 @@ jobs:
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
export AUDITWHEEL_PLAT=many
export MACOSX_DEPLOYMENT_TARGET=11.7
export MACOSX_DEPLOYMENT_TARGET=10.14
python -m pip install twine wheel numpy==`python .github/workflows/numpy_version.py`
TOMOTOPY_CPU_ARCH=${{ matrix.cpu-arch }} python setup.py bdist_wheel
twine upload dist/*
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/deploy_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ jobs:
TWINE_USERNAME=${{ secrets.TEST_PYPI_USERNAME }} TWINE_PASSWORD=${{ secrets.TEST_PYPI_PASSWORD }} /opt/python/${{ matrix.cp }}/bin/python -m twine upload --repository testpypi wheelhouse/*.whl
build_macos_11:
name: Build for macOS 11
runs-on: macOS-11
name: Build for macOS 13
runs-on: macOS-13
strategy:
max-parallel: 4
matrix:
Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
run: |
export AUDITWHEEL_PLAT=many
export MACOSX_DEPLOYMENT_TARGET=11.7
export MACOSX_DEPLOYMENT_TARGET=10.14
python -m pip install twine wheel numpy==`python .github/workflows/numpy_version.py`
TOMOTOPY_CPU_ARCH=${{ matrix.cpu-arch }} python setup.py bdist_wheel
twine upload --repository testpypi dist/*
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pull_request_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ jobs:
path: artifacts/build.tgz

build_macos_11:
name: Build for macOS 11
runs-on: macOS-11
name: Build for macOS 13
runs-on: macOS-13
strategy:
max-parallel: 4
matrix:
Expand Down Expand Up @@ -132,7 +132,7 @@ jobs:
mv variant-1.1.3/include/mapbox include/
- name: Build
run: |
export MACOSX_DEPLOYMENT_TARGET=11.7
export MACOSX_DEPLOYMENT_TARGET=10.14
python -m pip install numpy==`python .github/workflows/numpy_version.py`
TOMOTOPY_CPU_ARCH=${{ matrix.cpu-arch }} python setup.py build install
- name: Archive binary
Expand Down
1 change: 1 addition & 0 deletions src/TopicModel/LDAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,7 @@ namespace tomoto
void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
{
if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
static_cast<DerivedClass*>(this)->updateWordFormCnts();
static_cast<DerivedClass*>(this)->updateWeakArray();
static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
static_cast<DerivedClass*>(this)->prepareWordPriors();
Expand Down
63 changes: 62 additions & 1 deletion src/TopicModel/TopicModel.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#pragma once
#pragma once
#include <numeric>
#include <unordered_set>
#include "../Utils/Utils.hpp"
Expand Down Expand Up @@ -251,6 +251,7 @@ namespace tomoto
virtual const std::vector<uint64_t>& getVocabCf() const = 0;
virtual std::vector<double> getVocabWeightedCf() const = 0;
virtual const std::vector<uint64_t>& getVocabDf() const = 0;
virtual const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const = 0;

virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
virtual size_t getGlobalStep() const = 0;
Expand All @@ -260,6 +261,7 @@ namespace tomoto
virtual size_t getNumTopicsForPrior() const = 0;
virtual std::vector<Float> getWidsByTopic(size_t tid, bool normalize = true) const = 0;
virtual std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const = 0;
virtual std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const = 0;

virtual std::vector<std::pair<std::string, Float>> getWordsByDocSorted(const DocumentBase* doc, size_t topN) const = 0;

Expand Down Expand Up @@ -319,6 +321,7 @@ namespace tomoto
size_t globalStep = 0;
_ModelState globalState, tState;
Dictionary dict;
std::vector<std::vector<std::pair<std::string, size_t>>> wordFormCnts;
uint64_t realV = 0; // vocab size after removing stopwords
uint64_t realN = 0; // total word size after removing stopwords
double weightedN = 0;
Expand Down Expand Up @@ -565,6 +568,44 @@ namespace tomoto
}
}

void updateWordFormCnts()
{
wordFormCnts.clear();
wordFormCnts.resize(realV);
std::vector<std::unordered_map<std::string, size_t>> cnts(realV);
for (auto& doc : docs)
{
for (size_t i = 0; i < doc.words.size(); ++i)
{
auto w = doc.words[i];
if (w >= realV) continue;
auto& cnt = cnts[w];
std::string word;
if (!doc.rawStr.empty() && i < doc.origWordPos.size())
{
word = doc.rawStr.substr(doc.origWordPos[i], doc.origWordLen[i]);
}
else
{
word = dict.toWord(w);
}
++cnt[word];
}
}

for (size_t i = 0; i < realV; ++i)
{
auto& cnt = cnts[i];
std::vector<std::pair<std::string, size_t>> v{ std::make_move_iterator(cnt.begin()), std::make_move_iterator(cnt.end()) };
std::sort(v.begin(), v.end(), [](const std::pair<std::string, size_t>& a, const std::pair<std::string, size_t>& b)
{
return a.second > b.second;
});
wordFormCnts[i] = move(v);
cnt.clear();
}
}

int restoreFromTrainingError(const exc::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
{
throw e;
Expand Down Expand Up @@ -751,11 +792,26 @@ namespace tomoto
return ret;
}

std::vector<std::tuple<std::string, Vid, Float>> vid2StringVid(const std::vector<std::pair<Vid, Float>>& vids) const
{
std::vector<std::tuple<std::string, Vid, Float>> ret(vids.size());
for (size_t i = 0; i < vids.size(); ++i)
{
ret[i] = std::make_tuple(dict.toWord(vids[i].first), vids[i].first, vids[i].second);
}
return ret;
}

std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const override
{
return vid2String(getWidsByTopicSorted(tid, topN));
}

std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const override
{
return vid2StringVid(getWidsByTopicSorted(tid, topN));
}

std::vector<std::pair<Vid, Float>> getWidsByDocSorted(const DocumentBase* doc, size_t topN) const
{
std::vector<Float> cnt(dict.size());
Expand Down Expand Up @@ -872,6 +928,11 @@ namespace tomoto
return vocabDf;
}

const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const override
{
return wordFormCnts;
}

void saveModel(std::ostream& writer, bool fullModel, const std::vector<uint8_t>* extra_data) const override
{
static_cast<const _Derived*>(this)->_saveModel(writer, fullModel, extra_data);
Expand Down
3 changes: 2 additions & 1 deletion src/Utils/Dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ namespace tomoto
{
using Vid = uint32_t;
static constexpr Vid non_vocab_id = (Vid)-1;
static constexpr Vid rm_vocab_id = (Vid)-2;
using Tid = uint16_t;
static constexpr Vid non_topic_id = (Tid)-1;
static constexpr Tid non_topic_id = (Tid)-1;
using Float = float;

struct VidPair : public std::pair<Vid, Vid>
Expand Down
25 changes: 25 additions & 0 deletions src/python/PyUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,31 @@ namespace py
}
};

template<typename _Ty1, typename _Ty2, typename _Ty3>
struct ValueBuilder<std::tuple<_Ty1, _Ty2, _Ty3>>
{
PyObject* operator()(const std::tuple<_Ty1, _Ty2, _Ty3>& v)
{
PyObject* ret = PyTuple_New(3);
size_t id = 0;
PyTuple_SetItem(ret, id++, buildPyValue(std::get<0>(v)));
PyTuple_SetItem(ret, id++, buildPyValue(std::get<1>(v)));
PyTuple_SetItem(ret, id++, buildPyValue(std::get<2>(v)));
return ret;
}

template<typename _FailMsg>
std::tuple<_Ty1, _Ty2, _Ty3> _toCpp(PyObject* obj, _FailMsg&&)
{
if (PyTuple_Size(obj) != 3) throw ConversionFail{ "input is not tuple with len=3" };
return std::make_tuple(
toCpp<_Ty1>(PyTuple_GetItem(obj, 0)),
toCpp<_Ty2>(PyTuple_GetItem(obj, 1)),
toCpp<_Ty3>(PyTuple_GetItem(obj, 2))
);
}
};

template<typename _Ty1, typename _Ty2>
struct ValueBuilder<std::unordered_map<_Ty1, _Ty2>>
{
Expand Down
19 changes: 17 additions & 2 deletions src/python/docs.h
Original file line number Diff line number Diff line change
Expand Up @@ -531,14 +531,19 @@ show_progress : bool
)"");

DOC_SIGNATURE_EN_KO(LDA_get_topic_words__doc__,
"get_topic_words(self, topic_id, top_n=10)",
"get_topic_words(self, topic_id, top_n=10, return_id=False)",
u8R""(Return the `top_n` words and its probability in the topic `topic_id`.
The return type is a `list` of (word:`str`, probability:`float`).
The return type is a `list` of (word:`str`, probability:`float`) tuples if return_id is False,
otherwise a `list` of (word:`str`, word_id:`int`, probability:`float`) tuples.
Parameters
----------
topic_id : int
an integer in range [0, `k`), indicating the topic
top_n : int
the number of words to be returned
return_id : bool
If `True`, it returns the word IDs too.
)"",
u8R""(토픽 `topic_id`에 속하는 상위 `top_n`개의 단어와 각각의 확률을 반환합니다.
반환 타입은 (단어:`str`, 확률:`float`) 튜플의 `list`형입니다.
Expand All @@ -547,6 +552,10 @@ Parameters
----------
topic_id : int
토픽을 가리키는 [0, `k`) 범위의 정수
top_n : int
반환할 단어의 개수
return_id : bool
참일 경우 단어 ID도 함께 반환합니다.
)"");

DOC_SIGNATURE_EN_KO(LDA_get_topic_word_dist__doc__,
Expand Down Expand Up @@ -766,6 +775,12 @@ flush : bool
출력 스트림의 강제 flush 여부
)"");

DOC_SIGNATURE_EN_KO(LDA_get_word_forms__doc__,
"get_word_forms(self)",
u8R""()"",
u8R""()"");



DOC_VARIABLE_EN_KO(LDA_tw__doc__,
u8R""(the term weighting scheme (read-only))"",
Expand Down
39 changes: 35 additions & 4 deletions src/python/py_LDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,16 +250,23 @@ static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwa

PyObject* LDA_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kwargs)
{
size_t topicId, topN = 10;
static const char* kwlist[] = { "topic_id", "top_n", nullptr };
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &topicId, &topN)) return nullptr;
size_t topicId, topN = 10, returnId = 0;
static const char* kwlist[] = { "topic_id", "top_n", "return_id", nullptr};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|np", (char**)kwlist, &topicId, &topN, &returnId)) return nullptr;
return py::handleExc([&]()
{
if (!self->inst) throw py::RuntimeError{ "inst is null" };
auto* inst = static_cast<tomoto::ILDAModel*>(self->inst);
if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < K" };

return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN));
if (returnId)
{
return py::buildPyValue(inst->getWordIdsByTopicSorted(topicId, topN));
}
else
{
return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN));
}
});
}

Expand Down Expand Up @@ -582,6 +589,29 @@ static PyObject* LDA_summary(TopicModelObject* self, PyObject* args, PyObject* k
});
}

static PyObject* LDA_get_word_forms(TopicModelObject* self, PyObject* args, PyObject* kwargs)
{
size_t idx = -1;
static const char* kwlist[] = { "idx", nullptr};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", (char**)kwlist, &idx)) return nullptr;
return py::handleExc([&]()
{
if (!self->inst) throw py::RuntimeError{ "inst is null" };
if (idx == (size_t)-1)
{
return py::buildPyValue(self->inst->getWordFormCnts());
}
else
{
if (idx >= self->inst->getWordFormCnts().size())
{
throw py::ValueError{ "`idx` must be less than the `len(used_vocabs)`." };
}
return py::buildPyValue(self->inst->getWordFormCnts()[idx]);
}
});
}

static PyObject* LDA_copy(TopicModelObject* self)
{
return py::handleExc([&]()
Expand Down Expand Up @@ -782,6 +812,7 @@ static PyMethodDef LDA_methods[] =
{ "copy", (PyCFunction)LDA_copy, METH_NOARGS, LDA_copy__doc__},
{ "_update_vocab", (PyCFunction)LDA_update_vocab, METH_VARARGS | METH_KEYWORDS, ""},
{ "summary", (PyCFunction)LDA_summary, METH_VARARGS | METH_KEYWORDS, LDA_summary__doc__},
{ "get_word_forms", (PyCFunction)LDA_get_word_forms, METH_VARARGS | METH_KEYWORDS, LDA_get_word_forms__doc__},
{ nullptr }
};

Expand Down
Loading

0 comments on commit 5b480b4

Please sign in to comment.