Merge pull request #219 from bab2min/dev_improve_utils

add more utilities
bab2min · Jul 31, 2024 · 5b480b4 · 5b480b4
2 parents 235f85c + 65f914a
commit 5b480b4
Show file tree

Hide file tree

Showing 10 changed files with 199 additions and 22 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -107,8 +107,8 @@ jobs:
           TWINE_USERNAME=${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD=${{ secrets.PYPI_PASSWORD }} /opt/python/${{ matrix.cp }}/bin/python -m twine upload wheelhouse/*.whl
 
   build_macos_11:
-    name: Build for macOS 11
-    runs-on: macOS-11
+    name: Build for macOS 13
+    runs-on: macOS-13
     strategy:
       max-parallel: 4
       matrix:
@@ -141,7 +141,7 @@ jobs:
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
         export AUDITWHEEL_PLAT=many
-        export MACOSX_DEPLOYMENT_TARGET=11.7
+        export MACOSX_DEPLOYMENT_TARGET=10.14
         python -m pip install twine wheel numpy==`python .github/workflows/numpy_version.py`
         TOMOTOPY_CPU_ARCH=${{ matrix.cpu-arch }} python setup.py bdist_wheel
         twine upload dist/*

diff --git a/.github/workflows/deploy_test.yml b/.github/workflows/deploy_test.yml
@@ -106,8 +106,8 @@ jobs:
           TWINE_USERNAME=${{ secrets.TEST_PYPI_USERNAME }} TWINE_PASSWORD=${{ secrets.TEST_PYPI_PASSWORD }} /opt/python/${{ matrix.cp }}/bin/python -m twine upload --repository testpypi wheelhouse/*.whl
 
   build_macos_11:
-    name: Build for macOS 11
-    runs-on: macOS-11
+    name: Build for macOS 13
+    runs-on: macOS-13
     strategy:
       max-parallel: 4
       matrix:
@@ -140,7 +140,7 @@ jobs:
         TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
       run: |
         export AUDITWHEEL_PLAT=many
-        export MACOSX_DEPLOYMENT_TARGET=11.7
+        export MACOSX_DEPLOYMENT_TARGET=10.14
         python -m pip install twine wheel numpy==`python .github/workflows/numpy_version.py`
         TOMOTOPY_CPU_ARCH=${{ matrix.cpu-arch }} python setup.py bdist_wheel
         twine upload --repository testpypi dist/*

diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml
@@ -103,8 +103,8 @@ jobs:
         path: artifacts/build.tgz
 
   build_macos_11:
-    name: Build for macOS 11
-    runs-on: macOS-11
+    name: Build for macOS 13
+    runs-on: macOS-13
     strategy:
       max-parallel: 4
       matrix:
@@ -132,7 +132,7 @@ jobs:
         mv variant-1.1.3/include/mapbox include/
     - name: Build
       run: |
-        export MACOSX_DEPLOYMENT_TARGET=11.7
+        export MACOSX_DEPLOYMENT_TARGET=10.14
         python -m pip install numpy==`python .github/workflows/numpy_version.py`
         TOMOTOPY_CPU_ARCH=${{ matrix.cpu-arch }} python setup.py build install
     - name: Archive binary

diff --git a/src/TopicModel/LDAModel.hpp b/src/TopicModel/LDAModel.hpp
@@ -1066,6 +1066,7 @@ namespace tomoto
 		void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
 		{
 			if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
+			static_cast<DerivedClass*>(this)->updateWordFormCnts();
 			static_cast<DerivedClass*>(this)->updateWeakArray();
 			static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
 			static_cast<DerivedClass*>(this)->prepareWordPriors();

diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp
@@ -1,4 +1,4 @@
-#pragma once
+#pragma once
 #include <numeric>
 #include <unordered_set>
 #include "../Utils/Utils.hpp"
@@ -251,6 +251,7 @@ namespace tomoto
 		virtual const std::vector<uint64_t>& getVocabCf() const = 0;
 		virtual std::vector<double> getVocabWeightedCf() const = 0;
 		virtual const std::vector<uint64_t>& getVocabDf() const = 0;
+		virtual const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const = 0;
 
 		virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
 		virtual size_t getGlobalStep() const = 0;
@@ -260,6 +261,7 @@ namespace tomoto
 		virtual size_t getNumTopicsForPrior() const = 0;
 		virtual std::vector<Float> getWidsByTopic(size_t tid, bool normalize = true) const = 0;
 		virtual std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const = 0;
+		virtual std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const = 0;
 
 		virtual std::vector<std::pair<std::string, Float>> getWordsByDocSorted(const DocumentBase* doc, size_t topN) const = 0;
 
@@ -319,6 +321,7 @@ namespace tomoto
 		size_t globalStep = 0;
 		_ModelState globalState, tState;
 		Dictionary dict;
+		std::vector<std::vector<std::pair<std::string, size_t>>> wordFormCnts;
 		uint64_t realV = 0; // vocab size after removing stopwords
 		uint64_t realN = 0; // total word size after removing stopwords
 		double weightedN = 0;
@@ -565,6 +568,44 @@ namespace tomoto
 			}
 		}
 
+		void updateWordFormCnts()
+		{
+			wordFormCnts.clear();
+			wordFormCnts.resize(realV);
+			std::vector<std::unordered_map<std::string, size_t>> cnts(realV);
+			for (auto& doc : docs)
+			{
+				for (size_t i = 0; i < doc.words.size(); ++i)
+				{
+					auto w = doc.words[i];
+					if (w >= realV) continue;
+					auto& cnt = cnts[w];
+					std::string word;
+					if (!doc.rawStr.empty() && i < doc.origWordPos.size())
+					{
+						word = doc.rawStr.substr(doc.origWordPos[i], doc.origWordLen[i]);
+					}
+					else
+					{
+						word = dict.toWord(w);
+					}
+					++cnt[word];
+				}
+			}
+
+			for (size_t i = 0; i < realV; ++i)
+			{
+				auto& cnt = cnts[i];
+				std::vector<std::pair<std::string, size_t>> v{ std::make_move_iterator(cnt.begin()), std::make_move_iterator(cnt.end()) };
+				std::sort(v.begin(), v.end(), [](const std::pair<std::string, size_t>& a, const std::pair<std::string, size_t>& b)
+				{
+					return a.second > b.second;
+				});
+				wordFormCnts[i] = move(v);
+				cnt.clear();
+			}
+		}
+
 		int restoreFromTrainingError(const exc::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
 		{
 			throw e;
@@ -751,11 +792,26 @@ namespace tomoto
 			return ret;
 		}
 
+		std::vector<std::tuple<std::string, Vid, Float>> vid2StringVid(const std::vector<std::pair<Vid, Float>>& vids) const
+		{
+			std::vector<std::tuple<std::string, Vid, Float>> ret(vids.size());
+			for (size_t i = 0; i < vids.size(); ++i)
+			{
+				ret[i] = std::make_tuple(dict.toWord(vids[i].first), vids[i].first, vids[i].second);
+			}
+			return ret;
+		}
+
 		std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const override
 		{
 			return vid2String(getWidsByTopicSorted(tid, topN));
 		}
 
+		std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const override
+		{
+			return vid2StringVid(getWidsByTopicSorted(tid, topN));
+		}
+
 		std::vector<std::pair<Vid, Float>> getWidsByDocSorted(const DocumentBase* doc, size_t topN) const
 		{
 			std::vector<Float> cnt(dict.size());
@@ -872,6 +928,11 @@ namespace tomoto
 			return vocabDf;
 		}
 
+		const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const override
+		{
+			return wordFormCnts;
+		}
+
 		void saveModel(std::ostream& writer, bool fullModel, const std::vector<uint8_t>* extra_data) const override
 		{ 
 			static_cast<const _Derived*>(this)->_saveModel(writer, fullModel, extra_data);

diff --git a/src/Utils/Dictionary.h b/src/Utils/Dictionary.h
@@ -12,8 +12,9 @@ namespace tomoto
 {
 	using Vid = uint32_t;
 	static constexpr Vid non_vocab_id = (Vid)-1;
+	static constexpr Vid rm_vocab_id = (Vid)-2;
 	using Tid = uint16_t;
-	static constexpr Vid non_topic_id = (Tid)-1;
+	static constexpr Tid non_topic_id = (Tid)-1;
 	using Float = float;
 
 	struct VidPair : public std::pair<Vid, Vid>

diff --git a/src/python/PyUtils.h b/src/python/PyUtils.h
@@ -748,6 +748,31 @@ namespace py
 		}
 	};
 
+	template<typename _Ty1, typename _Ty2, typename _Ty3>
+	struct ValueBuilder<std::tuple<_Ty1, _Ty2, _Ty3>>
+	{
+		PyObject* operator()(const std::tuple<_Ty1, _Ty2, _Ty3>& v)
+		{
+			PyObject* ret = PyTuple_New(3);
+			size_t id = 0;
+			PyTuple_SetItem(ret, id++, buildPyValue(std::get<0>(v)));
+			PyTuple_SetItem(ret, id++, buildPyValue(std::get<1>(v)));
+			PyTuple_SetItem(ret, id++, buildPyValue(std::get<2>(v)));
+			return ret;
+		}
+
+		template<typename _FailMsg>
+		std::tuple<_Ty1, _Ty2, _Ty3> _toCpp(PyObject* obj, _FailMsg&&)
+		{
+			if (PyTuple_Size(obj) != 3) throw ConversionFail{ "input is not tuple with len=3" };
+			return std::make_tuple(
+				toCpp<_Ty1>(PyTuple_GetItem(obj, 0)),
+				toCpp<_Ty2>(PyTuple_GetItem(obj, 1)),
+				toCpp<_Ty3>(PyTuple_GetItem(obj, 2))
+			);
+		}
+	};
+
 	template<typename _Ty1, typename _Ty2>
 	struct ValueBuilder<std::unordered_map<_Ty1, _Ty2>>
 	{

diff --git a/src/python/docs.h b/src/python/docs.h
@@ -531,14 +531,19 @@ show_progress : bool
 )"");
 
 DOC_SIGNATURE_EN_KO(LDA_get_topic_words__doc__,
-    "get_topic_words(self, topic_id, top_n=10)",
+    "get_topic_words(self, topic_id, top_n=10, return_id=False)",
     u8R""(Return the `top_n` words and its probability in the topic `topic_id`. 
-The return type is a `list` of (word:`str`, probability:`float`).
+The return type is a `list` of (word:`str`, probability:`float`) tuples if return_id is False,
+otherwise a `list` of (word:`str`, word_id:`int`, probability:`float`) tuples.
 
 Parameters
 ----------
 topic_id : int
     an integer in range [0, `k`), indicating the topic
+top_n : int
+	the number of words to be returned
+return_id : bool
+	If `True`, it returns the word IDs too.
 )"",
 u8R""(토픽 `topic_id`에 속하는 상위 `top_n`개의 단어와 각각의 확률을 반환합니다. 
 반환 타입은 (단어:`str`, 확률:`float`) 튜플의 `list`형입니다.
@@ -547,6 +552,10 @@ Parameters
 ----------
 topic_id : int
     토픽을 가리키는 [0, `k`) 범위의 정수
+top_n : int
+	반환할 단어의 개수
+return_id : bool
+	참일 경우 단어 ID도 함께 반환합니다.
 )"");
 
 DOC_SIGNATURE_EN_KO(LDA_get_topic_word_dist__doc__,
@@ -766,6 +775,12 @@ flush : bool
     출력 스트림의 강제 flush 여부
 )"");
 
+DOC_SIGNATURE_EN_KO(LDA_get_word_forms__doc__,
+    "get_word_forms(self)",
+    u8R""()"",
+u8R""()"");
+
+
 
 DOC_VARIABLE_EN_KO(LDA_tw__doc__,
     u8R""(the term weighting scheme (read-only))"",

diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp
@@ -250,16 +250,23 @@ static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwa
 
 PyObject* LDA_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kwargs)
 {
-	size_t topicId, topN = 10;
-	static const char* kwlist[] = { "topic_id", "top_n", nullptr };
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &topicId, &topN)) return nullptr;
+	size_t topicId, topN = 10, returnId = 0;
+	static const char* kwlist[] = { "topic_id", "top_n", "return_id", nullptr};
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|np", (char**)kwlist, &topicId, &topN, &returnId)) return nullptr;
 	return py::handleExc([&]()
 	{
 		if (!self->inst) throw py::RuntimeError{ "inst is null" };
 		auto* inst = static_cast<tomoto::ILDAModel*>(self->inst);
 		if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < K" };
 
-		return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN));
+		if (returnId)
+		{
+			return py::buildPyValue(inst->getWordIdsByTopicSorted(topicId, topN));
+		}
+		else
+		{
+			return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN));
+		}
 	});
 }
 
@@ -582,6 +589,29 @@ static PyObject* LDA_summary(TopicModelObject* self, PyObject* args, PyObject* k
 	});
 }
 
+static PyObject* LDA_get_word_forms(TopicModelObject* self, PyObject* args, PyObject* kwargs)
+{
+	size_t idx = -1;
+	static const char* kwlist[] = { "idx", nullptr};
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", (char**)kwlist, &idx)) return nullptr;
+	return py::handleExc([&]()
+	{
+		if (!self->inst) throw py::RuntimeError{ "inst is null" };
+		if (idx == (size_t)-1)
+		{
+			return py::buildPyValue(self->inst->getWordFormCnts());
+		}
+		else
+		{
+			if (idx >= self->inst->getWordFormCnts().size())
+			{
+				throw py::ValueError{ "`idx` must be less than the `len(used_vocabs)`." };
+			}
+			return py::buildPyValue(self->inst->getWordFormCnts()[idx]);
+		}
+	});
+}
+
 static PyObject* LDA_copy(TopicModelObject* self)
 {
 	return py::handleExc([&]()
@@ -782,6 +812,7 @@ static PyMethodDef LDA_methods[] =
 	{ "copy", (PyCFunction)LDA_copy, METH_NOARGS, LDA_copy__doc__},
 	{ "_update_vocab", (PyCFunction)LDA_update_vocab, METH_VARARGS | METH_KEYWORDS, ""},
 	{ "summary", (PyCFunction)LDA_summary, METH_VARARGS | METH_KEYWORDS, LDA_summary__doc__},
+	{ "get_word_forms", (PyCFunction)LDA_get_word_forms, METH_VARARGS | METH_KEYWORDS, LDA_get_word_forms__doc__},
 	{ nullptr }
 };