Skip to content

Commit

Permalink
Change minCnt of KnLM from size_t type to vector<size_t>
Browse files Browse the repository at this point in the history
  • Loading branch information
bab2min committed Oct 12, 2024
1 parent 01a22a8 commit 7b2f0f6
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 8 deletions.
3 changes: 1 addition & 2 deletions include/kiwi/Kiwi.h
Original file line number Diff line number Diff line change
Expand Up @@ -614,8 +614,7 @@ namespace kiwi
std::vector<std::string> corpora;
size_t minMorphCnt = 10;
size_t lmOrder = 4;
size_t lmMinCnt = 1;
size_t lmLastOrderMinCnt = 2;
std::vector<size_t> lmMinCnts = { 1 };
size_t numWorkers = 1;
size_t sbgSize = 1000000;
bool useLmTagHistory = true;
Expand Down
15 changes: 12 additions & 3 deletions src/KiwiBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,8 @@ KiwiBuilder::KiwiBuilder(const ModelBuildArgs& args)
pool.~ThreadPool();
new (&pool) utils::ThreadPool{ args.numWorkers };
}
auto cntNodes = utils::count(sents.begin(), sents.end(), args.lmMinCnt, 1, args.lmOrder, (args.numWorkers > 1 ? &pool : nullptr), &bigramList, args.useLmTagHistory ? &historyTx : nullptr);
size_t lmMinCnt = *std::min(args.lmMinCnts.begin(), args.lmMinCnts.end());
auto cntNodes = utils::count(sents.begin(), sents.end(), lmMinCnt, 1, args.lmOrder, (args.numWorkers > 1 ? &pool : nullptr), &bigramList, args.useLmTagHistory ? &historyTx : nullptr);
// discount for bos node cnt
if (args.useLmTagHistory)
{
Expand All @@ -882,8 +883,16 @@ KiwiBuilder::KiwiBuilder(const ModelBuildArgs& args)
{
cntNodes.root().getNext(0)->val /= 2;
}
std::vector<size_t> minCnts(args.lmOrder, args.lmMinCnt);
minCnts.back() = args.lmLastOrderMinCnt;
std::vector<size_t> minCnts;
if (args.lmMinCnts.size() == 1)
{
minCnts.clear();
minCnts.resize(args.lmOrder, args.lmMinCnts[0]);
}
else if (args.lmMinCnts.size() == args.lmOrder)
{
minCnts = args.lmMinCnts;
}
langMdl.knlm = lm::KnLangModelBase::create(lm::KnLangModelBase::build(
cntNodes,
args.lmOrder, minCnts,
Expand Down
41 changes: 38 additions & 3 deletions tools/model_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@
using namespace std;
using namespace kiwi;

vector<size_t> splitMultipleInts(const string& s, const char delim = ',')
{
vector<size_t> ret;
size_t p = 0, e = 0;
while (1)
{
size_t t = s.find(delim, p);
if (t == s.npos)
{
ret.emplace_back(atoi(&s[e]));
return ret;
}
else
{
ret.emplace_back(atoi(&s[e]));
p = t + 1;
e = t + 1;
}
}
}

int run(const KiwiBuilder::ModelBuildArgs& args, const string& output, bool skipBigram)
{
try
Expand Down Expand Up @@ -49,7 +70,7 @@ int main(int argc, const char* argv[])
ValueArg<size_t> workers{ "w", "workers", "number of workers", false, 1, "int" };
ValueArg<size_t> morMinCnt{ "", "morpheme_min_cnt", "min count of morpheme", false, 10, "int" };
ValueArg<size_t> lmOrder{ "", "order", "order of LM", false, 4, "int" };
ValueArg<size_t> lmMinCnt{ "", "min_cnt", "min count of LM", false, 1, "int" };
ValueArg<string> lmMinCnt{ "", "min_cnt", "min count of LM", false, "1", "multiple ints with comma"};
ValueArg<size_t> lmLastOrderMinCnt{ "", "last_min_cnt", "min count of the last order of LM", false, 2, "int" };
ValueArg<string> output{ "o", "output", "output model path", true, "", "string" };
ValueArg<size_t> sbgSize{ "", "sbg_size", "sbg size", false, 1000000, "int" };
Expand Down Expand Up @@ -86,10 +107,24 @@ int main(int argc, const char* argv[])
args.useLmTagHistory = tagHistory;
args.minMorphCnt = morMinCnt;
args.lmOrder = lmOrder;
args.lmMinCnt = lmMinCnt;
args.lmLastOrderMinCnt = lmLastOrderMinCnt;
args.numWorkers = workers;
args.sbgSize = sbgSize;

auto v = splitMultipleInts(lmMinCnt.getValue());

if (v.empty())
{
args.lmMinCnts.resize(1, 1);
}
else if (v.size() == 1 || v.size() == lmOrder)
{
args.lmMinCnts = v;
}
else
{
cerr << "error: min_cnt size should be 1 or equal to order" << endl;
return -1;
}
return run(args, output, skipBigram);
}

0 comments on commit 7b2f0f6

Please sign in to comment.