From 074b3bb1014af928a1a012b5f43628f52e001dd4 Mon Sep 17 00:00:00 2001 From: Trondtr Date: Fri, 15 Nov 2024 11:41:40 +0000 Subject: [PATCH] deploy: 871de5bf2afc58e04703b9e49660207faec21a19 --- index-header.md | 2 +- index.md | 2 +- lemmacount.json | 2 +- test-diary.md | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 test-diary.md diff --git a/index-header.md b/index-header.md index bf38b4a..d671b99 100644 --- a/index-header.md +++ b/index-header.md @@ -10,7 +10,7 @@ This page documents the work on the **Tokelauan language model**. ## Project documentation -* Add links to project specific documentation here as needed. Keep the documentation in the `docs/` directory. +* [Test diary](test-diary.md) ## In-source documentation diff --git a/index.md b/index.md index 0e0673f..fb5e400 100644 --- a/index.md +++ b/index.md @@ -10,7 +10,7 @@ This page documents the work on the **Tokelauan language model**. ## Project documentation -* Add links to project specific documentation here as needed. Keep the documentation in the `docs/` directory. +* [Test diary](test-diary.md) ## In-source documentation diff --git a/lemmacount.json b/lemmacount.json index e7db5b9..9aaf6d2 100644 --- a/lemmacount.json +++ b/lemmacount.json @@ -1 +1 @@ -{ "schemaVersion": 1, "label": "Lemmas", "message": "4.7 K", "color": "red" } +{ "schemaVersion": 1, "label": "Lemmas", "message": "5.4 K", "color": "red" } diff --git a/test-diary.md b/test-diary.md new file mode 100644 index 0000000..6c8b5ed --- /dev/null +++ b/test-diary.md @@ -0,0 +1,34 @@ +Test diary +========== + + +## Lexical coverage + +Number of words (standing in `lang-tkl`): + +``` +cat misc/nt_tkl.txt |hfst-tokenise -mgW tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst |grep ' ?'|wc -l +``` + +Number of unknown words: + + +``` +cat misc/nt_tkl.txt |hfst-tokenise -m tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst |wc -l +``` + +### Lexical coverage nt + + +Coverage: + +``` +241115: 1-(34193/351080) = 0.9026 + +``` + + + + + +