diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index cfa24fb8..ace6451c 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -97,7 +97,7 @@ def test_text_input(): # ("EOD", "EOD\nEOD\n"), # TODO ("おはよう", "おはよう\nEOD\n"), ("おはよう.", "おはよう.\nEOD\n"), - ("おはよう #今日も一日", "おはよう␣#今日も一日\nEOD\n"), + ("おはよう #今日も一日", "おはよう #今日も一日\nEOD\n"), ("おはよう。\nこんにちは。\nこんばんわ。\n", "おはよう。こんにちは。こんばんわ。\nEOD\n"), ("おはよう。EOD", "おはよう。EOD\nEOD\n"), ], diff --git a/tests/data/datasets/typo_files/0.jsonl b/tests/data/datasets/typo_files/0.jsonl index 569ca2f5..28a6a72c 100644 --- a/tests/data/datasets/typo_files/0.jsonl +++ b/tests/data/datasets/typo_files/0.jsonl @@ -1,2 +1,2 @@ -{"pre_text": "待つの木が枯れる", "post_text": "松の木が枯れる", "kdrs": ["R:松", "D", "K", "K", "K", "K", "K", "K", "K"], "inss": ["_", "_", "_", "_", "_", "_", "_", "_", "_"]} -{"pre_text": "紹介ことなかった", "post_text": "紹介することがなかった", "kdrs": ["K", "K", "K", "K", "K", "K", "K", "K", "K"], "inss": ["_", "_", "I:する", "_", "I:が", "_", "_", "_", "_"]} +{"pre_text": "待つの木が枯れる", "post_text": "松の木が枯れる", "kdr_tags": ["R:松", "D", "K", "K", "K", "K", "K", "K", "K"], "ins_tags": ["_", "_", "_", "_", "_", "_", "_", "_", "_"]} +{"pre_text": "紹介ことなかった", "post_text": "紹介することがなかった", "kdr_tags": ["K", "K", "K", "K", "K", "K", "K", "K", "K"], "ins_tags": ["_", "_", "I:する", "_", "I:が", "_", "_", "_", "_"]} diff --git a/tests/datamodule/datasets/test_word_dataset.py b/tests/datamodule/datasets/test_word_dataset.py index fac5b160..defe2f9f 100644 --- a/tests/datamodule/datasets/test_word_dataset.py +++ b/tests/datamodule/datasets/test_word_dataset.py @@ -78,7 +78,6 @@ def test_encode(data_dir: Path, word_tokenizer: PreTrainedTokenizerBase, dataset max_seq_length = 64 document_split_stride = 1 dataset = WordDataset(str(path), word_tokenizer, max_seq_length, document_split_stride, **dataset_kwargs) - assert dataset.tokenizer_input_format == "text" dataset.examples[1].load_discourse_document(Document.from_knp(path.joinpath("1.knp").read_text())) num_examples = len(dataset) @@ -368,7 +367,6 @@ def test_split_into_words_encode( dataset = WordDataset( str(path), split_into_words_word_tokenizer, max_seq_length, document_split_stride, **dataset_kwargs ) - assert dataset.tokenizer_input_format == "words" dataset.examples[1].load_discourse_document(Document.from_knp(path.joinpath("1.knp").read_text())) num_examples = len(dataset) diff --git a/tests/utils/test_reading_prediction.py b/tests/utils/test_reading_prediction.py index c6eadd63..be30214a 100644 --- a/tests/utils/test_reading_prediction.py +++ b/tests/utils/test_reading_prediction.py @@ -33,7 +33,7 @@ [False, False, False, False, True], [False, False, False, False, False], ], - ["ふせい", "␣", "な", "にゅうりょく"], + ["ふせい", "_", "な", "にゅうりょく"], ), ( ["[UNK]", "ふせい", "[ID]", "[ID]", "にゅうりょく"], diff --git a/tests/utils/test_word_normalization.py b/tests/utils/test_word_normalization.py index 8ce07a46..0a7e5555 100644 --- a/tests/utils/test_word_normalization.py +++ b/tests/utils/test_word_normalization.py @@ -4,7 +4,7 @@ from kwja.utils.word_normalization import ( MorphemeDenormalizer, MorphemeNormalizer, - get_normalized, + get_normalized_surf, get_word_norm_op_tags, ) @@ -73,7 +73,7 @@ @pytest.mark.parametrize(("surf", "ops", "expected"), wellformed_list) def test_gen_normalized_surf(surf, ops, expected): - assert get_normalized(surf, ops, strict=True) == expected + assert get_normalized_surf(surf, ops, strict=True) == expected @pytest.mark.parametrize(("surf", "expected", "normalized"), wellformed_list) @@ -93,12 +93,12 @@ def test_get_normalization_opns(surf, expected, normalized): @pytest.mark.parametrize(("surf", "ops", "expected"), malformed_list) def test_gen_normalized_surf_malformed(surf, ops, expected): with pytest.raises(ValueError): - get_normalized(surf, ops, strict=True) + get_normalized_surf(surf, ops, strict=True) @pytest.mark.parametrize(("surf", "ops", "expected"), malformed_list) def test_gen_normalized_surf_malformed_loose(surf, ops, expected): - assert get_normalized(surf, ops, strict=False) == expected + assert get_normalized_surf(surf, ops, strict=False) == expected def test_morpheme_normalizer():