forked from nlacslab/kaznlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tutorial.py
executable file
·108 lines (82 loc) · 3.31 KB
/
tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: UTF-8 -*-
from __future__ import print_function
import os
from kaznlp.normalization.ininorm import Normalizer
from kaznlp.tokenization.tokrex import TokenizeRex
from kaznlp.tokenization.tokhmm import TokenizerHMM
from kaznlp.lid.lidnb import LidNB
from kaznlp.morphology.analyzers import AnalyzerDD
from kaznlp.morphology.taggers import TaggerHMM
# =======================
# INITIAL NORMALIZATION =
# =======================
print()
# basic example
# by default performs cleaning, script fixing
# returns normalized text and stats
txt = 'Қайыpлы таӊ! Əнші бaлааапaн ☺️☺️☺️ L O V E 🇰🇿'
ininormer = Normalizer()
print(ininormer.normalize(txt))
# desegment and deduplicate on top of that
print(ininormer.normalize(txt, desegment=2, dedupe=2))
# transliterate and resolve emojies
print(ininormer.normalize(txt, translit=True, emojiresolve=True))
# do everything without returning stats
print(ininormer.normalize(txt, translit=True, desegment=2,
dedupe=2, emojiresolve=True, stats=False))
# ==============
# TOKENIZATION =
# ==============
print()
txt = u'Көш жүре түзеледі. Ақсақ қой түстен кейін маңырайды.'
tokrex = TokenizeRex()
sents_toks = tokrex.tokenize(txt)
print(sents_toks)
mdl = os.path.join('kaznlp', 'tokenization', 'tokhmm.mdl')
tokhmm = TokenizerHMM(model=mdl)
sents_toks = tokhmm.tokenize(txt)
print(sents_toks)
# =========================
# LANGUAGE IDENTIFICATION =
# =========================
txt_kaz = u'Еңбек етсең ерінбей, тояды қарның тіленбей.'
txt_rus = u'Нет, нет, нет, нет! Мы хотим сегодня! Мы хотим сейчас!'
landetector = LidNB(char_mdl=os.path.join('kaznlp', 'lid', 'char.mdl'))
print()
doclan = landetector.predict(tokrex.tokenize(txt_kaz, lower=True)[0])
print(f'Document "{txt_kaz}" is written in {doclan}.')
print()
doclan = landetector.predict_wp(tokrex.tokenize(txt_rus, lower=True)[0])
print(f'Document "{txt_rus}" has the following language probabilities {doclan}.')
print()
print(f'Input document is mixed:\n"{txt_kaz} {txt_rus}".')
print('\nPer-word language detection:')
for i, wrd in enumerate(tokrex.tokenize(txt_kaz + txt_rus)[0]):
wrdlan = landetector.predict(wrd.lower())
print(f'{str(i+1).rjust(2)}) {wrd.ljust(15)}{wrdlan}')
# ============
# MORPHOLOGY =
# ============
# create a morphological analyzer instance
analyzer = AnalyzerDD()
analyzer.load_model(os.path.join('kaznlp', 'morphology', 'mdl'))
# try analysis
print()
wrd = 'алмасын'
[iscovered, alist] = analyzer.analyze(wrd)
print('"{}" is covered by the analyzer.'.format(wrd))
print('Analyses are:')
for i, a in enumerate(alist):
print(f'{str(i+1).rjust(2)}) {a}')
# create a morphological tagger instance
print()
tagger = TaggerHMM(lyzer=analyzer)
tagger.load_model(os.path.join('kaznlp', 'morphology', 'mdl'))
txt = u'Еңбек етсең ерінбей, тояды қарның тіленбей.'
tokenizer = TokenizerHMM(model=mdl)
for sentence in tokenizer.tokenize(txt):
print(f'input sentence:\n{sentence}\n')
print('tagged sentence:')
lower_sentence = map(lambda x: x.lower(), sentence)
for i, a in enumerate(tagger.tag_sentence(lower_sentence)):
print(f'{str(i+1).rjust(2)}) {sentence[i].ljust(15)}{a}')