-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathords_demo_tfidf-nltk.py
76 lines (57 loc) · 2.01 KB
/
ords_demo_tfidf-nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
import polars as pl
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
from funcs import *
class LemmaTokenizer:
def __init__(self):
self.wnl = WordNetLemmatizer()
self.rx = re.compile("[\W\d_]")
def __call__(self, doc):
return [
self.wnl.lemmatize(t) for t in word_tokenize(doc) if (not self.rx.search(t))
]
def log_vectors(title):
logger.debug(f"**** {title} ****")
logger.debug("** STOP WORDS **")
logger.debug(vectorizer.get_stop_words())
logger.debug("** VOCABULARY **")
logger.debug(vectorizer.vocabulary_)
logger.debug("** FEATURE NAMES **")
logger.debug(vectorizer.get_feature_names_out())
# Calculate similarity
# cosine_similarities = linear_kernel(
# docvects[0:1], docvects).flatten()
# docscores = [item.item() for item in cosine_similarities[1:]]
# logger.debug('** SIMILARITY **')
# logger.debug(docscores)
# logger.debug('** BAG OF WORDS **')
# logger.debug(doc_vectors.toarray())
if __name__ == "__main__":
logger = cfg.init_logger(__file__)
docs = list(
pl.read_csv(f"{cfg.DATA_DIR}/quests/vacuums/dustup.csv")
.filter(pl.col("country") == pl.lit("USA"))
.select(
pl.col("id_ords"),
pl.col("fault_type"),
pl.col("problem"),
)["problem"]
)
vectorizer = TfidfVectorizer()
docvects = vectorizer.fit_transform(docs)
log_vectors("#1 DEFAULT")
stop_words = set(stopwords.words("english"))
tokenizer = LemmaTokenizer()
token_stop = tokenizer(" ".join(stop_words))
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)
docvects = vectorizer.fit_transform(docs)
log_vectors("#2 WITH STOPWORDS")