-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathords_regex_category_compiler.py
41 lines (33 loc) · 1.3 KB
/
ords_regex_category_compiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python3
# Given a list of "terms" (actually mini-regexes), return a whole regular expression string for each category.
import polars as pl
import re
from funcs import *
if __name__ == "__main__":
logger = cfg.init_logger(__file__)
testterms = pl.read_csv(f"{cfg.DATA_DIR }/ords_testdata_common_products.csv")
rxelems = pl.read_csv(f"{cfg.DATA_DIR }/product_category_regex_elements.csv")
regs = []
for category in rxelems.columns:
print(category)
logger.debug(category)
data = rxelems.select(pl.col(category)).drop_nulls()
logger.debug(data[category])
regex = textfuncs.build_regex_string(data[category])
regs.append(regex)
rx = re.compile(regex)
tests = testterms.filter(pl.col("product_category") == category)
for cat, prod in tests.iter_rows():
logger.debug(prod)
matches = rx.search(prod)
logger.debug(matches)
results = pl.DataFrame(
data={
"product_category": rxelems.columns,
"lang": ["any"] * len(rxelems.columns),
"regex": regs,
},
schema={"product_category": pl.String, "lang": pl.String, "regex": pl.String},
)
logger.debug(results)
results.write_csv(f"{cfg.OUT_DIR}/product_category_regexes.csv")