-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyproject.toml
78 lines (72 loc) · 2.19 KB
/
pyproject.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
[project]
name = "dom-tokenizers"
version = "0.0.18"
authors = [{ name = "Gary Benson", email = "[email protected]" }]
description = "DOM-aware tokenization for 🤗 Hugging Face language models"
readme = "README.md"
requires-python = ">=3.10" # match..case
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Text Processing :: Markup :: HTML",
]
dependencies = [
"python-magic", # XXX review
"tokenizers",
"unidecode", # XXX review
"vec64>0.0.5",
]
[project.urls]
Homepage = "https://github.com/gbenson/dom-tokenizers"
Source = "https://github.com/gbenson/dom-tokenizers"
[project.optional-dependencies]
dev = [
"build",
"datasets",
"flake8",
"flake8-custom-import-rules",
"flake8-quotes",
"pillow",
"pytest",
"pytest-cov",
"transformers",
]
train = [
"datasets",
"pillow",
"transformers",
]
[project.scripts]
train-tokenizer = "dom_tokenizers.train:main"
dump-tokenizations = "dom_tokenizers.scripts.dump:main"
diff-tokenizer = "dom_tokenizers.scripts.diff:main"
tokenizer-diff = "dom_tokenizers.scripts.diff:main"
profile-tokenizer = "dom_tokenizers.scripts.profile:main"
dump-breaking-inputs = "dom_tokenizers.scripts.dump_breaking_inputs:main"
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[tool.pytest.ini_options]
addopts = "--cov=dom_tokenizers"
filterwarnings = [
"error",
"ignore:`resume_download` is deprecated:FutureWarning",
]
[tool.coverage.run]
omit = [
"*/.venv/*",
"src/dom_tokenizers/dump.py",
"src/dom_tokenizers/diff.py",
"src/dom_tokenizers/pre_tokenizers/compat_itertools.py",
]