-
-
Notifications
You must be signed in to change notification settings - Fork 268
/
pyproject.toml
116 lines (108 loc) · 3.5 KB
/
pyproject.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "trafilatura"
description = "Python & Command-line tool to gather text and metadata on the Web: Crawling, scraping, extraction, output as CSV, JSON, HTML, MD, TXT, XML."
readme = "README.md"
license = { text = "Apache 2.0" }
dynamic = ["version"]
requires-python = ">=3.8"
authors = [
{name = "Adrien Barbaresi", email = "[email protected]"}
]
keywords=[
"corpus",
"html2text",
"news-crawler",
"natural-language-processing",
"scraper",
"tei-xml",
"text-extraction",
"webscraping",
"web-scraping",
]
classifiers = [
# https://pypi.python.org/pypi?%3Aaction=list_classifiers
"Development Status :: 5 - Production/Stable",
#'Development Status :: 6 - Mature',
"Environment :: Console",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: MacOS",
"Operating System :: Microsoft",
"Operating System :: POSIX",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Security",
"Topic :: Text Editors :: Text Processing",
"Topic :: Text Processing :: Linguistic",
"Topic :: Text Processing :: Markup :: HTML",
"Topic :: Text Processing :: Markup :: Markdown",
"Topic :: Text Processing :: Markup :: XML",
"Topic :: Utilities",
]
dependencies = [
"certifi",
"charset_normalizer >= 3.4.0",
"courlan >= 1.3.2",
"htmldate >= 1.9.2",
"justext >= 3.0.1",
# see tests on Github Actions
"lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml >= 5.3.0 ; platform_system != 'Darwin' or python_version > '3.8'",
"urllib3 >= 1.26, < 3",
]
# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
[tool.setuptools]
packages = ["trafilatura"]
# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
[tool.setuptools.dynamic]
version = {attr = "trafilatura.__version__"}
# https://setuptools.pypa.io/en/stable/userguide/datafiles.html
[tool.setuptools.package-data]
trafilatura = [
"data/tei_corpus.dtd",
"settings.cfg",
]
[project.scripts]
trafilatura = "trafilatura.cli:main"
[project.urls]
"Homepage" = "https://trafilatura.readthedocs.io"
"Source" = "https://github.com/adbar/trafilatura"
"Blog" = "https://adrien.barbaresi.eu/blog/tag/trafilatura.html"
"Tracker" = "https://github.com/adbar/trafilatura/issues"
# Development extras
[project.optional-dependencies]
dev = [
"flake8",
"mypy",
"pytest",
"pytest-cov",
"types-lxml",
"types-urllib3",
]
all = [
"brotli",
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
"faust-cchardet >= 2.1.19; python_version >= '3.11'",
"htmldate[speed] >= 1.9.2",
"py3langid >= 0.3.0",
"pycurl >= 7.45.3",
"urllib3[socks]",
"zstandard >= 0.23.0",
]
[tool.pytest.ini_options]
testpaths = "tests/*test*.py"