-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsporklib.py
executable file
·286 lines (266 loc) · 11.6 KB
/
sporklib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Spork
-----
A "little language" designed to facilitate the extraction and processing of
data held in XML or HTML files.
:copyright: © 2015-2022 Oracle Corporation
:copyright: © 2022 Sioned Arrowsmith
:author: S Arrowsmith <[email protected]>
:licence: MIT, see LICENSE for more details
"""
from __future__ import print_function
import sys
import os
import json
if sys.version_info[0] > 2:
from io import StringIO
else:
from StringIO import StringIO
from string import Template
from collections import defaultdict
try:
from importlib import import_module
except ImportError:
import_module = __import__
try:
from argparse import ArgumentParser
except ImportError:
ArgumentParser = object
import tinycss
import cssselect
from lxml import etree
class AttributeMapping(dict):
def __init__(self, element):
self.element = element
self.parent = None
def __missing__(self, key):
if self.element is None:
return "''"
value = None
if key.startswith("_"):
if key == '_TEXT': value = " ".join(self.element.itertext())
elif key == '_TAG': value = self.element.tag
elif key == '_XML': value = etree.tostring(self.element)
else:
value = self.element.get(key, None)
if value is None:
if self.parent is None:
self.parent = AttributeMapping(self.element.getparent())
value = self.parent[key]
else:
value = repr(value)
self[key] = value
return value
class Spork:
"""A Spork object represents one spork script"""
class FlowControl(Exception):
pass
class DEFAULT(FlowControl, StopIteration):
pass
class PROGRAM(FlowControl):
pass
class ELEMENT(FlowControl):
pass
class SELECTOR(FlowControl):
pass
def __init__(self, source, warn=False, debug=False, formats=["XML", "HTML"]):
"""Initialise this object from the script given by the file-like object "source"."""
if not source.readline().startswith("#!"):
source.seek(0)
self.warn = warn
self.debug = debug
self.translator = cssselect.GenericTranslator()
self.namespace = globals().copy()
self.namespace["self"] = self
self.xmlns = dict()
self.parse(source)
self.formats = formats
self.doccument = None
def parse(self, source):
self.rules = list()
parsed = tinycss.make_parser().parse_stylesheet_file(source)
if self.warn:
if parsed.errors:
for error in parsed.errors:
print(error, file=sys.stderr)
return
for rule in parsed.rules:
if rule.at_keyword:
if rule.at_keyword != '@import':
if self.warn:
print("Unknown at-rule: %s" % rule.at_keyword, file=sys.stderr)
return
continue
self.namespace[rule.uri] = import_module(rule.uri)
else:
css = rule.selector.as_css()
if css == '_':
xpath = None
else:
xpath = self.translator.css_to_xpath(css)
self.rules.append((xpath, rule.declarations))
def eval(self, name, value, namespace, attribs):
result = None
copy = False
if value in namespace:
return namespace[value][:], True
value = value.replace("[]", "[-1]")
if name != "_":
if name in namespace and "#" in value:
value = "[ (%s) for __%s__ in %s ]" % (value.replace("#", "__%s__"%name), name, name)
copy = True
elif value.endswith("[:]"):
copy = True
if attribs.element is not None:
value = Template(value).safe_substitute(attribs)
if self.debug:
print("++ %s <- %s" % (name, value))
return eval(value, self.namespace, namespace), copy
def process(self, element, declarations, namespace):
namespace['_'] = element
attribs = AttributeMapping(element)
if self.debug:
print("+expressions")
for declaration in declarations:
name = declaration.name
if element is not None and name == "_":
name = element.tag
if name.startswith("{"):
name = name[name.find("}")+1:]
name = name.replace("-", "_")
value = declaration.value.as_css().strip(";").strip()
result, copy = self.eval(name, value, namespace, attribs)
if name != "_":
if copy:
namespace[name] = result
else:
namespace.setdefault(name, []).append(result)
def run_element(self, document):
namespace = {"[]": [], "__": document}
for xpath, declarations in self.rules:
try:
if xpath is None:
self.process(None, declarations, namespace)
else:
if self.debug:
print("+xpath\n++", xpath)
for element in document.xpath(xpath, namespaces=self.xmlns):
self.process(element, declarations, namespace)
except Spork.SELECTOR:
pass
if self.debug:
print("+namespace")
for n in sorted(namespace):
if not n.startswith("__"):
print("++ %s\t=> %s" % (n, namespace[n]))
if "_" in namespace: del namespace["_"]
del namespace["[]"]; del namespace["__"]
return namespace
def run(self, document=None):
"""Run this script over the given document root element
If document is None, use the last document returned by get_root().
Returns the final state of the spork variables."""
if document is None:
document = self.document
try:
return self.run_element(document)
except Spork.FlowControl:
pass
return {}
def selector(self, selector, document=None):
"""Generator to run this script over all elements in the given document identified by selector.
If document is None, use the last document returned by get_root().
Yields tuples of (selected_element, spork_variables)."""
if document is None:
document = self.document
xpath = self.translator.css_to_xpath(selector)
for element in document.xpath(xpath):
try:
yield ((element, self.run_element(element)))
except Spork.ELEMENT:
pass
except Spork.PROGRAM:
break
def select(self, selector, document=None):
"""Convenience function to return result of selector() as a list"""
return list(self.selector(selector, document))
def get_root(self, document, parserargs={}):
"""Return the root element of the given document, and remember it for subsequent calls to select()."""
for name in self.formats:
parser = getattr(etree, name+"Parser")(**parserargs)
try:
root = etree.parse(document, parser).getroot()
if root is None:
continue
if name == "HTML":
self.translator = cssselect.HTMLTranslator()
self.xmlns = dict()
else:
self.xmlns = root.nsmap
self.document = root
return root
except etree.ParseError as e:
document.seek(0)
pass
# *Theoretically* we can get here with no exception if a document parses but getroot() returns None
raise
def exit(self, how=DEFAULT):
if not issubclass(how, Spork.FlowControl):
return
raise how()
def parse_args(args):
"""Parse arguments passed when sporklib is used as a script"""
parser = ArgumentParser(prog="spork", description="Spork is a mark-up scanning and processing language")
source = parser.add_mutually_exclusive_group(required=True)
source.add_argument("-f", "--file", metavar="progfile", dest="source", type=file, help="filename of Spork program")
source.add_argument("-e", "--source", metavar="program-text", dest="source", type=StringIO, help="text of Spork program")
parser.add_argument("-w", "--warn", action='store_true', help="warn of program parsing errors")
parser.add_argument("file", nargs='?', type=file, default=sys.stdin, help="document to be processed (default stdin)")
parser.add_argument("-X", "--xml", action='store_true', help="attempt XML parsing of document")
parser.add_argument("-H", "--html", action='store_true', help="attempt HTML parsing of document")
parser.add_argument("-s", "--select", metavar="tag", help="process selected elements from document")
parser.add_argument("-p", "--print", action='store_true', help="print result(s)")
parser.add_argument("-d", "--debug", action='store_true', help="display debugging information")
parseropts = parser.add_argument_group(title="Document parser options", description="These options only apply if -X/--xml or -H/--html are given.")
parseropts.add_argument("--attribute-defaults", action='store_true', help="read the DTD (if referenced by the document) and add the default attributes from it")
parseropts.add_argument("--dtd-validation", action='store_true', help="validate while parsing (if a DTD was referenced)")
parseropts.add_argument("--allow-network", action='store_true', help="allow network access when looking up external documents")
parseropts.add_argument("--recover", action='store_true', help="try hard to parse through broken XML")
parseropts.add_argument("--remove-blank-text", action='store_true', help="discard blank text nodes (ignorable whitespace) between tags")
parseropts.add_argument("--leave-cdata", action='store_true', help="don't replace CDATA sections by normal text content")
parseropts.add_argument("--leave-entities", action='store_true', help="don't replace entities by their text value")
parseropts.add_argument("--forget-ids", action='store_true', help="don't collect XML IDs (can speed up parsing if IDs not used)")
parseropts.add_argument("--huge-tree", action='store_true', help="support deep trees and very long text content. WARNING: disables security restrictions")
return parser.parse_args(args)
def get_parserargs(args):
"""Convert command line arguments to lxml parser arguments"""
parserlist = list()
kwargs = dict()
va = vars(args)
for arg in ('attribute_defaults', 'dtd_validation', 'recover', 'remove_blank_text', 'huge_tree'):
if va[arg]:
kwargs[arg] = True
for (arg, kwarg) in (('allow_network', 'no_network'), ('leave_cdata', 'strip_cdata'), ('leave_entities', 'resolve_entities'), ('forget_ids', 'collect_ids')):
if va[arg]:
kwargs[kwarg] = False
return kwargs
def main(args=sys.argv[1:]):
"""Entry point for using sporklib as a script"""
args = parse_args(args)
prog = Spork(args.source, args.warn, args.debug)
if args.xml:
prog.formats = ["XML"]
if args.html:
prog.formats = ["HTML"]
root = prog.get_root(args.file, get_parserargs(args))
if args.select:
results = [ r[1] for r in prog.selector(args.select, root) ]
else:
results = [ prog.run(root) ]
if args.print:
json.dump(results, sys.stdout, indent=2)
print()
if __name__ == '__main__':
main()