-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit after moving from ankidict repo.
- Loading branch information
Showing
5 changed files
with
787 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from . import html | ||
from .html import (Node, StrictNode, Text, ShallowText, | ||
Html, StrictHtml, ThisClass) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
import six | ||
|
||
from pagemodel.html import BaseNode, BaseLeaf, Base | ||
|
||
import bs4 | ||
|
||
|
||
class PageModelMetaClass(type): | ||
def __new__(cls, name, bases, attrs): | ||
if name in ['PageModel', 'BasePageModel']: | ||
return super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs) | ||
if 'model_class' not in attrs: | ||
raise TypeError("Subclasses of PageModel must declare " | ||
"'model_class' attribute.") | ||
if 'page_tree' not in attrs: | ||
raise TypeError("Subclasses of PageModel must declare " | ||
"'page_tree' attribute.") | ||
page_tree = attrs['page_tree'] | ||
if not isinstance(page_tree, BaseNode): | ||
raise TypeError("Invalid type of 'page_tree' attribute.") | ||
page_tree.validate() | ||
res = super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs) | ||
page_tree.fill_thisclass_attr(res) | ||
return res | ||
|
||
|
||
class BaseBasePageModel(object): | ||
pass | ||
|
||
|
||
class BasePageModel(six.with_metaclass(PageModelMetaClass, BaseBasePageModel)): | ||
pass | ||
|
||
|
||
class PageModel(BasePageModel, BaseLeaf): | ||
@classmethod | ||
def extract_unboxed(cls, selector): | ||
try: | ||
res = cls.page_tree.extract(selector) | ||
cls.postproc(res) | ||
return cls.model_class(**res) | ||
except ValueError as a: | ||
raise ValueError(cls.__name__ + ": " + str(a)) | ||
|
||
def extract(self, selector): | ||
res = self.extract_unboxed(selector) | ||
return {self.fieldlabel: res} | ||
|
||
def __new__(cls, page_text=None): | ||
if page_text is None: | ||
res = super(PageModel, cls).__new__(cls) | ||
return res | ||
else: | ||
return cls.extract_unboxed(Selector(page_text)) | ||
|
||
@classmethod | ||
def postproc(cls, dic): | ||
return dic | ||
|
||
|
||
class Selector(object): | ||
def __init__(self, arg): | ||
if isinstance(arg, basestring): | ||
self.sel = bs4.BeautifulSoup(arg, "html.parser") | ||
else: | ||
self.sel = arg | ||
|
||
def css(self, *paths): | ||
"""Return a list of nodes that satisfy any of provided | ||
css paths. | ||
""" | ||
# musi być zrobione, że po kolei wywołuje, ponieważ | ||
# według BS4 przecinek wiąże silniej niż spacja w | ||
# selektorach css. | ||
sel_list = [self.sel.select(path.strip()) for path in paths] | ||
sel_list = [el for chunk in sel_list for el in chunk] | ||
return [Selector(sel) for sel in sel_list] | ||
|
||
def text(self): | ||
"""Return all the text contained in a node as a string.""" | ||
return self.sel.get_text() | ||
|
||
def textlist(self): | ||
return list(self.sel.strings) | ||
|
||
def get_attr(self, attr_name): | ||
return self.sel[attr_name] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
from collections import Counter | ||
|
||
|
||
# TODO: | ||
# - flatten_lists switch in PageModel subclasses - when lists are nested, they | ||
# are flattened | ||
# - If(Attr("class") == "myclass") (...) | ||
# - ParentNode node-like thing to access parent node space from child nodes | ||
# - Node.nth(num)("<css-selector>"), Node.first, Node.second, Node.third | ||
# - more accurate, informative error handling | ||
|
||
|
||
class Base(object): | ||
def extract(self, selector): | ||
"""Always return a dict with (possibly partial) results.""" | ||
raise NotImplementedError | ||
|
||
def set_fieldlabel(self, lab): | ||
raise NotImplementedError | ||
|
||
def get_fieldlabels(self): | ||
raise NotImplementedError | ||
|
||
def fill_thisclass_attr(self, cls): | ||
raise NotImplementedError | ||
|
||
|
||
class BaseNode(Base): | ||
def __init__(self, *args, **kwargs): | ||
self.child_nodes = [] | ||
for i in list(args) + kwargs.values(): | ||
if isinstance(i, Base): | ||
self.child_nodes.append(i) | ||
else: | ||
raise TypeError( | ||
"Invalid argument of type: '%s'" % str(type(i))) | ||
for i in kwargs: | ||
kwargs[i].set_fieldlabel(i) | ||
for i in self.child_nodes: | ||
i.parent_node = self | ||
super(BaseNode, self).__init__() | ||
|
||
def set_fieldlabel(self, name): | ||
raise TypeError("You cannot store a node-like" | ||
"object '{}' in model's field. You can only store leaf-like things" | ||
"such as Text, ShallowText and instances of other models.".format( | ||
type(self).__name__)) | ||
|
||
def get_fieldlabels(self): | ||
"""Return a Counter of field labels. This method is only for validation | ||
that no fieldlabel is written twice in page_tree. | ||
""" | ||
res = Counter() | ||
for node in self.child_nodes: | ||
res.update(node.get_fieldlabels()) | ||
return res | ||
|
||
def fill_thisclass_attr(self, cls): | ||
"""Needed for the recursive ThisClass leaf nodes.""" | ||
for node in self.child_nodes: | ||
node.fill_thisclass_attr(cls) | ||
|
||
def validate(self): | ||
res = self.get_fieldlabels() | ||
for label in res: | ||
if res[label] >= 2: | ||
raise NameError("Duplicate field label: '{}'.".format(label)) | ||
|
||
def extract(self, selector): | ||
res = {} | ||
for node in self.child_nodes: | ||
res.update(node.extract(selector)) | ||
return res | ||
|
||
|
||
class BaseLeaf(Base): | ||
def __init__(self): | ||
self.fieldlabel = None | ||
super(BaseLeaf, self).__init__() | ||
|
||
def set_fieldlabel(self, name): | ||
if self.fieldlabel is None: | ||
self.fieldlabel = name | ||
else: | ||
raise NameError("Conflict of field labels in page_tree.") | ||
|
||
def get_fieldlabels(self): | ||
if self.fieldlabel is None: | ||
raise NameError("A leaf-like node without field label exists.") | ||
return Counter([self.fieldlabel]) | ||
|
||
def fill_thisclass_attr(self, cls): | ||
"""Needed for the recursive ThisClass leaf nodes.""" | ||
pass | ||
|
||
|
||
|
||
class Html(BaseNode): | ||
pass | ||
|
||
|
||
class FullNode(BaseNode): | ||
@classmethod | ||
def reduce_dict_list(cls, dlist): | ||
res = {} | ||
for dic in dlist: | ||
res.update(dic) | ||
for k in res: | ||
res[k] = [dic[k] for dic in dlist if k in dic] | ||
return res | ||
|
||
@classmethod | ||
def concat_dict_list(cls, dlist, sep): | ||
res = cls.reduce_dict_list(dlist) | ||
for k in res: | ||
res[k] = sep.join(res[k]) | ||
return res | ||
|
||
@classmethod | ||
def takefirst_dict_list(cls, dlist): | ||
res = cls.reduce_dict_list(dlist) | ||
try: | ||
for k in res: | ||
res[k] = res[k][0] | ||
except IndexError: | ||
raise ValueError("take_first applied to an empty list!") | ||
return res | ||
|
||
def extract(self, selector): | ||
sel_list = selector.css(*self.node.alts) | ||
self.node.validate_sel_list_len(len(sel_list)) | ||
res_list = [super(FullNode, self).extract(sel) for sel in sel_list] | ||
if self.node.is_list: | ||
if self.node.concat_sep is not None: | ||
return self.concat_dict_list(res_list, self.node.concat_sep) | ||
elif self.node.is_take_first: | ||
return self.takefirst_dict_list(res_list) | ||
else: | ||
return self.reduce_dict_list(res_list) | ||
else: | ||
try: | ||
return res_list[0] | ||
except: | ||
return {} | ||
|
||
|
||
class Node(BaseNode): | ||
def __init__(self, *args): | ||
self.alts = [] | ||
self.is_opt = False | ||
self.is_list = False | ||
self.concat_sep = None | ||
self.is_take_first = False | ||
for i in args: | ||
if isinstance(i, basestring): | ||
self.alts.append(i) | ||
else: | ||
raise TypeError("Invalid argument '%s' of type: '%s'. " | ||
"Expected a string with a css path here." % ( | ||
str(i), type(i).__name__ | ||
) | ||
) | ||
super(Node, self).__init__() | ||
|
||
def __call__(self, *args, **kwargs): | ||
res = FullNode(*args, **kwargs) | ||
res.node = self | ||
return res | ||
|
||
@classmethod | ||
def list(cls, *args): | ||
res = cls(*args) | ||
res.is_list = True | ||
return res | ||
|
||
@classmethod | ||
def optional(cls, *args): | ||
res = cls(*args) | ||
res.is_opt = True | ||
return res | ||
|
||
def concat(self, s): | ||
if self.is_take_first: | ||
raise TypeError("take_first and concat are mutually exclusive!") | ||
if not self.is_list: | ||
raise TypeError("You can only concat a list of strings") | ||
self.concat_sep = s | ||
return self | ||
|
||
def take_first(self): | ||
if self.concat_sep is not None: | ||
raise TypeError("take_first and concat are mutually exclusive!") | ||
if not self.is_list: | ||
raise TypeError("You can only take_first from a list") | ||
self.is_take_first = True | ||
return self | ||
|
||
def validate_sel_list_len(self, size): | ||
if self.is_list: | ||
pass | ||
else: | ||
if size > 1: | ||
raise ValueError("Multiple html tags for a non-list node " | ||
"'{}'.".format(" | ".join(self.alts))) | ||
if size == 0 and (not self.is_opt): | ||
raise ValueError("Missing html tag for a non-optional node " | ||
"'{}'.".format(" | ".join(self.alts))) | ||
|
||
def extract(self, selector): | ||
"""Only check if the data is correct.""" | ||
size = len(selector.css(*self.alts)) | ||
self.validate_sel_list_len(size) | ||
return {} | ||
|
||
|
||
class Text(BaseLeaf): | ||
"""Whitespace at the beginning and the end of the text is automatically stripped.""" | ||
def __init__(self): | ||
super(Text, self).__init__() | ||
|
||
def extract(self, selector): | ||
res = selector.text() | ||
res = res.strip() | ||
return {self.fieldlabel: res} | ||
|
||
# TODO | ||
# Text.replace("$", "").lower() | ||
# Text.not_strip (or Text.with_whitespace or Text.retain_spaces) | ||
|
||
|
||
class Attr(BaseLeaf): | ||
def __init__(self, attr): | ||
super(Attr, self).__init__() | ||
self.attr = attr | ||
|
||
def extract(self, selector): | ||
return {self.fieldlabel: selector.get_attr(self.attr).strip()} | ||
|
||
|
||
class Constant(BaseLeaf): | ||
def __init__(self, val): | ||
self.val = val | ||
super(Constant, self).__init__() | ||
|
||
def extract(self, selector): | ||
return {self.fieldlabel: self.val} | ||
|
||
|
||
class ThisClass(BaseLeaf): | ||
def __init__(self): | ||
self.this_class = None | ||
super(ThisClass, self).__init__() | ||
|
||
def extract(self, selector): | ||
res = self.this_class.extract_unboxed(selector) | ||
return {self.fieldlabel: res} | ||
|
||
def fill_thisclass_attr(self, cls): | ||
"""Needed for the recursive ThisClass leaf nodes.""" | ||
self.this_class = cls | ||
|
||
|
||
|
||
# not implemented: | ||
StrictNode = Node | ||
StrictHtml = Html | ||
ShallowText = Text |
Empty file.
Oops, something went wrong.