Skip to content

Commit

Permalink
Initial commit after moving from ankidict repo.
Browse files Browse the repository at this point in the history
  • Loading branch information
franekp committed Mar 6, 2016
1 parent dba87ef commit 7998074
Show file tree
Hide file tree
Showing 5 changed files with 787 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pagemodel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import html
from .html import (Node, StrictNode, Text, ShallowText,
Html, StrictHtml, ThisClass)
90 changes: 90 additions & 0 deletions pagemodel/bsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import six

from pagemodel.html import BaseNode, BaseLeaf, Base

import bs4


class PageModelMetaClass(type):
def __new__(cls, name, bases, attrs):
if name in ['PageModel', 'BasePageModel']:
return super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
if 'model_class' not in attrs:
raise TypeError("Subclasses of PageModel must declare "
"'model_class' attribute.")
if 'page_tree' not in attrs:
raise TypeError("Subclasses of PageModel must declare "
"'page_tree' attribute.")
page_tree = attrs['page_tree']
if not isinstance(page_tree, BaseNode):
raise TypeError("Invalid type of 'page_tree' attribute.")
page_tree.validate()
res = super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
page_tree.fill_thisclass_attr(res)
return res


class BaseBasePageModel(object):
pass


class BasePageModel(six.with_metaclass(PageModelMetaClass, BaseBasePageModel)):
pass


class PageModel(BasePageModel, BaseLeaf):
@classmethod
def extract_unboxed(cls, selector):
try:
res = cls.page_tree.extract(selector)
cls.postproc(res)
return cls.model_class(**res)
except ValueError as a:
raise ValueError(cls.__name__ + ": " + str(a))

def extract(self, selector):
res = self.extract_unboxed(selector)
return {self.fieldlabel: res}

def __new__(cls, page_text=None):
if page_text is None:
res = super(PageModel, cls).__new__(cls)
return res
else:
return cls.extract_unboxed(Selector(page_text))

@classmethod
def postproc(cls, dic):
return dic


class Selector(object):
def __init__(self, arg):
if isinstance(arg, basestring):
self.sel = bs4.BeautifulSoup(arg, "html.parser")
else:
self.sel = arg

def css(self, *paths):
"""Return a list of nodes that satisfy any of provided
css paths.
"""
# musi być zrobione, że po kolei wywołuje, ponieważ
# według BS4 przecinek wiąże silniej niż spacja w
# selektorach css.
sel_list = [self.sel.select(path.strip()) for path in paths]
sel_list = [el for chunk in sel_list for el in chunk]
return [Selector(sel) for sel in sel_list]

def text(self):
"""Return all the text contained in a node as a string."""
return self.sel.get_text()

def textlist(self):
return list(self.sel.strings)

def get_attr(self, attr_name):
return self.sel[attr_name]
267 changes: 267 additions & 0 deletions pagemodel/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
from collections import Counter


# TODO:
# - flatten_lists switch in PageModel subclasses - when lists are nested, they
# are flattened
# - If(Attr("class") == "myclass") (...)
# - ParentNode node-like thing to access parent node space from child nodes
# - Node.nth(num)("<css-selector>"), Node.first, Node.second, Node.third
# - more accurate, informative error handling


class Base(object):
def extract(self, selector):
"""Always return a dict with (possibly partial) results."""
raise NotImplementedError

def set_fieldlabel(self, lab):
raise NotImplementedError

def get_fieldlabels(self):
raise NotImplementedError

def fill_thisclass_attr(self, cls):
raise NotImplementedError


class BaseNode(Base):
def __init__(self, *args, **kwargs):
self.child_nodes = []
for i in list(args) + kwargs.values():
if isinstance(i, Base):
self.child_nodes.append(i)
else:
raise TypeError(
"Invalid argument of type: '%s'" % str(type(i)))
for i in kwargs:
kwargs[i].set_fieldlabel(i)
for i in self.child_nodes:
i.parent_node = self
super(BaseNode, self).__init__()

def set_fieldlabel(self, name):
raise TypeError("You cannot store a node-like"
"object '{}' in model's field. You can only store leaf-like things"
"such as Text, ShallowText and instances of other models.".format(
type(self).__name__))

def get_fieldlabels(self):
"""Return a Counter of field labels. This method is only for validation
that no fieldlabel is written twice in page_tree.
"""
res = Counter()
for node in self.child_nodes:
res.update(node.get_fieldlabels())
return res

def fill_thisclass_attr(self, cls):
"""Needed for the recursive ThisClass leaf nodes."""
for node in self.child_nodes:
node.fill_thisclass_attr(cls)

def validate(self):
res = self.get_fieldlabels()
for label in res:
if res[label] >= 2:
raise NameError("Duplicate field label: '{}'.".format(label))

def extract(self, selector):
res = {}
for node in self.child_nodes:
res.update(node.extract(selector))
return res


class BaseLeaf(Base):
def __init__(self):
self.fieldlabel = None
super(BaseLeaf, self).__init__()

def set_fieldlabel(self, name):
if self.fieldlabel is None:
self.fieldlabel = name
else:
raise NameError("Conflict of field labels in page_tree.")

def get_fieldlabels(self):
if self.fieldlabel is None:
raise NameError("A leaf-like node without field label exists.")
return Counter([self.fieldlabel])

def fill_thisclass_attr(self, cls):
"""Needed for the recursive ThisClass leaf nodes."""
pass



class Html(BaseNode):
pass


class FullNode(BaseNode):
@classmethod
def reduce_dict_list(cls, dlist):
res = {}
for dic in dlist:
res.update(dic)
for k in res:
res[k] = [dic[k] for dic in dlist if k in dic]
return res

@classmethod
def concat_dict_list(cls, dlist, sep):
res = cls.reduce_dict_list(dlist)
for k in res:
res[k] = sep.join(res[k])
return res

@classmethod
def takefirst_dict_list(cls, dlist):
res = cls.reduce_dict_list(dlist)
try:
for k in res:
res[k] = res[k][0]
except IndexError:
raise ValueError("take_first applied to an empty list!")
return res

def extract(self, selector):
sel_list = selector.css(*self.node.alts)
self.node.validate_sel_list_len(len(sel_list))
res_list = [super(FullNode, self).extract(sel) for sel in sel_list]
if self.node.is_list:
if self.node.concat_sep is not None:
return self.concat_dict_list(res_list, self.node.concat_sep)
elif self.node.is_take_first:
return self.takefirst_dict_list(res_list)
else:
return self.reduce_dict_list(res_list)
else:
try:
return res_list[0]
except:
return {}


class Node(BaseNode):
def __init__(self, *args):
self.alts = []
self.is_opt = False
self.is_list = False
self.concat_sep = None
self.is_take_first = False
for i in args:
if isinstance(i, basestring):
self.alts.append(i)
else:
raise TypeError("Invalid argument '%s' of type: '%s'. "
"Expected a string with a css path here." % (
str(i), type(i).__name__
)
)
super(Node, self).__init__()

def __call__(self, *args, **kwargs):
res = FullNode(*args, **kwargs)
res.node = self
return res

@classmethod
def list(cls, *args):
res = cls(*args)
res.is_list = True
return res

@classmethod
def optional(cls, *args):
res = cls(*args)
res.is_opt = True
return res

def concat(self, s):
if self.is_take_first:
raise TypeError("take_first and concat are mutually exclusive!")
if not self.is_list:
raise TypeError("You can only concat a list of strings")
self.concat_sep = s
return self

def take_first(self):
if self.concat_sep is not None:
raise TypeError("take_first and concat are mutually exclusive!")
if not self.is_list:
raise TypeError("You can only take_first from a list")
self.is_take_first = True
return self

def validate_sel_list_len(self, size):
if self.is_list:
pass
else:
if size > 1:
raise ValueError("Multiple html tags for a non-list node "
"'{}'.".format(" | ".join(self.alts)))
if size == 0 and (not self.is_opt):
raise ValueError("Missing html tag for a non-optional node "
"'{}'.".format(" | ".join(self.alts)))

def extract(self, selector):
"""Only check if the data is correct."""
size = len(selector.css(*self.alts))
self.validate_sel_list_len(size)
return {}


class Text(BaseLeaf):
"""Whitespace at the beginning and the end of the text is automatically stripped."""
def __init__(self):
super(Text, self).__init__()

def extract(self, selector):
res = selector.text()
res = res.strip()
return {self.fieldlabel: res}

# TODO
# Text.replace("$", "").lower()
# Text.not_strip (or Text.with_whitespace or Text.retain_spaces)


class Attr(BaseLeaf):
def __init__(self, attr):
super(Attr, self).__init__()
self.attr = attr

def extract(self, selector):
return {self.fieldlabel: selector.get_attr(self.attr).strip()}


class Constant(BaseLeaf):
def __init__(self, val):
self.val = val
super(Constant, self).__init__()

def extract(self, selector):
return {self.fieldlabel: self.val}


class ThisClass(BaseLeaf):
def __init__(self):
self.this_class = None
super(ThisClass, self).__init__()

def extract(self, selector):
res = self.this_class.extract_unboxed(selector)
return {self.fieldlabel: res}

def fill_thisclass_attr(self, cls):
"""Needed for the recursive ThisClass leaf nodes."""
self.this_class = cls



# not implemented:
StrictNode = Node
StrictHtml = Html
ShallowText = Text
Empty file added pagemodel/tests/__init__.py
Empty file.
Loading

0 comments on commit 7998074

Please sign in to comment.