Initial commit after moving from ankidict repo.

franekp · Mar 6, 2016 · 7998074 · 7998074
1 parent dba87ef
commit 7998074
Show file tree

Hide file tree

Showing 5 changed files with 787 additions and 0 deletions.
diff --git a/pagemodel/__init__.py b/pagemodel/__init__.py
@@ -0,0 +1,3 @@
+from . import html
+from .html import (Node, StrictNode, Text, ShallowText,
+                       Html, StrictHtml, ThisClass)
diff --git a/pagemodel/bsoup.py b/pagemodel/bsoup.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import six
+
+from pagemodel.html import BaseNode, BaseLeaf, Base
+
+import bs4
+
+
+class PageModelMetaClass(type):
+    def __new__(cls, name, bases, attrs):
+        if name in ['PageModel', 'BasePageModel']:
+            return super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
+        if 'model_class' not in attrs:
+            raise TypeError("Subclasses of PageModel must declare "
+                "'model_class' attribute.")
+        if 'page_tree' not in attrs:
+            raise TypeError("Subclasses of PageModel must declare "
+                "'page_tree' attribute.")
+        page_tree = attrs['page_tree']
+        if not isinstance(page_tree, BaseNode):
+            raise TypeError("Invalid type of 'page_tree' attribute.")
+        page_tree.validate()
+        res = super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
+        page_tree.fill_thisclass_attr(res)
+        return res
+
+
+class BaseBasePageModel(object):
+    pass
+
+
+class BasePageModel(six.with_metaclass(PageModelMetaClass, BaseBasePageModel)):
+    pass
+
+
+class PageModel(BasePageModel, BaseLeaf):
+    @classmethod
+    def extract_unboxed(cls, selector):
+        try:
+            res = cls.page_tree.extract(selector)
+            cls.postproc(res)
+            return cls.model_class(**res)
+        except ValueError as a:
+            raise ValueError(cls.__name__ + ": " + str(a))
+
+    def extract(self, selector):
+        res = self.extract_unboxed(selector)
+        return {self.fieldlabel: res}
+
+    def __new__(cls, page_text=None):
+        if page_text is None:
+            res = super(PageModel, cls).__new__(cls)
+            return res
+        else:
+            return cls.extract_unboxed(Selector(page_text))
+
+    @classmethod
+    def postproc(cls, dic):
+        return dic
+
+
+class Selector(object):
+    def __init__(self, arg):
+        if isinstance(arg, basestring):
+            self.sel = bs4.BeautifulSoup(arg, "html.parser")
+        else:
+            self.sel = arg
+
+    def css(self, *paths):
+        """Return a list of nodes that satisfy any of provided
+        css paths.
+        """
+        # musi być zrobione, że po kolei wywołuje, ponieważ
+        # według BS4 przecinek wiąże silniej niż spacja w
+        # selektorach css.
+        sel_list = [self.sel.select(path.strip()) for path in paths]
+        sel_list = [el for chunk in sel_list for el in chunk]
+        return [Selector(sel) for sel in sel_list]
+
+    def text(self):
+        """Return all the text contained in a node as a string."""
+        return self.sel.get_text()
+
+    def textlist(self):
+        return list(self.sel.strings)
+
+    def get_attr(self, attr_name):
+        return self.sel[attr_name]
diff --git a/pagemodel/html.py b/pagemodel/html.py
@@ -0,0 +1,267 @@
+from collections import Counter
+
+
+# TODO:
+# - flatten_lists switch in PageModel subclasses - when lists are nested, they
+#    are flattened
+# - If(Attr("class") == "myclass") (...)
+# - ParentNode node-like thing to access parent node space from child nodes
+# - Node.nth(num)("<css-selector>"), Node.first, Node.second, Node.third
+# - more accurate, informative error handling
+
+
+class Base(object):
+    def extract(self, selector):
+        """Always return a dict with (possibly partial) results."""
+        raise NotImplementedError
+
+    def set_fieldlabel(self, lab):
+        raise NotImplementedError
+
+    def get_fieldlabels(self):
+        raise NotImplementedError
+
+    def fill_thisclass_attr(self, cls):
+        raise NotImplementedError
+
+
+class BaseNode(Base):
+    def __init__(self, *args, **kwargs):
+        self.child_nodes = []
+        for i in list(args) + kwargs.values():
+            if isinstance(i, Base):
+                self.child_nodes.append(i)
+            else:
+                raise TypeError(
+                    "Invalid argument of type: '%s'" % str(type(i)))
+        for i in kwargs:
+            kwargs[i].set_fieldlabel(i)
+        for i in self.child_nodes:
+            i.parent_node = self
+        super(BaseNode, self).__init__()
+
+    def set_fieldlabel(self, name):
+        raise TypeError("You cannot store a node-like"
+            "object '{}' in model's field. You can only store leaf-like things"
+            "such as Text, ShallowText and instances of other models.".format(
+                type(self).__name__))
+
+    def get_fieldlabels(self):
+        """Return a Counter of field labels. This method is only for validation
+        that no fieldlabel is written twice in page_tree.
+        """
+        res = Counter()
+        for node in self.child_nodes:
+            res.update(node.get_fieldlabels())
+        return res
+
+    def fill_thisclass_attr(self, cls):
+        """Needed for the recursive ThisClass leaf nodes."""
+        for node in self.child_nodes:
+            node.fill_thisclass_attr(cls)
+
+    def validate(self):
+        res = self.get_fieldlabels()
+        for label in res:
+            if res[label] >= 2:
+                raise NameError("Duplicate field label: '{}'.".format(label))
+
+    def extract(self, selector):
+        res = {}
+        for node in self.child_nodes:
+            res.update(node.extract(selector))
+        return res
+
+
+class BaseLeaf(Base):
+    def __init__(self):
+        self.fieldlabel = None
+        super(BaseLeaf, self).__init__()
+
+    def set_fieldlabel(self, name):
+        if self.fieldlabel is None:
+            self.fieldlabel = name
+        else:
+            raise NameError("Conflict of field labels in page_tree.")
+
+    def get_fieldlabels(self):
+        if self.fieldlabel is None:
+            raise NameError("A leaf-like node without field label exists.")
+        return Counter([self.fieldlabel])
+
+    def fill_thisclass_attr(self, cls):
+        """Needed for the recursive ThisClass leaf nodes."""
+        pass
+
+
+
+class Html(BaseNode):
+    pass
+
+
+class FullNode(BaseNode):
+    @classmethod
+    def reduce_dict_list(cls, dlist):
+        res = {}
+        for dic in dlist:
+            res.update(dic)
+        for k in res:
+            res[k] = [dic[k] for dic in dlist if k in dic]
+        return res
+
+    @classmethod
+    def concat_dict_list(cls, dlist, sep):
+        res = cls.reduce_dict_list(dlist)
+        for k in res:
+            res[k] = sep.join(res[k])
+        return res
+
+    @classmethod
+    def takefirst_dict_list(cls, dlist):
+        res = cls.reduce_dict_list(dlist)
+        try:
+            for k in res:
+                res[k] = res[k][0]
+        except IndexError:
+            raise ValueError("take_first applied to an empty list!")
+        return res
+
+    def extract(self, selector):
+        sel_list = selector.css(*self.node.alts)
+        self.node.validate_sel_list_len(len(sel_list))
+        res_list = [super(FullNode, self).extract(sel) for sel in sel_list]
+        if self.node.is_list:
+            if self.node.concat_sep is not None:
+                return self.concat_dict_list(res_list, self.node.concat_sep)
+            elif self.node.is_take_first:
+                return self.takefirst_dict_list(res_list)
+            else:
+                return self.reduce_dict_list(res_list)
+        else:
+            try:
+                return res_list[0]
+            except:
+                return {}
+
+
+class Node(BaseNode):
+    def __init__(self, *args):
+        self.alts = []
+        self.is_opt = False
+        self.is_list = False
+        self.concat_sep = None
+        self.is_take_first = False
+        for i in args:
+            if isinstance(i, basestring):
+                self.alts.append(i)
+            else:
+                raise TypeError("Invalid argument '%s' of type: '%s'. "
+                    "Expected a string with a css path here." % (
+                        str(i), type(i).__name__
+                    )
+                )
+        super(Node, self).__init__()
+
+    def __call__(self, *args, **kwargs):
+        res = FullNode(*args, **kwargs)
+        res.node = self
+        return res
+
+    @classmethod
+    def list(cls, *args):
+        res = cls(*args)
+        res.is_list = True
+        return res
+
+    @classmethod
+    def optional(cls, *args):
+        res = cls(*args)
+        res.is_opt = True
+        return res
+
+    def concat(self, s):
+        if self.is_take_first:
+            raise TypeError("take_first and concat are mutually exclusive!")
+        if not self.is_list:
+            raise TypeError("You can only concat a list of strings")
+        self.concat_sep = s
+        return self
+
+    def take_first(self):
+        if self.concat_sep is not None:
+            raise TypeError("take_first and concat are mutually exclusive!")
+        if not self.is_list:
+            raise TypeError("You can only take_first from a list")
+        self.is_take_first = True
+        return self
+
+    def validate_sel_list_len(self, size):
+        if self.is_list:
+            pass
+        else:
+            if size > 1:
+                raise ValueError("Multiple html tags for a non-list node "
+                    "'{}'.".format(" | ".join(self.alts)))
+            if size == 0 and (not self.is_opt):
+                raise ValueError("Missing html tag for a non-optional node "
+                    "'{}'.".format(" | ".join(self.alts)))
+
+    def extract(self, selector):
+        """Only check if the data is correct."""
+        size = len(selector.css(*self.alts))
+        self.validate_sel_list_len(size)
+        return {}
+
+
+class Text(BaseLeaf):
+    """Whitespace at the beginning and the end of the text is automatically stripped."""
+    def __init__(self):
+        super(Text, self).__init__()
+
+    def extract(self, selector):
+        res = selector.text()
+        res = res.strip()
+        return {self.fieldlabel: res}
+
+    # TODO
+    # Text.replace("$", "").lower()
+    # Text.not_strip (or Text.with_whitespace or Text.retain_spaces)
+
+
+class Attr(BaseLeaf):
+    def __init__(self, attr):
+        super(Attr, self).__init__()
+        self.attr = attr
+
+    def extract(self, selector):
+        return {self.fieldlabel: selector.get_attr(self.attr).strip()}
+
+
+class Constant(BaseLeaf):
+    def __init__(self, val):
+        self.val = val
+        super(Constant, self).__init__()
+
+    def extract(self, selector):
+        return {self.fieldlabel: self.val}
+
+
+class ThisClass(BaseLeaf):
+    def __init__(self):
+        self.this_class = None
+        super(ThisClass, self).__init__()
+
+    def extract(self, selector):
+        res = self.this_class.extract_unboxed(selector)
+        return {self.fieldlabel: res}
+
+    def fill_thisclass_attr(self, cls):
+        """Needed for the recursive ThisClass leaf nodes."""
+        self.this_class = cls
+
+
+
+# not implemented:
+StrictNode = Node
+StrictHtml = Html
+ShallowText = Text
diff --git a/pagemodel/tests/__init__.py b/pagemodel/tests/__init__.py