Merge pull request #1 from wojtex/master

Added more Python 2/3 portability and XPath support.
franekp · Mar 9, 2016 · c52a582 · c52a582
2 parents 7998074 + d1f2eea
commit c52a582
Show file tree

Hide file tree

Showing 5 changed files with 226 additions and 55 deletions.
diff --git a/pagemodel/__init__.py b/pagemodel/__init__.py
@@ -1,3 +1,4 @@
 from . import html
 from .html import (Node, StrictNode, Text, ShallowText,
                        Html, StrictHtml, ThisClass)
+
diff --git a/pagemodel/bsoup.py b/pagemodel/bsoup.py
@@ -4,33 +4,11 @@
 import six
 
 from pagemodel.html import BaseNode, BaseLeaf, Base
+from pagemodel.pagemodel import PageModelMetaClass, BaseBasePageModel
 
 import bs4
 
 
-class PageModelMetaClass(type):
-    def __new__(cls, name, bases, attrs):
-        if name in ['PageModel', 'BasePageModel']:
-            return super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
-        if 'model_class' not in attrs:
-            raise TypeError("Subclasses of PageModel must declare "
-                "'model_class' attribute.")
-        if 'page_tree' not in attrs:
-            raise TypeError("Subclasses of PageModel must declare "
-                "'page_tree' attribute.")
-        page_tree = attrs['page_tree']
-        if not isinstance(page_tree, BaseNode):
-            raise TypeError("Invalid type of 'page_tree' attribute.")
-        page_tree.validate()
-        res = super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
-        page_tree.fill_thisclass_attr(res)
-        return res
-
-
-class BaseBasePageModel(object):
-    pass
-
-
 class BasePageModel(six.with_metaclass(PageModelMetaClass, BaseBasePageModel)):
     pass
 
@@ -63,7 +41,7 @@ def postproc(cls, dic):
 
 class Selector(object):
     def __init__(self, arg):
-        if isinstance(arg, basestring):
+        if isinstance(arg, six.string_types):
             self.sel = bs4.BeautifulSoup(arg, "html.parser")
         else:
             self.sel = arg
@@ -80,11 +58,22 @@ def css(self, *paths):
         return [Selector(sel) for sel in sel_list]
 
     def text(self):
-        """Return all the text contained in a node as a string."""
+        """
+        Return all the text contained in a node as a string.
+        :return: String containing all the taxt inside the node, w/o tags.
+        """
         return self.sel.get_text()
 
     def textlist(self):
         return list(self.sel.strings)
 
     def get_attr(self, attr_name):
-        return self.sel[attr_name]
+        """
+        Returns value of specified attribute of this tag.
+        :param attr_name: Name of the attribute.
+        :return: Value of the attribute.
+        """
+        attr_value = self.sel[attr_name]
+        if isinstance(attr_value, list):
+            attr_value = ' '.join(attr_value)
+        return attr_value
diff --git a/pagemodel/html.py b/pagemodel/html.py
@@ -1,14 +1,12 @@
 from collections import Counter
+import six
 
 
 # TODO:
 # - flatten_lists switch in PageModel subclasses - when lists are nested, they
 #    are flattened
 # - If(Attr("class") == "myclass") (...)
 # - ParentNode node-like thing to access parent node space from child nodes
-# - Node.nth(num)("<css-selector>"), Node.first, Node.second, Node.third
-# - more accurate, informative error handling
-
 
 class Base(object):
     def extract(self, selector):
@@ -28,7 +26,7 @@ def fill_thisclass_attr(self, cls):
 class BaseNode(Base):
     def __init__(self, *args, **kwargs):
         self.child_nodes = []
-        for i in list(args) + kwargs.values():
+        for i in list(args) + list(kwargs.values()):
             if isinstance(i, Base):
                 self.child_nodes.append(i)
             else:
@@ -42,9 +40,9 @@ def __init__(self, *args, **kwargs):
 
     def set_fieldlabel(self, name):
         raise TypeError("You cannot store a node-like"
-            "object '{}' in model's field. You can only store leaf-like things"
-            "such as Text, ShallowText and instances of other models.".format(
-                type(self).__name__))
+                        "object '{}' in model's field. You can only store leaf-like things"
+                        "such as Text, ShallowText and instances of other models.".format(
+            type(self).__name__))
 
     def get_fieldlabels(self):
         """Return a Counter of field labels. This method is only for validation
@@ -94,12 +92,11 @@ def fill_thisclass_attr(self, cls):
         pass
 
 
-
 class Html(BaseNode):
     pass
 
 
-class FullNode(BaseNode):
+class BaseSelectorFullNode(BaseNode):
     @classmethod
     def reduce_dict_list(cls, dlist):
         res = {}
@@ -126,10 +123,13 @@ def takefirst_dict_list(cls, dlist):
             raise ValueError("take_first applied to an empty list!")
         return res
 
+    def _make_select(self, selector, *alts):
+        raise NotImplementedError
+
     def extract(self, selector):
-        sel_list = selector.css(*self.node.alts)
+        sel_list = self._make_select(selector, *self.node.alts)
         self.node.validate_sel_list_len(len(sel_list))
-        res_list = [super(FullNode, self).extract(sel) for sel in sel_list]
+        res_list = [super(BaseSelectorFullNode, self).extract(sel) for sel in sel_list]
         if self.node.is_list:
             if self.node.concat_sep is not None:
                 return self.concat_dict_list(res_list, self.node.concat_sep)
@@ -144,26 +144,29 @@ def extract(self, selector):
                 return {}
 
 
-class Node(BaseNode):
+class BaseSelectorNode(BaseNode):
     def __init__(self, *args):
         self.alts = []
         self.is_opt = False
         self.is_list = False
         self.concat_sep = None
         self.is_take_first = False
         for i in args:
-            if isinstance(i, basestring):
+            if isinstance(i, six.string_types):
                 self.alts.append(i)
             else:
                 raise TypeError("Invalid argument '%s' of type: '%s'. "
-                    "Expected a string with a css path here." % (
-                        str(i), type(i).__name__
-                    )
-                )
-        super(Node, self).__init__()
+                                "Expected a string with a selector path here." % (
+                                    str(i), type(i).__name__
+                                )
+                                )
+        super(BaseSelectorNode, self).__init__()
+
+    def _make_full(self, *args, **kwargs):
+        raise NotImplementedError
 
     def __call__(self, *args, **kwargs):
-        res = FullNode(*args, **kwargs)
+        res = self._make_full(*args, **kwargs)
         res.node = self
         return res
 
@@ -201,31 +204,71 @@ def validate_sel_list_len(self, size):
         else:
             if size > 1:
                 raise ValueError("Multiple html tags for a non-list node "
-                    "'{}'.".format(" | ".join(self.alts)))
+                                 "'{}'.".format(" | ".join(self.alts)))
             if size == 0 and (not self.is_opt):
                 raise ValueError("Missing html tag for a non-optional node "
-                    "'{}'.".format(" | ".join(self.alts)))
+                                 "'{}'.".format(" | ".join(self.alts)))
+
+    def _make_select(self, selector, *alts):
+        raise NotImplementedError
 
     def extract(self, selector):
         """Only check if the data is correct."""
-        size = len(selector.css(*self.alts))
+        size = len(self._make_select(selector, self.alts))
         self.validate_sel_list_len(size)
         return {}
 
 
+class FullNode(BaseSelectorFullNode):
+    def _make_select(self, selector, *alts):
+        return selector.css(*alts)
+
+
+class Node(BaseSelectorNode):
+    def _make_full(self, *args, **kwargs):
+        return FullNode(*args, **kwargs)
+
+    def _make_select(self, selector, *alts):
+        return selector.css(*alts)
+
+
+class FullXPath(BaseSelectorFullNode):
+    def _make_select(self, selector, *alts):
+        return selector.xpath(*alts)
+
+
+class XPath(BaseSelectorNode):
+    def _make_full(self, *args, **kwargs):
+        return FullXPath(*args, **kwargs)
+
+    def _make_select(self, selector, *alts):
+        return selector.xpath(*alts)
+
+
 class Text(BaseLeaf):
     """Whitespace at the beginning and the end of the text is automatically stripped."""
+
     def __init__(self):
         super(Text, self).__init__()
 
     def extract(self, selector):
-        res = selector.text()
-        res = res.strip()
+        res = selector.text().strip()
         return {self.fieldlabel: res}
 
-    # TODO
-    # Text.replace("$", "").lower()
-    # Text.not_strip (or Text.with_whitespace or Text.retain_spaces)
+        # TODO
+        # Text.replace("$", "").lower()
+        # Text.not_strip (or Text.with_whitespace or Text.retain_spaces)
+
+
+class Fragment(BaseLeaf):
+    """Whitespace at the beginning and the end of the text is automatically stripped."""
+
+    def __init__(self):
+        super(Fragment, self).__init__()
+
+    def extract(self, selector):
+        res = selector.fragment()
+        return {self.fieldlabel: res}
 
 
 class Attr(BaseLeaf):
@@ -260,8 +303,7 @@ def fill_thisclass_attr(self, cls):
         self.this_class = cls
 
 
-
 # not implemented:
 StrictNode = Node
 StrictHtml = Html
-ShallowText = Text
+ShallowText = Text
diff --git a/pagemodel/lxml.py b/pagemodel/lxml.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+import six
+
+from pagemodel.html import BaseLeaf
+from pagemodel.pagemodel import PageModelMetaClass, BaseBasePageModel
+
+from lxml import etree
+from lxml.etree import XPath
+from lxml.cssselect import CSSSelector
+
+import html5lib
+
+
+class BasePageModel(six.with_metaclass(PageModelMetaClass, BaseBasePageModel)):
+    pass
+
+
+class PageModel(BasePageModel, BaseLeaf):
+    @classmethod
+    def extract_unboxed(cls, selector):
+        try:
+            res = cls.page_tree.extract(selector)
+            cls.postproc(res)
+            return cls.model_class(**res)
+        except ValueError as a:
+            raise ValueError(cls.__name__ + ": " + str(a))
+
+    def extract(self, selector):
+        res = self.extract_unboxed(selector)
+        return {self.fieldlabel: res}
+
+    def __new__(cls, page_text=None):
+        if page_text is None:
+            res = super(PageModel, cls).__new__(cls)
+            return res
+        else:
+            return cls.extract_unboxed(Selector(page_text))
+
+    @classmethod
+    def postproc(cls, dic):
+        return dic
+
+
+class Selector(object):
+    @classmethod
+    def _simple_html5_parser(cls, s):
+        """
+        Parses HTML code into tree (html namespace is converted to void namespace)
+        :param s: HTML code to parse
+        :return: Lxml tree representing the code
+        """
+        parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
+        return parser.parse(s)
+
+    def __init__(self, arg, nsmap=None, css_translator='html', html_parser=None):
+        if not nsmap:
+            nsmap = {}
+        if not html_parser:
+            html_parser = Selector._simple_html5_parser
+        self.nsmap = nsmap
+        self.css_translator = css_translator
+        self.html_parser = html_parser
+        if isinstance(arg, six.string_types):
+            self.sel = self.html_parser(arg)
+        else:
+            self.sel = arg
+
+    def css(self, *paths):
+        """Return a list of nodes that satisfy any of provided
+        css paths.
+        """
+        sel_list = [CSSSelector(path, translator=self.css_translator, namespaces=self.nsmap)(self.sel) for path in paths]
+        sel_list = [el for chunk in sel_list for el in chunk]
+        sel_list = set(sel_list)
+        return [Selector(sel, nsmap=self.nsmap, css_translator=self.css_translator, html_parser=self.html_parser) for sel in sel_list]
+
+    def xpath(self, *paths):
+        """Return a list of nodes that satisfy any of provided
+        xpath paths.
+        """
+        sel_list = [XPath(path, namespaces=self.nsmap)(self.sel) for path in paths]
+        sel_list = [el for chunk in sel_list for el in chunk]
+        sel_list = set(sel_list)
+        return [Selector(sel, nsmap=self.nsmap, css_translator=self.css_translator, html_parser=self.html_parser) for sel in sel_list]
+
+    def name(self):
+        """Returns tag name or attribute name."""
+        return self.sel.tag
+
+    def text(self):
+        """Return all the text contained in a node as a string."""
+        res = ''.join([s for s in self.sel.itertext()])
+        return res
+
+    def fragment(self):
+        """Return XML code within content of current element."""
+        return etree.tostring(self.sel)
+
+    def textlist(self):
+        raise NotImplementedError() # do not know what it does
+
+    def get_attr(self, attr_name):
+        """
+        Returns value of specified attribute of this tag.
+        :param attr_name: Name of the attribute.
+        :return: Value of the attribute.
+        """
+        return self.sel.get(attr_name)