Skip to content

Commit

Permalink
Merge pull request #1 from wojtex/master
Browse files Browse the repository at this point in the history
Added more Python 2/3 portability and XPath support.
  • Loading branch information
franekp committed Mar 9, 2016
2 parents 7998074 + d1f2eea commit c52a582
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 55 deletions.
1 change: 1 addition & 0 deletions pagemodel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from . import html
from .html import (Node, StrictNode, Text, ShallowText,
Html, StrictHtml, ThisClass)

41 changes: 15 additions & 26 deletions pagemodel/bsoup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,11 @@
import six

from pagemodel.html import BaseNode, BaseLeaf, Base
from pagemodel.pagemodel import PageModelMetaClass, BaseBasePageModel

import bs4


class PageModelMetaClass(type):
def __new__(cls, name, bases, attrs):
if name in ['PageModel', 'BasePageModel']:
return super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
if 'model_class' not in attrs:
raise TypeError("Subclasses of PageModel must declare "
"'model_class' attribute.")
if 'page_tree' not in attrs:
raise TypeError("Subclasses of PageModel must declare "
"'page_tree' attribute.")
page_tree = attrs['page_tree']
if not isinstance(page_tree, BaseNode):
raise TypeError("Invalid type of 'page_tree' attribute.")
page_tree.validate()
res = super(PageModelMetaClass, cls).__new__(cls, name, bases, attrs)
page_tree.fill_thisclass_attr(res)
return res


class BaseBasePageModel(object):
pass


class BasePageModel(six.with_metaclass(PageModelMetaClass, BaseBasePageModel)):
pass

Expand Down Expand Up @@ -63,7 +41,7 @@ def postproc(cls, dic):

class Selector(object):
def __init__(self, arg):
if isinstance(arg, basestring):
if isinstance(arg, six.string_types):
self.sel = bs4.BeautifulSoup(arg, "html.parser")
else:
self.sel = arg
Expand All @@ -80,11 +58,22 @@ def css(self, *paths):
return [Selector(sel) for sel in sel_list]

def text(self):
"""Return all the text contained in a node as a string."""
"""
Return all the text contained in a node as a string.
:return: String containing all the taxt inside the node, w/o tags.
"""
return self.sel.get_text()

def textlist(self):
return list(self.sel.strings)

def get_attr(self, attr_name):
return self.sel[attr_name]
"""
Returns value of specified attribute of this tag.
:param attr_name: Name of the attribute.
:return: Value of the attribute.
"""
attr_value = self.sel[attr_name]
if isinstance(attr_value, list):
attr_value = ' '.join(attr_value)
return attr_value
100 changes: 71 additions & 29 deletions pagemodel/html.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from collections import Counter
import six


# TODO:
# - flatten_lists switch in PageModel subclasses - when lists are nested, they
# are flattened
# - If(Attr("class") == "myclass") (...)
# - ParentNode node-like thing to access parent node space from child nodes
# - Node.nth(num)("<css-selector>"), Node.first, Node.second, Node.third
# - more accurate, informative error handling


class Base(object):
def extract(self, selector):
Expand All @@ -28,7 +26,7 @@ def fill_thisclass_attr(self, cls):
class BaseNode(Base):
def __init__(self, *args, **kwargs):
self.child_nodes = []
for i in list(args) + kwargs.values():
for i in list(args) + list(kwargs.values()):
if isinstance(i, Base):
self.child_nodes.append(i)
else:
Expand All @@ -42,9 +40,9 @@ def __init__(self, *args, **kwargs):

def set_fieldlabel(self, name):
raise TypeError("You cannot store a node-like"
"object '{}' in model's field. You can only store leaf-like things"
"such as Text, ShallowText and instances of other models.".format(
type(self).__name__))
"object '{}' in model's field. You can only store leaf-like things"
"such as Text, ShallowText and instances of other models.".format(
type(self).__name__))

def get_fieldlabels(self):
"""Return a Counter of field labels. This method is only for validation
Expand Down Expand Up @@ -94,12 +92,11 @@ def fill_thisclass_attr(self, cls):
pass



class Html(BaseNode):
pass


class FullNode(BaseNode):
class BaseSelectorFullNode(BaseNode):
@classmethod
def reduce_dict_list(cls, dlist):
res = {}
Expand All @@ -126,10 +123,13 @@ def takefirst_dict_list(cls, dlist):
raise ValueError("take_first applied to an empty list!")
return res

def _make_select(self, selector, *alts):
raise NotImplementedError

def extract(self, selector):
sel_list = selector.css(*self.node.alts)
sel_list = self._make_select(selector, *self.node.alts)
self.node.validate_sel_list_len(len(sel_list))
res_list = [super(FullNode, self).extract(sel) for sel in sel_list]
res_list = [super(BaseSelectorFullNode, self).extract(sel) for sel in sel_list]
if self.node.is_list:
if self.node.concat_sep is not None:
return self.concat_dict_list(res_list, self.node.concat_sep)
Expand All @@ -144,26 +144,29 @@ def extract(self, selector):
return {}


class Node(BaseNode):
class BaseSelectorNode(BaseNode):
def __init__(self, *args):
self.alts = []
self.is_opt = False
self.is_list = False
self.concat_sep = None
self.is_take_first = False
for i in args:
if isinstance(i, basestring):
if isinstance(i, six.string_types):
self.alts.append(i)
else:
raise TypeError("Invalid argument '%s' of type: '%s'. "
"Expected a string with a css path here." % (
str(i), type(i).__name__
)
)
super(Node, self).__init__()
"Expected a string with a selector path here." % (
str(i), type(i).__name__
)
)
super(BaseSelectorNode, self).__init__()

def _make_full(self, *args, **kwargs):
raise NotImplementedError

def __call__(self, *args, **kwargs):
res = FullNode(*args, **kwargs)
res = self._make_full(*args, **kwargs)
res.node = self
return res

Expand Down Expand Up @@ -201,31 +204,71 @@ def validate_sel_list_len(self, size):
else:
if size > 1:
raise ValueError("Multiple html tags for a non-list node "
"'{}'.".format(" | ".join(self.alts)))
"'{}'.".format(" | ".join(self.alts)))
if size == 0 and (not self.is_opt):
raise ValueError("Missing html tag for a non-optional node "
"'{}'.".format(" | ".join(self.alts)))
"'{}'.".format(" | ".join(self.alts)))

def _make_select(self, selector, *alts):
raise NotImplementedError

def extract(self, selector):
"""Only check if the data is correct."""
size = len(selector.css(*self.alts))
size = len(self._make_select(selector, self.alts))
self.validate_sel_list_len(size)
return {}


class FullNode(BaseSelectorFullNode):
def _make_select(self, selector, *alts):
return selector.css(*alts)


class Node(BaseSelectorNode):
def _make_full(self, *args, **kwargs):
return FullNode(*args, **kwargs)

def _make_select(self, selector, *alts):
return selector.css(*alts)


class FullXPath(BaseSelectorFullNode):
def _make_select(self, selector, *alts):
return selector.xpath(*alts)


class XPath(BaseSelectorNode):
def _make_full(self, *args, **kwargs):
return FullXPath(*args, **kwargs)

def _make_select(self, selector, *alts):
return selector.xpath(*alts)


class Text(BaseLeaf):
"""Whitespace at the beginning and the end of the text is automatically stripped."""

def __init__(self):
super(Text, self).__init__()

def extract(self, selector):
res = selector.text()
res = res.strip()
res = selector.text().strip()
return {self.fieldlabel: res}

# TODO
# Text.replace("$", "").lower()
# Text.not_strip (or Text.with_whitespace or Text.retain_spaces)
# TODO
# Text.replace("$", "").lower()
# Text.not_strip (or Text.with_whitespace or Text.retain_spaces)


class Fragment(BaseLeaf):
"""Whitespace at the beginning and the end of the text is automatically stripped."""

def __init__(self):
super(Fragment, self).__init__()

def extract(self, selector):
res = selector.fragment()
return {self.fieldlabel: res}


class Attr(BaseLeaf):
Expand Down Expand Up @@ -260,8 +303,7 @@ def fill_thisclass_attr(self, cls):
self.this_class = cls



# not implemented:
StrictNode = Node
StrictHtml = Html
ShallowText = Text
ShallowText = Text
112 changes: 112 additions & 0 deletions pagemodel/lxml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import

import six

from pagemodel.html import BaseLeaf
from pagemodel.pagemodel import PageModelMetaClass, BaseBasePageModel

from lxml import etree
from lxml.etree import XPath
from lxml.cssselect import CSSSelector

import html5lib


class BasePageModel(six.with_metaclass(PageModelMetaClass, BaseBasePageModel)):
pass


class PageModel(BasePageModel, BaseLeaf):
@classmethod
def extract_unboxed(cls, selector):
try:
res = cls.page_tree.extract(selector)
cls.postproc(res)
return cls.model_class(**res)
except ValueError as a:
raise ValueError(cls.__name__ + ": " + str(a))

def extract(self, selector):
res = self.extract_unboxed(selector)
return {self.fieldlabel: res}

def __new__(cls, page_text=None):
if page_text is None:
res = super(PageModel, cls).__new__(cls)
return res
else:
return cls.extract_unboxed(Selector(page_text))

@classmethod
def postproc(cls, dic):
return dic


class Selector(object):
@classmethod
def _simple_html5_parser(cls, s):
"""
Parses HTML code into tree (html namespace is converted to void namespace)
:param s: HTML code to parse
:return: Lxml tree representing the code
"""
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
return parser.parse(s)

def __init__(self, arg, nsmap=None, css_translator='html', html_parser=None):
if not nsmap:
nsmap = {}
if not html_parser:
html_parser = Selector._simple_html5_parser
self.nsmap = nsmap
self.css_translator = css_translator
self.html_parser = html_parser
if isinstance(arg, six.string_types):
self.sel = self.html_parser(arg)
else:
self.sel = arg

def css(self, *paths):
"""Return a list of nodes that satisfy any of provided
css paths.
"""
sel_list = [CSSSelector(path, translator=self.css_translator, namespaces=self.nsmap)(self.sel) for path in paths]
sel_list = [el for chunk in sel_list for el in chunk]
sel_list = set(sel_list)
return [Selector(sel, nsmap=self.nsmap, css_translator=self.css_translator, html_parser=self.html_parser) for sel in sel_list]

def xpath(self, *paths):
"""Return a list of nodes that satisfy any of provided
xpath paths.
"""
sel_list = [XPath(path, namespaces=self.nsmap)(self.sel) for path in paths]
sel_list = [el for chunk in sel_list for el in chunk]
sel_list = set(sel_list)
return [Selector(sel, nsmap=self.nsmap, css_translator=self.css_translator, html_parser=self.html_parser) for sel in sel_list]

def name(self):
"""Returns tag name or attribute name."""
return self.sel.tag

def text(self):
"""Return all the text contained in a node as a string."""
res = ''.join([s for s in self.sel.itertext()])
return res

def fragment(self):
"""Return XML code within content of current element."""
return etree.tostring(self.sel)

def textlist(self):
raise NotImplementedError() # do not know what it does

def get_attr(self, attr_name):
"""
Returns value of specified attribute of this tag.
:param attr_name: Name of the attribute.
:return: Value of the attribute.
"""
return self.sel.get(attr_name)
Loading

0 comments on commit c52a582

Please sign in to comment.