Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #105 - Modify ebooklib.utils#parse_string and ebooklib.utils#parse_html_stri… #115

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 28 additions & 14 deletions ebooklib/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,20 @@ def __init__(self, code, msg):
def __str__(self):
return repr(self.msg)


# Parser


class EpubParser(object):
def parse_string(self, s, *args, **kwargs):
kwargs['parser'] = etree.XMLParser(remove_comments=True)
return parse_string(s, *args, **kwargs)

def parse_html_string(self, s, **kwargs):
kwargs['remove_comments'] = True
return parse_html_string(s, **kwargs)


# Items

class EpubItem(object):
Expand Down Expand Up @@ -226,7 +240,7 @@ def __str__(self):
return '<EpubCover:%s:%s>' % (self.id, self.file_name)


class EpubHtml(EpubItem):
class EpubHtml(EpubItem, EpubParser):
"""
Represents HTML document in the EPUB file.
"""
Expand Down Expand Up @@ -326,7 +340,7 @@ def get_body_content(self):
"""

try:
html_tree = parse_html_string(self.content)
html_tree = self.parse_html_string(self.content)
except:
return ''

Expand Down Expand Up @@ -358,7 +372,7 @@ def get_content(self, default=None):
Returns content of this document.
"""

tree = parse_string(self.book.get_template(self._template_name))
tree = self.parse_string(self.book.get_template(self._template_name))
tree_root = tree.getroot()

tree_root.set('lang', self.lang or self.book.language)
Expand All @@ -368,7 +382,7 @@ def get_content(self, default=None):
# <meta charset="utf-8" />

try:
html_tree = parse_html_string(self.content)
html_tree = self.parse_html_string(self.content)
except:
return ''

Expand Down Expand Up @@ -417,7 +431,7 @@ def __str__(self):
return '<EpubHtml:%s:%s>' % (self.id, self.file_name)


class EpubCoverHtml(EpubHtml):
class EpubCoverHtml(EpubHtml, EpubParser):
"""
Represents Cover page in the EPUB file.
"""
Expand Down Expand Up @@ -448,7 +462,7 @@ def get_content(self):

self.content = self.book.get_template('cover')

tree = parse_string(super(EpubCoverHtml, self).get_content())
tree = self.parse_string(super(EpubCoverHtml, self).get_content())
tree_root = tree.getroot()

images = tree_root.xpath('//xhtml:img', namespaces={'xhtml': NAMESPACES['XHTML']})
Expand Down Expand Up @@ -812,7 +826,7 @@ def add_prefix(self, name, uri):
self.prefixes.append('%s: %s' % (name, uri))


class EpubWriter(object):
class EpubWriter(EpubParser):
DEFAULT_OPTIONS = {
'epub2_guide': True,
'epub3_landmark': True,
Expand Down Expand Up @@ -1018,7 +1032,7 @@ def _write_opf_file(self):

def _get_nav(self, item):
# just a basic navigation for now
nav_xml = parse_string(self.book.get_template('nav'))
nav_xml = self.parse_string(self.book.get_template('nav'))
root = nav_xml.getroot()

root.set('lang', self.book.language)
Expand Down Expand Up @@ -1109,7 +1123,7 @@ def _create_section(itm, items):
def _get_ncx(self):

# we should be able to setup language for NCX as also
ncx = parse_string(self.book.get_template('ncx'))
ncx = self.parse_string(self.book.get_template('ncx'))
root = ncx.getroot()

head = etree.SubElement(root, 'head')
Expand Down Expand Up @@ -1213,7 +1227,7 @@ def write(self):
self.out.close()


class EpubReader(object):
class EpubReader(EpubParser):
DEFAULT_OPTIONS = {}

def __init__(self, epub_file_name, options=None):
Expand Down Expand Up @@ -1251,7 +1265,7 @@ def read_file(self, name):

def _load_container(self):
meta_inf = self.read_file('META-INF/container.xml')
tree = parse_string(meta_inf)
tree = self.parse_string(meta_inf)

for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):
if root_file.get('media-type') == "application/oebps-package+xml":
Expand Down Expand Up @@ -1385,7 +1399,7 @@ def _load_manifest(self):
self.book.add_item(ei)

def _parse_ncx(self, data):
tree = parse_string(data)
tree = self.parse_string(data)
tree_root = tree.getroot()

nav_map = tree_root.find('{%s}navMap' % NAMESPACES['DAISY'])
Expand Down Expand Up @@ -1414,7 +1428,7 @@ def _get_children(elems, n, nid):
self.book.toc = _get_children(nav_map, 0, '')

def _parse_nav(self, data, base_path):
html_node = parse_html_string(data)
html_node = self.parse_html_string(data)
nav_node = html_node.xpath("//nav[@*='toc']")[0]

def parse_list(list_node):
Expand Down Expand Up @@ -1472,7 +1486,7 @@ def _load_opf_file(self):
except KeyError:
raise EpubException(-1, 'Can not find container file')

self.container = parse_string(s)
self.container = self.parse_string(s)

self._load_metadata()
self._load_manifest()
Expand Down
10 changes: 5 additions & 5 deletions ebooklib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ def debug(obj):
pp.pprint(obj)


def parse_string(s):
def parse_string(s, *args, **kwargs):
try:
tree = etree.parse(io.BytesIO(s.encode('utf-8')))
tree = etree.parse(io.BytesIO(s.encode('utf-8')), *args, **kwargs)
except:
tree = etree.parse(io.BytesIO(s))
tree = etree.parse(io.BytesIO(s), *args, **kwargs)

return tree


def parse_html_string(s):
def parse_html_string(s, **kwargs):
from lxml import html

utf8_parser = html.HTMLParser(encoding='utf-8')
utf8_parser = html.HTMLParser(encoding='utf-8', **kwargs)

html_tree = html.document_fromstring(s, parser=utf8_parser)

Expand Down