diff --git a/microdata.py b/microdata.py index 9cdb83c..2e03a30 100755 --- a/microdata.py +++ b/microdata.py @@ -4,6 +4,7 @@ import html5lib from collections import defaultdict +from urlparse import urlparse try: @@ -17,13 +18,23 @@ def get_items(location, encoding=None): Pass in a string or file-like object and get a list of Items present in the HTML document. """ + try: + from urllib.request import urlopen + except ImportError: + from urllib import urlopen + dom_builder = html5lib.treebuilders.getTreeBuilder("dom") parser = html5lib.HTMLParser(tree=dom_builder) - if encoding: - tree = parser.parse(location, transport_encoding=encoding) - else: - tree = parser.parse(location) - return _find_items(tree) + + try: + if encoding: + tree = parser.parse(location, transport_encoding=encoding) + else: + tree = parser.parse(location) + except ValueError: + # Try opening it as a local file + tree = parser.parse(open(location), encoding=encoding) + return _find_items(tree, URI.get_domain(location)) class Item(object): @@ -33,15 +44,15 @@ class Item(object): or another Item. """ - def __init__(self, itemtype=None, itemid=None): + def __init__(self, itemtype=None, itemid=None, domain=""): """Create an Item, with an optional itemptype and/or itemid. """ # itemtype can be a space delimited list if itemtype: - self.itemtype = [URI(i) for i in itemtype.split(" ")] + self.itemtype = [URI(i, domain=domain) for i in itemtype.split(" ")] if itemid: - self.itemid = URI(itemid) + self.itemid = URI(itemid, domain=domain) self.props = {} @@ -108,8 +119,11 @@ def json_dict(self): class URI(object): - def __init__(self, string): - self.string = string + def __init__(self, string, domain=""): + if string.startswith("http://") or string.startswith("https://"): + self.string = string + else: + self.string = "/".join(("http:", "", domain, string)) def __eq__(self, other): if isinstance(other, URI): @@ -119,6 +133,16 @@ def __eq__(self, other): def __repr__(self): return self.string + @staticmethod + def get_domain(url_string): + """ + Get the domain _including_ the protocol specified, if any. + """ + parsed = urlparse(url_string) + if parsed.scheme: + return "/".join((parsed.scheme, "", parsed.netloc)) + else: + return url_string.split("/")[0] # what follows are the guts of extracting the Items from a DOM @@ -138,23 +162,23 @@ def __repr__(self): } -def _find_items(e): +def _find_items(e, domain=""): items = [] unlinked = [] if _is_element(e) and _is_itemscope(e): - item = _make_item(e) - unlinked = _extract(e, item) + item = _make_item(e, domain=domain) + unlinked = _extract(e, item, domain=domain) items.append(item) for unlinked_element in unlinked: - items.extend(_find_items(unlinked_element)) + items.extend(_find_items(unlinked_element, domain=domain)) else: for child in e.childNodes: - items.extend(_find_items(child)) + items.extend(_find_items(child, domain=domain)) return items -def _extract(e, item): +def _extract(e, item, domain=""): # looks in a DOM element for microdata to assign to an Item # _extract returns a list of elements which appeared to have microdata # but which were not directly related to the Item that was passed in @@ -165,19 +189,19 @@ def _extract(e, item): itemscope = _is_itemscope(child) if itemprop and itemscope: for i in itemprop.split(" "): - nested_item = _make_item(child) - unlinked.extend(_extract(child, nested_item)) + nested_item = _make_item(child, domain=domain) + unlinked.extend(_extract(child, nested_item, domain=domain)) item.set(i, nested_item) elif itemprop: - value = _property_value(child) + value = _property_value(child, domain=domain) # itemprops may also be in a space delimited list for i in itemprop.split(" "): item.set(i, value) - unlinked.extend(_extract(child, item)) + unlinked.extend(_extract(child, item, domain=domain)) elif itemscope: unlinked.append(child) else: - unlinked.extend(_extract(child, item)) + unlinked.extend(_extract(child, item, domain=domain)) return unlinked @@ -198,11 +222,11 @@ def _is_itemscope(e): return _attr(e, "itemscope") is not None -def _property_value(e): +def _property_value(e, domain=""): value = None attrib = property_values.get(e.tagName, None) if attrib in ["href", "src"]: - value = URI(e.getAttribute(attrib)) + value = URI(e.getAttribute(attrib), domain) elif attrib: value = e.getAttribute(attrib) else: @@ -221,20 +245,15 @@ def _text(e): return ''.join(chunks) -def _make_item(e): +def _make_item(e, domain=""): if not _is_itemscope(e): raise Exception("element is not an Item") itemtype = _attr(e, "itemtype") itemid = _attr(e, "itemid") - return Item(itemtype, itemid) + return Item(itemtype, itemid, domain=domain) if __name__ == "__main__": - try: - from urllib.request import urlopen - except ImportError: - from urllib import urlopen - if len(sys.argv) < 2: print("Usage: %s URL [...]" % sys.argv[0]) sys.exit(1) @@ -245,7 +264,7 @@ def _make_item(e): microdata = {} microdata['items'] = items = [] - for item in get_items(urlopen(url)): + for item in get_items(url): items.append(item.json_dict()) print(json.dumps(microdata, indent=2)) diff --git a/test.py b/test.py index 00bcae9..bd62a85 100644 --- a/test.py +++ b/test.py @@ -12,7 +12,7 @@ class MicrodataParserTest(unittest.TestCase): def test_parse(self): # parse the html for microdata - items = get_items(open("test-data/example.html")) + items = get_items("test-data/example.html") # this html should have just one main item self.assertTrue(len(items), 1) @@ -55,7 +55,7 @@ def test_parse(self): def test_parse_nested(self): # parse the html for microdata - items = get_items(open("test-data/example-nested.html")) + items = get_items("test-data/example-nested.html") # this html should have just one main item self.assertTrue(len(items), 1) @@ -71,7 +71,7 @@ def test_parse_nested(self): # test case of a nested itemscope self.assertTrue(isinstance(item.location, Item)) self.assertEqual(item.location.itemtype, [URI("http://schema.org/Place")]) - self.assertEqual(item.location.url, URI("wells-fargo-center.html")) + self.assertEqual(item.location.url, URI("wells-fargo-center.html", domain="test-data")) # address should be a nested item self.assertTrue(isinstance(item.location.address, Item)) @@ -82,14 +82,14 @@ def test_parse_nested(self): i = json.loads(item.json()) self.assertEqual(i["properties"]["name"][0].strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)") self.assertEqual(i["type"], ["http://schema.org/Event"]) - self.assertEqual(i["properties"]["url"], ["nba-miami-philidelphia-game3.html"]) + self.assertEqual(i["properties"]["url"], ["http://test-data/nba-miami-philidelphia-game3.html"]) self.assertTrue(isinstance(i["properties"]["location"][0], dict)) - self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "wells-fargo-center.html") + self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "http://test-data/wells-fargo-center.html") self.assertTrue(isinstance(i["properties"]["location"][0]["properties"]["address"][0], dict)) self.assertEqual(i["properties"]["location"][0]["properties"]["address"][0]["properties"]["addressLocality"][0], "Philadelphia") def test_parse_unlinked(self): - items = get_items(open("test-data/unlinked.html")) + items = get_items("test-data/unlinked.html") self.assertEqual(len(items), 2) i = items[0] @@ -108,7 +108,7 @@ def test_parse_unlinked(self): self.assertTrue('Whitworth' in i.streetAddress) def test_skip_level(self): - items = get_items(open("test-data/skip-level.html")) + items = get_items("test-data/skip-level.html") self.assertEqual(len(items), 1) self.assertEqual(items[0].name, "Jane Doe") @@ -137,5 +137,19 @@ def test_parse_multiple_props(self): self.assertTrue(i["properties"]["favorite-color"][0], "orange") self.assertTrue(i["properties"]["favorite-fruit"][0], "orange") + +class URITest(unittest.TestCase): + + def test_get_domain(self): + https_start = "https://github.com/edsu/microdata" + self.assertEqual("https://github.com", URI.get_domain(https_start)) + + no_https = "github.com/edsu/microdata" + self.assertEqual("github.com", URI.get_domain(no_https)) + + plain = "github.com" + self.assertEqual(plain, URI.get_domain(plain)) + + if __name__ == "__main__": unittest.main()