From e03ef53dd5b684b08ff7c948a09a0f56a63cb965 Mon Sep 17 00:00:00 2001 From: Chad Estioco Date: Tue, 23 Jul 2019 08:56:13 +0800 Subject: [PATCH] Use urlparse in get_domain. --- microdata.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/microdata.py b/microdata.py index c865006..e015796 100755 --- a/microdata.py +++ b/microdata.py @@ -4,6 +4,7 @@ import html5lib from collections import defaultdict +from urlparse import urlparse try: @@ -137,8 +138,9 @@ def get_domain(url_string): """ Get the domain _including_ the protocol specified, if any. """ - if "://" in url_string: - return "/".join(url_string.split("/")[0:3]) + parsed = urlparse(url_string) + if parsed.scheme: + return "/".join(parsed.scheme, "", parsed.netloc) else: return url_string.split("/")[0]