From 43fabcea3b6c251d57a190792d05811341d7fa31 Mon Sep 17 00:00:00 2001 From: Daniel Daphron Kaczmarek Date: Fri, 21 Aug 2015 21:42:19 -0500 Subject: [PATCH] Fixed url cleaning for blogspot archives --- rsstory/scrapers/siteRules/blogspot.py | 7 ++++++- rsstory/scrapers/tools.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/rsstory/scrapers/siteRules/blogspot.py b/rsstory/scrapers/siteRules/blogspot.py index 906ee56..a54cd06 100644 --- a/rsstory/scrapers/siteRules/blogspot.py +++ b/rsstory/scrapers/siteRules/blogspot.py @@ -4,6 +4,9 @@ from tld import get_tld import re import rsstory.scrapers.tools as tools +import logging + +log = logging.getLogger(__name__) http = urllib3.PoolManager( cert_reqs='CERT_REQUIRED', @@ -40,6 +43,7 @@ def get_monthly_archive_urls(links, page_url): def get_post_from_month(month_url): '''Returns the individual posts from the input monthly archive url''' + log.debug("Getting posts from month url: {}".format(month_url)) try: r = http.request('GET', month_url) except urllib3.exceptions.SSLError as e: @@ -54,7 +58,7 @@ def get_post_from_month(month_url): post_links = [] for link in links: try: - url = link.attrs['href'] + url = tools.clean_url(link.attrs['href']) match = url.startswith(tools.clean_url(month_url_base) + year + "/" + month + "/") if match: post_links.append(link) @@ -63,6 +67,7 @@ def get_post_from_month(month_url): pass post_links.reverse() # Reverse the order to get oldest posts first + log.debug("Posts from month url: {} are \n {}".format(month_url, post_links)) return post_links def remove_duplicates(links): diff --git a/rsstory/scrapers/tools.py b/rsstory/scrapers/tools.py index ac3cd04..3b9c479 100644 --- a/rsstory/scrapers/tools.py +++ b/rsstory/scrapers/tools.py @@ -23,7 +23,7 @@ def clean_url(url): '''Removes protocol from url and ensures a trailing slash exists''' url = url.replace("http://", "").replace("https://", "") if url[-1] != "/": - url.append("/") + url += "/" return url