From b4800882625c3cf0e48a13344d5cfaa4a4f43d1c Mon Sep 17 00:00:00 2001 From: pyup-bot Date: Thu, 19 Mar 2020 21:24:18 +1300 Subject: [PATCH 1/6] Update pyyaml from 5.1.1 to 5.3.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a9f3e3a..2379eec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -PyYaml==5.1.1 +PyYaml==5.3.1 requests==2.22.0 From f5e0ec4405ccf67bf027be22d1873defd60af56e Mon Sep 17 00:00:00 2001 From: eAlasdair Date: Mon, 11 May 2020 15:55:05 +1200 Subject: [PATCH 2/6] Add logic to cope with 429 too many requests errors --- linkie/linkie.py | 62 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/linkie/linkie.py b/linkie/linkie.py index 4041a44..3857a89 100644 --- a/linkie/linkie.py +++ b/linkie/linkie.py @@ -6,7 +6,9 @@ import yaml import logging import requests +import time from multiprocessing.dummy import Pool as ThreadPool +from multiprocessing.dummy import Lock # This isn't a perfect URL matcher, but should catch the large majority of URLs. # This now matches URLs presented in the format defined in the CSU Writing Guide @@ -18,6 +20,7 @@ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36' } THREADS = 12 +TIMEOUT = 60 #s class Linkie: @@ -26,9 +29,12 @@ def __init__(self, config=None, config_file_path=None): self.file_count = 0 self.status_counts = {} self.urls = dict() + self.domains = dict() self.unchecked_urls = set() + self.delayed_urls = [] self.directory = '.' self.pool = ThreadPool(THREADS) + self.lock = Lock() if not config and config_file_path: logging.info('Using Linkie configuration file {}'.format(config_file_path)) config = self.read_config(config_file_path) @@ -90,6 +96,13 @@ def process_config(self, config): file_types[i] = '.' + file_types[i] config['file-types'] = tuple(file_types) return config + + def get_domain(self, url): + # Return everything before the third / + # i.e https://example.com/subpage/?hello-there&general-kenobi + # becomes https://example.com + url_parts = url.split('/') + return '/'.join(url_parts[:3]) def count_broken_links(self): count = 0 @@ -122,6 +135,19 @@ def traverse_directory(self): self.search_file(os.path.join(directory_root, filename)) self.pool.map(self.check_link, self.unchecked_urls) + repeat_count = 1 + max_repeats = 1000000 + while len(self.delayed_urls) > 0 and repeat_count <= max_repeats: + # Many iterations are expected because the timeout may still be going each time this repeats itself, so thepool map will end immediately + # Only uncomment this line if debugging locally + # print('Retrying delayed urls **MANY ITERATIONS ARE EXPECTED** #{}'.format(repeat_count), end='\r') + repeat_urls = self.delayed_urls[:] + self.delayed_urls = [] + self.pool.map(self.check_link, repeat_urls) + repeat_count += 1 + if repeat_count > max_repeats: + logging.critical("Infinite loop in retrying delayed urls. The timeout period can't have ended!") + def traverse_connection_errors(self): connect_errors = [] for url, url_data in self.urls.items(): @@ -151,12 +177,26 @@ def search_file(self, file_path): # [Wikipedia link](http://foo.com/blah_blah_(wikipedia)) if url.count('('): url += url.count('(') * ')' + self.domains[self.get_domain(url)] = -1 self.unchecked_urls.add(url) def check_link(self, url): - message = ' - Checking URL {} '.format(url) + domain = self.get_domain(url) + self.lock.acquire() + time_at_429 = self.domains[domain] + is_ready = time_at_429 < 0 or time.perf_counter() - time_at_429 > TIMEOUT + if is_ready: + self.domains[domain] = -1 + self.lock.release() + if not is_ready: + # Put the url back to be checked later + self.lock.acquire() + self.delayed_urls.append(url) + self.lock.release() + return + message = ' - ' if url in self.config['skip-urls']: - message += '= skipping URL (as defined in config file)' + message += 'Skipping URL (as defined in config file)' elif url not in self.urls: try: status_code = requests.head(url, headers=HEADERS).status_code @@ -168,16 +208,26 @@ def check_link(self, url): status_code = str(type(e).__name__) if type(status_code) == str: - message += '= {}'.format(status_code) + message += '{}'.format(status_code) else: - message += '= {} status'.format(status_code) + message += 'Status {}'.format(status_code) if type(status_code) == str or status_code >= 400: - self.save_url(url, status_code, True) + if status_code == 429: # Too many requests + message += " => Delaying requests to the domain {} for {} seconds".format(domain, TIMEOUT) + self.lock.acquire() + # Save the time the request was made + self.domains[domain] = time.perf_counter() + # Put the url back to be checked again later + self.delayed_urls.append(url) + self.lock.release() + else: + self.save_url(url, status_code, True) else: self.save_url(url, status_code, False) else: - message += '= {} (already checked)'.format(self.urls[url]['status']) + message += '{} (already checked)'.format(self.urls[url]['status']) + message += ' = {}'.format(url) logging.info(message) def save_url(self, url, status_code, broken): From a4fd686b2d3f2dcabd67ce67fa01c39525f9902b Mon Sep 17 00:00:00 2001 From: eAlasdair Date: Mon, 11 May 2020 16:00:08 +1200 Subject: [PATCH 3/6] update README --- README.rst | 6 ++++++ linkie/linkie.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 823b3ee..138a2ab 100644 --- a/README.rst +++ b/README.rst @@ -81,6 +81,12 @@ more details. Changelog ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +1.4.1 +------------------------------------------------------------------------------ + +- Add logic to delay Linkie requesting from a domain if it responds with code 429 +- Reorganise logging output so that the URL is printed last + 1.4.0 ------------------------------------------------------------------------------ diff --git a/linkie/linkie.py b/linkie/linkie.py index 3857a89..ec7b5c5 100644 --- a/linkie/linkie.py +++ b/linkie/linkie.py @@ -138,7 +138,7 @@ def traverse_directory(self): repeat_count = 1 max_repeats = 1000000 while len(self.delayed_urls) > 0 and repeat_count <= max_repeats: - # Many iterations are expected because the timeout may still be going each time this repeats itself, so thepool map will end immediately + # Many iterations are expected because the timeout may still be going each time this repeats itself, so the pool map will end immediately # Only uncomment this line if debugging locally # print('Retrying delayed urls **MANY ITERATIONS ARE EXPECTED** #{}'.format(repeat_count), end='\r') repeat_urls = self.delayed_urls[:] From 6b904200e6e2d71143a83fc344b836ab8ae81789 Mon Sep 17 00:00:00 2001 From: eAlasdair Date: Mon, 11 May 2020 16:15:07 +1200 Subject: [PATCH 4/6] add description of the different URL lists --- linkie/linkie.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/linkie/linkie.py b/linkie/linkie.py index ec7b5c5..5d170df 100644 --- a/linkie/linkie.py +++ b/linkie/linkie.py @@ -28,10 +28,10 @@ class Linkie: def __init__(self, config=None, config_file_path=None): self.file_count = 0 self.status_counts = {} - self.urls = dict() - self.domains = dict() - self.unchecked_urls = set() - self.delayed_urls = [] + self.urls = dict() # Dictionary of URLs that have been checked, with their broken status and status code + self.domains = dict() # Dictionary of URL domains and when they were last requested from (429 code) + self.unchecked_urls = set() # Initial set of urls to be checked + self.delayed_urls = [] # List of urls to be checked later (429 code) self.directory = '.' self.pool = ThreadPool(THREADS) self.lock = Lock() From 6b08caeeece843c84649c803ef84be98bc28edcf Mon Sep 17 00:00:00 2001 From: eAlasdair Date: Tue, 12 May 2020 12:07:46 +1200 Subject: [PATCH 5/6] update readme --- .vscode/settings.json | 3 +++ README.rst | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..c022e41 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "restructuredtext.confPath": "" +} \ No newline at end of file diff --git a/README.rst b/README.rst index 138a2ab..5dc287b 100644 --- a/README.rst +++ b/README.rst @@ -84,8 +84,9 @@ Changelog 1.4.1 ------------------------------------------------------------------------------ -- Add logic to delay Linkie requesting from a domain if it responds with code 429 -- Reorganise logging output so that the URL is printed last +- Add logic to delay Linkie requesting from a domain if it responds with code 429. +- Reorganise logging output so that the URL is printed last. +- Update dependencies. 1.4.0 ------------------------------------------------------------------------------ From e3dfd5b1a6728c269fb62fc0ad7dafe7e797cffd Mon Sep 17 00:00:00 2001 From: eAlasdair Date: Tue, 12 May 2020 12:11:43 +1200 Subject: [PATCH 6/6] remove thing that wasn't meant to be --- .vscode/settings.json | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index c022e41..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "restructuredtext.confPath": "" -} \ No newline at end of file