diff --git a/README.rst b/README.rst index 1891b03..823b3ee 100644 --- a/README.rst +++ b/README.rst @@ -81,6 +81,14 @@ more details. Changelog ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +1.4.0 +------------------------------------------------------------------------------ + +- Update to only check links prefixed by one of [=", (, <, ' '(a space)]. +- Linkie now finds all unique links at once, then uses multithreading to check them all. +- Linkie now rechecks links that had a ConnectionError, as these are often valid. +- Broken links in the SUMMARY are now also displayed with their status code. + 1.3.1 ------------------------------------------------------------------------------ diff --git a/linkie/__init__.py b/linkie/__init__.py index c674980..76f31a8 100644 --- a/linkie/__init__.py +++ b/linkie/__init__.py @@ -28,4 +28,4 @@ } }) -__version__ = '1.3.1' +__version__ = '1.4.0' diff --git a/linkie/linkie.py b/linkie/linkie.py index eb70e5d..4041a44 100644 --- a/linkie/linkie.py +++ b/linkie/linkie.py @@ -6,12 +6,18 @@ import yaml import logging import requests +from multiprocessing.dummy import Pool as ThreadPool # This isn't a perfect URL matcher, but should catch the large majority of URLs. -URL_REGEX = r'(?:https?|ftp)://[^\s`\'"\]\)>}]+' +# This now matches URLs presented in the format defined in the CSU Writing Guide +# as of 13/11/2018 (https://cs-unplugged.readthedocs.io/en/latest/author/writing_guide.html) +# as well as formats used in the code for the guide itself. +# As such, URLs will be matched if and only if they have any one of the following prefixes: [=", (, <, ' '(a space)] +URL_REGEX = r'(?:\=\"|\(|\<| )(?:https?|ftp)://[^\s`\'\"\]\)>}]+' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36' } +THREADS = 12 class Linkie: @@ -20,7 +26,9 @@ def __init__(self, config=None, config_file_path=None): self.file_count = 0 self.status_counts = {} self.urls = dict() + self.unchecked_urls = set() self.directory = '.' + self.pool = ThreadPool(THREADS) if not config and config_file_path: logging.info('Using Linkie configuration file {}'.format(config_file_path)) config = self.read_config(config_file_path) @@ -92,6 +100,7 @@ def count_broken_links(self): def run(self): self.traverse_directory() + self.traverse_connection_errors() self.print_summary() if self.count_broken_links(): return 1 @@ -110,9 +119,23 @@ def traverse_directory(self): for filename in files: if filename.endswith(self.config['file-types']): - self.check_file(os.path.join(directory_root, filename)) + self.search_file(os.path.join(directory_root, filename)) + self.pool.map(self.check_link, self.unchecked_urls) - def check_file(self, file_path): + def traverse_connection_errors(self): + connect_errors = [] + for url, url_data in self.urls.items(): + if str(url_data['status']) == 'ConnectionError': + connect_errors.append(url) + for url in connect_errors: + self.urls.__delitem__(url) + if len(connect_errors): + logging.info('Rechecking {} link{} that returned ConnectionError... '.format(len(connect_errors), 's' if len(connect_errors) != 1 else '')) + self.pool = ThreadPool(min(THREADS, len(connect_errors))) + self.pool.map(self.check_link, connect_errors) + + + def search_file(self, file_path): self.file_count += 1 file_message = 'Checking file {} for URLs... '.format(file_path) file_object = open(file_path, 'r') @@ -121,40 +144,41 @@ def check_file(self, file_path): urls = re.findall(URL_REGEX, file_contents) logging.info('{}{} URL{} found'.format(file_message, len(urls), 's' if len(urls) != 1 else '')) for url in urls: + # Remove trailing characters + url = url.rstrip('> !"#$%&\'*+,-./@:;=^_`|~').lstrip(' <(=\"') # Remove extra trailing bracket if link containing brackets - # Within Markdown link syntax. + # within Markdown link syntax. # [Wikipedia link](http://foo.com/blah_blah_(wikipedia)) if url.count('('): url += url.count('(') * ')' - # Remove trailing characters - url = url.rstrip('!"#$%&\'*+,-./@:;=^_`|~') - message = ' - Checking URL {} '.format(url) - if url in self.config['skip-urls']: - message += '= skipping URL (as defined in config file)' - elif url not in self.urls: - try: - status_code = requests.head(url, headers=HEADERS).status_code - # If response doesn't allow HEAD request, try GET request - if status_code >= 400: - status_code = requests.get(url, headers=HEADERS).status_code - # If connection error - except Exception as e: - status_code = str(type(e).__name__) - - if type(status_code) == str: - message += '= {}'.format(status_code) - else: - message += '= {} status'.format(status_code) - - if type(status_code) == str or status_code >= 400: - self.save_url(url, status_code, True) - else: - self.save_url(url, status_code, False) - status_code = str(status_code) - self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1 + self.unchecked_urls.add(url) + + def check_link(self, url): + message = ' - Checking URL {} '.format(url) + if url in self.config['skip-urls']: + message += '= skipping URL (as defined in config file)' + elif url not in self.urls: + try: + status_code = requests.head(url, headers=HEADERS).status_code + # If response doesn't allow HEAD request, try GET request + if status_code >= 400: + status_code = requests.get(url, headers=HEADERS).status_code + # If connection error + except Exception as e: + status_code = str(type(e).__name__) + + if type(status_code) == str: + message += '= {}'.format(status_code) else: - message += '= {} (already checked)'.format(self.urls[url]['status']) - logging.info(message) + message += '= {} status'.format(status_code) + + if type(status_code) == str or status_code >= 400: + self.save_url(url, status_code, True) + else: + self.save_url(url, status_code, False) + else: + message += '= {} (already checked)'.format(self.urls[url]['status']) + logging.info(message) def save_url(self, url, status_code, broken): self.urls[url] = { @@ -162,8 +186,14 @@ def save_url(self, url, status_code, broken): 'status': status_code, } + def collect_status_counts(self): + for _, url_data in self.urls.items(): + status_code = str(url_data['status']) + self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1 + def print_summary(self): number_broken_links = self.count_broken_links() + self.collect_status_counts() logging.info('=============================================') logging.info('SUMMARY') @@ -181,12 +211,12 @@ def print_summary(self): logging.info('Status 999 refers to a connection error.') logging.info('---------------------------------------------') - logging.info('Broken links:') + logging.info('Broken links') logging.info('---------------------------------------------') - if self.count_broken_links(): + if number_broken_links: for url, url_data in self.urls.items(): if url_data['broken']: - logging.info(url) + logging.info('{}: {}'.format(url_data['status'], url)) else: logging.info('No broken links found!') diff --git a/linkie/tests/assets/basic/file.txt b/linkie/tests/assets/basic/file.txt index 022dc3a..1207586 100644 --- a/linkie/tests/assets/basic/file.txt +++ b/linkie/tests/assets/basic/file.txt @@ -1,2 +1,2 @@ -https://www.google.com -https://www.github.com +="https://www.google.com" +(https://www.github.com) diff --git a/linkie/tests/assets/broken/file.txt b/linkie/tests/assets/broken/file.txt index 7902a85..779b9fe 100644 --- a/linkie/tests/assets/broken/file.txt +++ b/linkie/tests/assets/broken/file.txt @@ -1 +1,2 @@ -https://www.googoogllelce.com +(https://www.googoogllelce.com) +[https://github.com] diff --git a/linkie/tests/assets/excluded_directories/docs/build/file.txt b/linkie/tests/assets/excluded_directories/docs/build/file.txt index 9e526d3..9a2f223 100644 --- a/linkie/tests/assets/excluded_directories/docs/build/file.txt +++ b/linkie/tests/assets/excluded_directories/docs/build/file.txt @@ -1 +1 @@ -https://www +="https://www diff --git a/linkie/tests/assets/excluded_directories_custom/ignore/file.txt b/linkie/tests/assets/excluded_directories_custom/ignore/file.txt index 9e526d3..9a2f223 100644 --- a/linkie/tests/assets/excluded_directories_custom/ignore/file.txt +++ b/linkie/tests/assets/excluded_directories_custom/ignore/file.txt @@ -1 +1 @@ -https://www +="https://www diff --git a/linkie/tests/assets/file_types/file.html b/linkie/tests/assets/file_types/file.html index 78cb3e5..afadb9e 100644 --- a/linkie/tests/assets/file_types/file.html +++ b/linkie/tests/assets/file_types/file.html @@ -1 +1 @@ -https://www.google.com +="https://www.google.com diff --git a/linkie/tests/assets/file_types/file.md b/linkie/tests/assets/file_types/file.md index 78cb3e5..afadb9e 100644 --- a/linkie/tests/assets/file_types/file.md +++ b/linkie/tests/assets/file_types/file.md @@ -1 +1 @@ -https://www.google.com +="https://www.google.com diff --git a/linkie/tests/assets/file_types/file.rst b/linkie/tests/assets/file_types/file.rst index 78cb3e5..afadb9e 100644 --- a/linkie/tests/assets/file_types/file.rst +++ b/linkie/tests/assets/file_types/file.rst @@ -1 +1 @@ -https://www.google.com +="https://www.google.com diff --git a/linkie/tests/assets/file_types/file.skipped b/linkie/tests/assets/file_types/file.skipped index 9e526d3..9a2f223 100644 --- a/linkie/tests/assets/file_types/file.skipped +++ b/linkie/tests/assets/file_types/file.skipped @@ -1 +1 @@ -https://www +="https://www diff --git a/linkie/tests/assets/file_types/file.txt b/linkie/tests/assets/file_types/file.txt index 78cb3e5..afadb9e 100644 --- a/linkie/tests/assets/file_types/file.txt +++ b/linkie/tests/assets/file_types/file.txt @@ -1 +1 @@ -https://www.google.com +="https://www.google.com diff --git a/linkie/tests/assets/file_types_custom/file.special b/linkie/tests/assets/file_types_custom/file.special index 9e526d3..9a2f223 100644 --- a/linkie/tests/assets/file_types_custom/file.special +++ b/linkie/tests/assets/file_types_custom/file.special @@ -1 +1 @@ -https://www +="https://www diff --git a/linkie/tests/assets/multiple/file1.txt b/linkie/tests/assets/multiple/file1.txt index 78cb3e5..afadb9e 100644 --- a/linkie/tests/assets/multiple/file1.txt +++ b/linkie/tests/assets/multiple/file1.txt @@ -1 +1 @@ -https://www.google.com +="https://www.google.com diff --git a/linkie/tests/assets/multiple/file2.txt b/linkie/tests/assets/multiple/file2.txt index ad50f64..89a147d 100644 --- a/linkie/tests/assets/multiple/file2.txt +++ b/linkie/tests/assets/multiple/file2.txt @@ -1 +1 @@ -https://www.github.com +="https://www.github.com diff --git a/linkie/tests/assets/skip_urls/file.txt b/linkie/tests/assets/skip_urls/file.txt index 022dc3a..15cfa7e 100644 --- a/linkie/tests/assets/skip_urls/file.txt +++ b/linkie/tests/assets/skip_urls/file.txt @@ -1,2 +1,2 @@ -https://www.google.com -https://www.github.com +="https://www.google.com" + diff --git a/linkie/tests/assets/skip_urls_custom/file.txt b/linkie/tests/assets/skip_urls_custom/file.txt index 022dc3a..5c72e4b 100644 --- a/linkie/tests/assets/skip_urls_custom/file.txt +++ b/linkie/tests/assets/skip_urls_custom/file.txt @@ -1,2 +1,2 @@ -https://www.google.com -https://www.github.com +(https://www.google.com) +="https://www.github.com" diff --git a/requirements.txt b/requirements.txt index 244bbf0..a9f3e3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,2 @@ -# Core requirements that also requires updating in setup.py -requests==2.19.1 -PyYaml==4.2b4 - -# Developer requirements +PyYaml==5.1.1 +requests==2.22.0