Skip to content

Commit

Permalink
Merge branch 'release/1.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
JackMorganNZ committed Jul 21, 2019
2 parents dad3ed1 + 573264c commit a48e5fc
Show file tree
Hide file tree
Showing 18 changed files with 94 additions and 58 deletions.
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ more details.
Changelog
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

1.4.0
------------------------------------------------------------------------------

- Update to only check links prefixed by one of [=", (, <, ' '(a space)].
- Linkie now finds all unique links at once, then uses multithreading to check them all.
- Linkie now rechecks links that had a ConnectionError, as these are often valid.
- Broken links in the SUMMARY are now also displayed with their status code.

1.3.1
------------------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion linkie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@
}
})

__version__ = '1.3.1'
__version__ = '1.4.0'
100 changes: 65 additions & 35 deletions linkie/linkie.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@
import yaml
import logging
import requests
from multiprocessing.dummy import Pool as ThreadPool

# This isn't a perfect URL matcher, but should catch the large majority of URLs.
URL_REGEX = r'(?:https?|ftp)://[^\s`\'"\]\)>}]+'
# This now matches URLs presented in the format defined in the CSU Writing Guide
# as of 13/11/2018 (https://cs-unplugged.readthedocs.io/en/latest/author/writing_guide.html)
# as well as formats used in the code for the guide itself.
# As such, URLs will be matched if and only if they have any one of the following prefixes: [=", (, <, ' '(a space)]
URL_REGEX = r'(?:\=\"|\(|\<| )(?:https?|ftp)://[^\s`\'\"\]\)>}]+'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
}
THREADS = 12


class Linkie:
Expand All @@ -20,7 +26,9 @@ def __init__(self, config=None, config_file_path=None):
self.file_count = 0
self.status_counts = {}
self.urls = dict()
self.unchecked_urls = set()
self.directory = '.'
self.pool = ThreadPool(THREADS)
if not config and config_file_path:
logging.info('Using Linkie configuration file {}'.format(config_file_path))
config = self.read_config(config_file_path)
Expand Down Expand Up @@ -92,6 +100,7 @@ def count_broken_links(self):

def run(self):
self.traverse_directory()
self.traverse_connection_errors()
self.print_summary()
if self.count_broken_links():
return 1
Expand All @@ -110,9 +119,23 @@ def traverse_directory(self):

for filename in files:
if filename.endswith(self.config['file-types']):
self.check_file(os.path.join(directory_root, filename))
self.search_file(os.path.join(directory_root, filename))
self.pool.map(self.check_link, self.unchecked_urls)

def check_file(self, file_path):
def traverse_connection_errors(self):
connect_errors = []
for url, url_data in self.urls.items():
if str(url_data['status']) == 'ConnectionError':
connect_errors.append(url)
for url in connect_errors:
self.urls.__delitem__(url)
if len(connect_errors):
logging.info('Rechecking {} link{} that returned ConnectionError... '.format(len(connect_errors), 's' if len(connect_errors) != 1 else ''))
self.pool = ThreadPool(min(THREADS, len(connect_errors)))
self.pool.map(self.check_link, connect_errors)


def search_file(self, file_path):
self.file_count += 1
file_message = 'Checking file {} for URLs... '.format(file_path)
file_object = open(file_path, 'r')
Expand All @@ -121,49 +144,56 @@ def check_file(self, file_path):
urls = re.findall(URL_REGEX, file_contents)
logging.info('{}{} URL{} found'.format(file_message, len(urls), 's' if len(urls) != 1 else ''))
for url in urls:
# Remove trailing characters
url = url.rstrip('> !"#$%&\'*+,-./@:;=^_`|~').lstrip(' <(=\"')
# Remove extra trailing bracket if link containing brackets
# Within Markdown link syntax.
# within Markdown link syntax.
# [Wikipedia link](http://foo.com/blah_blah_(wikipedia))
if url.count('('):
url += url.count('(') * ')'
# Remove trailing characters
url = url.rstrip('!"#$%&\'*+,-./@:;=^_`|~')
message = ' - Checking URL {} '.format(url)
if url in self.config['skip-urls']:
message += '= skipping URL (as defined in config file)'
elif url not in self.urls:
try:
status_code = requests.head(url, headers=HEADERS).status_code
# If response doesn't allow HEAD request, try GET request
if status_code >= 400:
status_code = requests.get(url, headers=HEADERS).status_code
# If connection error
except Exception as e:
status_code = str(type(e).__name__)

if type(status_code) == str:
message += '= {}'.format(status_code)
else:
message += '= {} status'.format(status_code)

if type(status_code) == str or status_code >= 400:
self.save_url(url, status_code, True)
else:
self.save_url(url, status_code, False)
status_code = str(status_code)
self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1
self.unchecked_urls.add(url)

def check_link(self, url):
message = ' - Checking URL {} '.format(url)
if url in self.config['skip-urls']:
message += '= skipping URL (as defined in config file)'
elif url not in self.urls:
try:
status_code = requests.head(url, headers=HEADERS).status_code
# If response doesn't allow HEAD request, try GET request
if status_code >= 400:
status_code = requests.get(url, headers=HEADERS).status_code
# If connection error
except Exception as e:
status_code = str(type(e).__name__)

if type(status_code) == str:
message += '= {}'.format(status_code)
else:
message += '= {} (already checked)'.format(self.urls[url]['status'])
logging.info(message)
message += '= {} status'.format(status_code)

if type(status_code) == str or status_code >= 400:
self.save_url(url, status_code, True)
else:
self.save_url(url, status_code, False)
else:
message += '= {} (already checked)'.format(self.urls[url]['status'])
logging.info(message)

def save_url(self, url, status_code, broken):
self.urls[url] = {
'broken': broken,
'status': status_code,
}

def collect_status_counts(self):
for _, url_data in self.urls.items():
status_code = str(url_data['status'])
self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1

def print_summary(self):
number_broken_links = self.count_broken_links()
self.collect_status_counts()

logging.info('=============================================')
logging.info('SUMMARY')
Expand All @@ -181,12 +211,12 @@ def print_summary(self):
logging.info('Status 999 refers to a connection error.')

logging.info('---------------------------------------------')
logging.info('Broken links:')
logging.info('Broken links')
logging.info('---------------------------------------------')
if self.count_broken_links():
if number_broken_links:
for url, url_data in self.urls.items():
if url_data['broken']:
logging.info(url)
logging.info('{}: {}'.format(url_data['status'], url))
else:
logging.info('No broken links found!')

Expand Down
4 changes: 2 additions & 2 deletions linkie/tests/assets/basic/file.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
https://www.google.com
https://www.github.com
="https://www.google.com"
(https://www.github.com)
3 changes: 2 additions & 1 deletion linkie/tests/assets/broken/file.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
https://www.googoogllelce.com
(https://www.googoogllelce.com)
[https://github.com]
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www
="https://www
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www
="https://www
2 changes: 1 addition & 1 deletion linkie/tests/assets/file_types/file.html
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www.google.com
="https://www.google.com
2 changes: 1 addition & 1 deletion linkie/tests/assets/file_types/file.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www.google.com
="https://www.google.com
2 changes: 1 addition & 1 deletion linkie/tests/assets/file_types/file.rst
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www.google.com
="https://www.google.com
2 changes: 1 addition & 1 deletion linkie/tests/assets/file_types/file.skipped
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www
="https://www
2 changes: 1 addition & 1 deletion linkie/tests/assets/file_types/file.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www.google.com
="https://www.google.com
2 changes: 1 addition & 1 deletion linkie/tests/assets/file_types_custom/file.special
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www
="https://www
2 changes: 1 addition & 1 deletion linkie/tests/assets/multiple/file1.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www.google.com
="https://www.google.com
2 changes: 1 addition & 1 deletion linkie/tests/assets/multiple/file2.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www.github.com
="https://www.github.com
4 changes: 2 additions & 2 deletions linkie/tests/assets/skip_urls/file.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
https://www.google.com
https://www.github.com
="https://www.google.com"
<https://www.github.com>
4 changes: 2 additions & 2 deletions linkie/tests/assets/skip_urls_custom/file.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
https://www.google.com
https://www.github.com
(https://www.google.com)
="https://www.github.com"
7 changes: 2 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
# Core requirements that also requires updating in setup.py
requests==2.19.1
PyYaml==4.2b4

# Developer requirements
PyYaml==5.1.1
requests==2.22.0

0 comments on commit a48e5fc

Please sign in to comment.