Skip to content

Commit

Permalink
Merge branch 'release/1.3.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
JackMorganNZ committed Oct 2, 2018
2 parents eb987ba + 2aa6c52 commit 56ba415
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 34 deletions.
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ more details.
Changelog
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

1.3.0
------------------------------------------------------------------------------

- Set User-Agent to emulate browser viewing.
- Use Python logging module.

1.2.0
------------------------------------------------------------------------------

Expand Down
6 changes: 5 additions & 1 deletion linkie/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# flake8: noqa
import logging
from .linkie import Linkie

__version__ = '1.2.0'

logging.getLogger(__name__).addHandler(logging.NullHandler())

__version__ = '1.3.0'
59 changes: 32 additions & 27 deletions linkie/linkie.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
import re
import sys
import yaml
import logging
import requests

# This isn't a perfect URL matcher, but should catch the large majority of URLs.
URL_REGEX = r'(?:https?|ftp)://[^\s`\'"\]\)>}]+'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
}


class Linkie:
Expand All @@ -18,12 +22,12 @@ def __init__(self, config=None, config_file_path=None):
self.urls = dict()
self.directory = '.'
if not config and config_file_path:
print('Using Linkie configuration file {}'.format(config_file_path))
logging.info('Using Linkie configuration file {}'.format(config_file_path))
config = self.read_config(config_file_path)
elif config:
print('Using custom Linkie settings via Python constructor')
logging.info('Using custom Linkie settings via Python constructor')
elif not config and not config_file_path:
print('Using default Linkie configuation')
logging.info('Using default Linkie configuation')
config = self.check_config(config)
self.config = self.process_config(config)

Expand Down Expand Up @@ -110,12 +114,12 @@ def traverse_directory(self):

def check_file(self, file_path):
self.file_count += 1
print('\nChecking file {} for URLs... '.format(file_path), end='')
logging.info('\nChecking file {} for URLs... '.format(file_path), end='')
file_object = open(file_path, 'r')
file_contents = file_object.read()
file_object.close()
urls = re.findall(URL_REGEX, file_contents)
print('{} URL{} found'.format(len(urls), 's' if len(urls) != 1 else ''))
logging.info('{} URL{} found'.format(len(urls), 's' if len(urls) != 1 else ''))
for url in urls:
# Remove extra trailing bracket if link containing brackets
# Within Markdown link syntax.
Expand All @@ -124,23 +128,23 @@ def check_file(self, file_path):
url += url.count('(') * ')'
# Remove trailing characters
url = url.rstrip('!"#$%&\'*+,-./@:;=^_`|~')
print(' - Checking URL {} '.format(url), end='')
logging.info(' - Checking URL {} '.format(url), end='')
if url in self.config['skip-urls']:
print('= skipping URL (as defined in config file)')
logging.info('= skipping URL (as defined in config file)')
elif url not in self.urls:
try:
status_code = requests.head(url).status_code
status_code = requests.head(url, headers=HEADERS).status_code
# If response doesn't allow HEAD request, try GET request
if status_code >= 400:
status_code = requests.get(url).status_code
status_code = requests.get(url, headers=HEADERS).status_code
# If connection error
except Exception as e:
status_code = str(type(e).__name__)

if type(status_code) == str:
print('= {}'.format(status_code))
logging.info('= {}'.format(status_code))
else:
print('= {} status'.format(status_code))
logging.info('= {} status'.format(status_code))

if type(status_code) == str or status_code >= 400:
self.save_url(url, status_code, True)
Expand All @@ -149,7 +153,7 @@ def check_file(self, file_path):
status_code = str(status_code)
self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1
else:
print('= {} (already checked)'.format(self.urls[url]['status']))
logging.info('= {} (already checked)'.format(self.urls[url]['status']))

def save_url(self, url, status_code, broken):
self.urls[url] = {
Expand All @@ -160,29 +164,30 @@ def save_url(self, url, status_code, broken):
def print_summary(self):
number_broken_links = self.count_broken_links()

print('\n=============================================')
print('SUMMARY')
print('=============================================')
print('{} file{} checked'.format(self.file_count, 's' if self.file_count != 1 else ''))
print('{} unique URL{} found'.format(len(self.urls), 's' if len(self.urls) != 1 else ''))
print('{} broken link{} found'.format(number_broken_links, 's' if number_broken_links != 1 else ''))
logging.info('=============================================')
logging.info('SUMMARY')
logging.info('=============================================')
logging.info('{} file{} checked'.format(self.file_count, 's' if self.file_count != 1 else ''))
logging.info('{} unique URL{} found'.format(len(self.urls), 's' if len(self.urls) != 1 else ''))
logging.info('{} broken link{} found'.format(number_broken_links, 's' if number_broken_links != 1 else ''))

print('\nStatus code counts')
print('---------------------------------------------')
logging.info('---------------------------------------------')
logging.info('Status code counts')
logging.info('---------------------------------------------')
for status in sorted(self.status_counts.keys()):
print('{}: {}'.format(status, self.status_counts[status]))
logging.info('{}: {}'.format(status, self.status_counts[status]))
if 999 in self.status_counts:
print('Status 999 refers to a connection error.')
logging.info('Status 999 refers to a connection error.')

print('\nBroken links:')
print('---------------------------------------------')
logging.info('---------------------------------------------')
logging.info('Broken links:')
logging.info('---------------------------------------------')
if self.count_broken_links():
for url, url_data in self.urls.items():
if url_data['broken']:
print(url)
logging.info(url)
else:
print('No broken links found!')
print()
logging.info('No broken links found!')


def main():
Expand Down
15 changes: 9 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@
if not sys.version_info[0] == 3:
sys.exit('Sorry, currently only Python 3 is supported.')

with open('requirements.txt') as f:
requirements = f.read().splitlines()

with open('README.rst') as f:
long_description = f.read()

setup(
name='linkie',
version=__version__,
description='Linkie looks through files for broken links using Python 3.',
long_description=open('README.rst').read(),
long_description=long_description,
url='https://github.com/uccser/linkie',
author=('University of Canterbury Computer'
'Science Education Research Group'),
Expand All @@ -28,14 +34,11 @@
keywords='link url checker',
packages=find_packages(),
include_package_data=True,
install_requires=[
'requests==2.18.4',
'PyYaml==4.2b4',
],
install_requires=requirements,
python_requires='~=3.4',
entry_points={
'console_scripts': [
'linkie = linkie.linkie:main',
],
}
)
)

0 comments on commit 56ba415

Please sign in to comment.