diff --git a/README.rst b/README.rst index 37ba035..7bda0c9 100644 --- a/README.rst +++ b/README.rst @@ -81,6 +81,12 @@ more details. Changelog ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +1.3.0 +------------------------------------------------------------------------------ + +- Set User-Agent to emulate browser viewing. +- Use Python logging module. + 1.2.0 ------------------------------------------------------------------------------ diff --git a/linkie/__init__.py b/linkie/__init__.py index 8389618..923ab25 100644 --- a/linkie/__init__.py +++ b/linkie/__init__.py @@ -1,4 +1,8 @@ # flake8: noqa +import logging from .linkie import Linkie -__version__ = '1.2.0' + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__version__ = '1.3.0' diff --git a/linkie/linkie.py b/linkie/linkie.py index 254467d..28b0c59 100644 --- a/linkie/linkie.py +++ b/linkie/linkie.py @@ -4,10 +4,14 @@ import re import sys import yaml +import logging import requests # This isn't a perfect URL matcher, but should catch the large majority of URLs. URL_REGEX = r'(?:https?|ftp)://[^\s`\'"\]\)>}]+' +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36' +} class Linkie: @@ -18,12 +22,12 @@ def __init__(self, config=None, config_file_path=None): self.urls = dict() self.directory = '.' if not config and config_file_path: - print('Using Linkie configuration file {}'.format(config_file_path)) + logging.info('Using Linkie configuration file {}'.format(config_file_path)) config = self.read_config(config_file_path) elif config: - print('Using custom Linkie settings via Python constructor') + logging.info('Using custom Linkie settings via Python constructor') elif not config and not config_file_path: - print('Using default Linkie configuation') + logging.info('Using default Linkie configuation') config = self.check_config(config) self.config = self.process_config(config) @@ -110,12 +114,12 @@ def traverse_directory(self): def check_file(self, file_path): self.file_count += 1 - print('\nChecking file {} for URLs... '.format(file_path), end='') + logging.info('\nChecking file {} for URLs... '.format(file_path), end='') file_object = open(file_path, 'r') file_contents = file_object.read() file_object.close() urls = re.findall(URL_REGEX, file_contents) - print('{} URL{} found'.format(len(urls), 's' if len(urls) != 1 else '')) + logging.info('{} URL{} found'.format(len(urls), 's' if len(urls) != 1 else '')) for url in urls: # Remove extra trailing bracket if link containing brackets # Within Markdown link syntax. @@ -124,23 +128,23 @@ def check_file(self, file_path): url += url.count('(') * ')' # Remove trailing characters url = url.rstrip('!"#$%&\'*+,-./@:;=^_`|~') - print(' - Checking URL {} '.format(url), end='') + logging.info(' - Checking URL {} '.format(url), end='') if url in self.config['skip-urls']: - print('= skipping URL (as defined in config file)') + logging.info('= skipping URL (as defined in config file)') elif url not in self.urls: try: - status_code = requests.head(url).status_code + status_code = requests.head(url, headers=HEADERS).status_code # If response doesn't allow HEAD request, try GET request if status_code >= 400: - status_code = requests.get(url).status_code + status_code = requests.get(url, headers=HEADERS).status_code # If connection error except Exception as e: status_code = str(type(e).__name__) if type(status_code) == str: - print('= {}'.format(status_code)) + logging.info('= {}'.format(status_code)) else: - print('= {} status'.format(status_code)) + logging.info('= {} status'.format(status_code)) if type(status_code) == str or status_code >= 400: self.save_url(url, status_code, True) @@ -149,7 +153,7 @@ def check_file(self, file_path): status_code = str(status_code) self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1 else: - print('= {} (already checked)'.format(self.urls[url]['status'])) + logging.info('= {} (already checked)'.format(self.urls[url]['status'])) def save_url(self, url, status_code, broken): self.urls[url] = { @@ -160,29 +164,30 @@ def save_url(self, url, status_code, broken): def print_summary(self): number_broken_links = self.count_broken_links() - print('\n=============================================') - print('SUMMARY') - print('=============================================') - print('{} file{} checked'.format(self.file_count, 's' if self.file_count != 1 else '')) - print('{} unique URL{} found'.format(len(self.urls), 's' if len(self.urls) != 1 else '')) - print('{} broken link{} found'.format(number_broken_links, 's' if number_broken_links != 1 else '')) + logging.info('=============================================') + logging.info('SUMMARY') + logging.info('=============================================') + logging.info('{} file{} checked'.format(self.file_count, 's' if self.file_count != 1 else '')) + logging.info('{} unique URL{} found'.format(len(self.urls), 's' if len(self.urls) != 1 else '')) + logging.info('{} broken link{} found'.format(number_broken_links, 's' if number_broken_links != 1 else '')) - print('\nStatus code counts') - print('---------------------------------------------') + logging.info('---------------------------------------------') + logging.info('Status code counts') + logging.info('---------------------------------------------') for status in sorted(self.status_counts.keys()): - print('{}: {}'.format(status, self.status_counts[status])) + logging.info('{}: {}'.format(status, self.status_counts[status])) if 999 in self.status_counts: - print('Status 999 refers to a connection error.') + logging.info('Status 999 refers to a connection error.') - print('\nBroken links:') - print('---------------------------------------------') + logging.info('---------------------------------------------') + logging.info('Broken links:') + logging.info('---------------------------------------------') if self.count_broken_links(): for url, url_data in self.urls.items(): if url_data['broken']: - print(url) + logging.info(url) else: - print('No broken links found!') - print() + logging.info('No broken links found!') def main(): diff --git a/setup.py b/setup.py index 657098a..dbc9d47 100644 --- a/setup.py +++ b/setup.py @@ -5,11 +5,17 @@ if not sys.version_info[0] == 3: sys.exit('Sorry, currently only Python 3 is supported.') +with open('requirements.txt') as f: + requirements = f.read().splitlines() + +with open('README.rst') as f: + long_description = f.read() + setup( name='linkie', version=__version__, description='Linkie looks through files for broken links using Python 3.', - long_description=open('README.rst').read(), + long_description=long_description, url='https://github.com/uccser/linkie', author=('University of Canterbury Computer' 'Science Education Research Group'), @@ -28,14 +34,11 @@ keywords='link url checker', packages=find_packages(), include_package_data=True, - install_requires=[ - 'requests==2.18.4', - 'PyYaml==4.2b4', - ], + install_requires=requirements, python_requires='~=3.4', entry_points={ 'console_scripts': [ 'linkie = linkie.linkie:main', ], } - ) +)