Merge branch 'release/1.4.0'

uccser · Jul 21, 2019 · a48e5fc · a48e5fc
2 parents dad3ed1 + 573264c
commit a48e5fc
Show file tree

Hide file tree

Showing 18 changed files with 94 additions and 58 deletions.
diff --git a/README.rst b/README.rst
@@ -81,6 +81,14 @@ more details.
 Changelog
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+1.4.0
+------------------------------------------------------------------------------
+
+- Update to only check links prefixed by one of [=", (, <, ' '(a space)].
+- Linkie now finds all unique links at once, then uses multithreading to check them all.
+- Linkie now rechecks links that had a ConnectionError, as these are often valid.
+- Broken links in the SUMMARY are now also displayed with their status code.
+
 1.3.1
 ------------------------------------------------------------------------------
 

diff --git a/linkie/__init__.py b/linkie/__init__.py
@@ -28,4 +28,4 @@
     }
 })
 
-__version__ = '1.3.1'
+__version__ = '1.4.0'
diff --git a/linkie/linkie.py b/linkie/linkie.py
@@ -6,12 +6,18 @@
 import yaml
 import logging
 import requests
+from multiprocessing.dummy import Pool as ThreadPool
 
 # This isn't a perfect URL matcher, but should catch the large majority of URLs.
-URL_REGEX = r'(?:https?|ftp)://[^\s`\'"\]\)>}]+'
+# This now matches URLs presented in the format defined in the CSU Writing Guide
+# as of 13/11/2018 (https://cs-unplugged.readthedocs.io/en/latest/author/writing_guide.html)
+# as well as formats used in the code for the guide itself.
+# As such, URLs will be matched if and only if they have any one of the following prefixes: [=", (, <, ' '(a space)]
+URL_REGEX = r'(?:\=\"|\(|\<| )(?:https?|ftp)://[^\s`\'\"\]\)>}]+'
 HEADERS = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
 }
+THREADS = 12
 
 
 class Linkie:
@@ -20,7 +26,9 @@ def __init__(self, config=None, config_file_path=None):
         self.file_count = 0
         self.status_counts = {}
         self.urls = dict()
+        self.unchecked_urls = set()
         self.directory = '.'
+        self.pool = ThreadPool(THREADS)
         if not config and config_file_path:
             logging.info('Using Linkie configuration file {}'.format(config_file_path))
             config = self.read_config(config_file_path)
@@ -92,6 +100,7 @@ def count_broken_links(self):
 
     def run(self):
         self.traverse_directory()
+        self.traverse_connection_errors()
         self.print_summary()
         if self.count_broken_links():
             return 1
@@ -110,9 +119,23 @@ def traverse_directory(self):
 
             for filename in files:
                 if filename.endswith(self.config['file-types']):
-                    self.check_file(os.path.join(directory_root, filename))
+                    self.search_file(os.path.join(directory_root, filename))
+        self.pool.map(self.check_link, self.unchecked_urls)
 
-    def check_file(self, file_path):
+    def traverse_connection_errors(self):
+        connect_errors = []
+        for url, url_data in self.urls.items():
+            if str(url_data['status']) == 'ConnectionError':
+                connect_errors.append(url)
+        for url in connect_errors:
+            self.urls.__delitem__(url)
+        if len(connect_errors):
+            logging.info('Rechecking {} link{} that returned ConnectionError... '.format(len(connect_errors), 's' if len(connect_errors) != 1 else ''))
+            self.pool = ThreadPool(min(THREADS, len(connect_errors)))
+            self.pool.map(self.check_link, connect_errors)
+
+
+    def search_file(self, file_path):
         self.file_count += 1
         file_message = 'Checking file {} for URLs... '.format(file_path)
         file_object = open(file_path, 'r')
@@ -121,49 +144,56 @@ def check_file(self, file_path):
         urls = re.findall(URL_REGEX, file_contents)
         logging.info('{}{} URL{} found'.format(file_message, len(urls), 's' if len(urls) != 1 else ''))
         for url in urls:
+            # Remove trailing characters
+            url = url.rstrip('> !"#$%&\'*+,-./@:;=^_`|~').lstrip(' <(=\"')
             # Remove extra trailing bracket if link containing brackets
-            # Within Markdown link syntax.
+            # within Markdown link syntax.
             # [Wikipedia link](http://foo.com/blah_blah_(wikipedia))
             if url.count('('):
                 url += url.count('(') * ')'
-            # Remove trailing characters
-            url = url.rstrip('!"#$%&\'*+,-./@:;=^_`|~')
-            message = '  - Checking URL {} '.format(url)
-            if url in self.config['skip-urls']:
-                message += '= skipping URL (as defined in config file)'
-            elif url not in self.urls:
-                try:
-                    status_code = requests.head(url, headers=HEADERS).status_code
-                    # If response doesn't allow HEAD request, try GET request
-                    if status_code >= 400:
-                        status_code = requests.get(url, headers=HEADERS).status_code
-                # If connection error
-                except Exception as e:
-                    status_code = str(type(e).__name__)
-
-                if type(status_code) == str:
-                    message += '= {}'.format(status_code)
-                else:
-                    message += '= {} status'.format(status_code)
-
-                if type(status_code) == str or status_code >= 400:
-                    self.save_url(url, status_code, True)
-                else:
-                    self.save_url(url, status_code, False)
-                status_code = str(status_code)
-                self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1
+            self.unchecked_urls.add(url)
+
+    def check_link(self, url):
+        message = '  - Checking URL {} '.format(url)
+        if url in self.config['skip-urls']:
+            message += '= skipping URL (as defined in config file)'
+        elif url not in self.urls:
+            try:
+                status_code = requests.head(url, headers=HEADERS).status_code
+                # If response doesn't allow HEAD request, try GET request
+                if status_code >= 400:
+                    status_code = requests.get(url, headers=HEADERS).status_code
+            # If connection error
+            except Exception as e:
+                status_code = str(type(e).__name__)
+
+            if type(status_code) == str:
+                message += '= {}'.format(status_code)
             else:
-                message += '= {} (already checked)'.format(self.urls[url]['status'])
-            logging.info(message)
+                message += '= {} status'.format(status_code)
+
+            if type(status_code) == str or status_code >= 400:
+                self.save_url(url, status_code, True)
+            else:
+                self.save_url(url, status_code, False)
+        else:
+            message += '= {} (already checked)'.format(self.urls[url]['status'])
+        logging.info(message)
 
     def save_url(self, url, status_code, broken):
         self.urls[url] = {
             'broken': broken,
             'status': status_code,
         }
 
+    def collect_status_counts(self):
+        for _, url_data in self.urls.items():
+            status_code = str(url_data['status'])
+            self.status_counts[status_code] = self.status_counts.get(status_code, 0) + 1
+
     def print_summary(self):
         number_broken_links = self.count_broken_links()
+        self.collect_status_counts()
 
         logging.info('=============================================')
         logging.info('SUMMARY')
@@ -181,12 +211,12 @@ def print_summary(self):
             logging.info('Status 999 refers to a connection error.')
 
         logging.info('---------------------------------------------')
-        logging.info('Broken links:')
+        logging.info('Broken links')
         logging.info('---------------------------------------------')
-        if self.count_broken_links():
+        if number_broken_links:
             for url, url_data in self.urls.items():
                 if url_data['broken']:
-                    logging.info(url)
+                    logging.info('{}: {}'.format(url_data['status'], url))
         else:
             logging.info('No broken links found!')
 

diff --git a/linkie/tests/assets/basic/file.txt b/linkie/tests/assets/basic/file.txt
@@ -1,2 +1,2 @@
-https://www.google.com
-https://www.github.com
+="https://www.google.com"
+(https://www.github.com)
diff --git a/linkie/tests/assets/broken/file.txt b/linkie/tests/assets/broken/file.txt
@@ -1 +1,2 @@
-https://www.googoogllelce.com
+(https://www.googoogllelce.com)
+[https://github.com]
diff --git a/linkie/tests/assets/excluded_directories/docs/build/file.txt b/linkie/tests/assets/excluded_directories/docs/build/file.txt
@@ -1 +1 @@
-https://www
+="https://www
diff --git a/linkie/tests/assets/excluded_directories_custom/ignore/file.txt b/linkie/tests/assets/excluded_directories_custom/ignore/file.txt
@@ -1 +1 @@
-https://www
+="https://www
diff --git a/linkie/tests/assets/file_types/file.html b/linkie/tests/assets/file_types/file.html
@@ -1 +1 @@
-https://www.google.com
+="https://www.google.com
diff --git a/linkie/tests/assets/file_types/file.md b/linkie/tests/assets/file_types/file.md
@@ -1 +1 @@
-https://www.google.com
+="https://www.google.com
diff --git a/linkie/tests/assets/file_types/file.rst b/linkie/tests/assets/file_types/file.rst
@@ -1 +1 @@
-https://www.google.com
+="https://www.google.com
diff --git a/linkie/tests/assets/file_types/file.skipped b/linkie/tests/assets/file_types/file.skipped
@@ -1 +1 @@
-https://www
+="https://www
diff --git a/linkie/tests/assets/file_types/file.txt b/linkie/tests/assets/file_types/file.txt
@@ -1 +1 @@
-https://www.google.com
+="https://www.google.com
diff --git a/linkie/tests/assets/file_types_custom/file.special b/linkie/tests/assets/file_types_custom/file.special
@@ -1 +1 @@
-https://www
+="https://www
diff --git a/linkie/tests/assets/multiple/file1.txt b/linkie/tests/assets/multiple/file1.txt
@@ -1 +1 @@
-https://www.google.com
+="https://www.google.com
diff --git a/linkie/tests/assets/multiple/file2.txt b/linkie/tests/assets/multiple/file2.txt
@@ -1 +1 @@
-https://www.github.com
+="https://www.github.com
diff --git a/linkie/tests/assets/skip_urls/file.txt b/linkie/tests/assets/skip_urls/file.txt
@@ -1,2 +1,2 @@
-https://www.google.com
-https://www.github.com
+="https://www.google.com"
+<https://www.github.com>
diff --git a/linkie/tests/assets/skip_urls_custom/file.txt b/linkie/tests/assets/skip_urls_custom/file.txt
@@ -1,2 +1,2 @@
-https://www.google.com
-https://www.github.com
+(https://www.google.com)
+="https://www.github.com"
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,2 @@
-# Core requirements that also requires updating in setup.py
-requests==2.19.1
-PyYaml==4.2b4
-
-# Developer requirements
+PyYaml==5.1.1
+requests==2.22.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,4 +28,4 @@ @@
         }
     })
-    __version__ = '1.3.1'
+    __version__ = '1.4.0'