Skip to content

Commit

Permalink
add broken links script with github action cron job
Browse files Browse the repository at this point in the history
  • Loading branch information
Fbasham committed Nov 16, 2023
1 parent 214ae69 commit 3889bdd
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions .github/scripts/broken_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import requests
from bs4 import BeautifulSoup

try:
# collect broken URLS using a synchronous stack based approach (good enough since there aren't many links)
BASE_URL = 'https://alpha.service.canada.ca'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

stack = [(BASE_URL, True)]
seen = set()
out = set()

while stack:
url, follow = stack.pop()

if url in seen:
continue

seen.add(url)

r = requests.get(url, headers=HEADERS)

if not r.ok:
out.add(url)

if not follow:
continue

soup = BeautifulSoup(r.text, 'html.parser')
body = soup.find('body')
for e in body.find_all('a'):
href = e.get('href')
if href.startswith('http') and BASE_URL not in href:
stack.append((href, False))
elif BASE_URL in href:
stack.append((href, True))
elif href.startswith('/'):
stack.append((f'{BASE_URL}{href}', True))


# if broken links are found send an email via GC Notify
if out:
r = requests.post(
f"{os.environ.get('NOTIFY_BASE_API_URL')}/v2/notifications/email",
headers={
**HEADERS,
'Content-Type': 'application/json',
'Authorization': f"ApiKey-v1 {os.environ.get('BROKEN_LINKS_API_KEY')}"
},
json={
'email_address': os.environ.get('BROKEN_LINKS_EMAIL'),
'template_id': os.environ.get('BROKEN_LINKS_TEMPLATE_ID'),
'personalisation': {
'links': list(out)
}
}
)

except Exception as e:
print('something went wrong...')
print(e)

0 comments on commit 3889bdd

Please sign in to comment.