-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathnomoretracking.py
executable file
·136 lines (121 loc) · 4.63 KB
/
nomoretracking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#! /usr/bin/env python
from __future__ import unicode_literals
import mwclient
import mwparserfromhell
import urlparse
import datetime
from theobot import bot
from theobot import password
from bs4 import BeautifulSoup
import requests
import MySQLdb
import re
import difflib
import urllib
import time
# CC-BY-SA Theopolisme
MONTHYEAR = datetime.date.today().strftime("%B %Y")
def process(page):
contents = page.edit()
contents_compare = contents
links = site.api('parse',text=contents,prop="externallinks")['parse']['externallinks']
for link in links:
if link.find("utm") != -1:
try:
req = requests.get(link)
except:
# Connection error; it is probably a dead link, but we can't verify that for sure.
# By that point it's out of scope to add advanced dead link handling/tagging.
continue
if req.status_code == requests.codes.ok:
html_doc = req.text
soup = BeautifulSoup(html_doc)
canonical = soup.find("link",rel="canonical")
if canonical is not None and 'href' in canonical:
origurl = urlparse.urlsplit(link)
base_url = urlparse.urlunsplit((origurl[0],origurl[1],'','',''))
newurl = urlparse.urljoin(base_url, canonical['href'])
else:
parsed_url = list(urlparse.urlparse(link))
parsed_url[4] = '&'.join([x for x in parsed_url[4].split('&') if not x.startswith('utm_')])
newurl = urlparse.urlunparse(parsed_url)
if contents.find(link) != -1:
matched_link = link
else:
# This is for when the API messes with the link text
parsed = urlparse.urlsplit(link)
matched_link = urlparse.urlunsplit([parsed[0],parsed[1],parsed[2],urllib.urlencode(urlparse.parse_qsl(parsed[3])),parsed[4]])
contents = contents.replace(matched_link,newurl)
else:
if contents.find("<!-- Remove this comment when fixing the dead link: "+link+" -->") == -1 and contents.find("<!-- Theo's Little Bot skip this link: "+link+" -->") == -1: # skip any articles that have been fixed
wikicode = mwparserfromhell.parse(contents)
templated = False
# If the link is inside a template, then add {{dead link}} immediately after the template
for template in wikicode.filter_templates(recursive=True):
if link in template:
templated = True
wikicode.insert_after(template," <!-- Remove this comment when fixing the dead link: "+link+" -->{{Dead link|date="+MONTHYEAR+"|bot=Theo's Little Bot}}")
if templated == True:
contents = unicode(wikicode)
else:
# Otherwise, just add {{dead link}} right after the link and hope for the best
contents = re.sub('('+re.escape(link)+r"""(?:.*])?)""",
r"\1 <!-- Remove this comment when fixing the dead link: "+link+" -->{{Dead link|date="+MONTHYEAR+"|bot=Theo's Little Bot}}",
contents,
flags=re.UNICODE|re.DOTALL
)
else:
print "The dead link was already tagged."
if contents == contents_compare:
return False
#diff = difflib.unified_diff(contents_compare.splitlines(), contents.splitlines(), lineterm='')
#print '\n'.join(list(diff))
#print "---------"
try:
page.save(contents,"[[WP:BOT|Bot]]: Removing Google Analytics tracking codes) ([[User:Theo's Little Bot/disable/tracking|disable]]")
except mwclient.errors.MaximumRetriesExceeded:
time.sleep(600) # Patiently wait and hope for the best once again; if it doesn't work this time, just give up.
page.save(contents,"[[WP:BOT|Bot]]: Removing Google Analytics tracking codes) ([[User:Theo's Little Bot/disable/tracking|disable]]")
return True
def main():
global site
site = mwclient.Site('en.wikipedia.org')
site.login(password.username, password.password)
print "And we're live."
connection = MySQLdb.connect(
host = 'enwiki.labsdb',
db = 'enwiki_p',
read_default_file = '~/replica.my.cnf'
)
# The script runs in 500 article increments.
# In other words, in each run, it will process
# and fix 500 articles and then stop.
# !todo figure out how long a run takes vs replag
# and then optimize crontab
cursor = connection.cursor()
query = """\
SELECT page_title
FROM externallinks
JOIN page
ON page_id = el_from
WHERE el_to LIKE "%&utm_%=%"
AND page_namespace = 0
LIMIT 5000;
"""
cursor.execute(query)
donenow = 0
for title in cursor.fetchall():
title = title[0].decode("utf-8") # since tuples are returned
if bot.donenow("User:Theo's Little Bot/disable/tracking",donenow=donenow,donenow_div=5) == True:
if bot.nobots(page=title,task='tracking') == True:
if process(site.Pages[title]) == True:
donenow += 1
else:
print "No changes to make."
else:
print "Bot was denied, boo hoo."
else:
print "Bot was disabled...shutting down..."
sys.exit()
if __name__ == '__main__':
main()