-
Notifications
You must be signed in to change notification settings - Fork 0
/
urlextractor.py
50 lines (35 loc) · 1018 Bytes
/
urlextractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import re
import tldextract
import esm
def extractUrl(text, match):
pretld, posttld = None, None
url = ""
tld = match[1]
startpt, endpt = match[0][0], match[0][1]
# check the next character is valid
if len(text) > endpt:
nextcharacter = text[endpt]
if re.match("[a-z0-9-.]", nextcharacter):
return None
posttld = re.match(':?[0-9]*[/[!#$&-;=?a-z]+]?', text[endpt:])
pretld = re.search('[a-z0-9-.]+?$', text[:startpt])
if pretld:
url = pretld.group(0)
startpt -= len(pretld.group(0))
url += tld
if posttld:
url += posttld.group(0)
endpt += len(posttld.group(0))
url = url.rstrip(",.")
return (startpt, endpt), url
def parseText(text):
results = []
tlds = (tldextract.TLDExtract()._get_tld_extractor().tlds)
tldindex = esm.Index()
for tld in tlds:
tldindex.enter("." + tld.encode("idna"))
tldindex.fix()
tldsfound = tldindex.query(text)
results = [extractUrl(text, tld) for tld in tldsfound]
results = [x for x in results if x] # remove nulls
return results