-
Notifications
You must be signed in to change notification settings - Fork 0
/
tbc-scraper.py
executable file
·91 lines (72 loc) · 2.56 KB
/
tbc-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup as bs
# Get difficulties. If none exists, default to 0.
# r1 = orange
# r2 = yellow
# r3 = green
# r4 = gray
def getDiff(string):
color = soup.find("span", class_=string)
if (color is None):
return 0
else:
return color.get_text()
profNames = [
'first-aid', # 0
'blacksmithing', # 1
'engineering', # 2
'enchanting', # 3
'jewelcrafting', # 4
'leatherworking', # 5
'tailoring', # 6
'mining', # 7
'cooking', # 8
'alchemy', # 9
'poisons' # 10
]
# Get list of spell IDs for all professions
profSpellList = []
for prof in profNames:
fileName = 'spell-ids/' + prof + '.txt'
file = open(fileName, "r")
# Read each spell ID into list
spellList = file.read().splitlines()
profSpellList.append(spellList)
# Scrape wowhead for remaining data
i = 0
for prof in profSpellList:
print("* Getting %s spells... (%d out of %d professions)" % (profNames[i], i+1, len(profSpellList)))
print("-----------------------------------------------------------")
profSpells = []
j = 0
for spellID in prof:
URL = 'https://tbc.wowhead.com/spell=' + spellID
options = Options()
options.headless = True
s = Service(executable_path='geckodriver', log_path='/dev/null') # Prevent geckodriver logging
browser = webdriver.Firefox(options=options, service=s)
browser.get(URL)
html = browser.page_source
soup = bs(html, 'lxml')
# Get data related to the tradeSkill spell
spell = []
spellName = soup.find_all('h1', class_='heading-size-1')[0].get_text()
orange = getDiff("r1")
yellow = getDiff("r2")
green = getDiff("r3")
gray = getDiff("r4")
spell.extend((spellID, orange, yellow, green, gray, spellName))
profSpells.append(spell) # Store in main spell list
print("Scraped %s \t(%d/%d)" % (URL, j+1, len(prof)))
browser.quit()
j+=1
spellDF = pd.DataFrame(profSpells, columns = ['spellID', 'orange', 'yellow', 'green', 'gray', 'spellName'])
# Export profession to CSV
spellDF.to_csv('spells-csv/%s.csv' % (profNames[i]), header=True, index=False, sep='\t')
print("Exported %s.csv" % (profNames[i]))
i += 1
print("Done")