-
Notifications
You must be signed in to change notification settings - Fork 0
/
mtg scraper.py
142 lines (104 loc) · 4.02 KB
/
mtg scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 10 15:28:33 2021
@author: Brandon Imstepf
"""
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re
#------------------------------------------------------------------------------#
# #
# What is the most expensive color to play? What is the least? Let's find out. #
# #
#------------------------------------------------------------------------------#
os.chdir("Case Study 2 - MTG")
#location = '2 GW Company.htm'
#locn = location.replace('.htm', '') #for export later
#Get all the html code related the Cards
#Setup the columns
names = []
namelen = []
colors = []
prices = []
cmcs = []
rarities = []
#this is to loop through the whole folder and scrape each .htm file
for filename in os.listdir():
if filename.endswith(".htm"):
print(filename)
#this just sets up the HTML scraper
soup = BeautifulSoup(open(filename, encoding="utf8"), "html.parser")
#this isolates the parts of the HTML I want to extract from
card_div = soup.find_all('tr',class_='cardItem')
for container in card_div:
#Name
name = container.a.text
names.append(name)
#Name Length
namelen.append(len(name))
#Price
price = container.find('span',class_='paper option').text
price = price.replace('\n', '')
prices.append(price)
#Rarity
rarity2 = container.find('td',class_='number').span
rarity = str(rarity2)
rarity = rarity.replace('<span class="rarity ', '')
rarity = rarity.replace('\"></span>', '')
rarities.append(rarity)
#Color and CMC
color2 = container.find('td', class_='manaCost', style="")
color3 = str(color2)
cmc = 0
color = ''
#I am in love with this library
#finds all the numbers in the string following 'ms-' and converts to a list
cmc2 = re.findall(r'ms-(\d+)',color3)
#converts the list back into integers
if not cmc2: #converts empty lists to a 0 generic mana cost
cmc = 0
else:
cmc = int(cmc2[0])
#I could not find a way to make this cleaner. Embarrassing.
if color3.__contains__('ms-u'):
color += 'U'
cmc += 1
if color3.__contains__('ms-r'):
color += 'R'
cmc += 1
if color3.__contains__('ms-b'):
color += 'B'
cmc += 1
if color3.__contains__('ms-g'):
color += 'G'
cmc += 1
if color3.__contains__('ms-w'):
color += 'W'
cmc += 1
if color == '':
color = 'V'
colors.append(color)
cmcs.append(cmc)
color3 = ''
color = ''
#put everything into a nice data frame
cards = pd.DataFrame({
'name': names,
'namelength': namelen,
'price': prices,
'color': colors,
'cmc':cmcs,
'rarity': rarities,
})
#changing the name length from a string to an int for cleaner data
cards['namelength'] = cards['namelength'].astype(int)
#checking the card counts
print(cards.value_counts())
cards = cards.drop_duplicates()
print("---------HERE WE REMOVE EXTRA CARDS----------")
print(cards.value_counts())
print(cards)
cards.to_csv('mtgData.csv')
#cards.to_csv(locn + ' mtg.csv')