forked from natpoor/GADS-WoWAH-parser
-
Notifications
You must be signed in to change notification settings - Fork 3
/
wowah-parser.py
143 lines (116 loc) · 6.03 KB
/
wowah-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Forked and Updated by Myles O'Neill May 2016 to collect more information from the original files
# Nov, 2013 (orig Sept 2013).
# This file will (hopefully) read all the little WoW AH files and make a csv.
# One file to unite them all! (It is like 3.5 GB of text.)
# Needs to:
# Read directory and file name structure so I don't have to type it in manually.
# Make that a data structure of some sort, easy to read so the program can scan through it.
# Writes to a CSV.
# Data: http://mmnet.iis.sinica.edu.tw/dl/wowah/
# PACKAGES
import csv
import re
import os # This is for os.listdir
import os.path # This is for the other dir stuff.
import string # Maybe for directory name cycling etc.
import time # For timing how long it takes.
# VARIABLES
max_char = 0 # Tracks char ID for error testing.
max_guild = 0 # Tracks guild ID for error testing.
mfc = 0 # Tracks how many little files were counted.
write_data_loc = '~/wowah/output/' # Adjust to your dir as needed.
write_data_file = 'wowah_data.csv'
write_data_filename = write_data_loc + write_data_file
the_dir = '~/wowah/WoWAH/' # This is where the WoWAH folders are located, adjust as needed. Have them in their own subdir.
#REGEX
line_re = re.compile(r'^.*"[\d+],\s(.*),\s?(\d*),\s?(\d*),\s?(\d*),\s?(\d*),\s?([A-Z].*),\s?([A-Z].*),\s?([A-Z].*),\s?([A-Z].*)".*$')
# dummy time1 seq2 3char 4guild 5level 6race 7charclass 8zone
# REGEX NOTES
# groups: 1=timestamp, 3=avatarID, 4=guild.
# [1] = "0, 03/30/06 23:59:49, 1,10772, , 1, Orc, Warrior, Orgrimmar, , 0",
# "0, 01/10/09 00:03:50, 1,55517, , 3, Orc, Warlock, Orgrimmar, WARLOCK, 0", -- [1]
# "0, 01/10/09 00:04:10, 5,4002,1, 75, Orc, Hunter, Zul'Gurub, HUNTER, 0", -- [26]
# "0, 01/10/09 00:04:10, 5,78122,342, 80, Orc, Hunter, The Storm Peaks, HUNTER, 0", -- [32]
# "0, 01/10/09 00:08:04, 51,64635,161, 80, Blood Elf, Paladin, The Obsidian Sanctum, PALADIN, 0", -- [447]
# dummy, query time, query sequence number, avatar ID, guild, level, race, class, zone, dummy, dummy
# FUNCTIONS
def get_subdirs(the_folder):
this_list = []
this_list = os.listdir(the_folder)
print 'From get_subdirs, a list is: ', this_list # Printing for error control.
for item in this_list:
if item.startswith('.'):
this_list.remove(item)
return(this_list)
# End of get_subdirs
# '.DS_Store'
def get_file_list(the_folder): # Yes these two are the same, just diff names.
this_list = []
this_list = os.listdir(the_folder)
for item in this_list:
if item.startswith('.'):
this_list.remove(item)
return(this_list)
# End of get_file_list
def parse_and_write(file, output_file):
for line in file: # Oh the first "line" is a hard return???
# print 'A line is: ', line
data = line_re.match(line)
if data is not(None):
timestamp = data.group(1)
char = data.group(3)
level = data.group(5)
race = data.group(6)
charclass = data.group(7)
zone = data.group(8)
if data.group(4) is not(''):
guild = data.group(4)
else:
guild = '-1' # Note there are some missing values, i.e. errant -1.
print timestamp # This is so you can keep track of where it is. Max Jan 2009 IIRC.
new_line = char + ',' + level + ',' + race + ',' + charclass + ',' + zone + ',' + guild + ',' + timestamp + '\n'
output_file.write(new_line)
else:
print "Didn't match the regex."
# End of parse_and_write
# Note the two diff formats they use in the files, it changes partway through:
# [1] = "0, 03/30/06 23:59:49, 1,10772, , 1, Orc, Warrior, Orgrimmar, , 0",
# "0, 01/10/09 00:03:50, 1,55517, , 3, Orc, Warlock, Orgrimmar, WARLOCK, 0", -- [1]
# dummy, query time, query sequence number, avatar ID, guild, level, race, class, zone, dummy, dummy
def read_tree(output_file):
global the_dir
months_folders = get_subdirs(the_dir) # This is why the subdirs should be in their own location that you set in the vars section up top.
for folder in months_folders: # Run isdir(dir) first, try/except. Make sure no funny folders/dirs.
folder = the_dir + folder # Expands the folder name to the long version.
day_folders = get_subdirs(folder)
for day_folder in day_folders:
day_folder = folder + '/' + day_folder
file_list = get_file_list(day_folder)
for file in file_list:
try:
file = day_folder + '/' + file
with open(file, 'r') as f:
this_file = f.readlines() # Should read the whole file as a string?
parse_and_write(this_file, output_file)
except IOError:
print 'Error opening hoped for data-text file,', str(file), ', reason: ', IOError
# End of read_tree
def main():
#open write file here
output_file = open(write_data_filename, 'a') # 'a' is very important, it appends the new data to the big file.
fieldnames = ('char, level, race, charclass, zone, guild, timestamp\n')
output_file.write(fieldnames)
start_time = time.time()
read_tree(output_file)
#close write file here
output_file.close()
spent_time = time.time() - start_time
mins_spent = int(spent_time / 60)
secs_remainder = int(spent_time % 60)
print 'Time of process: ', mins_spent, ':', secs_remainder # 13m:42s on iMac. Also 14m:39s another time.
# print 'Files scanned (or tried), ', mfc # 138,084
# print 'Max Chars: ', max_char # They say 91,065 ">= 1" but it starts at 0, my count says: 91064 + 1 = 91,065.
# print 'Max Guilds: ', max_guild # They say "An integer within [1, 513]" but no since they start at 0. 512 + 1 = 513.
# End of main
# Main call
main()