-
Notifications
You must be signed in to change notification settings - Fork 0
/
trim_data.py
63 lines (52 loc) · 1.71 KB
/
trim_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import csv, json
from html.parser import HTMLParser
from const import TAGS_OF_INTEREST
class MyHTMLParser(HTMLParser):
def __init__(self):
super(MyHTMLParser, self).__init__()
self.text_list = []
self.tag_stack = []
self.data_buffer = []
def handle_starttag(self, tag, attrs):
if tag == 'code':
self.tag_stack.append(tag)
def handle_endtag(self, tag):
if tag == 'code':
self.text_list.append('\n<code>\n')
self.tag_stack.pop()
else:
self.text_list.append(''.join(self.data_buffer))
self.data_buffer = []
def handle_data(self, data):
self.data_buffer.append(data)
def get_clean_text(self):
data = ''.join(self.text_list)
self.text_list = []
self.tag_stack = []
self.data_buffer = []
return data
id_to_tags = {}
id_to_question = {}
with open('Tags.csv', encoding='utf-8') as f:
reader = csv.reader(f)
next(reader)
for id, tag in reader:
if tag not in TAGS_OF_INTEREST:
continue
if id not in id_to_tags:
id_to_tags[id] = []
id_to_tags[id].append(tag)
for id, tags in id_to_tags.items():
tags.sort()
with open('Questions.csv', encoding='Latin-1') as f:
reader = csv.reader(f)
next(reader)
for Id,OwnerUserId,CreationDate,Score,Title,Body in reader:
if Id not in id_to_tags:
continue
parser = MyHTMLParser()
parser.feed(Body)
body = parser.get_clean_text()
id_to_question[Id] = {'title': Title, 'body': body, 'tags': id_to_tags[Id]}
with open('tagged_questions_of_interest.json', 'w') as outfile:
json.dump(id_to_question, outfile)