-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess.py
82 lines (72 loc) · 2.92 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from alphabet_detector import AlphabetDetector
import os
def fixLine(str):
newstr = ""
quote = False
for line in str.splitlines(True):
if line[:4] != '>':
if quote == True:
newstr = newstr.rstrip() + '">>'
newstr += line
quote = False
else:
if quote == False:
newstr += '<<You said "'
newstr += line[4:]
quote = True
return ' '.join(newstr.split())+"\n"
def findNext(text_file,author, notbody, parent, left):
if left != None and left != []:
for comment in left:
if comment["parent_id"][3:] == parent["id"] and comment["author"] == author and fixLine(comment["body"]) != notbody:
with open(text_file, "a") as file:
file.write(comment["author"]+ ": " +fixLine(comment["body"]))
return findNext(text_file, parent["author"], fixLine(parent["body"]), comment, left.remove(comment)) + 1
return 0
def process(list, number, dir):
ad = AlphabetDetector()
nullreturn = (0, [])
post = list[0]
comments = list[1:]
count = 0
data = []
if not ad.only_alphabet_chars(post["title"], "LATIN"):
return nullreturn
if len(comments) < 2:
return nullreturn
commentids = []
for comment in comments:
commentids.append(comment["id"])
level1 = []
level1ids = []
level2 = []
notlevel1 = []
notlevel2 = []
for comment in comments:
if comment["parent_id"][3:] not in commentids:
level1.append(comment)
level1ids.append(comment["id"])
else:
notlevel1.append(comment)
for comment in notlevel1:
if comment["parent_id"][3:] not in level1ids:
notlevel2.append(comment)
else:
level2.append(comment)
if len(level2) < 1:
return nullreturn
for comment in level2:
for parent in level1:
if comment["parent_id"][3:] == parent["id"]:
break
if comment["parent_id"][3:] == parent["id"] and comment["body"] != "[deleted]":
print ("Creating file: reddit" + "{:0>4d}".format(number+count)+".txt")
with open(os.path.join(dir, "reddit" + "{:0>4d}".format(number+count)+".txt"), "w") as file:
file.write(post["title"].replace('\n', ' ').replace('\r', ' ')+"\n")
file.write(post["url"]+"\n")
file.write(parent["author"]+": "+fixLine(parent["body"]))
file.write(comment["author"]+": "+fixLine(comment["body"]))
numcomments = findNext(os.path.join(dir, "reddit" + "{:0>4d}".format(number+count)+".txt"), parent["author"], fixLine(parent["body"]), comment, notlevel2) + 2
data.append([post["title"].replace(",", ""), parent["author"], comment["author"], "https://www.reddit.com" + post["permalink"], numcomments])
count += 1
return (count, data)