-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraping.py
116 lines (94 loc) · 3.56 KB
/
webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import praw
import re
import pandas as pd
reddit = praw.Reddit(client_id='5OIBM9csRhVFWg',
client_secret="jSgOYMFsmKLp5nn-fQH6OppX_E0",
user_agent='a script to summarize the workouts used by posters of progresspics v0.0.1 \
(by /u/wantthatbod for Insight Health Data Science)')
progresspics = reddit.subreddit('progresspics')
dat = pd.DataFrame(columns=["id", "imageurl", "age", "height", "startweight", "finalweight", "text"])
# real start 1325376000 (currently is only for 1 year)
for submission in progresspics.submissions(1483185599, 1514764799):
# check if there is a picture
url = submission.url
if(re.search(r'comments', url)):
continue
# check there is a match for demographics
match = re.search(r'(\w+)\s*/\s*(\d*)\s*/(\d+)\'*\"*(\d+)\'*\"*', submission.title)
# check if match exists
if (match is None):
continue
# check there is a match for weight
match_w = re.search(r'\[(\d+)\s*>\s*(\d+).*\]', submission.title)
if(match_w is None):
continue
# demographics
# sanity check values:
gender = match.group(1)
if(gender[0].upper() != 'F'):
continue
age = int(match.group(2))
if(age < 0 | age > 120):
continue
height_feet = int(match.group(3))
height_inch = int(match.group(4))
if((height_feet <= 0) | (height_feet > 9) | (height_inch <= 0) | (height_inch > 12)):
match_check = re.search(r'(\d+)\s*cm', submission.title)
if(match_check is None):
continue
height = float(match_check.group(1))/2.54
else:
height = height_feet*12 + height_inch
# weight
inkg = 1-(re.search('kg', submission.title) is None)
if(inkg):
startweight = float(match_w.group(1))*2.20462262
finalweight = float(match_w.group(2))*2.20462262
else:
startweight = float(match_w.group(1))
finalweight = float(match_w.group(2))
if((startweight <= 0.0) | (finalweight <= 0.0)):
continue
match_time = re.search(r'.*(\d+)\s*month.*', submission.title)
timeunit = 'month'
if(match_time is None):
match_time = re.search(r'.*(\d+)\s*week.*', submission.title)
if(match_time is None):
time = None
timeunit = None
else:
time = float(match_time.group(1))
time = time/4.333
timeunit='week'
else:
time = float(match_time.group(1))
submission.comments.replace_more(limit=None)
comment_queue = submission.comments[:] # Seed with top-level
# pull remaining text
if(timeunit == 'month'):
match_text = re.search(r'.*/.*/.*\[.*\].*months(.*)', submission.title)
elif(timeunit == 'week'):
match_text = re.search(r'.*/.*/.*\[.*\].*weeks(.*)', submission.title)
else:
match_text = re.search(r'.*/.*/.*\[.*\](.*)', submission.title)
if(match_text is not None):
text = match_text.group(1) # Pull rest of the text
else:
text = ""
while comment_queue:
comment = comment_queue.pop(0)
if (comment.author == submission.author):
text += comment.body
comment_queue.extend(comment.replies)
dat = dat.append({
"id": submission.id,
"imageurl": url,
"age": age,
"height": height,
"startweight": startweight,
"finalweight": finalweight,
"text": text
}, ignore_index = True)
import os
os.chdir('/mnt/c/Users/jhyan/OneDrive/Projects/GitHub/InsightProject/WantThatBod')
dat.to_pickle("./datafor2017.pkl")