-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawler.go
301 lines (259 loc) · 8.51 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
package main
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"time"
"github.com/OlegSchmidt/soup"
"github.com/jehiah/go-strftime"
)
// common html attributes
const (
// common html tags
div = "div"
button = "button"
span = "span"
class = "class"
a = "a"
meta = "meta"
// common html attributes
attributeClass = "class"
attributeProperty = "property"
attributeContent = "content"
attributeRole = "role"
attributeJsname = "jsname"
// string constants
propertyValueOpengraphUrl = "og:url"
attributeValueImg = "img"
// css classes for selection
classContentApp = "LXrl4c"
classMainContentBlock = "W4P4ne"
classReviewAreasContainer = "d15Mdf"
classReviewAuthor = "X43Kjb"
classReviewDate = "p2TkOb"
classReviewStarFilled = "vQHuPe"
classReviewTitle = "IEFhEe"
// jsname values for selection
jsnameReviewContentShort = "bN97Pc"
jsnameReviewContentFull = "fbQN7e"
reviewsURL = "https://play.google.com/store/getreviews?authuser=0"
)
// parses the website and returns the DOM struct
func retrieveDoc(url string) (soup.Root, int) {
var document soup.Root
httpStatus := http.StatusOK
// retrieving the html page
response, soupError := soup.Get(url)
if soupError != nil {
fmt.Println("\tcould not reach", url, "because of the following error:")
fmt.Println(soupError)
httpStatus = http.StatusBadRequest
} else {
// pre-process html
response = strings.Replace(response, "<br>", "\n", -1)
response = strings.Replace(response, "<b>", "", -1)
response = strings.Replace(response, "</b>", "", -1)
document = soup.HTMLParse(response)
}
return document, httpStatus
}
// returns the container where the reviews are stored
func GetReviewContainer(document soup.Root) (soup.Root, error) {
var container soup.Root
var containerError error = nil
appContainers := document.FindAll(div, class, classContentApp)
if len(appContainers) >= 1 {
mainContentBlock := appContainers[len(appContainers)-1].Find(div, class, classMainContentBlock)
if mainContentBlock.Error == nil {
mainContentBlockChildren := mainContentBlock.Children()
if len(mainContentBlockChildren) >= 2 {
containerBlockReviewChildren := mainContentBlockChildren[1].Children()
if len(containerBlockReviewChildren) >= 3 {
container = containerBlockReviewChildren[2]
} else {
containerError = errors.New("2nd child of main container block for reviews should contain at least 3 children")
}
} else {
containerError = errors.New("main container block for reviews (2nd main content block) should contain at least 2 children")
}
} else {
containerError = errors.New("couldn't find the main content blocks in the main container, looking for first <div class=\"" + classMainContentBlock + "\"></div>")
}
} else {
containerError = errors.New("couldn't find the main container of the app, looking for last <div class=\"" + classContentApp + "\"></div>")
}
return container, containerError
}
// returns the 3 main areas of the review : headline (stars, name, date), review itself and the reply from developer
func getReviewAreas(document soup.Root) ([]soup.Root, error) {
var reviewAreas []soup.Root
var reviewAreasError error = nil
areaContainer := document.Find(div, class, classReviewAreasContainer)
if areaContainer.Error == nil {
areaContainerChildren := areaContainer.Children()
if len(areaContainerChildren) >= 2 {
reviewAreas = areaContainerChildren
} else {
reviewAreasError = errors.New("mode \"html\" : <div class=\"" + classReviewAreasContainer + "\"></div> should contain at least 2 children")
}
} else {
reviewAreasError = errors.New("mode \"html\" : couldn't find container for the 3 areas of the review, looking for <div class=\"" + classReviewAreasContainer + "\"></div>")
}
return reviewAreas, reviewAreasError
}
// crawls the given link assuming that there are reviews to be found
func CrawlHtml(link string) ([]AppReview, error) {
var appReviews []AppReview
var crawlError error = nil
var review AppReview
appPage, HttpStatus := retrieveDoc(link)
if HttpStatus == http.StatusOK {
packageName, packageNameError := getHtmlReviewPackageName(appPage)
if packageNameError == nil {
reviewBlock, reviewBlockError := GetReviewContainer(appPage)
if reviewBlockError == nil {
reviewElements := reviewBlock.Children()
for position := range reviewElements {
review = AppReview{}.fillBySoup(packageName, appPage, reviewElements[position])
appReviews = append(appReviews, review)
}
} else {
crawlError = reviewBlockError
}
}
} else {
crawlError = errors.New("given link couldn't be parsed, please check the online availability")
}
return appReviews, crawlError
}
// Crawl crawls the reviews of a given app until a given limit is reached
func Crawl(packageName string, limit int) []AppReview {
var appReviews []AppReview
page := 0
for {
page++
// sleep for 6 seconds to not be blocked by Google
//time.Sleep(6 * time.Second)
// request html page
formData := url.Values{}
formData.Add("reviewType", "0")
formData.Add("pageNum", strconv.Itoa(page))
formData.Add("id", packageName)
formData.Add("reviewSortOrder", "0")
formData.Add("xhr", "1")
formData.Add("hl", "en")
var resp *http.Response
var err error
resp, err = http.PostForm(reviewsURL, formData)
if err != nil || resp == nil {
fmt.Printf("%s ERROR: %s\n", packageName, err)
return appReviews
}
// handle exit strategies
code := resp.StatusCode
if code == 400 || code == 403 || code == 404 || code == 408 || code == 429 {
fmt.Printf("%s STATUS %d: no more reviews\n", packageName, code)
return appReviews
}
contents, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Printf("%s ERROR: %s\n", packageName, err)
return appReviews
}
errorClosing := resp.Body.Close()
if errorClosing == nil {
// pre-process the html
stringContent := escapedBytesToString(contents)
// extract data from reviews of the html
doc := soup.HTMLParse(stringContent)
// check if the captcha came up
captcha := doc.Find("body").GetAttribute("onload")
if captcha == "e=document.getElementById('captcha');if(e){e.focus();}" {
fmt.Printf("%s QUIT PROGRAMM: captcha needed\n", packageName)
return appReviews
}
var reviewsOnPage int
reviewsInPage := doc.FindAll(div, class, "single-review")
for _, rDoc := range reviewsInPage {
review := AppReview{}
review.PackageName = packageName
review.Body = getReviewBody(rDoc)
review.Date = getReviewDate(rDoc)
review.Author = getReviewAuthor(rDoc)
review.PermaLink = getReviewPermaLink(rDoc)
review.ReviewID = getReviewID(rDoc)
review.Rating = getReviewRating(rDoc)
reviewsOnPage++
appReviews = append(appReviews, review)
if limit > 0 && len(appReviews) == limit {
break
}
}
if reviewsOnPage == 0 { // no more reviews
break
}
break
}
}
return appReviews
}
func getReviewBody(doc soup.Root) string {
return doc.Find(span, class, "review-title").FindNextSibling().NodeValue
}
func getReviewDate(doc soup.Root) int64 {
unFormattedDate := doc.Find(span, class, "review-date").Text()
t, err := time.Parse("January 2, 2006", unFormattedDate)
if err != nil {
return -1
}
s := strftime.Format("%Y%m%d", t)
val, err := strconv.Atoi(s)
if err != nil {
return -1
}
return int64(val)
}
func getReviewAuthor(doc soup.Root) string {
return strings.TrimSpace(doc.Find(span, class, "author-name").Text())
}
func getReviewPermaLink(doc soup.Root) string {
return "https://play.google.com" + doc.Find(a, class, "reviews-permalink").GetAttribute("href")
}
func getReviewID(doc soup.Root) string {
return doc.Find(div, class, "review-header").GetAttribute("data-reviewid")
}
func getReviewRating(doc soup.Root) int {
ratingRaw := doc.Find(div, class, "current-rating").GetAttribute("style")
re := regexp.MustCompile("[^0-9]+")
i, err := strconv.Atoi(re.ReplaceAllString(ratingRaw, ""))
if err != nil {
fmt.Println(err)
} else {
if i == 20 {
return 1
} else if i == 40 {
return 2
} else if i == 60 {
return 3
} else if i == 80 {
return 4
} else if i == 100 {
return 5
}
}
return -1
}
func escapedBytesToString(b []byte) string {
b = bytes.Replace(b, []byte("\\u003c"), []byte("<"), -1)
b = bytes.Replace(b, []byte("\\u003e"), []byte(">"), -1)
b = bytes.Replace(b, []byte("\\u0026"), []byte("&"), -1)
b = bytes.Replace(b, []byte("\\u003d"), []byte("="), -1)
b = bytes.Replace(b, []byte("\\\""), []byte("\""), -1)
return string(b)
}