-
Notifications
You must be signed in to change notification settings - Fork 0
/
myFilmProfile.r
114 lines (97 loc) · 3.07 KB
/
myFilmProfile.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
########################################
# what do you know about my film habit #
########################################
library(RCurl)
library(XML)
url1 = "http://movie.douban.com/people/xxxx/collect?start="
url2 = "&sort=time&rating=all&filter=all&mode=grid"
url = c()
for (k in seq(from=0, to=145, by=15)) {
url = c(url,paste(url1, k, url2, sep = ""))
}
attribute.name = c()
attribute.release_date = c()
attribute.run_time = c()
attribute.genre = c()
attribute.num_raters = c()
attribute.rating_dist = c()
attribute.imdb_link = c()
attribute.ctry = c()
for (j in url) {
doc = htmlTreeParse(j, useInternal=TRUE)
hrefs = xpathSApply(doc, "//div/div/div/a", xmlGetAttr, 'href')
subjects = hrefs[grepl("subject", hrefs)]
for (i in subjects) {
url = i
doc =
try(
htmlTreeParse(
url,
useInternal=TRUE))
if(class(doc) == "try-error") next;
attribute.name =
c(
attribute.name,
xpathSApply(doc, "//div[@class='info-area']/strong", xmlValue))
x = xpathSApply(doc, "//div[@id='info']", xmlValue)
ctry =
gsub(
"地区: |语言",
"",
regmatches(x, regexpr('地区:.+?语言', x)))
attribute.ctry =
c(
attribute.ctry,
ifelse(
length(ctry) ==0,
NA,
ctry))
attribute.release_date =
c(
attribute.release_date,
ifelse(
length(xpathSApply(doc, "//div[@id='info']/span[@property='v:initialReleaseDate']", xmlValue)) == 0,
NA,
xpathSApply(doc, "//div[@id='info']/span[@property='v:initialReleaseDate']", xmlValue)))
attribute.run_time =
c(
attribute.run_time,
ifelse(
length(xpathSApply(doc, "//div[@id='info']/span[@property='v:runtime']", xmlValue)) == 0,
NA,
xpathSApply(doc, "//div[@id='info']/span[@property='v:runtime']", xmlValue)))
attribute.genre =
c(
attribute.genre,
list(xpathSApply(doc, "//div[@id='info']/span[@property='v:genre']", xmlValue)))
attribute.num_raters =
c(
attribute.num_raters,
ifelse(
length(xpathSApply(doc, "//div[@class='rating_sum']/a/span[@property='v:votes']", xmlValue)) == 0,
NA,
xpathSApply(doc, "//div[@class='rating_sum']/a/span[@property='v:votes']", xmlValue)))
rating =
as.numeric(
sub(
"%",
"",
xpathSApply(doc, "//div[@class='rating_wrap clearbox']/span[@class='rating_per']", xmlValue)))/100
attribute.rating_dist =
c(
attribute.rating_dist,
ifelse(
length(xpathSApply(doc, "//div[@class='rating_wrap clearbox']/span[@class='rating_per']", xmlValue)) == 0,
NA,
list(rating)))
k = xpathSApply(doc, "//div[@id='info']/a[@target='_blank']", xmlGetAttr, 'href')
link = k[grepl("imdb",k)]
attribute.imdb_link =
c(
attribute.imdb_link,
ifelse(
length(link) == 0,
NA,
link))
}
}