-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.R
98 lines (84 loc) · 3.42 KB
/
scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
library(XML)
library(plyr)
library(rvest)
#Setting an empty data frame
output <- {}
#see how many iterations are needed
url <- "http://www.playdb.co.kr/artistdb/list_iframe.asp?Page=1&code=013003&NameSort=Y&Country=Y"
web_data <- readLines(url, warn=FALSE)
web_data <- paste(web_data, collapse = ' ')
pages <- as.numeric(substr(web_data, regexpr("total 1/", web_data)[[1]]+8, regexpr("]</span>", web_data)[[1]]-1))
#main scraper
for (page in 1:pages)
{
url <- paste0("http://www.playdb.co.kr/artistdb/list_iframe.asp?Page=", page, "&code=013003&NameSort=Y&Country=Y")
web_data <- readLines(url, warn=FALSE)
web_data <- paste(web_data, collapse = ' ')
id_starting_indices <- as.vector(gregexpr("./detail.asp?", web_data)[[1]])+19 #even only
id_ending_indices <- as.vector(gregexpr("&part=013003", web_data)[[1]])-1 #even only
name_starting_indices <- as.vector(gregexpr("top\">", web_data)[[1]])+5 #even only
name_ending_indices <- name_starting_indices+10 #even only
for (i in 1: (length(id_starting_indices)/2))
{
#get id
id <- substr(web_data, id_starting_indices[i*2], id_ending_indices[i*2])
#get name
name <- substr(web_data, name_starting_indices[i*2], name_ending_indices[i*2])
name <- substr(name, 1, regexpr("<", name)-1)
#get works
url <- paste0("http://www.playdb.co.kr/artistdb/Detail_Content_new.asp?TabKind=3&ManNo=", id)
history <- iconv(paste0(readLines(url, encoding="CP949"), collapse="\n"), to="UTF-8")
musicals <- xpathSApply(htmlParse(history, isHTML=T, asText=T, encoding="UTF-8"), "//td[@class='ptitle']", xmlValue)
dates <- xpathSApply(htmlParse(history, isHTML=T, asText=T, encoding="UTF-8"), "//td[@class='time']", xmlValue)
#format data
block <- cbind(id, name, musicals, dates)
#append
output <- rbind.fill(as.data.frame(output), as.data.frame(block))
}
}
#scraping playNo (musical id)
codes <- {}
ids <- unique(output$id)
for (persons in 1:length(ids))
{
#get workcodes
scores <- {}
url <- paste0("http://www.playdb.co.kr/artistdb/Detail_Content_new.asp?TabKind=3&ManNo=", ids[persons])
web_data <- readLines(url, warn=FALSE)
web_data <- paste(web_data, collapse = ' ')
code_starting_indices <- as.vector(gregexpr("dbDetail.asp?", web_data)[[1]])+24 #even only
code_ending_indices <- as.vector(gregexpr("target=\"_parent\"><", web_data)[[1]])-3
for (k in 1: (length(code_starting_indices)/2))
{
code <- substr(web_data, code_starting_indices[k*2-1], code_ending_indices[k])
codes <- append(codes, code)
}
}
codes[which(codes=="")] <- NA
#scraping scores (out of 10) by audience
scores <- {}
for (cd in codes)
{
#get scores
url <- paste0("http://www.playdb.co.kr/playdb/playdbDetail.asp?sReqPlayno=", cd)
url_html <- read_html(url, encoding="CP949")
score_node <- html_node(url_html, "dd.score")
score <- as.character(html_text(score_node))
scores <- append(scores, as.character(score))
}
#replace empty cells with NA
scores[which(scores=="")] <- NA
#combine and omit all rows with NA
output_final <- cbind(output, codes, scores)
output_final <- na.omit(output_final)
#strsplit dates column into two (start & end)
time_i <- {}
time_f <- {}
for (date in output_final$dates)
{
time_i <- append(time_i, strsplit(date, " ~ ")[[1]][1])
time_f <- append(time_f, strsplit(date, " ~ ")[[1]][2])
}
output_final <- cbind(output_final, time_i, time_f)
#save the output
write.csv(output_final, "data.csv")