-
-
Notifications
You must be signed in to change notification settings - Fork 75
검색어 기반 뉴스 수집
Chan-Yub Park edited this page May 16, 2018
·
7 revisions
# install.packages("selectr")
library(curl)
library(rvest)
# install.packages("devtools")
# devtools::install_github("forkonlp/N2H4")
library(N2H4)
options(stringsAsFactors = F)
success <- function(res){
cat("Request done! Status:", res$status, "\n")
#res$content<-iconv(rawToChar(res$content),from="CP949",to="UTF-8")
res$content<-rawToChar(res$content)
data <<- c(data, list(res))
}
failure <- function(msg){
cat("Oh noes! Request failed!", msg, "\n")
}
strDate<-as.Date("2017-03-26")
endDate<-as.Date("2017-06-15")
strTime<-Sys.time()
midTime<-Sys.time()
qlist<-c("경기도교육청","청문회","김상조","김이수")
for (i in 1:length(qlist)){
dir.create("./data",showWarnings=F)
dir.create(paste0("./data/news_",qlist[i]),showWarnings=F)
for (date in strDate:endDate){
date<-as.character(as.Date(date,origin = "1970-01-01"))
dateo<-gsub("-",".",date)
dated<-gsub("-","",date)
print(paste0(date," / ",qlist[i], "/ start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime))
midTime<-Sys.time()
pageUrli<-paste0("https://search.naver.com/search.naver?where=news&query=",qlist[i],"&ie=utf8&sm=tab_srt&sort=1&photo=0&field=0&reporter_article=&pd=3&ds=",dateo,"&de=",dateo,"&docid=&nso=so%3Ar%2Cp%3Afrom",date,"to",date,"%2Ca%3Aall&mynews=0&mson=0&refresh_start=0&related=0")
trym<-0
max<-try(getMaxPageNum(pageUrli, search=T), silent = T)
while(trym<=5&&class(max)=="try-error"){
max<-try(getMaxPageNum(pageUrli, search=T), silent = T)
Sys.sleep(abs(rnorm(1)))
trym<-trym+1
print(paste0("try again max num: ",pageUrli))
}
if(max=="no result"){
print("no naver news links this time")
next
}
for (pageNum in 1:max){
start<-(pageNum-1)*10+1
print(paste0(date," / ",qlist[i], "/ start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime))
midTime<-Sys.time()
pageUrl<-paste0(pageUrli,"&start=",start)
tryp<-0
newsList<-try(getUrlListByQuery(pageUrl), silent = T)
while(tryp<=5&&class(newsList)=="try-error"){
newsList<-try(getUrlListByQuery(pageUrl), silent = T)
Sys.sleep(abs(rnorm(1)))
tryp<-tryp+1
print(paste0("try again max num: ",pageUrl))
}
if(newsList$news_links[1]=="no naver news"){
print("no naver news links this time")
next
}
pool <- new_pool()
data <- list()
sapply(newsList$news_links, function(x) curl_fetch_multi(x,success,failure))
res <- multi_run()
if( identical(data, list()) ){
pool <- new_pool()
data <- list()
sapply(newsList$news_links, function(x) curl_fetch_multi(x,success,failure))
res <- multi_run()
}
closeAllConnections()
loc<-sapply(data, function(x) grepl("^http://news.naver",x$url))
cont<-sapply(data, function(x) x$content)
cont<-cont[loc]
if(identical(cont,character(0))){
print("no naver news links this time")
next
}
titles<-unlist(lapply(cont,function(x) getContentTitle(read_html(x))))
bodies<-unlist(lapply(cont,function(x) getContentBody(read_html(x))))
presses<-unlist(lapply(cont,function(x) getContentPress(read_html(x))))
datetime<-lapply(cont,function(x) getContentDatetime(read_html(x))[1])
datetime<-sapply(datetime, function(x) (as.character(x)[1]))
edittime<-lapply(cont,function(x) getContentDatetime(read_html(x))[2])
edittime<-sapply(edittime, function(x) (as.character(x)[1]))
urls<-sapply(data, function(x) x$url)
urls<-urls[loc]
datC<-data.frame(titles,urls,presses,datetime,edittime,bodies)
write.csv(datC, file=paste0("./data/news_",qlist[i],"/news_",date,"_",pageNum,".csv"),row.names = F, fileEncoding="UTF-8")
}
}
}