-
-
Notifications
You must be signed in to change notification settings - Fork 75
사용 예시 with setUrls
library(N2H4) library(httr)
cate<-getMainCategory()
tcate<-cate$sid1[1]
subCate<-cbind(sid1=tcate,getSubCategory(sid1=tcate))
tscate<-subCate$sid2[1:2]
strDate<-"20160101" endDate<-"20160101"
strTime<-Sys.time() midTime<-Sys.time()
setUrls <- function(sid1_vec, sid2_vec, strDate, endDate, page_vec=NA){ url_list <- expand.grid(sid1_vec, sid2_vec, strDate:endDate, page_vec, stringsAsFactors=FALSE) colnames(url_list) <- c("sid1", "sid2", "date", "pageNum") url_list <- apply(url_list, 1, as.list) url_list <- lapply(url_list, function(x){ pageUrl <- parse_url("http://news.naver.com/main/list.nhn") if(is.na(x$page)){ pageUrl$query <- list(sid1=x$sid1, sid2=x$sid2, mid="shm", mode="LS2D", date=x$date) } else { pageUrl$query <- list(sid1=x$sid1, sid2=x$sid2, mid="shm", mode="LS2D", date=x$date, page=x$pageNum) } x$pageUrl <- build_url(pageUrl) return(x) }) return(url_list) }
urls <- setUrls(tcate, tscate, strDate, endDate)
sub_urls <- list()
for (url in urls){ print(paste0(url$date," / ", url$sid1," / ", url$sid2 ," / start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime)) midTime<-Sys.time()
max<-getMaxPageNum(url$pageUrl)
sub_urls <- c(sub_urls, setUrls(url$sid1, url$sid2, url$date, url$date, 1:max)) }
for (url in sub_urls){ print(paste0(url$date," / ",url$sid1," / ",url$sid2," / ",url$pageNum, " / start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime)) midTime<-Sys.time()
newsList<-getUrlListByCategory(url$pageUrl) newsData<-c()
newsData <- lapply(newsList$links, function(x){ # 불러오기에 성공할 때 까지 반복합니다. tem<-try(getContent(x), silent = TRUE) while(class(tem)=="try-error"){ tem<-try(getContent(x), silent = TRUE) print(paste0("try again: ",x)) } if(class(tem$datetime)[1]=="POSIXct"){ # newsData<-rbind(newsData,tem) return(tem) } })
newsData <- dplyr::bind_rows(newsData)
dir.create("./data",showWarnings=F)
write.csv(newsData, file=paste0("./data/news",url$sid1,"",url$sid2,"",url$date,"_",url$pageNum,".csv"),row.names = F) }