-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathREAD_tweets.R
83 lines (66 loc) · 2.02 KB
/
READ_tweets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# read in tweets files
#work with dates
library(lubridate)
#graphing
library(ggplot2)
# DF
library(dplyr)
# text
library(stringr)
# sebtiment
library(tidytext)
#library(readr)
#
library(scales)
#reorg df
library(reshape2)
library(data.table)
#set directory
setwd("E://R/Twitter/Tweets")
#list all files in directory
ls_fl <- as.list(list.files("E://R/Twitter/Tweets"))
#find the created date of the file
ls_fl_dt <- file.info(list.files("E://R/Twitter/Tweets"))$mtime
ls_fl_dt <- substr(ls_fl_dt,0,10)
#make the data frame
ls_df <- as.data.frame(cbind(ls_fl, ls_fl_dt))
#arrange the df according to date
ls_df <- ls_df %>% mutate(dt=ymd(ls_fl_dt)) %>% arrange(desc(dt))
# create DF
x <- read.csv2(paste0("E://R/Twitter/Tweets/",ls_fl[1]), sep=",", stringsAsFactors = FALSE,
header=TRUE)
res_df <- x[0,]
#loop to read files in and append to main df
for (i in 1:nrow(ls_df)) {
step_y <- read.csv2(paste0("E://R/Twitter/Tweets/",ls_df[i,1]), sep=",", stringsAsFactors = FALSE,
header=TRUE)
#do a null check
if (nrow(step_y)>0) {
#create the logical vector
mat_log <- paste0(step_y$id, step_y$text) %in% paste0(res_df$id, res_df$text)
#append only non-matches
res_df <- rbind(res_df, step_y[!mat_log,])
}
}
# find unqinue tweets
unq_text_df <- res_df
# addin a text flag
unq_text_df$te <- substr(unq_text_df$text, 0, 15)
# remove tweets with no content
unq_text_df <- subset(unq_text_df, te!="FALSE")
#write out for rmarkdown
write.table(unq_text_df, file=paste0("E://R/Twitter/MKDWN/unq_text_df.txt"), sep=",", row.names = FALSE)
############
#reweet cnt
table(unq_text_df$retweetCount)
subset(unq_text_df, retweetCount>=2) %>%
filter(retweetCount!="abbruzzd") %>%
select(text, retweetCount) %>%
arrange(desc(retweetCount))
#fav count
fav_cnt <- subset(unq_text_df, favoriteCount!="FALSE")
fav_cnt$favoriteCount <- as.numeric(fav_cnt$favoriteCount)
table(fav_cnt$favoriteCount)
subset(fav_cnt, favoriteCount>=7) %>%
select(text, favoriteCount) %>%
arrange(desc(favoriteCount))