-
Notifications
You must be signed in to change notification settings - Fork 2
/
TerrorNotebook.Rmd
593 lines (347 loc) · 16.4 KB
/
TerrorNotebook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
---
title: "Analyzing global terrorist activities"
output:
html_document: default
html_notebook: default
author: Anish Singh Walia
---
The main aim of this analytics project is to get out the hidden trends and unknown information about the terrorist attacks that took place in the world from 1970-2016 and understand *why they happened,where they happended,which countires were most affected by it,what were the most targetted entities of the terrorist groups, the most violent and active terrorist groups in the world throught* etc and lots more.
And then also some __rolling down__ to specific and most affected countries with terrorist activities in the past such as India,USA,UK etc.
```{r,warning=FALSE,message=FALSE}
#analyzing the GLobal Terrorist Activities
require(data.table)
require(readr) #to read data faster
require(ggplot2)
#Data preprocessing and Transformation packages
require(dplyr)
require(tidyr)
terror<-read_csv("F:/globalterrorismdb_0617dist.csv")
#checking the structure of data
#str(terror)
#summary(terror)
#a wide data set with lots of columns
#---------------------
```
##Starting with Descriptive Analytics
```{r}
#checking how many terrorist attacks were successfull
table(terror$success)
success<-as.data.frame(table(terror$success))
success$Var1<-ifelse(success$Var1==1,"Successful","Unsuccessful")
names(success)<-c("Attack","count")
#almost 90% of all attacks were successfull
ggplot(aes(x = Attack,y=count), data = success) +
geom_col(width=0.5,color="black",fill="purple",alpha=0.5) +
labs(x = "Attack",y="Count",title="Distrubution of Successful and Unsuccessful attacks from 1970-2015")
```
Most of the attacks in the past were successfully executed.
---------------
####Analyzing what types of Attacks were done
Aggregating and summarising data.
```{r}
table(terror$attacktype1_txt)
#Most common terror attack is Bombing/Explosion
#Finding Succcessful attacks and their attack types
#percentage of Successful and unsuccessful attacks
#converting to a data frame
attackdf<-data.frame(table(terror$attacktype1_txt,
terror$success))
#Spreading the data frame
attackdf<-attackdf %>% spread(key = Var2,Freq)
colnames(attackdf)<-c("Type_of_Attack","Unsuccessful","Successful")
#Adding new columns which contain Percentage of Successful and Unsucesful attacks
attackdf<-attackdf%>% mutate(PerSuccessful=round((Successful/(Unsuccessful+Successful))*100,2), PerUnsuccessful=round((Unsuccessful/(Unsuccessful+Successful))*100,2))
attackdf
#Plotting
plot<-ggplot(aes(x = "" , y = Successful,fill=Type_of_Attack),data = attackdf) +
geom_bar(width=1 , stat="identity") +
theme(axis.line = element_blank(),
plot.title = element_text(hjust=0.5)) +
labs(title="Pie Chart of Count of the Types of Successful Attacks",
x=NULL,y=NULL , fill = 'Type of Attack')
#Final Plot to make it a Pie Chart
plot + coord_polar(theta ="y" , start = 0) + scale_color_brewer(palette = "Set1")
#Making a Bar plot
theme_set(theme_grey())
ggplot(aes(x = reorder(Type_of_Attack,PerSuccessful),y = PerSuccessful),data = attackdf) +
geom_col(color="black",fill="red",alpha=0.6) +
coord_flip() +
labs(title="Barplot of Types of Attacks and Successful attacks",x="Type of Attack",y="Percentage of Successful Attacks")
```
As the Plot shows the Most Successful *__Terror Attacks were for Hostage Taking,
Kidnapping,Armed Assaults, Bombings__*.
----------------
####Analyzing the Countires and the Terrorist attacks took place
```{r}
countrydf<-data.frame(table(terror$success,terror$country_txt))
#Spreading the Dataframe
countrydf<-countrydf %>% spread(Var1,Freq)
names(countrydf)<-c("Country","Unsuccessful","Successful")
countrydf<-countrydf %>% mutate(TotalAttacks=(Successful+Unsuccessful))
#renaming the columns
#FInding the Top countries with most succcessfully executed terror attacks
Topcountrydf<- countrydf %>% group_by(Country) %>%
summarise(Mean_Successfull = mean(Successful)) %>%
top_n(20) %>%
arrange(desc(Mean_Successfull))
#Finding Countries with Most Unsuccessful Terror Attack attempts
TopFailsAttacks<- countrydf %>% group_by(Country) %>%
summarise(Mean_Unsuccessfull = mean(Unsuccessful)) %>%
top_n(20) %>%
arrange(desc(Mean_Unsuccessfull))
#Countires with least Terrorist Attacks
LeastAttackdf<-countrydf %>% group_by(Country) %>%
filter(TotalAttacks %in% seq(20,100)) %>%
select(TotalAttacks) %>%
top_n(30) %>%
arrange(TotalAttacks)
#New Zealand ,Malaysia, Hong Kong , Cuba ,UAE etc are countries having very less
#Terrorist attacks less than 100 attempts.
#Plotting the Barplots
theme_set(theme_classic())
ggplot(aes(x = reorder(Country,Mean_Successfull) , y = Mean_Successfull),data = Topcountrydf) +
geom_col(color="black",fill="#1111F7") +
coord_flip() +
labs(x = "Countries",y="Successfull Terrorist Attacks from 1970-2015")
#Countires with Most Unsuccessful Terror Attacks
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,Mean_Unsuccessfull),y=Mean_Unsuccessfull),data = TopFailsAttacks) +
geom_col(stat="identity",color="black",fill="#DEFD16") +
coord_flip() +
labs(x = "Countries",y="Unsuccessful Terrorist Attacks from 1970-2015") +
scale_y_continuous(limits = c(0,2100),breaks=seq(0,2100,200))
#Countries with Least Terrorist Activities
leastFilter<- LeastAttackdf %>%
filter(TotalAttacks > 50) %>% top_n(20)
#Bar plot for Countries having attacks > 50
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,-TotalAttacks),y=TotalAttacks),data = leastFilter) +
geom_col(stat="count",color="black",fill="#DEFD16") +
coord_flip() +
labs(x = "Countries with Least Terrorist Attacks",y="Total Terrorist Attacks from 1970-2015")
```
----------
### What were the target of the Terrorists?
```{r}
worldTargetdf<-data.frame(table(terror$targtype1_txt)) %>% arrange(desc(Freq))
theme_set(theme_bw())
ggplot(aes(x = reorder(Var1,Freq),y = Freq),data = worldTargetdf) +
geom_col(fill='purple') +
coord_flip()
```
*__As we can notice from the plot above the most targetted entities were Citizens,then Military,Police ,Goverment and Businesses.__*
-----------
##Creating a new Dataframe
```{r}
countryList<-c("United States","United Kingdom")
SpecificTargdf<-terror %>% select(success,target1,targtype1_txt,attacktype1_txt,
country_txt,city,iyear,gname,nkill) %>%
filter(country_txt %in% countryList)
```
The above data frame consists only attacks on 2 major developed countries i.e __United States of America__ and __United Kingdom__ which were in the top 20 list of most successfully executed terrorist attacks.
---------------------
##Analyzing attack in USA
Creating a data frame with only some targetted variables.
```{r}
UStarg<-na.omit(SpecificTargdf) %>% filter(country_txt=="United States")
summary(UStarg$nkill)
```
####City with most successful attacks
```{r}
uscitydf<-data.frame(table(UStarg$city,UStarg$success))
uscitydf<-uscitydf %>% spread(Var2,Freq)
names(uscitydf)<-c("city",'unsuccessful','success')
uscitysuccess<-uscitydf %>% group_by(city) %>%
summarise(success=mean(success)) %>%
arrange(desc(success)) %>%
top_n(30)
#most attacks in New York
theme_set(theme_minimal())
ggplot(aes(x=reorder(city,success),y=success),data=uscitysuccess) +
geom_col(fill="#131D75")+
coord_flip() +
labs(x="City",y="Number of successfull attacks")
```
####Time series of Year and number of succesful attacks in USA
Creating a dataframe with year and number of attacks.
```{r}
theme_set(theme_bw())
yearsuccess<-data.frame(table(UStarg$iyear,UStarg$success))
yearsuccess<-yearsuccess %>% spread(Var2,Freq)
names(yearsuccess)<-c("year","fail",'success')
#generating numeric year column
year<-data.frame(seq(1970,1992))
year[24:46,]<-seq(1994,2016)
names(year)<-c("year")
#adding the year df to yearsuccess df
yearsuccess$attack<-year
#Plotting the time series splot
success=ggplot(data=yearsuccess,aes(x=attack,y=success)) +
geom_point(color="#E80110",size=2) +
geom_line(color="#E80110") +
scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4)) +
labs(x="Year",y="Number of successful attacks",title="Time series of attacks in USA")
success
fail=ggplot(data=yearsuccess,aes(x=attack,y=fail)) +
geom_point(color="#E18001",size=2) +
geom_line(color="#E18001") +
scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4)) +
labs(x="Year",y="Number of failed attacks",title="Time series of failed attacks in USA")
fail
```
-----------------
##Analyzing attacks in India
I will create a new data frame for India by filtering and selecting only the relevant columns from the original data frame.
```{r}
#All the terrorist attacks in INDIA
indiaAttack<-terror %>%
filter(country_txt=="India") %>% select(iyear,provstate,city,summary,location,success,attacktype1_txt,targtype1_txt,gname,weaptype1_txt,nkill)
#Which state has most attacks
table(indiaAttack$provstate)
indiastate<-data.frame(table(indiaAttack$provstate,
indiaAttack$success))
indiastate<-indiastate %>% spread(Var2,Freq)
names(indiastate)<-c("state","Unsuccessful","successful")
#plotting states and number of Successfull attacks
ggplot(aes(x=reorder(state,successful),y=successful ),data = indiastate) + geom_col(color="black",fill="blue",alpha=0.6) +
coord_flip() +
scale_y_continuous(limits=c(0,2000),breaks=seq(0,2000,400)) +
labs(x = "States",y="Successful terrorist attacks")
#most successfull terrorist attacks in Jammu and Kashmir
#We have Punjab , J and K and Assam on the top with most Successfull attacks
#number of kills
ggplot(aes(x = reorder(provstate,nkill), y = nkill),data = na.omit(indiaAttack)) + geom_col(fill="#E63B10") +
coord_flip() +
labs(x="State",y="Number of Kills from 1970-2015")
ggplot(aes(x = reorder(provstate,nkill)),data = indiaAttack) +
geom_bar(fill="green",color="black") +
coord_flip()
```
We have Punjab , J & K and Assam on the top with most Successful terrorist attacks and maximum number of fatalities due to the terrorist activities occurred at Chattisgarh,Assam and J&K as well.
```{r}
#cities with most terrorist attacks
citydf<-data.frame(table(indiaAttack$city,indiaAttack$success))
citydf<-citydf %>%spread(Var2,Freq)
names(citydf)<-c("city","unsuccess","success")
#dataframe consisting of cities with top most count of successful attacks
cityTopsuccess<-citydf %>% group_by(city) %>%
summarise(success = mean(success)) %>%
arrange(desc(success)) %>%
top_n(25)
#Plot for cities with most successful attacks added
ggplot(aes(x = reorder(city,success) , y = success),data = cityTopsuccess) +
geom_col(fill="#FE3C01") +
coord_flip() +
scale_y_continuous(limits=c(0,600),breaks=seq(0,600,100)) +
labs(x="city name",y="Count of Successful terrorist attacks")
```
-----------------------
####Checking the type of attacks occured in India
```{r}
india_typeAttack<-with(indiaAttack,data.frame(table(attacktype1_txt,success)))
#spreading the dataframe
india_typeAttack<-india_typeAttack %>% spread(success,Freq)
india_typeAttack
#Most Bombings occured in india followed by armed assaults.
names(india_typeAttack)<-c("type","fail","success")
india_typeAttack<-india_typeAttack%>% mutate(successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))
#plotting barplots now
ggplot(aes(y = successRate , x = reorder(type,successRate)),data = india_typeAttack) + geom_col(width=0.8,fill='#0000A4',alpha=0.7) +
coord_flip() +
labs(x="Type of attack",y="Success Rate of Attacks(in %)")
```
The surprising thing is that the __Bombings/Explosions__ have the highest distribution i.e highest count amongst all other types of terror attacks but their success rate is less.The terrorist activity having highest success rate is __unarmed assaults__.
---------------------------
#### Analyzing the Target of the Terrorist activities
```{r}
target_india<-with(indiaAttack,data.frame(table(targtype1_txt,success)))
target_india<-target_india %>% spread(success,Freq)
names(target_india)<-c("target","fail","success")
target_india<-target_india%>% mutate(total=fail+success,successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))
#barplot with distribution of the most attacks in and its target
ggplot(aes(x = reorder(target,total),y=total),data=target_india) +
geom_bar(stat="identity",color="white",fill="#FE0202") +
coord_flip() +
labs(x ="Target Type ", y ="Total number of terror attacks")
#Most attacks targetted for Private citizens and property followed by police and goverment
#---------Most successful targets----------#
theme_set(theme_classic())
ggplot(aes(x = reorder(target,successRate),y=successRate),data=target_india) +
geom_col(width=0.7,fill="#F22424",alpha=0.7) + coord_flip() +
labs(x="Target of Terror attacks",y="% of Successul Attacks")
```
-------------------
####Time series analysis of Number of kills over the Years
I will create a dataframe consiting of year and the total number of kills for that particular year.
```{r}
#creating a new data frame-grouping by year and summarising by total sum of kills for a year
yearkillsIndia<-na.omit(indiaAttack) %>%group_by(iyear) %>% summarise(nkills=sum(nkill))
#Time series analysis
theme_set(theme_bw())
ggplot(aes(x = iyear, y =nkills),data=yearkillsIndia) +
geom_point(color="purple",size=2) +
geom_line(color="#9124F2") +
scale_x_continuous(limits=c(1975,2016),breaks=seq(1975,2016,4)) +
labs(x="Year",y="number of Kills")
#highest number of kills in year 2010
#another time series chart using dygraphs
require(dygraphs)
dygraph(yearkillsIndia) %>%
dyHighlight(highlightCircleSize = 5,
highlightSeriesBackgroundAlpha = 0.2,
hideOnMouseOut = FALSE)
```
The above time series plot shows that the highest number of kills were in year __2010__.
------------------------------
###Group names associated with Terrorist activities
Creating a new dataframe which is grouped by the __Group name__ .
```{r,message=FALSE,warning=FALSE,fig.height=6,fig.width=12}
gnamedf<-indiaAttack %>% group_by(gname) %>%
select(success,attacktype1_txt,targtype1_txt,nkill,iyear,provstate,city)
mostSuccessGroup<-as.data.frame(table(gnamedf$gname,gnamedf$success,
gnamedf$attacktype1_txt))
#spreading the dataframe
mostSuccessGroup<-mostSuccessGroup %>% spread(Var2,Freq)
names(mostSuccessGroup)<-c("gname","attacktype","Unsuccessful","success")
SuccessGroup<-mostSuccessGroup%>% group_by(gname) %>%
select(success,Unsuccessful,attacktype) %>%
arrange(desc(success))
#considering only groups which have successful attack>20 and removing unknown groups
SuccessGroup <- SuccessGroup %>% filter(gname!="Unknown",success>20)
#Plotting a Barplot of Gropus vs number of successfull attacks
theme_set(theme_bw())
p<-ggplot(aes(x=reorder(gname,success),y=success),data=SuccessGroup) +
geom_col(aes(fill=attacktype)) +
coord_flip()+
labs(x="Group Name",y="Number of Successful attacks",title="Groups and their successful attacks",
fill="Type of attack") +
scale_y_continuous(breaks=seq(0,2000,200))
p+scale_fill_brewer(palette = "Dark2")
```
-----------------------
### The most used Weapon and weapon types used in Attacks
Now let's analyze the most used weapons which was used by the extremists to execute the attacks.We need to create a separate data frame for arms and weapons used.
```{r}
#grouping by group_name
weapondf<-na.omit(indiaAttack) %>% filter(gname!="Unknown") %>%
group_by(gname) %>%
select(success,nkill,weaptype1_txt ) %>%
arrange(desc(nkill))
#data frame with most number of kills and grouped by Group name
mostkilldf<- weapondf %>% group_by(gname) %>%
summarise(sum_kill=sum(nkill)) %>%
arrange(desc(sum_kill)) %>%
top_n(20)
#plot of most kills by which terrorist group
theme_set(theme_bw())
ggplot(aes(x = reorder(gname,sum_kill) , y = sum_kill ),data = mostkilldf) +
geom_col(fill="red") +
coord_flip() +
labs(x = "Terrorist group",y="total number of kills",title="Plot of total kills vs Terrorist Group" )
#data frame for weapons which killed most people
weaponKill<-weapondf %>% group_by(weaptype1_txt) %>%
summarise(sum_kill=sum(nkill)) %>%
filter(sum_kill >0 ) %>%
arrange(desc(sum_kill))
weaponKill
```