TerrorNotebook.Rmd

---
title: "Analyzing global terrorist activities"
output:
  html_document: default
  html_notebook: default
  author: Anish Singh Walia
---


The main aim of this analytics project is to get out the hidden trends and unknown information about the terrorist attacks that took place in the world from 1970-2016 and understand *why they happened,where they happended,which countires were most affected by it,what were the most targetted entities of the terrorist groups, the most violent and active terrorist groups in the world throught* etc and lots more.

And then also some __rolling down__ to specific and most affected countries with terrorist activities in the past such as India,USA,UK etc.

```{r,warning=FALSE,message=FALSE}
#analyzing the GLobal Terrorist Activities
require(data.table)
require(readr) #to read data faster
require(ggplot2)
#Data preprocessing and Transformation packages
require(dplyr)
require(tidyr)


terror<-read_csv("F:/globalterrorismdb_0617dist.csv")

#checking the structure of data
#str(terror)
#summary(terror)
#a wide data set with lots of columns


#---------------------


```


##Starting with Descriptive Analytics


```{r}
#checking how many terrorist attacks were successfull
table(terror$success)

success<-as.data.frame(table(terror$success))
success$Var1<-ifelse(success$Var1==1,"Successful","Unsuccessful")
names(success)<-c("Attack","count")
#almost 90% of all attacks were successfull
ggplot(aes(x = Attack,y=count), data = success) + 
  geom_col(width=0.5,color="black",fill="purple",alpha=0.5) + 
  labs(x = "Attack",y="Count",title="Distrubution of Successful and Unsuccessful attacks from 1970-2015")


```
Most of the attacks in the past were successfully executed.

---------------

####Analyzing what types of Attacks were done

Aggregating and summarising data.

```{r}
table(terror$attacktype1_txt)
#Most common terror attack is Bombing/Explosion

#Finding Succcessful attacks and their attack types
#percentage of Successful and unsuccessful attacks
#converting to a data frame
attackdf<-data.frame(table(terror$attacktype1_txt,
                                      terror$success))


#Spreading the data frame
attackdf<-attackdf %>% spread(key = Var2,Freq)
colnames(attackdf)<-c("Type_of_Attack","Unsuccessful","Successful")

#Adding new columns which contain Percentage of Successful and Unsucesful attacks
attackdf<-attackdf%>% mutate(PerSuccessful=round((Successful/(Unsuccessful+Successful))*100,2), PerUnsuccessful=round((Unsuccessful/(Unsuccessful+Successful))*100,2))

attackdf

#Plotting
plot<-ggplot(aes(x = "" , y = Successful,fill=Type_of_Attack),data = attackdf) + 
  geom_bar(width=1 , stat="identity") + 
  theme(axis.line = element_blank(), 
        plot.title = element_text(hjust=0.5)) + 
        labs(title="Pie Chart of Count of the Types of Successful Attacks",
             x=NULL,y=NULL , fill = 'Type of Attack')


#Final Plot to make it a Pie Chart

plot + coord_polar(theta ="y" , start = 0)  + scale_color_brewer(palette = "Set1")


#Making a Bar plot
theme_set(theme_grey())

ggplot(aes(x = reorder(Type_of_Attack,PerSuccessful),y = PerSuccessful),data = attackdf) + 
  geom_col(color="black",fill="red",alpha=0.6) + 
  coord_flip() + 
  labs(title="Barplot of Types of Attacks and Successful attacks",x="Type of Attack",y="Percentage of Successful Attacks")
          

```

As the Plot shows the Most Successful *__Terror Attacks were for Hostage Taking,
Kidnapping,Armed Assaults, Bombings__*. 


----------------

####Analyzing the Countires and the Terrorist attacks took place 

```{r}
countrydf<-data.frame(table(terror$success,terror$country_txt))
#Spreading the Dataframe
countrydf<-countrydf %>% spread(Var1,Freq)
names(countrydf)<-c("Country","Unsuccessful","Successful")
countrydf<-countrydf %>% mutate(TotalAttacks=(Successful+Unsuccessful))
#renaming the columns


#FInding the Top countries with most succcessfully executed terror attacks

Topcountrydf<- countrydf %>% group_by(Country) %>%
        summarise(Mean_Successfull = mean(Successful)) %>%
        top_n(20) %>%
        arrange(desc(Mean_Successfull))


#Finding Countries with Most Unsuccessful Terror Attack attempts
TopFailsAttacks<- countrydf %>% group_by(Country) %>%
        summarise(Mean_Unsuccessfull = mean(Unsuccessful)) %>%
        top_n(20) %>%
        arrange(desc(Mean_Unsuccessfull))


#Countires with least Terrorist Attacks

LeastAttackdf<-countrydf %>% group_by(Country) %>%
      filter(TotalAttacks %in% seq(20,100)) %>%
      select(TotalAttacks) %>%
      top_n(30) %>%
      arrange(TotalAttacks)

#New Zealand ,Malaysia, Hong Kong , Cuba ,UAE etc are countries having very less
#Terrorist attacks less than 100 attempts.


#Plotting the Barplots
theme_set(theme_classic())
ggplot(aes(x = reorder(Country,Mean_Successfull) , y = Mean_Successfull),data = Topcountrydf) + 
  geom_col(color="black",fill="#1111F7") + 
  coord_flip() +
  labs(x = "Countries",y="Successfull Terrorist Attacks from 1970-2015")


#Countires with Most Unsuccessful Terror Attacks
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,Mean_Unsuccessfull),y=Mean_Unsuccessfull),data = TopFailsAttacks) + 
  geom_col(stat="identity",color="black",fill="#DEFD16") + 
  coord_flip() +
  labs(x = "Countries",y="Unsuccessful Terrorist Attacks from 1970-2015") + 
  scale_y_continuous(limits = c(0,2100),breaks=seq(0,2100,200))


#Countries with Least Terrorist Activities 

leastFilter<- LeastAttackdf %>% 
  filter(TotalAttacks > 50) %>% top_n(20)
#Bar plot for Countries having attacks > 50
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,-TotalAttacks),y=TotalAttacks),data = leastFilter) + 
  geom_col(stat="count",color="black",fill="#DEFD16") + 
  coord_flip() +
  labs(x = "Countries with Least Terrorist Attacks",y="Total Terrorist Attacks from 1970-2015")
  

```


----------


### What were the target of the Terrorists?


```{r}
worldTargetdf<-data.frame(table(terror$targtype1_txt)) %>% arrange(desc(Freq))

theme_set(theme_bw())
ggplot(aes(x = reorder(Var1,Freq),y = Freq),data = worldTargetdf) +
  geom_col(fill='purple') +
  coord_flip()


```

*__As we can notice from the plot above the most targetted entities were Citizens,then Military,Police ,Goverment and Businesses.__*


-----------


##Creating a new Dataframe

```{r}
countryList<-c("United States","United Kingdom")

SpecificTargdf<-terror %>% select(success,target1,targtype1_txt,attacktype1_txt,
                                  country_txt,city,iyear,gname,nkill) %>% 
                          filter(country_txt %in% countryList)
```

The above data frame consists only attacks on 2 major developed countries i.e __United States of America__ and __United Kingdom__ which were in the top 20 list of most successfully executed terrorist attacks.


---------------------


##Analyzing attack in USA

Creating a data frame with only some targetted variables.

```{r}
UStarg<-na.omit(SpecificTargdf) %>% filter(country_txt=="United States")
summary(UStarg$nkill)

```


####City with most successful attacks


```{r}
uscitydf<-data.frame(table(UStarg$city,UStarg$success))

uscitydf<-uscitydf %>% spread(Var2,Freq)
names(uscitydf)<-c("city",'unsuccessful','success')

uscitysuccess<-uscitydf %>% group_by(city) %>%
                          summarise(success=mean(success)) %>%
                          arrange(desc(success)) %>%
                          top_n(30)
#most attacks in New York
theme_set(theme_minimal())
ggplot(aes(x=reorder(city,success),y=success),data=uscitysuccess) +
  geom_col(fill="#131D75")+
  coord_flip() +
  labs(x="City",y="Number of successfull attacks")


```


####Time series of Year and number of succesful attacks in USA

Creating a dataframe with year and number of attacks.

```{r}
theme_set(theme_bw())

yearsuccess<-data.frame(table(UStarg$iyear,UStarg$success))
yearsuccess<-yearsuccess %>% spread(Var2,Freq)
names(yearsuccess)<-c("year","fail",'success')

#generating numeric year column
year<-data.frame(seq(1970,1992))
year[24:46,]<-seq(1994,2016)
names(year)<-c("year")

#adding the year df to yearsuccess df
yearsuccess$attack<-year


#Plotting the time series splot
success=ggplot(data=yearsuccess,aes(x=attack,y=success)) +
  geom_point(color="#E80110",size=2) + 
  geom_line(color="#E80110") + 
  scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4)) +
  labs(x="Year",y="Number of successful attacks",title="Time series of attacks in USA")

success
  

fail=ggplot(data=yearsuccess,aes(x=attack,y=fail)) +
  geom_point(color="#E18001",size=2) + 
  geom_line(color="#E18001") + 
  scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4))  +
  labs(x="Year",y="Number of failed attacks",title="Time series of failed attacks in USA")

 fail


```


-----------------


##Analyzing attacks in India

I will create a new data frame for India by filtering and selecting only the relevant columns from the original data frame.

```{r}
#All the terrorist attacks in INDIA
indiaAttack<-terror %>%
  filter(country_txt=="India") %>% select(iyear,provstate,city,summary,location,success,attacktype1_txt,targtype1_txt,gname,weaptype1_txt,nkill) 

#Which state has most attacks 
table(indiaAttack$provstate)
indiastate<-data.frame(table(indiaAttack$provstate,
                             indiaAttack$success))

indiastate<-indiastate %>% spread(Var2,Freq)
names(indiastate)<-c("state","Unsuccessful","successful")

#plotting states and number of Successfull attacks
ggplot(aes(x=reorder(state,successful),y=successful ),data = indiastate) + geom_col(color="black",fill="blue",alpha=0.6) + 
  coord_flip() + 
  scale_y_continuous(limits=c(0,2000),breaks=seq(0,2000,400)) + 
  labs(x = "States",y="Successful terrorist attacks")
#most successfull terrorist attacks in Jammu and Kashmir
#We have Punjab , J and K and Assam on the top with most Successfull attacks
  
#number of kills
ggplot(aes(x = reorder(provstate,nkill), y = nkill),data = na.omit(indiaAttack)) + geom_col(fill="#E63B10") + 
  coord_flip() +
  labs(x="State",y="Number of Kills from 1970-2015")


ggplot(aes(x = reorder(provstate,nkill)),data = indiaAttack) + 
  geom_bar(fill="green",color="black") + 
  coord_flip()
  

```

We have Punjab , J & K and Assam on the top with most Successful terrorist attacks and maximum number of fatalities due to the terrorist activities occurred at Chattisgarh,Assam and J&K as well.

```{r}
#cities with most terrorist attacks
citydf<-data.frame(table(indiaAttack$city,indiaAttack$success))
citydf<-citydf %>%spread(Var2,Freq)
names(citydf)<-c("city","unsuccess","success")

#dataframe consisting of cities with top most count of successful attacks
cityTopsuccess<-citydf %>% group_by(city) %>%
  summarise(success = mean(success)) %>%
  arrange(desc(success)) %>%
  top_n(25)

#Plot for cities with most successful attacks added
ggplot(aes(x = reorder(city,success) , y = success),data = cityTopsuccess) + 
  geom_col(fill="#FE3C01") + 
  coord_flip() + 
  scale_y_continuous(limits=c(0,600),breaks=seq(0,600,100)) +
  labs(x="city name",y="Count of Successful terrorist attacks")


```

-----------------------


####Checking the type of attacks occured in India


```{r}
india_typeAttack<-with(indiaAttack,data.frame(table(attacktype1_txt,success)))
#spreading the dataframe
india_typeAttack<-india_typeAttack %>% spread(success,Freq)
india_typeAttack
#Most Bombings occured in india followed by armed assaults.
names(india_typeAttack)<-c("type","fail","success")

india_typeAttack<-india_typeAttack%>% mutate(successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))

#plotting barplots now
ggplot(aes(y = successRate , x = reorder(type,successRate)),data = india_typeAttack) + geom_col(width=0.8,fill='#0000A4',alpha=0.7) +
  coord_flip() +
  labs(x="Type of attack",y="Success Rate of Attacks(in %)")


```

The surprising thing is that the __Bombings/Explosions__ have the highest distribution i.e highest count amongst all other types of terror attacks but their success rate is less.The terrorist activity having highest success rate is __unarmed assaults__.

---------------------------


#### Analyzing the Target of the Terrorist activities


```{r}

target_india<-with(indiaAttack,data.frame(table(targtype1_txt,success)))
target_india<-target_india %>% spread(success,Freq)
names(target_india)<-c("target","fail","success")
target_india<-target_india%>% mutate(total=fail+success,successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))

#barplot with distribution of the most attacks in and its target
ggplot(aes(x = reorder(target,total),y=total),data=target_india) + 
  geom_bar(stat="identity",color="white",fill="#FE0202") +
  coord_flip() +
  labs(x ="Target Type ", y ="Total number of terror attacks")
#Most attacks targetted for Private citizens and property followed by police and goverment


#---------Most successful targets----------#
theme_set(theme_classic())

ggplot(aes(x = reorder(target,successRate),y=successRate),data=target_india) +
  geom_col(width=0.7,fill="#F22424",alpha=0.7) + coord_flip() +
  labs(x="Target of Terror attacks",y="% of Successul Attacks")


```


-------------------


####Time series analysis of Number of kills over the Years

I will create a dataframe consiting of year and the total number of kills for that particular year.

```{r}

#creating a new data frame-grouping by year and summarising by total sum of kills for a year
yearkillsIndia<-na.omit(indiaAttack) %>%group_by(iyear) %>% summarise(nkills=sum(nkill))

#Time series analysis
theme_set(theme_bw())
ggplot(aes(x = iyear, y =nkills),data=yearkillsIndia) +
  geom_point(color="purple",size=2) + 
  geom_line(color="#9124F2") + 
  scale_x_continuous(limits=c(1975,2016),breaks=seq(1975,2016,4)) + 
  labs(x="Year",y="number of Kills")

#highest number of kills in year 2010

#another time series chart using dygraphs
require(dygraphs)
dygraph(yearkillsIndia) %>%
  dyHighlight(highlightCircleSize = 5, 
              highlightSeriesBackgroundAlpha = 0.2,
              hideOnMouseOut = FALSE)


```

The above time series plot shows that the highest number of kills were in year __2010__.


------------------------------

###Group names associated with Terrorist activities

Creating a new dataframe which is grouped by the __Group name__ .

```{r,message=FALSE,warning=FALSE,fig.height=6,fig.width=12}
gnamedf<-indiaAttack %>% group_by(gname) %>%
        select(success,attacktype1_txt,targtype1_txt,nkill,iyear,provstate,city)


mostSuccessGroup<-as.data.frame(table(gnamedf$gname,gnamedf$success,
                                      gnamedf$attacktype1_txt))

#spreading the dataframe
mostSuccessGroup<-mostSuccessGroup %>% spread(Var2,Freq)
names(mostSuccessGroup)<-c("gname","attacktype","Unsuccessful","success")

SuccessGroup<-mostSuccessGroup%>% group_by(gname) %>%
          select(success,Unsuccessful,attacktype) %>%
          arrange(desc(success))
         

#considering only groups which have successful attack>20 and removing unknown groups                        
SuccessGroup <- SuccessGroup %>% filter(gname!="Unknown",success>20)

#Plotting a Barplot of Gropus vs number of successfull attacks 
theme_set(theme_bw())
p<-ggplot(aes(x=reorder(gname,success),y=success),data=SuccessGroup) + 
  geom_col(aes(fill=attacktype)) + 
  coord_flip()+
  labs(x="Group Name",y="Number of Successful attacks",title="Groups and their successful attacks",
       fill="Type of attack") + 
  scale_y_continuous(breaks=seq(0,2000,200))

p+scale_fill_brewer(palette = "Dark2")


```
 
-----------------------


### The most used Weapon and weapon types used in Attacks

Now let's analyze the most used weapons which was used by the extremists to execute the attacks.We need to create a separate data frame for arms and weapons used.


```{r}
#grouping by group_name
weapondf<-na.omit(indiaAttack) %>% filter(gname!="Unknown") %>%
  group_by(gname) %>%
  select(success,nkill,weaptype1_txt ) %>%
  arrange(desc(nkill))

#data frame with most number of kills and grouped by Group name
mostkilldf<- weapondf %>% group_by(gname) %>%
  summarise(sum_kill=sum(nkill)) %>%
  arrange(desc(sum_kill)) %>%
  top_n(20)

#plot of most kills by which terrorist group
theme_set(theme_bw())
ggplot(aes(x = reorder(gname,sum_kill) , y = sum_kill ),data = mostkilldf) +
  geom_col(fill="red") + 
  coord_flip() +
  labs(x = "Terrorist group",y="total number of kills",title="Plot of total kills vs Terrorist Group" )
  

#data frame for weapons which killed most people
weaponKill<-weapondf %>% group_by(weaptype1_txt) %>% 
  summarise(sum_kill=sum(nkill)) %>%
  filter(sum_kill >0 ) %>% 
  arrange(desc(sum_kill))
weaponKill


```