diff --git a/analysis/infographics.Rmd b/analysis/infographics.Rmd deleted file mode 100644 index d138bd9..0000000 --- a/analysis/infographics.Rmd +++ /dev/null @@ -1,91 +0,0 @@ ---- -title: "infographics - Workshop 1 (Sept 2021)" -author: "Natalie Thurlby" -date: "9/30/2021" -output: html_document ---- - -This document creates the info-graphics from Data Hazards Workshop 1 - Sept 2021 - academic focus. -These info-graphics visualise things like how many people found the hazards useful. - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r} -library(tidyverse) -library(waffle) -``` -## Read in and clean data - -### Data provenance -The data is stored in a private folder called `data`, within the analysis folder. Data can be downloaded from [this](https://uob.sharepoint.com/:f:/r/teams/grp-ethicaldatascience/Shared%20Documents/Data%20Hazards/Results?csf=1&web=1&e=6pvo8u) SharePoint folder, for researchers working on the project only (at time of writing Natalie and Nina). - -The data was collected through a series of surveys via wualtrics, for which the exact questions can be seen on the [Data Hazards project OSF page](https://osf.io/3fv7t/). - -### Read in data - -```{r cars} -survey_data <- read_csv('data/data-hazards-workshop-21-sept-2021.csv') -``` -### Rename labels -```{r} -survey_data <- rename(survey_data, - Name=Q28, - AttendeeType=Q27, - ProjOwner=`Proj Title`, - HazardsPre=`Hazards Pre 1`, - HazardsComments1=`Hazards Comments`, - HazardsPost=`Hazards Post 1`, - HazardsComments2=Q18, - Clarity=Q20, - Ease=Q24, - UsefulHazards=Q21, - UsefulDiscussion=Q26, - Learning=Q23, - OverallHazards=Q27_1, - OverallWorkshop=Q28_1, - EncourageParticipation=Q29, - ) -``` - -### Data cleaning - -Remove first row and use as a lookup for actual wording of questions -```{r} -question_wording <- survey_data[1,] -survey_data <- survey_data[-1,] -``` - -## Plots -To create the pictograms, I followed [this tutorial](https://rud.is/rpubs/building-pictograms.html) as a guide. - -```{r} -``` - -```{r} -extrafont::font_import('~/Downloads/fontawesome-free-5.15.4-web/', prompt=FALSE) -``` - - -### How useful did you think the Data Hazards were for exploring potential downfalls of data science projects? -```{r} -qscale <- c("1"="Very Useful", - "2"="Useful", - "3"="Neutral", - "4"="Not useful", - "5"="Not useful at all") -text <- question_wording$UsefulHazards -data <- tibble(survey_data$UsefulHazards) -#data$qscale = qscale[data] -qscale[data] -i``` -```{r pressure, echo=FALSE} -# TODO: Change format of data so that it works in the geom_waffle -#ggplot(survey_data, aes(fill=UsefulHazards, values=UsefulHazards)) + -# waffle::geom_waffle() -ggplot(xdf, aes(fill=vals)) + - waffle::geom_waffle() -``` - -Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. diff --git a/analysis/nina-analysis/hazard-ratings-graph.Rmd b/analysis/nina-analysis/hazard-ratings-graph.Rmd deleted file mode 100644 index 7110d37..0000000 --- a/analysis/nina-analysis/hazard-ratings-graph.Rmd +++ /dev/null @@ -1,158 +0,0 @@ ---- -title: "Hazard Ratings Graph" -author: "Nina Di Cara" -date: "30/09/2021" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) - -library(here) -library(dplyr) -library(tidyr) -library(ggplot2) - -data <- read.table(here("..", "data", "21-sept-survey.txt"), sep = "\t", header = TRUE, encoding="utf-8") - -# Add a person ID colum -data$ID <- seq.int(nrow(data)) - -# Do some initial tidying -data <- data %>% - mutate(project = as.factor(project)) - -hazards <- c( - "Data Science (general)", - "Automates decision making", - "Classifies or ranks people", - "Difficult to understand", - "Lacks community involvement", - "Lacks informed consent", - "Danger of misuse", - "May cause direct harm", - "High environmental cost", - "Risk to privacy", - "Reinforces existing biases" -) - -``` - - -```{r scores} - -data %>% - select(contains("eval")) %>% - mutate(across(everything(), as.factor)) %>% - pivot_longer(cols = everything(), names_to = "Question", values_to = "Score", names_prefix = "eval.") %>% - drop_na() %>% - count(Question, Score) %>% - write.csv(., here("..", "data", "ratings.csv")) - -``` - - -```{r get-rankings} - -# Overwrite data with a longer version of the hazard ratings data, so the timepoint of the ratings is a column -data <- data %>% - select(ID, role, project, hazards.1, hazards.2) %>% - pivot_longer(cols = c("hazards.1", "hazards.2"), names_pattern = "hazards.(.)", values_to = "hazards") %>% - rename(timepoint = name) - -ratings <- data %>% pull("hazards") %>% strsplit(., ",") - -# Initialise objects -i = 1 -ratings_clean <-vector("list",42) - -# This loop will go through each set of responses and indicate if each hazard is present or not. -for (response in ratings) { - - # Set up a list for the next person's responses - next_response <- c(i) - - # Make a vector of TRUE/FALSE if each hazard is in the list - for (hazard in hazards) { - val <- hazard %in% response - next_response <- c(next_response, val) - } - - # Add the next response to the list of clean ratings - ratings_clean[[i]] <- next_response - - # Increase i for the next iteration - i = i + 1 -} - -ratings_clean <- as.data.frame(do.call(rbind, ratings_clean)) -colnames(ratings_clean) <- c("ID", hazards) -ratings_clean <- ratings_clean %>% select(!ID) - -# Add the ratings back to the main dataset -data <- cbind(data, ratings_clean) - -# Remove data no longer needed -rm(ratings, ratings_clean) - -``` - -```{r graph} - -data %>% - mutate(ID = as.factor(ID), # Change this so it doesn't get caught in 'numeric' net - timepoint = as.factor(timepoint), - timepoint = recode_factor(timepoint, - "1" = "Before", "2" = "After")) %>% - group_by(project, timepoint) %>% - summarise(across(where(is.numeric), sum)) %>% - ungroup() %>% - pivot_longer(cols = hazards, names_to = "hazard", values_to = "ratings") %>% - ggplot(aes(x=ratings, y=hazard)) + - geom_line(aes(group = hazard)) + - geom_point(aes(color = timepoint), size = 3) + - facet_wrap(~project) + - theme_bw() + - xlab("\nNumber of times selected") + - ylab("Hazard Label\n") + - guides(color=guide_legend(title="Timepoint")) - -ggsave("rating-changes.png", width = 6, height = 4, units = "in", dpi=330) - -``` - - -```{r graph2} - -data %>% - mutate(ID = as.factor(ID), # Change this so it doesn't get caught in 'numeric' net - timepoint = as.factor(timepoint), - timepoint = recode_factor(timepoint, - "1" = "Before", "2" = "After")) %>% - select(!hazards) %>% - pivot_longer(cols = hazards, names_to = "hazard", values_to = "ratings") %>% - pivot_wider(names_from = "timepoint", values_from = "ratings") %>% - mutate(diff = After - Before) %>% - count(hazard, diff) %>% - ungroup() %>% - mutate(value = diff * n) %>% - #pivot_wider(names_from = diff, values_from = n, values_fill = 0) %>% - #rename("Up" = "1", "Down" = "-1") %>% - # Make into a pyramid plot - ggplot(aes(y = hazard, x = value)) + - geom_point(aes(color = as.factor(diff)), size = 3) + - #geom_line(arrow = arrow(ends="first", type = "closed")) + - #geom_line(arrow = arrow(ends="last", type = "closed")) + - geom_arrow(start = 0) + - theme_minimal() + - theme(panel.grid.minor = element_blank()) + - #coord_flip() + - ylab("Hazard Label\n") + - xlab("\nNumber of people who added or removed labels") - - # geom_bar(aes(y = -Down), stat = "identity", fill = "#fc8d62") + - # geom_bar(aes(y = Up), stat = "identity", fill = "#66c2a5") + - - -# ggsave("ratings-moved.png", width = 6, height = 4, units = "in", dpi=330) -``` \ No newline at end of file diff --git a/analysis/nina-analysis/rating-changes.png b/analysis/nina-analysis/rating-changes.png deleted file mode 100644 index 1b2cd51..0000000 Binary files a/analysis/nina-analysis/rating-changes.png and /dev/null differ diff --git a/analysis/nina-analysis/ratings-moved.png b/analysis/nina-analysis/ratings-moved.png deleted file mode 100644 index c94b9d1..0000000 Binary files a/analysis/nina-analysis/ratings-moved.png and /dev/null differ