Complexity_index.R

library(foreign)
library(dplyr)
library(geometry)
library(readstata13)
#library(easyCODA)
library(sjmisc)
library(ggplot2)
library(tidyverse)
library(tidyr)
library(ade4)
options(scipen = 999)
rm(list=ls())
# set working directory to source file location; RStudio required
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# files are also available at "https://github.com/hkwilliamchiu/Complexity-index/"
# Reading descriptor scores
occ1990dd_ONET_2 <- read.dta("occ1990dd_ONET_2.dta")
# Reading main results of Caines et al.
CHK_occupation_level <- read.dta13("CHK_occupation_level.dta")
# Reading PCA generated by data_build_occ_level.do in the submission data
ONET_pca_scores_produced <- read.dta13("ONET_pca_scores_produced.dta")
# Reading titles of occupation by occ1990dd
occ1990dd_names <- read.dta("occ1990dd_names.dta")

#-------------------------------------------------------------------------------
# Data processing
#-------------------------------------------------------------------------------
# add occupation names
ONET_pca_scores_produced = merge(ONET_pca_scores_produced, occ1990dd_names, by="occ1990dd")
ONET_pca_scores_produced <- ONET_pca_scores_produced %>%
  select("occ1990dd", "occ1990dd_title", everything())

# merge imputed complexity index with descriptor scores
occ1990dd_ONET_2 = merge(occ1990dd_ONET_2, occ1990dd_names, by="occ1990dd")
occ1990dd_ONET_2 <- occ1990dd_ONET_2 %>%
  select("occ1990dd", "occ1990dd_title", everything())

# drop unused columns
occ1990dd_ONET_2 = occ1990dd_ONET_2[, !names(occ1990dd_ONET_2) %in% 
                                      c("_install","_troubleshooting","_repair","_merge")]
occ1990dd_ONET_2 = occ1990dd_ONET_2[, -grep("DEMING_*", colnames(occ1990dd_ONET_2))]

#-------------------------------------------------------------------------------
# Complexity "Score" (Following Equation 1 from Caines et al. not included
# in their data)
#-------------------------------------------------------------------------------
# calculate factor loadings
WPCA <- dudi.pca(occ1990dd_ONET_2[,grep("^_.*", colnames(occ1990dd_ONET_2))], 
                 row.w=occ1990dd_ONET_2$weight, scannf=FALSE)
factorloadings_numeric <- -WPCA[["c1"]][["CS1"]]

# calculate complexity scores as in Equation 1
complexsco = cbind(rep(0,nrow(occ1990dd_ONET_2)),rep(0,nrow(occ1990dd_ONET_2)))

# note that both the "factorloadings_numeric" and the "occ1990dd_ONET_2" order descriptors
# in alphabetical order, so dot product can be used directly
for (i in 1:nrow(occ1990dd_ONET_2)){
  complexsco[i,1] = occ1990dd_ONET_2[i,1]
  complexsco[i,2] = dot(as.numeric(occ1990dd_ONET_2[i,grep("^_.*", 
                            colnames(occ1990dd_ONET_2))]),factorloadings_numeric)
}

complexsco = as.data.frame(complexsco)
names(complexsco) = c("occ1990dd", "complexsco")
complexsco <- complexsco %>%
  mutate(pctile_complexsco = percent_rank(complexsco))

# extract complexity index
complexind = ONET_pca_scores_produced[c("occ1990dd","complexind", "pctile_complexind")]

# merge complexity score and complexity index with occ1990dd_ONET_2
occ1990dd_ONET_2 = merge(occ1990dd_ONET_2, complexsco, by="occ1990dd")
occ1990dd_ONET_2 = merge(occ1990dd_ONET_2, complexind, by="occ1990dd")
occ1990dd_ONET_2 <- occ1990dd_ONET_2%>%
  select("occ1990dd", "occ1990dd_title", "complexsco", "pctile_complexsco", 
         "complexind", "pctile_complexind", everything())

# sort by pctile_complexind
occ1990dd_ONET_2 = occ1990dd_ONET_2[order(-occ1990dd_ONET_2$pctile_complexind),]

write.csv(occ1990dd_ONET_2,"occ1990dd_ONET_2_CMZ.csv", row.names = FALSE)

# Table 2, top 5 occupations
head(occ1990dd_ONET_2, 5)[1:6]
# Table 3, bottom 5 occupations
tail(occ1990dd_ONET_2, 5)[1:6]
#-------------------------------------------------------------------------------


#-------------------------------------------------------------------------------
# construct overview of descriptors ("descriptor_overview_CMZ.csv")
#-------------------------------------------------------------------------------
descriptor_overview = data.frame(occ1990dd = numeric(),
                                 occ1990dd_title = character(),
                                 pctile_complexind = numeric(),
                                 descriptor_sum = double(),
                                 descriptor_mean = double(),
                                 descriptor_zero_count = numeric())

for (i in 1:nrow(occ1990dd_ONET_2)){
  descriptor_overview[i,1] = occ1990dd_ONET_2[i,"occ1990dd"]
  descriptor_overview[i,2] = occ1990dd_ONET_2[i,"occ1990dd_title"]
  descriptor_overview[i,3] = occ1990dd_ONET_2[i,"pctile_complexind"]
  descriptor_overview[i,4] = sum(occ1990dd_ONET_2[i,grep("^_.*", 
                                                colnames(occ1990dd_ONET_2))])
  descriptor_overview[i,5] = rowMeans(occ1990dd_ONET_2[i,grep("^_.*", 
                                                colnames(occ1990dd_ONET_2))])
  descriptor_overview[i,6] = rowSums(occ1990dd_ONET_2[i,grep("^_.*", 
                                                colnames(occ1990dd_ONET_2))]==0)
}
  
write.csv(descriptor_overview,"descriptor_overview_CMZ.csv", row.names = FALSE)

#-------------------------------------------------------------------------------
# SENSITIVITY CHECK ("sensitivity_CMZ.csv")
#-------------------------------------------------------------------------------
# Pass a n times 2 dataframe to this function with name of the column to be ranked by
# it returns a n times 3 data frame with ranking of the column
get_rank <- function(matrix, col_name){
  matrix = matrix[order(-matrix[,2]),]
  matrix = cbind(matrix, c(1:nrow(matrix)))
  colnames(matrix) <- c("occ1990dd", col_name, paste0("rank_",col_name))
  return(matrix)
}

# benchmark: pctile_complexind ranking (from Caines et al.)
sensitivity <- get_rank(occ1990dd_ONET_2[c("occ1990dd", "pctile_complexind")], "pctile_complexind")
sensitivity <- sensitivity %>%
  inner_join(occ1990dd_names, by="occ1990dd") %>%
  select("occ1990dd", "occ1990dd_title", everything())

# add complexity score
score_rank = get_rank(occ1990dd_ONET_2[c("occ1990dd", "complexsco")], "complexsco")
sensitivity <- sensitivity %>%
  right_join(score_rank, by="occ1990dd")
# calculate "rank_complexsco_diff" which is difference in ranking relative to benchmark
sensitivity <- sensitivity %>%
  mutate("rank_complexsco_diff"=abs(sensitivity$rank_complexsco-sensitivity$rank_pctile_complexind))

# Function that binds new PCA specification to the benchmark
attach_func <- function(pred, col_name, sensitivity){
  pred <- as.data.frame(cbind(occ1990dd_ONET_2$occ1990dd, pred[,1]))
  colnames(pred) <- c("occ1990dd", col_name)
  pred <- get_rank(pred, col_name)
  sensitivity = merge(sensitivity, pred, by="occ1990dd")
  
  sensitivity[paste0("rank_",col_name,"_diff")]=abs(sensitivity[paste0("rank_",col_name)]
                                          -sensitivity$rank_pctile_complexind)
  return(sensitivity)
}

# principal component analysis in R, without weighting, but with standardization: cor=TRUE
PCA <- princomp(occ1990dd_ONET_2[,grep("^_.*", 
                                 colnames(occ1990dd_ONET_2))], cor=TRUE, scores=TRUE)
# PCA[["scores"]][,1]
pred <- predict(PCA, newdata=occ1990dd_ONET_2[,grep("^_.*", 
                                                     colnames(occ1990dd_ONET_2))])

sensitivity <- attach_func(pred, "R_PCA", sensitivity)

# principal component analysis in R, with weighting and standardization
# this is where we replicate Caines et al.'s data: rankings are entirely the same (rank_W_R_PCA=rank_pctile_complexind)
WPCA <- dudi.pca(occ1990dd_ONET_2[,grep("^_.*", colnames(occ1990dd_ONET_2))], 
                 row.w=occ1990dd_ONET_2$weight, scannf=FALSE)

pred_weighted <- as.data.frame(-WPCA[["li"]][["Axis1"]])

sensitivity <- attach_func(pred_weighted, "W_R_PCA", sensitivity)

# highest 30, 20, 10 descriptors in terms of factor loadings (by dropping lowest 5, 15, 25)
lowest5_descriptors = c("_percep_speed","_monitor","_program","_science","_flex_closure")
lowest15_descriptors = c("_percep_speed","_monitor","_program","_science","_flex_closure",
                       "_evaluate","_judge_quality","_math","_speed_closure","_number_facility",
                       "_creative","_develop","_memorization","_math_reason","_process")
lowest25_descriptors = c("_percep_speed","_monitor","_program","_science","_flex_closure",
                       "_evaluate","_judge_quality","_math","_speed_closure","_number_facility",
                       "_creative","_develop","_memorization","_math_reason","_process",
                       "_cat_flex","_information_ord","_update","_oral_expr","_originality",
                       "_decisions_prob","_written_expr","_prob_sensitive","_analyze","_fluency")

# drop descriptors
check_drop_descriptors <- function(lowest_descriptors, col_name, sensitivity){
  train <- occ1990dd_ONET_2[,grep("^_.*", 
                      colnames(occ1990dd_ONET_2))]
  train <- train[,!(names(train) %in% lowest_descriptors)]
  WPCA_narrow <- dudi.pca(train, row.w=occ1990dd_ONET_2$weight, scannf=FALSE)
  pred_WPCA_narrow <- as.data.frame(-WPCA_narrow[["li"]][["Axis1"]])

  sensitivity <- attach_func(pred_WPCA_narrow, col_name, sensitivity)
  return(sensitivity)
}

sensitivity <- check_drop_descriptors(lowest5_descriptors, "High30", sensitivity)
sensitivity <- check_drop_descriptors(lowest15_descriptors, "High20", sensitivity)
sensitivity <- check_drop_descriptors(lowest25_descriptors, "High10", sensitivity)


write.csv(sensitivity,"sensitivity_CMZ.csv", row.names = FALSE)

#-------------------------------------------------------------------------------
# Pseudo Occupations
#-------------------------------------------------------------------------------
train <- occ1990dd_ONET_2[,grep("^_.*", 
                    colnames(occ1990dd_ONET_2))]

# PCA on the original 317 occupations
PCA_pseudo <- dudi.pca(train, scale=TRUE, scannf=FALSE)

# creating occupations with different scores across the board
# score 1 for all 35 descriptors
pseudo_know_nothing = rep(1,35)
# score 2 for all 35 descriptors
pseudo_low = rep(2,35)
# score 3.5 for all 35 descriptors
pseudo_med = rep(3.5,35)
# score 6.125 for 20 descriptors with highest factor loadings
pseudo_med_specialist = rep(6.125,35)
  # set unwanted descriptors to zero
  for (i in 1:35){
    col_name=colnames(train)[i]
    binary = str_contains(c("_percep_speed","_monitor","_program","_science","_flex_closure",
                 "_evaluate","_judge_quality","_math","_speed_closure","_number_facility",
                 "_creative","_develop","_memorization","_math_reason","_process"), col_name)
    if (binary){pseudo_med_specialist[i]=0}
  }
# score 7 for all 35 descriptors
pseudo_know_it_all = rep(7,35)

# bind all the pseudo-occupations with the original 317 occupations
test <- rbind(train,pseudo_know_nothing,pseudo_low,pseudo_med,pseudo_med_specialist,pseudo_know_it_all)

# predicting complexity measure for the pesudo-occupations using PCA generated from the original 317 occupations
pred <- predict(PCA_pseudo, newdata=test)
pred <- cbind.data.frame("occ1990dd"=c(occ1990dd_ONET_2$occ1990dd,9999,9998,9997,9996,9995), 
              "occ1990dd_title"=c(occ1990dd_ONET_2$occ1990dd_title,"pseudo_know_nothing","pseudo_low",
                              "pseudo_med","pseudo_med_specialist","pseudo_know_it_all"),
              "pseudo"=-pred[,1])

pred <- pred[order(-pred$pseudo),]
pred <- cbind(pred, "rank_pseudo"=c(1:nrow(pred)))

write.csv(pred,"pseudo_occupations_CMZ.csv", row.names=FALSE)

#-------------------------------------------------------------------------------
# Plotting Density of Descriptor Scores (Figure A1)
#-------------------------------------------------------------------------------
train <- occ1990dd_ONET_2[,grep("^_.*", 
                                colnames(occ1990dd_ONET_2))]

train %>% tidyr::gather("id","value",1:35) %>%
  ggplot(., aes(x=value)) +
  geom_density(linetype="dashed") +
  facet_wrap(~ id) +
  labs(y="Density",
       x="Descriptor Score (0-7)") +
  theme_minimal()

#-------------------------------------------------------------------------------
# Scatter Plot of Descriptor Scores (Figure A2)
#-------------------------------------------------------------------------------
train <- cbind(occ1990dd_ONET_2[,grep("^_.*", colnames(occ1990dd_ONET_2))], 
               "pctile_complexind"=occ1990dd_ONET_2[,"pctile_complexind"])

train %>% tidyr::gather("id","value",1:35) %>%
  ggplot(., aes(x=pctile_complexind,y=value)) +
  geom_point() +
  facet_wrap(~ id) +
  labs(y="Descriptor Score",
       x="Percentile of Complexity Index") +
  theme_minimal()