hrs-selection.qmd

---
title: "Natural Selection Across Three Generations of Americans"
author: 
  - David Hugh-Jones
  - Tobias Edwards
abstract: |
  We investigate natural selection on polygenic scores
  in the contemporary US, using the Health and Retirement Study. Across
  three generations, scores which correlate negatively (positively) with
  education are selected for (against). However, results only
  partially support the economic theory of fertility as an explanation for 
  natural selection. The theory predicts that
  selection coefficients should be stronger among low-income, 
  less educated, unmarried and younger parents, but these predictions are only
  half borne out: coefficients are larger only among low-income
  parents and unmarried parents. We also estimate effect sizes corrected 
  for noise in the polygenic scores. Selection for some health traits is 
  similar in magnitude to that for cognitive traits. 
date: "February 2024"
format: 
  pdf:
    keep-tex: true
    include-in-header:
      - text: |
          \usepackage{placeins}
    geometry:
    - top=2in
    - left=1.25in
    - right=1.25in
    - heightrounded
editor: visual
editor_options: 
  chunk_output_type: console
knitr:
  opts_chunk: 
    echo: false
bibliography: bibliography-hrs.bib
---

```{r}
#| label: setup
#| cache-lazy: false

# TE One time I tried to render the document with cache on I got the error message on the first chunk: "Error in `lazyLoadDBinsertVariable()`: ! long vectors not supported yet: connections.c:6028" Added cache-lazy: false which seems to fix things

options(digits = 3)
set.seed(271075)

library(tibble)
library(haven)
library(forcats)
library(dplyr, warn.conflicts = FALSE)
library(glue)
library(stringr)
library(purrr)
library(carData)
library(car, warn.conflicts = FALSE) # masks some
library(ggplot2)
library(tidyr)
library(santoku, warn.conflicts = FALSE) # masks tidyr::chop
 # masks dotchart:
suppressPackageStartupMessages(library(survey, warn.conflicts = FALSE))

knitr::knit_hooks$set("inline", 
  \(x) {
    if (! is.na(x <- suppressWarnings(as.numeric(x)))) {
      x <- prettyNum(formatC(x, digits = 3, format = "f"), ",")
    }
    x
  }
)
nice_names <- c(
  SCZ = "Schizophrenia",
  WC = "Waist circ.",
  WHR = "Waist-hip ratio",
  NEUROT = "Neuroticism",
  WELLB = "Well-being",
  DEPSYMP = "Depr. symptoms",
  CD = "Cor. art. disease",
  MI = "Myoc. infarction",
  CORT = "Cortisol",
  T2D = "Type 2 diabetes",
  BIP = "Bipolar",
  ADHD_PGC17 = "ADHD",
  XDISORDER = "Cross-disorder",
  MENA = "Age at menarche",
  MENO = "Age at menopause",
  EXTRAVER = "Extraversion",
  AUTISM = "Autism",
  AB = "Antisoc. behav.",
  OCD = "OCD",
  AFBC = "Age 1st birth combn",
  AFBF = "Age 1st birth F",
  AFBM = "Age 1st birth M",
  MDD2 = "Maj. depr. disorder",
  PTSDAA = "PTSD Afr",
  PTSDEA = "PTSD Eur",
  PTSDC = "PTSD combined",
  HDL = "HDL cholesterol",
  LDL = "LDL cholesterol",
  TC = "Total cholesterol",
  ANXFS = "Anxiety factor",
  ANXCC = "Anxiety case-control",
  BUN = "Blood urea nitro.",
  BUNTE = "Blood urea nitro. TE",
  CKD = "Kidney disease",
  CKDTE = "Kidney disease TE",
  DBP = "Diastolic blood pr.",
  BMI2 = "BMI",
  HEIGHT2 = "Height",
  AI = "Age smoking init.",
  CPD_GSCAN19 = "Cigs per day",
  DPW = "Drinks per week",
  SC = "Smoking cessation",
  SI = "Smoking initiation",
  HTN = "Hypertension",
  CANNABIS = "Cannabis",
  GWAD2NA = "Alzheimer's no APOE",
  ALZ_01AD2NA = "Alzheimer's p 0.01 no APOE",
  GWAD2WA = "Alzheimer's",
  ALZ_01AD2WA = "Alzheimer's p 0.01",
  ALC = "Alcohol dependence",
  PP = "Pulse pressure",
  SBP = "Systolic blood pr.",
  EGFR = "Glom. filtr.",
  EGFRTE = "Glom. filtr. TE",
  EA3 = "Educ. attainment",
  HBA1CAA = "HbA1c Afr",
  HBA1CEA = "HbA1c Eur",
  GCOG2 = "Gen. cognition",
  ACTIVITY = "Physical Activity", # pgirep here
  ADHD = "ADHD",
  ADVENTURE = "Adventurous",
  AFB = "Age First Birth",
  ALLERGYCAT = "Cat Allergy",
  ALLERGYDUST = "Dust Allergy",
  ALLERGYPOLLEN = "Pollen Allergy",
  ASTECZRHI = "Asthma/Eczema/Rhinitis",
  ASTHMA = "Asthma",
  AUDIT = "Alcohol Misuse",
  BMI = "Body Mass Index",
  CANNABISpgirep = "Cannabis Use",
  COGEMP = "Cognitive Empathy",
  COPD = "Chronic Obstructive Pulmonary Disease",
  CPD = "Cigarettes per Day",
  CP = "Cognitive Performance",
  DELAYDISC = "Delay Discounting",
  DEP = "Depressive Symptoms",
  DPWpgirep = "Drinks per Week",
  EA = "Educational Attainment",
  EVERSMOKE = "Ever Smoker",
  EXTRA = "Extraversion",
  FAMSAT = "Satisfaction with Family",
  FINSAT = "Satisfaction with Finances",
  FRIENDSAT = "Satisfaction with Friends",
  HAYFEVER = "Hayfever",
  HEIGHTpgirep = "Height",
  HIGHMATH = "Highest Math",
  LEFTOUT = "Left Out of Social Activity",
  LONELY = "Loneliness",
  MENARCHE = "Age First Menses",
  MIGRAINE = "Migraine",
  MORNING = "Morning Person",
  NARCIS = "Narcissism",
  NEARSIGHTED = "Nearsightedness",
  NEBmen = "Number of Children (men)",
  NEBwomen = "Number of Children (women)",
  NEURO = "Neuroticism",
  OPEN = "Openness",
  READING = "Age Started Reading",
  RELIGATT = "Religious Attendance",
  RISK = "Risk Tolerance",
  SELFHEALTH = "Self-Rated Health",
  SELFMATH = "Self-Rated Math Ability",
  SWB = "Subjective Well-Being",
  VOICEDEEP = "Age Voice Deepened",
  WORKSAT = "Satisfaction with Work"
)

eths <- c("white", "black")

titles <- c(white = "White participants", black = "Black participants")


shorten <- function (x) {
  x <- str_remove(x, "^(A4|E4)_")
  x_short <- str_remove(x, "_.*")

  duplicates <- table(x_short)
  duplicates <- names(duplicates)[duplicates > 1]
  x_short[x_short %in% duplicates] <- x[x_short %in% duplicates]

  x_short <- str_replace(x_short, "^01", "ALZ_01") # two alzheimer's variants
  x_short
}


calc_income <- function (x, eth) {
  x <- x |> filter(ethnicity == eth)
  ages <- x |> select(matches("r\\d+agey_e"))
  income <- x |> select(matches("r\\d+iearn"))
  income <- rowMeans(income, na.rm = TRUE)
  mod <- lm(income ~ factor(x$rabyear), weights = x$weight, na.action =
              na.exclude)
  income_r <- resid(mod)
  income_r <- income_r - min(income_r, na.rm = TRUE) + 1

  income_r
}


calc_n <- function (mods, eths = "white", format = "(N = %s)") {
  nobs <- map_dbl(eths, \(x) nobs(mods[[x]][[1]]) ) 
  if (! is.null(format)) nobs <- sprintf(format, nobs)
  names(nobs) <- eths
  nobs
}


# avoid warnings about zero weights
svyglm_quiet <- function (...) suppressWarnings(svyglm(...))


# for a confidence interval
significant <- function (x) {
  stopifnot(is.numeric(x), length(x) == 2)
  all(x < 0) || all(x > 0)
}


insignificant <- Negate(significant)


# rand HRS longit data. VERY BIG
# Done just once:
# rand_orig <- haven::read_dta("data/randhrs/randhrs1992_2020v1.dta")
# saveRDS(rand_orig, file = "data/randhrs/rand-orig.rds", compress = FALSE)
rand_orig <- readRDS("data/randhrs/rand-orig.rds")

# ragender: 1 male, 2 female
rand_orig <- filter(rand_orig, 
                    (rabyear <= 1965 & ragender == 1) |
                    (rabyear <= 1970 & ragender == 2))

# african ancestry
pgen4a <- haven::read_dta("data/PGENSCORE4r3/pgenscore4a_r.dta")
# european ancestry
pgen4e <- haven::read_dta("data/PGENSCORE4r3/pgenscore4e_r.dta")


pgirep <- readr::read_tsv("data/pgi-repository/HRS_PGIrepo_v1.0.txt",
                          show_col_types = FALSE)
names(pgirep)[1:2] <- c("hhid","pn")

pgirep <- pgirep |> 
  select(hhid, pn, matches("single"), -PGI_NEBwomen_single) |> 
  rename_with(
    \(x) str_replace(x, "PGI_(.*)_single", "\\1")
  )

# TE PGI rep does not come pre-standardised. Going to standardize over whole sample to match
# the approach taken with PGENscores
pgirep[, 3:ncol(pgirep)] <- scale(pgirep[, 3:ncol(pgirep)])

# original HRS tracker file
trk_orig <- haven::read_dta("data/trk2020v3/trk2020tr_r.dta")
trk_weights <- trk_orig |> select(HHID, PN, matches("BIOWGTR"))

# rand family respondents file
# Done just once:
# rand_fam <- haven::read_dta(
#   "data/randhrsfam1992_2018v2_STATA/randhrsfamr1992_2018v2.dta")
# saveRDS(rand_fam, "data/randhrsfam1992_2018v2_STATA/rand_fam.rds")
rand_fam <- readRDS("data/randhrsfam1992_2018v2_STATA/rand_fam.rds")

# only selected columns to avoid creating suffixes when we join with
# rand.
# Age of oldest kid 2016; age of oldest kid 2010; grandchildren; num siblings who died;
# Mother's years of education; father's years of education
rand_fam <- rand_fam |> select(hhid, pn, h13ageokid, h10ageokid, h13gkid, r10sbdied,
                               rameduc, rafeduc) |>
                        mutate(paedyrs = as.numeric(rameduc + rafeduc)/2) #parent ave education

# 2010 (and others) record "age started smoking"
# rand_2010 <- haven::read_dta("data/hd10f6a_STATA/hd10f6a.dta")
# saveRDS(rand_2010, "data/hd10f6a_STATA/rand_2010.rds")
rand_2010 <- readRDS("data/hd10f6a_STATA/rand_2010.rds")
rand_2010 <- rand_2010 |> 
  select(hhid, pn, mc120, matches("mlb041")) |> 
  mutate(
    age_smoked = ifelse(mc120 %in% c(95, 98, 99), NA_real_, mc120)
  )
rand_2010$anxiety <- rand_2010 |> 
  select(matches("mlb041")) |> 
  rowMeans(na.rm = TRUE)
rand_2010$anx_na <- rand_2010 |> 
  select(matches("mlb041")) |> 
  is.na() |> 
  rowSums()
# matches the documentation at 
# https://hrs.isr.umich.edu/sites/default/files/biblio/HRS2006LBQscale.pdf :
rand_2010$anxiety[rand_2010$anx_na > 2] <- NA_real_

# 2016 survey had a module with 18 questions diagnosing adult ADHD
# rand_2016 <- haven::read_dta("data/h16f2c_STATA/h16f2c.dta")
# saveRDS(rand_2016, "data/h16f2c_STATA/rand_2016.rds")
rand_2016 <- readRDS("data/h16f2c_STATA/rand_2016.rds")
rand_2016 <- rand_2016 |> 
  select(hhid, pn, pv001:pv018) |> 
  mutate(
    across(matches("pv\\d\\d\\d"), \(x) ifelse(x %in% 8:9, NA, x)),
    across(matches("pv\\d\\d\\d"), \(x) x == 1) # 1 is "yes', 5 is "no"
  )
rand_2016$adhd_score <- rand_2016 |> 
  select(pv001:pv018) |> 
  rowSums(na.rm = TRUE)
rand_2016$adhd_nas <- rand_2016 |> 
  select(pv001:pv018) |> 
  is.na() |> 
  rowSums(na.rm = TRUE)
rand_2016$adhd_score <- rand_2016$adhd_score/(18 - rand_2016$adhd_nas)

pgens <- list(white = pgen4e, black = pgen4a)

pcs <- c(paste0("PC1_5", LETTERS[1:5]), paste0("PC6_10", LETTERS[1:5]))

rands <- list()

for (eth in eths) {
  pgen <- pgens[[eth]]
  names(pgen) <- shorten(names(pgen))
  rands[[eth]] <- inner_join(rand_orig, pgen,
                     join_by(hhid, pn),
                     unmatched = "drop",
                     relationship = "one-to-one")
}


rand <- list_rbind(rands, names_to = "ethnicity")

old_pgs <- c("EDU2", "EDU3", "HEIGHT", "BMI",
           "AD", "AD2", "GENCOG", "EVRSMK", "LONG", "CPD_TAG10",
           "ADHD_PGC10", "MDD")

# TE going to simplify names of pgen4e. Then merge, then create pgs.
pgen4e_new_names <- pgen4e |>
    rename_with(shorten) |> 
    # weird that we have to operate on whole df to use select semantics
    select(-version,
           -matches("^PC\\d"),
           -matches("^NEB"), # number ever born
           # outdated, or EDU3 without 23andMe:
           -any_of(old_pgs)
           ) 

pgen4e_new_names <- left_join(pgen4e_new_names, pgirep, 
                              by = join_by(hhid == hhid, pn == pn),
                              suffix = c("", "pgirep"))

# Height is a duplicated name, but it's with the old PGS, so fixing this manually
pgen4e_new_names <- rename(pgen4e_new_names, 
                           HEIGHTpgirep = HEIGHT)

# TE here we removed pgenscores that are duplicated in the pgi repository
# we also remove trans ethnic and African ancestry PGS
dupe_PGS <- c("EA3", # TE when I've tested it the pgirep one is slightly better...
              "GCOG2", #CP
              "BMI2",
              "NEUROT",
              "WELLB",
              "DEPSYMP",
              "ADHD_PGC17",
              "EXTRAVER",
              "AFBC", 
              "PTSDAA", # African    
              "PTSDC", # Combined Euro + African.
              #"ANXFS"  ,"ANXCC", continuous vs case-control anxiety 
              # r = 0.6 let's keep them
              # I suppose it's equivalent to having CP and EA
              "BUNTE",  
              "CKDTE", 
              "BMI2",
              "HEIGHT2",
              "CPD_GSCAN19",
              "DPW",
              "SI",
              "CANNABIS", 
              "GWAD2NA", 
              "ALZ_01AD2NA",
              "GWAD2WA", # Three alzheimers deleted. Ones using p value 1 threshold
              # and alzheimers that remove APOE variant region
              "ALC", #AUDIT is alcohol misuse in pgirep
 	            "EGFR",
	            "HBA1CEA" 
              )


dupe_PGS_black <- c("PTSDAA",   
              "PTSDC", 
              "BUNTE", 
              "CKDTE", 
              "BMI2",
              "CANNABIS", 
              "GWAD2NA", 
              "ALZ_01AD2NA",
              "GWAD2WA", 
 	            "EGFR",
	            "HBA1CEA" 
              )
              
pgen4e_new_names <- select(pgen4e_new_names, 
                           -hhid, -pn, -any_of(dupe_PGS), -any_of(old_pgs))
pgs_white <- names(pgen4e_new_names)
pgs_white[pgs_white == "HEIGHT"] <- "HEIGHTpgirep"

pgs_black <- pgen4a |>
    rename_with(shorten) |> 
    select(-hhid, -pn, -version,
           -matches("^PC\\d"),
           -matches("^NEB"), # number ever born
           # outdated, or EDU3 without 23andMe:
           -any_of(old_pgs),
           -any_of(dupe_PGS_black)
           ) |>
    names()

rand <- left_join(rand, pgirep, by = join_by(hhid, pn),suffix=c("","pgirep"))

# weighting
rand <- left_join(rand, trk_weights, by = join_by(hhid == HHID, pn == PN))
rand$weight <- ifelse(is.na(rand$MBIOWGTR), rand$NBIOWGTR, rand$MBIOWGTR)

# Replacing 0 weights with NA because it can create odd problems
rand$weight <- na_if(rand$weight, 0)

# compensate for fecund parents more likely to have child in survey.
# we use living sibs rather than including dead sibs too here. we think that's
# the correct approach:
rand$parent_weight <- rand$weight / (rand$r10livsib + 1)
rand$child_weight <- rand$weight*rand$raevbrn


rand <- rand |> left_join(rand_fam, 
                          by = join_by(hhid, pn),
                          unmatched = "drop",
                          relationship = "one-to-one")

rand <- rand |> left_join(rand_2016, 
                          by = join_by(hhid, pn),
                          unmatched = "drop",
                          relationship = "one-to-one")

rand <- rand |> left_join(rand_2010, 
                          by = join_by(hhid, pn),
                          unmatched = "drop",
                          relationship = "one-to-one")

# TE this is average number of kids that your kids have
rand$div_h13gkid <- rand$h13gkid / rand$raevbrn


# Some people report having grandkids, but having had no children. Weird!
# This code subsets the data by ethnicity then counts all observations
# where grandkids is greater than 0 but number of kids is 0
gkids_unusable_white <- rand |> 
  filter(ethnicity == "white", h13gkid > 0, raevbrn == 0) |> 
  nrow()
gkids_unusable_black <- rand |> 
  filter(ethnicity == "black", h13gkid > 0, raevbrn == 0) |> 
  nrow()

# if grandkids but no kids div_gkid is infinite. Replacing with NA
rand$div_h13gkid[is.infinite(rand$div_h13gkid)] <- NA

rand <- rand |>
  mutate(.by = c(ethnicity, rabyear), # by year and ethnicity
    mean_weight = mean(weight, na.rm = TRUE),
    mean_child_weight = mean(child_weight, na.rm = TRUE),
    mean_raevbrn = mean(raevbrn*weight/mean_weight, na.rm = TRUE),
    rlrs = raevbrn/mean_raevbrn,
    mean_h13gkid = mean(h13gkid*weight/mean_weight, na.rm = TRUE),
    rlrs_h13gkid = h13gkid/mean_h13gkid,
    mean_div_h13gkid = mean(div_h13gkid*child_weight/mean_child_weight, 
                            na.rm = TRUE), 
    rlrs_div_h13gkid = div_h13gkid/mean_div_h13gkid
  ) |>
  mutate(.by = ethnicity, # calculate these stats within-ethnicity
    income_resid = calc_income(rand, ethnicity[1]),
    income_resid = income_resid - min(income_resid, na.rm = TRUE) + 1,
    income_med = chop_equally(income_resid, 2, labels = c("Low", "High")),
    # as.numeric avoids haven.labelled issues:
    raedyrs = as.numeric(raedyrs),
    r10hearte = as.numeric(r10hearte),
    r10diabe = as.numeric(r10diabe),
    edu = chop(as.numeric(raedyrs), 13, lbl_discrete(symbol = "-")),
    born = chop(rabyear, 1942, labels = c("pre-1942", "post-1942")),
    # wave 10 is 2010
    # table(rands$white$r10mstat)
     # 1                 1.married
     # 2   2.married,spouse absent
     # 3               3.partnered
     # 4               4.separated
     # 5                5.divorced
     # 6      6.separated/divorced
     # 7                 7.widowed
     # 8           8.never married
    married = factor(r10mstat == 1, 
                     levels = c(FALSE, TRUE),
                     labels = c("Other", "Married")),
    r10sibs = r10livsib + 1, # n. sibs including oneself!
    agefbn = r10agey_e - h10ageokid,
    agefbn = ifelse(agefbn < 12, NA_real_, agefbn),
  ) |> 
  mutate(.by = c(ethnicity, ragender),
    agefb = chop_equally(agefbn, 2, labels = c("Low", "High"))
  )


rand$neurot <- rand |> 
  select(matches("r\\d+lbneur")) |> 
  rowMeans(na.rm = TRUE)
# two of four Alzheimer's variants with awkward names:
#pgs <- str_replace(pgs, "^01", "ALZ_01")
pgs_white <- str_replace(pgs_white, "^01", "ALZ_01")
pgs_black <- str_replace(pgs_black, "^01", "ALZ_01")
names(rand) <- str_replace(names(rand), "^01", "ALZ_01")
pgs <- unique(c(pgs_white,pgs_black))

##############################################################
# Residualising then standardising PGS, with sampling weights
##############################################################

dat_eth_list <- list()

for (eth in eths) {

  r_eth <- rand |> subset(ethnicity == eth)
  
   if (eth == "white") {
    pgs <- pgs_white
  } else {
    pgs <- pgs_black
  }
  
for (pg in pgs) {

  reg <- lm(get(pg) ~ PC1_5A + PC1_5B + PC1_5C + PC1_5D + PC1_5E +
                    PC6_10A + PC6_10B + PC6_10C + PC6_10D +PC6_10E, weights = weight, 
                    data = r_eth)

  r_eth <- r_eth |> mutate(
                    !!pg := get(pg) - predict(reg, r_eth),
                    mean_weight = mean(weight, na.rm=TRUE),
                    tot_weight = sum(weight, na.rm=TRUE),
                    mean_res_pgs = sum(get(pg)*weight, na.rm=TRUE)/tot_weight,
                    w.dev = weight*(get(pg) - mean_res_pgs)^2,
                    w.var.pgs = sum(w.dev, na.rm =TRUE)/tot_weight,
                    deman_pgs = get(pg) - mean_res_pgs,
                    !!pg := deman_pgs/sqrt(w.var.pgs)
  )
}
  
  dat_eth_list[[eth]] <- r_eth
  
}

rand <- dat_eth_list |> list_rbind()
pgs <- unique(c(pgs_white,pgs_black))

# SD of EA before and after this process goes from 1 to 1.01, implying that the HRS is slightly more diverse in EA than the population and that Z scoring using sampling weights doesn't really matter

```

```{r}
#| label: prog-cleanup


# clean up cache so it doesn't get too big
rm(rand_orig, trk_orig, rands, rand_fam, rand_2010, rand_2016)

# TE we can delete more
rm(pgen, pgen4a, pgen4e, pgirep, trk_weights)

rand <- rand |> 
  select(ethnicity, all_of(pgs), all_of(old_pgs), 
         all_of(pcs), weight, raevbrn, rlrs,
         raedyrs, r10sibs, edu, income_resid, income_med, married, 
         agefb, agefbn, rabyear, born, raehsamp, raestrat, 
         parent_weight, ragender, adhd_score, neurot, r10diabe, 
         r10hearte, r10cesd, age_smoked, anxiety, r10cogtot, 
         rlrs_div_h13gkid, child_weight, h13ageokid, paedyrs)

r_parents <- svydesign(
    id = ~ raehsamp,
		strata = ~ raestrat,
		weights = ~ parent_weight, 
		nest = TRUE,
		data = rand |> drop_na(parent_weight) 
		)

r_child <- svydesign(
    id = ~ raehsamp,
		strata = ~ raestrat,
		weights = ~ child_weight, 
		nest = TRUE,
		data = rand |> drop_na(child_weight) |>
		        filter(!is.na( rlrs_div_h13gkid), h13ageokid > 39)
		)

rand <- svydesign(
    id = ~ raehsamp,
		strata = ~ raestrat,
		weights = ~ weight, 
		nest = TRUE,
		data = rand |> drop_na(weight))

n_white <- nrow(rand |> subset(ethnicity == "white"))
n_black <- nrow(rand |> subset(ethnicity == "black"))

```

@hugh2022human explain patterns of natural selection on polygenic scores in the UK, using an economic theory of fertility derived from @becker1976child. The theory has two components:

1.  There is a trade-off between time spent working and raising children. This "substitution effect" leads people with more human capital and higher expected wages to have fewer children. Evidence for this is that polygenic scores which correlate positively with human capital correlate negatively with number of children, i.e. they are being selected against; conversely, scores which correlate positively with human capital are being selected for.
2.  The trade-off is sharper for low-income people, people with low human capital, and single parents. Because these groups value income more at the margin, the substitution effect is stronger for them. In other groups, the substitution effect is balanced by the "income effect", that children become more affordable when you get richer. As a result, natural selection is stronger among these groups. Evidence for this is that scores' regression coefficients on number of children are larger among people with lower income or less education, and single parents.

Here, we make an independent test of the theory in the US population, using the Health and Retirement Survey [@hrsrandfam2023; @hrsrand2023]. The motivation is to establish the direction and magnitude of natural selection on a range of traits in Americans, and to test whether the economic theory can explain the selection. Using information on respondents' siblings and grandchildren, we can also extend the analysis to three generations of Americans. This is interesting first because selection effects may accumulate over time, and second because an possible alternative explanation for existing findings is the development of the welfare state, which happened in the US during the "Great Society" programs of the 1960s (i.e. after the respondents' parents' generation). To preview our results, we confirm point 1 above across all three generations. But we only see partial and ambiguous support for point 2. 

# Data

The HRS sample focuses on cohorts born between 1920 and 1960, but contains some younger and older participants. We include only male participants born before 1965 and female participants born before 1970, which guarantees that most will have completed their fertility by 2010. The resulting sample contains `r n_white` genotyped white participants. We focus on these because the sample size is large enough. The appendix reports some basic analyses for the `r n_black` genotyped black participants. (Throughout, "white" and "black" refer to participants who (a) self-identified as non-Hispanic and as "White/Caucasian" and "Black/African-American" respectively, and who (b) had principal components of SNP data close to the respective population mean. See @hrspgs2020.)

Genotyping took place in 2006, 2008 and subsequent years. PGS were taken from those pre-calculated by the HRS [@hrspgs2020] and those produced by the Social Science Genetic Association Consortium, as part of their Polygenic Index Repository [@becker2021resource]. Scores created by the HRS were provided for black and white participants, but Polygenic Index Repository scores were only created for white participants.

For the white participants, when scores from the two samples measured the same trait, we only used the PGS from the Polygenic Index Repository. For some traits, polygenic scores were created from both European ancestry and multiple ancestry GWAS. We choose to use polygenic scores trained only on individuals of European ancestry. We discard obsolete PGS for which there is a newer, more accurate score targeting the same phenotype. We also discard PGS for number of children ever born (but keep scores for age at first birth). This leaves a total of `r length(pgs_white)` scores for the white participants. PGS were residualised on the first ten within-ethnicity principal components of the DNA array data, to reduce bias from population stratification. PGS were then rescaled to zero mean and unit variance.

The key dependent variable is relative lifetime reproductive success (RLRS): number of children ever born, divided by the mean number of children of people born in the same year. RLRS is calculated separately by ethnicity. This is not ideal, because it treats the ethnicities as "separate breeding populations" (i.e. subpopulations who rarely interbreed; black and white people intermarried rarely in the HRS generation, but that is due to laws and norms against intermarriage which have now mostly disappeared.) But the alternative of calculating pooled RLRS would effectively be estimating natural selection in the whole US population by treating whites as representative. We therefore focus on the white US population, with the caveat that results from this "one data point" may not replicate in other ethnicities or countries. The mean number of children of people born in each year was calculated using sampling weights.

The intuition behind our analysis is simple: if a polygenic score predicts more reproductive success, then people higher in the PGS will reproduce more than others, causing scores to increase in the population. However the approach is also based on quantitative genetic theory and is able to yield an estimate of genetic change from one generation to the next.

The Robertson-Price Identity states that the change in a genetic trait between one generation and the next is equal to its covariance with relative fitness [@Price1970identity; @Robertson1966identity], assuming no mutations. In humans, infant mortality is so low today that a person's reproductive success is a very close approximation of the fitness. As such, the covariance between a polygenic score and RLRS yields the expected change in the mean polygenic score per generation. Because the polygenic scores are standardized to have a variance of 1, the regression coefficient of RLRS on the PGS is equal to the covariance. This approach is standard in the literature [@Beauchamp2016selection; @hugh2022human; @kong2017selection].

The HRS contains weights which match survey respondents to the US population. We use weights for the biomarker subsample (\*BIOWGTR in the HRS tracker file). Since half the sample enters the extended interview including biomarker data in each biannual survey, we weight individuals by either their 2010 weight or their 2012 weight. This maximizes the available sample of both black and white respondents, and should approximately match the US population of the sample cohorts between 2010 and 2012. Standardization of polygenic scores used estimates of the population mean and variance of the polygenic score, as estimated with sampling weights. Statistical tests are adjusted for clustering and stratification using the R *survey* package [@lumleysurvey2023].

# Results

```{r}
#| label: stat-regressions
# cache seems to cause trouble even with cache-lazy: FALSE 

options("survey.lonely.psu" = "average")

tidy_pgs <- mod_pgs <- list()
tidy_pgs_ed <- mod_pgs_edyrs <- list()
tidy_pgs_ed_parent <- mod_pgs_edyrs_parent <- list()
tidy_pgs_sibs <- mod_pgs_sibs <- list()
tidy_pgs_gkids <- mod_pgs_gkids <- list()
tidy_pgs_x_ed <- mod_pgs_x_ed <- list()
tidy_pgs_x_inc <- mod_pgs_x_inc <- list()
tidy_pgs_x_mar <- mod_pgs_x_mar <- list()
tidy_pgs_x_agefb <- mod_pgs_x_agefb <- list()

coefs_fert_edu <- cors_fert_edu <- tidy_pgs_joined <- list()
cors_sibs_edu <- list()
cors_sibs_edu_parent <- list()
cors_gkids_edu <- list()
tidy_fert_edu_phen <- list()

for (eth in eths) {
  
  if (eth == "white") {
    pgs <- pgs_white
  } else {
    pgs <- pgs_black
  }
  

  r_eth <- rand |> subset(ethnicity == eth)
  r_parents_eth <- r_parents |> subset(ethnicity == eth)
  r_child_eth <- r_child |> subset(ethnicity == eth)
  
  form_pgs <- map(pgs, \(x) reformulate(c(x), response = "rlrs"))
  mod_pgs[[eth]] <- map(form_pgs, 
                               \(x) svyglm_quiet(x, r_eth))
  
  tidy_pgs[[eth]] <- mod_pgs[[eth]] |> 
    map(broom::tidy, conf.int = TRUE, conf.level = 0.95) |> 
    list_rbind() |>
    filter(term %in% pgs)
  
  form_pgs_edyrs <- map(pgs, \(x) reformulate(c(x),
                                              response = "raedyrs"))
  mod_pgs_edyrs[[eth]] <- map(form_pgs_edyrs, 
                                     \(x) svyglm_quiet(x, r_eth))
  names(mod_pgs_edyrs[[eth]]) <- pgs
  
  tidy_pgs_ed[[eth]] <- mod_pgs_edyrs[[eth]] |>  
    map(broom::tidy) |>
    list_rbind() |> 
    filter(term %in% pgs)
  
  
  form_pgs_edyrs_parent <- map(pgs, \(x) reformulate(c(x),
                                              response = "paedyrs"))
  mod_pgs_edyrs_parent[[eth]] <- map(form_pgs_edyrs_parent, 
                                     \(x) svyglm_quiet(x, r_parents_eth))
  names(mod_pgs_edyrs_parent[[eth]]) <- pgs
  
  tidy_pgs_ed_parent[[eth]] <- mod_pgs_edyrs_parent[[eth]] |>  
    map(broom::tidy) |>
    list_rbind() |> 
    filter(term %in% pgs)
  
  
  tidy_pgs_joined[[eth]] <- inner_join(tidy_pgs[[eth]], tidy_pgs_ed[[eth]],
                                by = join_by(term),
                                unmatched = "error",
                                suffix = c(".fert", ".edyrs"),
                                relationship = "one-to-one")
  cors_fert_edu[[eth]] <- cor(tidy_pgs_joined[[eth]]$estimate.edyrs, 
                     tidy_pgs_joined[[eth]]$estimate.fert)
  coefs_fert_edu[[eth]] <- lm(estimate.fert ~ estimate.edyrs,
                              data = tidy_pgs_joined[[eth]]) |> 
                           coef() |> 
                           pluck("estimate.edyrs")
  
  form_pgs_sibs <- map(pgs, 
                       \(x) reformulate(c(x), response = "r10sibs"))
  mod_pgs_sibs[[eth]] <- map(form_pgs_sibs,
                                    \(x) svyglm_quiet(x, r_parents_eth))
  
  tidy_pgs_sibs[[eth]] <- mod_pgs_sibs[[eth]] |> 
    map(broom::tidy, conf.int = TRUE, conf.level = 0.95) |>
    list_rbind() |> 
    filter(term %in% pgs)
  
  cors_sibs_edu[[eth]] <- cor(tidy_pgs_sibs[[eth]]$estimate, 
                     tidy_pgs_ed[[eth]]$estimate)
  
  cors_sibs_edu_parent[[eth]] <- cor(tidy_pgs_sibs[[eth]]$estimate, 
                     tidy_pgs_ed_parent[[eth]]$estimate)
  
  
    form_pgs_gkids <- map(pgs, 
                       \(x) reformulate(c(x), response = "rlrs_div_h13gkid"))
  mod_pgs_gkids[[eth]] <- map(form_pgs_gkids,
                                    \(x) svyglm_quiet(x, r_child_eth))
  
  tidy_pgs_gkids[[eth]] <- mod_pgs_gkids[[eth]] |> 
    map(broom::tidy, conf.int = TRUE, conf.level = 0.95) |>
    list_rbind() |> 
    filter(term %in% pgs)
  
  cors_gkids_edu[[eth]] <- cor(tidy_pgs_gkids[[eth]]$estimate, 
                     tidy_pgs_ed[[eth]]$estimate)
  

  tidy_fert_edu_phen[[eth]] <- 
    svyglm_quiet(rlrs ~ raedyrs, r_eth) |> 
    broom::tidy(conf.int = TRUE) |> 
    filter(term == "raedyrs")
}

# we only look at interactions among whites, the rest is pointless
for (eth in "white") {
  r_eth <- rand |> subset(ethnicity == eth)
  
  
  if (eth == "white") {
    pgs <- pgs_white
  } else {
    pgs <- pgs_black
  }
  
  # the colon interaction estimates the effect separately within each group
  form_pgs_x_ed <- paste0(pgs, ":edu") |>
    map(\(x) reformulate(c(x, "edu"), response = "rlrs"))
  mod_pgs_x_ed[[eth]]  <-
    map(form_pgs_x_ed, 
               \(x) svyglm_quiet(x, r_eth))
  
  tidy_pgs_x_ed[[eth]] <-
    map(mod_pgs_x_ed[[eth]], broom::tidy, conf.int = TRUE)
  names(tidy_pgs_x_ed[[eth]]) <- pgs
  tidy_pgs_x_ed[[eth]] <-
    list_rbind(tidy_pgs_x_ed[[eth]], names_to = "pgs") |>
    filter(!term %in% c("(Intercept)", "edu13-17")) |>
    mutate(edu = str_remove(term, ".*:"))
  
  form_pgs_x_inc <- paste0(pgs, ":income_med") |>
    map(\(x) reformulate(c(x, "income_med"), response = "rlrs"))
  mod_pgs_x_inc[[eth]]  <-
    map(form_pgs_x_inc, 
               \(x) svyglm_quiet(x, r_eth))
  
  tidy_pgs_x_inc[[eth]] <-
    map(mod_pgs_x_inc[[eth]], broom::tidy, conf.int = TRUE)
  names(tidy_pgs_x_inc[[eth]]) <- pgs
  tidy_pgs_x_inc[[eth]] <-
    list_rbind(tidy_pgs_x_inc[[eth]], names_to = "pgs") |>
    filter(!term %in% c("(Intercept)", "income_medHigh")) |>
    mutate(income_med = str_remove(term, ".*:income_med"))
  
  form_pgs_x_mar <- paste0(pgs, ":married") |>
    map(\(x) reformulate(c(x, "married"), response = "rlrs"))
  mod_pgs_x_mar[[eth]]  <-
    map(form_pgs_x_mar, 
               \(x) svyglm_quiet(x, r_eth))
  
  tidy_pgs_x_mar[[eth]] <-
    map(mod_pgs_x_mar[[eth]], broom::tidy, conf.int = TRUE)
  names(tidy_pgs_x_mar[[eth]]) <- pgs
  tidy_pgs_x_mar[[eth]] <-
    list_rbind(tidy_pgs_x_mar[[eth]], names_to = "pgs") |>
    filter(! term %in% c("(Intercept)", "marriedMarried")) |>
    mutate(
      married = str_remove(term, ".*:married")
    ) 
  
  form_pgs_x_agefb <- paste0(pgs, ":agefb") |>
    map(\(x) reformulate(c(x, "agefb"), response = "rlrs"))
  mod_pgs_x_agefb[[eth]]  <-
    map(form_pgs_x_agefb, 
               \(x) svyglm_quiet(x, r_eth))
  
  tidy_pgs_x_agefb[[eth]] <-
    map(mod_pgs_x_agefb[[eth]], broom::tidy, conf.int = TRUE) |> 
    setNames(pgs) |> 
    list_rbind(names_to = "pgs") |>
    filter(! term %in% c("(Intercept)", "agefbHigh")) |>
    mutate(
      agefb = str_remove(term, ".*:agefb")
    ) 
}

tidy_pgs_eth <- list_rbind(tidy_pgs)
n_tests <- nrow(tidy_pgs_eth)
n_sig_bonf <- sum(tidy_pgs_eth$p.value < 0.05/n_tests)


n_tests_white <- nrow(tidy_pgs$white)
n_sig_bonf_white <- sum(tidy_pgs$white$p.value < 0.05/n_tests_white)

n_tests_black <- nrow(tidy_pgs$black)
n_sig_bonf_black <- sum(tidy_pgs$black$p.value < 0.05/n_tests_black)


```

```{r}
#| label: stat-bootstrap-eth-cor
#| cache: false
#| cache-lazy: false
#| eval: false

# I don't mention this below so turning it off for now
# TODO: should I mention it? It's a low correlation.

n_reps_eth_diff <- 199
rand_boot <- rand |> 
    as.svrepdesign(type = "bootstrap", replicates = n_reps_eth_diff)


calc_eth_cor <- function (weights, data) {
  coef_pgs_white <- pgs |> 
        map(\(x) reformulate(c(x), response = "rlrs")) |> 
        map(\(x) lm(x, data, weights = weights, 
                    subset = ethnicity == "white")) |> 
        map2_dbl(pgs, \(x, y) coef(x)[[y]])
  coef_pgs_black <- pgs |> 
        map(\(x) reformulate(c(x), response = "rlrs")) |> 
        map(\(x) lm(x, data, weights = weights, 
                    subset = ethnicity == "black")) |> 
        map2_dbl(pgs, \(x, y) coef(x)[[y]])
  
  c(cor = cor(coef_pgs_black, coef_pgs_white))
}


cor_eths <- cor(tidy_pgs$white$estimate, tidy_pgs$black$estimate)
cor_eths_boot <- withReplicates(rand_boot, theta = calc_eth_cor)
ci_eths <- confint(cor_eths_boot)[1,]
```

We estimate coefficients of PGS on RLRS. These do not identify causal effects; recall that natural selection involves correlation, not necessarily causation, between selected characteristics and fertility. Appendix @fig-rlrs shows coefficients. Standard errors are large because of the relatively low sample sizes. `r n_sig_bonf_white` scores are significant at Bonferroni-corrected p \< 0.05/`r n_tests_white`. The scores are age at first birth, educational attainment, ADHD, self-rated health and having ever smoked. But we are most concerned with looking at patterns across scores rather than judging the significance of individual scores.

```{r}
#| label: stat-bootstraps
#| cache: false
#| cache-lazy: false
#| cache-comments: true

# TODO set cache to false before final build

# Not sure this actually does anything :-)
old_opts <- options(survey.multicore = TRUE) 

# The large n_reps is to get an accurate upper tail on the black sample, 
# since it is close to 0:
n_reps <-  599

calc_cors_edu <- function (weights, data, var) {
  coef_pgs_fert <- pgs |> 
        map(\(x) reformulate(c(x), response = var)) |> 
        map(\(x) lm(x, data, weights = weights)) |> 
        map_dbl(\(x) coef(x)[[2]])
  
  coef_pgs_edu <- pgs |> 
        map(\(x) reformulate(c(x), response = "raedyrs")) |> 
        map(\(x) lm(x, data, weights = weights)) |> 
        map_dbl(\(x) coef(x)[[2]])
  
  c(
    cor = cor(coef_pgs_edu, coef_pgs_fert),
    coef = coef(lm(coef_pgs_fert ~ coef_pgs_edu))["coef_pgs_edu"]
  )
}

calc_cors_edu_parent <- function (weights, data, var) {
  coef_pgs_fert <- pgs |> 
        map(\(x) reformulate(c(x), response = var)) |> 
        map(\(x) lm(x, data, weights = weights)) |> 
        map_dbl(\(x) coef(x)[[2]])
  
  coef_pgs_edu <- pgs |> 
        map(\(x) reformulate(c(x), response = "paedyrs")) |> 
        map(\(x) lm(x, data, weights = weights)) |> 
        map_dbl(\(x) coef(x)[[2]])
  
  c(
    cor = cor(coef_pgs_edu, coef_pgs_fert),
    coef = coef(lm(coef_pgs_fert ~ coef_pgs_edu))["coef_pgs_edu"]
  )
}
  

cors_fert_edu <- cors_sibs_edu <- list()
for (eth in eths) {
  
  
  pgs <- if (eth == "white") pgs_white else pgs_black
  
  rand_boot <- rand |> 
    subset(ethnicity == eth) |> 
    as.svrepdesign(type = "bootstrap", replicates = n_reps)
  r_parents_boot <- r_parents |> 
    subset(ethnicity == eth) |> 
    as.svrepdesign(type = "bootstrap", replicates = n_reps)
  r_child_boot <- r_child |> 
    subset(ethnicity == eth) |> 
    as.svrepdesign(type = "bootstrap", replicates = n_reps)
  
  cors_fert_edu[[eth]] <- 
    withReplicates(rand_boot, theta = calc_cors_edu, var = "rlrs")
  cors_sibs_edu[[eth]] <- 
    withReplicates(r_parents_boot, theta = calc_cors_edu, var = "r10sibs")
  cors_sibs_edu_parent[[eth]] <- 
    withReplicates(r_parents_boot, theta = calc_cors_edu_parent, var = "r10sibs")
  cors_gkids_edu[[eth]] <- 
    withReplicates(r_child_boot, theta = calc_cors_edu, var = "rlrs_div_h13gkid")
}

options(old_opts)
```

```{r}
#| label: stat-bootstrap-eth-diff
#| eval: false

# this was an attempt to see if black coefs were larger in abs size than
# white coefs. Results were always absolutely insignificant (and conf
# intervals too wide to be informative).

n_reps_eth_diff <- 199
rand_boot <- rand |> 
    as.svrepdesign(type = "bootstrap", replicates = n_reps_eth_diff)


calc_eth_diff <- function (weights, data) {
  coef_pgs_white <- pgs |> 
        map(\(x) reformulate(c(x), response = "rlrs")) |> 
        map(\(x) lm(x, data, weights = weights, 
                    subset = ethnicity == "white")) |> 
        map_dbl(\(x) coef(x)[[2]])
  coef_pgs_black <- pgs |> 
        map(\(x) reformulate(c(x), response = "rlrs")) |> 
        map(\(x) lm(x, data, weights = weights, 
                    subset = ethnicity == "black")) |> 
        map_dbl(\(x) coef(x)[[2]])
  
  # this is positive if white was "bigger" than black taking the white sign 
  # as the correct sign. Negative if b was "bigger"  than w.
  diff_coefs <- (coef_pgs_white - coef_pgs_black)*sign(coef_pgs_white)
  names(diff_coefs) <- pgs
  c(diff = mean(diff_coefs))
}

diff_eths <- withReplicates(rand_boot, theta = calc_eth_diff)
```

```{r}
#| label: stat-post-bootstraps

# separate so we don't interfere with the bootstrap cache

ci_boot_fert <- map(cors_fert_edu, \(x) confint(x)[1,])
ci_boot_sibs <- map(cors_sibs_edu, \(x) confint(x)[1,])
ci_boot_sibs_parent <- map(cors_sibs_edu_parent, \(x) confint(x)[1,])
ci_boot_gkids <- map(cors_gkids_edu, \(x) confint(x)[1,])

ci_boot_fert_lm <- map(cors_fert_edu, \(x) confint(x)[2,])
ci_boot_sibs_lm <- map(cors_sibs_edu, \(x) confint(x)[2,])
ci_boot_sibs_lm_parent <- map(cors_sibs_edu_parent, \(x) confint(x)[2,])
ci_boot_gkids_lm <- map(cors_gkids_edu, \(x) confint(x)[2,])

if (insignificant(ci_boot_fert$white)) warning("ci_boot_fert$white insignificant")
if (insignificant(ci_boot_fert$black)) warning("ci_boot_fert$black insignificant")
if (insignificant(ci_boot_sibs$white)) warning("ci_boot_sibs$white insignificant")
if (insignificant(ci_boot_sibs_parent$white)) warning("ci_boot_sibs_parent$white insignificant")
if (insignificant(ci_boot_sibs_parent$white)) warning("ci_boot_sibs_parent$black significant")
if (significant(ci_boot_sibs$black)) warning("ci_boot_sibs$black significant")
if (insignificant(ci_boot_gkids$white)) warning("ci_boot_gkids$white insignificant")
if (significant(ci_boot_gkids$black)) warning("ci_boot_gkids$black significant")
```

@fig-rlrs-edu plots each PGS's regression beta on RLRS against its regression beta on educational attainment. The relationship is negative (correlation `r cors_fert_edu$white[1]`, bootstrap 95% C.I. `r ci_boot_fert$white[1]` to `r ci_boot_fert$white[2]`). Survey bootstraps [@canty1999resampling] are used so as to make inferences from the sample of respondents.

```{r}
#| label: fig-rlrs-edu
#| fig-cap: |
#|   Scatterplot of PGS regression coefficients on RLRS against 
#|   coefficients on years of education. Each dot is one polygenic score.
#|   The dashed line shows a linear regression.


ggplot(tidy_pgs_joined$white, aes(estimate.edyrs, estimate.fert)) +
   geom_hline(yintercept = 0, color = "grey80") +
   geom_vline(xintercept = 0, color = "grey80") +
   geom_point() +
   geom_smooth(method = "lm", formula = y ~ x, se = FALSE, 
               linetype = "dashed", colour = "red", linewidth = 0.5) +
   scale_y_continuous(breaks = seq(-0.2, 0.2, 0.02),
                      labels = scales::label_number(0.01)) +
   coord_cartesian(xlim = c(-.3, .75)) +
   theme_light() + 
   theme(panel.grid = element_blank()) +
   labs(
     x = "Coefficient on years of education",
     y = "Coefficient on RLRS"
   )
  

```

```{r}
#| label: stat-bootstrap-gen-cor
#| cache: false
#| cache-comments: true

# TODO set cache to FALSE before submission

n_reps_gen_cor <- 199
pgs <- pgs_white

f_resp <- map(pgs, \(x) reformulate(c(x), response = "raevbrn"))
f_sibs <- map(pgs, \(x) reformulate(c(x), response = "r10sibs"))
  
estimate_gen_cor <- function (w, data) {
  data$w <- w # avoid w getting lost by map and lm
  m_resp <- map(f_resp, \(x) lm(x, data, weights = w))
  m_sibs <- map(f_sibs, \(x) lm(x, data, weights = w/r10sibs))
  
  coef_resp <- map2_dbl(m_resp, pgs, \(x, y) coef(x)[[y]])
  coef_sibs <- map2_dbl(m_sibs, pgs, \(x, y) coef(x)[[y]])
  c(gen_cor = cor(coef_resp, coef_sibs))
}

rand_boot <- rand |> 
    subset(ethnicity == "white") |> 
    as.svrepdesign(type = "bootstrap", replicates = n_reps_gen_cor)

gen_cor <- withReplicates(rand_boot, theta = estimate_gen_cor)
ci_gen_cor <- confint(gen_cor)
```

```{r}
#| label: stat-bootstrap-gen-cor-gkids
#| cache: false
#| cache-comments: true

# TODO set cache to FALSE before submission

n_reps_gen_cor <-  199
pgs <- pgs_white

f_resp <- map(pgs, \(x) reformulate(c(x), response = "rlrs"))
f_gkids <- map(pgs, \(x) reformulate(c(x), response = "rlrs_div_h13gkid"))
  
estimate_gen_cor <- function (w, data) {
  data$w <- w # avoid w getting lost by map and lm
  m_resp <- map(f_resp, \(x) lm(x, data, weights = w))
  m_gkids <- map(f_gkids, \(x) lm(x, data |> filter(h13ageokid > 39), weights = w*raevbrn))
  # data subset so respondent's kids have 1+ married kids
  coef_resp <- map2_dbl(m_resp, pgs, \(x, y) coef(x)[[y]])
  coef_gkids <- map2_dbl(m_gkids, pgs, \(x, y) coef(x)[[y]])
  c(gen_cor = cor(coef_resp, coef_gkids))
}

rand_boot <- rand |> 
    subset(ethnicity == "white") |> 
    as.svrepdesign(type = "bootstrap", replicates = n_reps_gen_cor)

gen_cor_gkids <- withReplicates(rand_boot, theta = estimate_gen_cor)
ci_gen_cor_gkids <- confint(gen_cor_gkids)

```

We can also examine natural selection in the preceding and succeeding generations, by using reported number of siblings and grandchildren respectively. We regress PGS on respondents' number of living siblings in 2010. Data for dead siblings has too many missing values to use. We reweight respondents by the reciprocal of their number of siblings, to account for parents of many siblings being more likely to be a parent of a respondent. Parents of no siblings cannot be included, so coefficient sizes are not perfectly comparable across the generations. The HRS has data on the years of education for the parents of respondents. We recalculate the coefficients on education using values for the parents, in case the genetics of education and human capital changed between generations. Appendix @fig-siblings plots coefficients on number of siblings versus coefficients on parents' years of education. The correlation is negative and significant (correlation `r cors_sibs_edu_parent$white[1]`, bootstrapped 95% C.I. `r ci_boot_sibs_parent$white[1]` to `r ci_boot_sibs_parent$white[2]`). The same holds if we use coefficients on own years of education (correlation `r cors_sibs_edu$white[1]`, bootstrapped 95% C.I. `r ci_boot_sibs$white[1]` to `r ci_boot_sibs$white[2]`). There is a positive and significant correlation across generations, i.e. between PGS coefficients on number of siblings and number of children (`r gen_cor`, bootstrapped 95% C.I. `r ci_gen_cor[1,1]` to `r ci_gen_cor[1,2]`).

To examine selection in the next generation, we divide the respondents' number of grandchildren by their number of children. In other words, we calculate the average number of children of the respondent's children. This indicates reproductive success in the second generation, which we call RLRS2. We removed `r gkids_unusable_white` respondents who report having grandchildren despite reporting having reported no children. In regressions we reweight respondents by the number of children they have, since more fecund grandparents account for a larger proportion of the next generation.

Older grandparents will have more time for their number of grandkids to accumulate. To deal with this time trend, we subset the data to parents whose oldest child was $40$ years old or older in $2016$, when the number of grandchildren was recorded. Appendix @fig-offspring-cohorts shows the distribution of the birth year of the oldest child, adjusted for sampling weights. A majority of the observations are after $1960$, the most recent year the respondents were born in. This reassures us that we are observing reproduction in later generations.

We regress the respondent's (the grandparent's) PGS on RLRS2. The resulting coefficient is only a proxy for selection in the next generation, not a perfect measure. The grandparent's PGS indicates, but does not determine the PGS of the parent. To know the expected parent's PGS, we would need both grandparents' PGS. Given that some respondents do not have a partner in the HRS and some have had children with multiple partners, that approach is untenable in our sample. We expect the effect of the grandparent's PGS to also depend upon the level of assortative mating for the trait. For traits with high assortative mating, the grandparent's PGS will more strongly predict the parent's PGS, leading to a greater regression slope.

Appendix @fig-gkids plots coefficients on RLRS2 versus coefficients on years of education. Correlations are significant. Standard errors are large (correlation `r cors_gkids_edu$white[1]`, bootstrapped 95% C.I. `r ci_boot_gkids$white[1]` to `r ci_boot_gkids$white[2]`). Again, there is a positive correlation between PGS coefficients on RLRS2 and RLRS (`r gen_cor_gkids`, bootstrapped 95% C.I. `r ci_gen_cor_gkids[1,1]` to `r ci_gen_cor_gkids[1,2]`), indicating stability in selection over time.

## Differences across social groups

We next test part 2 of the theory by interacting PGS with measures of education, income, marital status, and age at first birth. Education is years of education, split at 12 years. Income is respondent's mean wage income over all surveys, residualized on a full set of birth year dummies, and median-split. We call the low-education, low-income, unmarried or younger-AFB group the "disadvantaged group".

We test the null hypothesis that there are no differences in coefficients between social groups for any PGS, against the alternative that for some PGS, coefficients are larger for the disadvantaged group. Here, "larger" means in the direction predicted by the score's correlation with education: more positive for scores which are negatively correlated with education, and more negative for scores which are positively correlated with education. So, we run regressions of the form

$$
RLRS_i=\alpha+\beta PGS_i + \gamma DIS_i + \delta (PGS_i \times DIS_i) + \varepsilon_i
$$

where $DIS_i$ is a dummy for $i$'s membership in the disadvantaged group and where $PGS_i$ has been sign-flipped to correlate negatively with education. Under the null, the estimated $\delta$s will be distributed around zero; in expectation half will be positive and half negative. Under the alternative, more than half the $\delta$s will be positive. We bootstrap respondents and count the number of positive $\delta$s in each resample. If 95% confidence intervals are above `r length(pgs_white)`/2 = `r length(pgs_white)/2`, we reject the null in favour of the alternative.


```{r}
#| label: stat-x-bootstrap-effect-size
#| cache: false
#| cache-lazy: false
#| cache-comments: true

# TODO set cache to false before submission

n_reps_x_es <-  199

calc_delta_coefs <- function (w, data, term) {
  groups <- switch(term,
                   edu = c("0-12", "13-17"),
                   income_med = c("Low", "High"),
                   married = c("Other", "Married"),
                   agefb = c("Low", "High"),
                   born = c("pre-1942", "post-1942"))
  groups <- paste0(":", term, groups) # colon for the cross term
  low_group <- groups[1]
  high_group <- groups[2]
  
  # We estimate separate betas then take the difference,
  # rather than explicitly using a cross term.
  # This is simpler.
  cross_terms <- paste0(pgs_white, ":", term)
  
  # "rlrs ~ pgs:term + term + pc1 + pc2 +..."
  forms <-  map(cross_terms, 
                \(x) reformulate(c(x, term), response = "rlrs"))
  mods <- map(forms, \(x) lm(x, data, weights = w))
  cfs <- map(mods, coef)
  cfs_low <- map_dbl(cfs, \(x) x[str_detect(names(x), low_group)])
  cfs_high <- map_dbl(cfs, \(x) x[str_detect(names(x), high_group)])
  
  # \delta in the estimating equation above.
  delta_coefs <- cfs_low - cfs_high
  
  setNames(delta_coefs, pgs_white)
}


calc_num_positive_deltas <- function (w, data, term) {
  delta_coefs <- calc_delta_coefs(w, data, term)
  # the name is required to workaround a bug in survey
  c(n_pos = sum(delta_coefs > 0))
}


rand_boot <- rand |> 
  subset(ethnicity == "white") |> 
  as.svrepdesign(type = "bootstrap", replicates = n_reps_x_es)

# sign-flip the PGS to correlate negatively with education
for (pgs_name in pgs_white) {
  pgs <- rand_boot$variables[[pgs_name]]
  sign_cor_pgs_edu <- sign(cor(pgs, rand_boot$variables$raedyrs, 
                               use = "complete"))
  rand_boot$variables[[pgs_name]] <- pgs * -1 * sign_cor_pgs_edu
}

# to get the non-bootstrapped deltas do, e.g.:
# calc_delta_coefs(weights(rand_boot, type = "sampling"), 
#                  rand_boot$variables, "edu")

n_pos_deltas <- ci <- est <- tbl <- list()

terms <- c("edu", "income_med", "married", "agefb", "born")

for (term in terms) {
  n_pos_deltas[[term]] <- 
    withReplicates(rand_boot, theta = calc_num_positive_deltas, term = term)
  
  ci[[term]] <- confint(n_pos_deltas[[term]])
  est[[term]] <- round(n_pos_deltas[[term]])
  tbl[[term]] <- sprintf("%d (%.1f to %.1f)", est[[term]], 
                         ci[[term]][, 1], ci[[term]][, 2])
}

# calculate non-bootstrapped deltas
sampling_weights <- weights(rand_boot, type = "sampling")

deltas <- map(terms, \(term) {
                calc_delta_coefs(sampling_weights,
                                 rand_boot$variables, 
                                 term)
              }) 

deltas <- deltas |> 
  setNames(terms) |> 
  as.data.frame() |> 
  as_tibble(rownames = "PGS") |> 
  pivot_longer(-PGS, names_to = "group", values_to = "delta")

rm(rand_boot) # avoid messing with subsequent code

```

```{r}
#| label: fig-deltas
#| fig-cap: Coefficients of interaction terms ($\delta$) for effects of
#|   polygenic scores 
#|   on RLRS among different social groups. A positive $\delta$ means that
#|   the selection coefficient was larger for the disadvantaged group, in 
#|   the direction associated with less education. Horizontal lines
#|   show median coefficients.

deltas |> 
  filter(
    group != "born"
  ) |> 
  mutate(
    med_d = median(delta),
    Group = case_match(group,
              "agefb"      ~ "AFB",
              "edu"        ~ "Education",
              "income_med" ~ "Income",
              "married"    ~ "Marriage"
            ),
    outliers = ifelse(delta > 0.05, nice_names[PGS], ""),
    .by = group
  ) |> 
  ggplot(aes(Group, delta, colour = Group)) + 
    geom_hline(yintercept = 0, color='grey70') + 
    geom_point(alpha = 0.7, 
               position = position_jitter(width = 0.125, height=0.001, seed = 12346)) +
    geom_spoke(aes(y = med_d), angle = 0,  radius = 0.3, colour = "grey40") +
    geom_spoke(aes(y = med_d), angle = pi, radius = 0.3, colour = "grey40") +
    geom_text(aes(label = outliers), size = 2, colour = "black", 
              position = position_jitter(width = 0.125,  height=0.001, seed = 12346), 
              hjust = -0.15) +
    theme_minimal() + 
    theme(
      panel.grid = element_blank(),
      legend.position = "none"
    )
```

@fig-deltas plots values of $\delta$ for each interaction term, without bootstrapping. The majority of $\delta$s are positive for age at first birth, income and marriage but not for education. In other words, for younger parents, poorer participants, and unmarried participants, most PGS have larger effects on RLRS, but this is not true for less educated participants.

```{r}
#| label: stat-control-agefb

# selection effects controlling for age at first birth
pgs <- pgs_white

form_pgs_control_agefb <- map(pgs, \(x) reformulate(c(x, "agefbn"), 
                                                    response = "rlrs"))
mod_pgs_control_agefb <- map(form_pgs_control_agefb,
                             \(x) svyglm_quiet(x, rand, subset = 
                                                 ethnicity == "white"))
tidy_pgs_control_agefb <- map(mod_pgs_control_agefb, broom::tidy) |> 
                          list_rbind() |> 
                          filter(term %in% pgs)

cor_raw_controlled <- cor(tidy_pgs$white$estimate,
                          tidy_pgs_control_agefb$estimate)

stopifnot(cor_raw_controlled >= 0)


```

@tbl-x-bootstraps shows estimates and bootstrapped 95% confidence intervals for the number of scores where the $\delta$ term is positive. Confidence intervals exclude 34 only for income and marriage.

|                    | Estimate (95% C.I.) |
|--------------------|---------------------|
| Education          | `r tbl$edu`         |
| Income             | `r tbl$income_med`  |
| Marriage           | `r tbl$married`     |
| Age at first birth | `r tbl$agefb`       |
| Birth year         | `r tbl$born`        |

: Estimates and bootstrap 95% confidence intervals for numbers of PGS out of `r length(pgs_white)` where the disadvantaged group had a larger selection coefficient than the advantaged group. Groups are: 0-12 years education vs. 13-17 years; below vs. above median income; all others vs. married; below vs. above median age at first birth; born before vs. after 1942. `r n_reps_x_es` bootstraps. {#tbl-x-bootstraps}

To complement the above cross-PGS analysis, we focus on two scores which we a priori expect to show intergroup differences: educational attainment and ADHD. Both of these showed large differences in @hugh2022human, and have significant main effects in this data. @fig-top-pgs-interactions displays coefficients of these scores on RLRS for each of the different groups. 7 out of 8 differences are in the expected direction, with coefficients being larger for the disadvantaged group, except for low- vs- high-education and EA. However, no differences remain significant after Bonferroni correction. 

```{r}
#| label: stat-top-pgs-interactions

run_interaction_model <- function (pgs, interaction) {
  # This formulation gives you separate estimates of `pgs`
  # for each value of  `interaction`, as well as the main effect
  # for `interaction` itself. We can then test separately
  # for the interaction term using e.g. coeftest()
  # I prefer doing this because the coefficients are easy to interpret
  fml_x <- glue("rlrs ~ {interaction} + {interaction}:{pgs}")
  fml_x <- as.formula(fml_x)
  rand_white <- rand |> subset(ethnicity == "white")
  mod_x <- svyglm_quiet(fml_x, rand_white)
  
  mod_x
}


test_interaction_term <- function (mod) {
  # All our models have two interaction terms. This finds them
  # and tests if they are equal, using car::linearHypothesis.
  x_terms <- matchCoefs(mod, ":") 
  x_hypothesis <- glue("{x_terms[1]} = {x_terms[2]}")
  car::linearHypothesis(mod, x_hypothesis)
}


interactions <- c(
    Education = "edu", 
    Income    = "income_med", 
    Marriage  = "married",
    AFB       = "agefb"
  )

top_pgs <- c("EA", "ADHD")

regs_to_run <- expand_grid(pgs = top_pgs, interaction = interactions)

# This creates a tibble of regression results
tidy_regs <- regs_to_run |> 
  # Using rowwise() like this preserves the variables pgs and interaction,
  # but makes it easier to refer to variables in list-columns.
  rowwise(pgs, interaction) |> 
  mutate(
    mod = list(run_interaction_model(pgs = pgs, interaction = interaction)),
    tidy = list(broom::tidy(mod, conf.int = TRUE)),
    # A test that the two interaction terms
    # have the same coefficient:
    test = list(test_interaction_term(mod)),
    # P value of the test. (broom throws a warning which I think we can 
    # ignore.)
    test.p.value = suppressWarnings(broom::tidy(test)) |> pull(p.value)
  )

# this gives all the coefficients for the models
tidy_coefs <- tidy_regs |> 
  unnest(tidy) |> 
  ungroup()

```

```{r}
#| label: fig-top-pgs-interactions
#| fig-cap: |
#|   Coefficients of RLRS across different groups for EA and ADHD 
#|   polygenic scores. Lines show 95 per cent confidence intervals. Groups
#|   are: 0-12 years education vs. 13-17 years; below vs. above median
#|   income; all others vs. married; below vs. above median age at first
#|   birth.

pos_dod <- position_dodge(width = 0.2)

tidy_coefs |> 
  # Select only the interaction terms:
  filter(grepl(":", term)) |> 
  mutate(.by = c(pgs, interaction),
    PGS = pgs,
    # gets the pretty name of the group
    Split = names(interactions)[match(interaction, interactions)],
    Group = c("Disadvantaged group", "Advantaged group"),
    Difference = ifelse(test.p.value < 0.05, "Difference p < 0.05",
                        "Difference n.s.")
  ) |> 
  ggplot(aes(estimate, Split, colour = Group)) + 
    geom_linerange(aes(xmin = conf.low, xmax = conf.high), alpha = 0.5,
                   position = pos_dod) +
    geom_point(aes(shape = Group, colour = Group, fill = Difference),
               position = pos_dod, 
               shape = "circle filled", size = 2) +
    facet_wrap(vars(PGS), ncol = 1) +
    scale_fill_manual(
      values = c("Difference n.s."     = "white", 
                 "Difference p < 0.05" = "orange"),
      guide = guide_legend(title = NULL)) +
    scale_color_manual(
      values = c("Advantaged group"    = "steelblue3", 
                 "Disadvantaged group" = "red4"),
      guide = guide_legend(title = NULL)) +
    theme_minimal() +
    theme(
      panel.grid.major.y = element_blank(),
      panel.grid.minor.x = element_blank()
    ) + 
    labs(x = "Coefficient on RLRS", y = "")

```

```{r}
#| label: stat-pgs-over-time
#| eval: false

# Plot changes in a couple of variables by decade
# Below: "Changes in PGS over time are small for both groups..."
 
rand$variables$birth_decade <- chop_width(rand$variables$rabyear, 10, 1920)
decades_adhd <- svyby(~ADHD_PGC17, ~ birth_decade + ethnicity, rand, 
                      FUN = svymean) 
decades_ea3 <- svyby(~EA3, ~ birth_decade + ethnicity, rand, 
                     FUN = svymean) 

# example plot
ggplot(decades_ea3, aes(birth_decade, EA3, color = ethnicity, 
                        group = ethnicity)) + 
  geom_line() +
  geom_pointrange(aes(ymax = EA3 + 1.96 * se, ymin = EA3 - 1.96 * se))


```

```{r}
#| label: ADHD

# Testing the R2 of the ADHD PGS

adhd_cor <- cor.test(rand$variables$adhd_score, rand$variables$ADHD, 
                     use = "pairwise.complete")

```

```{r}
#| label: stat-pgicorrect
#| cache: false


####################################
# Formatting parameter values to use
####################################

# correct_scores holds h2 and R2 values, as well as useful information about the PGS
# such as group categories taken from the PGI repository and simplified names
correct_scores <- readxl::read_excel("PGI_Heritabilities.xlsx", 
                                     na = "NA", range = "B1:U53")

# Where h2 and R2 are missing, we take values from a similar sample - the WLS
correct_scores$chip_h2 <- ifelse(is.na(correct_scores$HRS_h2), 
                                 correct_scores$WLS_h2,
                                 correct_scores$HRS_h2)

correct_scores$R2 <- ifelse(is.na(correct_scores$HRS_R2),
                            correct_scores$WLS_R2,
                            correct_scores$HRS_R2)

correct_scores$twin_h2 <- correct_scores$TwinH2
correct_scores$pgs     <- correct_scores$Abr

# This removes NEBwomen from the data
correct_scores <- correct_scores[-which(correct_scores$pgs == "NEBwomen"),]

correct_scores$pgs <- case_match(correct_scores$pgs,
  "CANNABIS" ~ "CANNABISpgirep",
  "DPW"      ~ "DPWpgirep",
  "HEIGHT"   ~ "HEIGHTpgirep",
  .default = correct_scores$pgs
)

# some of the PGS have estimates rounded to zero and are thus useless
correct_scores <- filter(correct_scores, chip_h2 > 0, R2 > 0)
correct_scores <- correct_scores[,c("pgs",
                                    "chip_h2",
                                    "twin_h2",
                                    "R2",
                                    "Source", 
                                    "PoldermanICF_ICD10_subchapter",
                                    "Category")]

# TE I originally used 0.28 from the TED sample. But that leads to especially absurd estimates
# 0.15 was estiamted in the American MIDUS sample from the same paper.
# Leading to slightly less absurd estimates. Perhaps more appropriate in the American context.
# Uncertainty regarding parameters means results should be interepreted with CAUTION!!!
# adjustment to use Midus values for NEB is also used

correct_scores$twin_h2[correct_scores$pgs == "AFB"] <- 0.15
#correct_scores$twin_h2[correct_scores$pgs == "NEBwomen"] <- 0.28
pgs_correct <- correct_scores$pgs

####################################
# Running the regressions
####################################

form_pgs_correct <- map(pgs_correct, \(x) reformulate(c(x), response = "rlrs"))
  
corrected_effects <-  map(form_pgs_correct,  \(x) svyglm_quiet(x, rand)) |> 
    map(broom::tidy, conf.int = TRUE, conf.level = 0.95) |> 
    list_rbind() |>
    filter(term %in% pgs) |>
    rename(pgs = term) |>
    left_join(correct_scores, by = join_by(pgs)) 
  
corrected_effects_snp <- corrected_effects |>
    mutate(estimate = estimate*sqrt(chip_h2/R2),
           std.error = std.error*sqrt(chip_h2/R2),
           conf.low = conf.low*sqrt(chip_h2/R2),
           conf.high = conf.high*sqrt(chip_h2/R2),
           expected_phenotypic_change = estimate*sqrt(chip_h2),
           heritability = "Chip") 

corrected_effects_twin <- corrected_effects |>
    mutate(estimate = estimate*sqrt(twin_h2/R2),
           std.error = std.error*sqrt(twin_h2/R2),
           conf.low = conf.low*sqrt(twin_h2/R2),
           conf.high = conf.high*sqrt(twin_h2/R2),
           expected_phenotypic_change = estimate*sqrt(twin_h2),
           heritability = "Twin")

corrected_effects <- rbind(corrected_effects_snp, corrected_effects_twin)

corrected_effects$Category <-
  case_match(
    corrected_effects$Category,
    "Health and health behaviours"     ~ "Health",
    "Personality and well-being"       ~ "Personality",
    "Fertility and sexual development" ~ "Health",
    "Anthropometric"                   ~ "Health",
    "Cognition and education"          ~ "Cognition"
  )

corrected_effects <- corrected_effects |> 
  rename(trait = pgs,
         TwinH2_Source = Source,
         Heritability = heritability) 


corrected_effects |>
  select(trait, estimate, std.error, expected_phenotypic_change, Heritability, chip_h2, 
         twin_h2, R2, TwinH2_Source, PoldermanICF_ICD10_subchapter, Category) |> 
  readr::write_csv("Corrected_effects.csv")


####################################
# Extracting Phenotypic change
####################################

IQgen_change <- corrected_effects_twin |> 
                filter(pgs == "CP") |>
                pull(estimate)

IQphen_change_sd_units <- corrected_effects_twin |> 
                filter(pgs == "CP") |>
                pull(expected_phenotypic_change)
IQphen_change <- IQphen_change_sd_units*15

```


## Corrected effect sizes

Lastly, we would like to know natural selection's effect sizes. The regression beta of RLRS on the PGS gives the change in one generation in the PGS due to natural selection, measured in standard deviations. Polygenic scores contain error, so estimated effects are biased towards zero compared to the effect of the true PGS. They can be scaled up by

$$
\hat{\beta}_{TRUE} = \hat{\beta}_{PGS}\sqrt{\frac{h^2}{R^2_{PGS}}}
$$

where $h^2$ is the heritability of the PGS target phenotype and $R^2_{PGS}$ is the coefficient of determination of the measured PGS on the target phenotype [@becker2021resource]. SNP- or chip-heritabilities and $R^2_{PGS}$ are calculated by @becker2021resource for scores in the Polygenic Index Repository. The authors use GCTA to estimate heritability. When the corresponding phenotype is not available in the HRS to estimate $R^2_{PGS}$, we use parameters estimated by the authors in the Wisconsin Longitudinal Study instead. We also perform error correction with twin heritabilites. To attain precise estimates we use heritabilities from a meta-analysis including over 14,000 twin pairs authored by @polderman2015twin[^1]. We remove polygenic scores with $R^2_{PGS} < 0.005$ to focus on PGS with adequate power. This removed the ADHD PGS which significantly predicted RLRS, even after bonferroni correction[^2]. Error-corrected effect sizes and the $R^2_{PGS}$ and $h^2$ parameters used can be found on the Github page for this paper.

[^1]: For age of first birth, we could not find an appropriately close trait in @polderman2015twin. For this trait we used a twin heritability of 0.15 as estimated in the Midlife in the United States (MIDUS) sample [@briley2017fertility].

[^2]: @becker2021resource estimate $R^2_{PGS}$ for ADHD using binary, yes or no, items of ADHD symptoms. These were PV001-PV018 in the 2016 core survey. They estimated an $R^2_{PGS}$ of $0.003$. When we attempt to replicate it we find an $r_{PGS}$ of `r adhd_cor$estimate`, implying an $R^2_{PGS}$ of `r adhd_cor$estimate^2`. Although this would pass our requirement of $R^2_{PGS} > 0.005$, the correlation was not very significant (p = `r adhd_cor$p.value`) and the confidence intervals were large 95% C.I. `r adhd_cor$conf.int[1]` to `r adhd_cor$conf.int[2]` owing to the fact that few respondents were given the ADHD items N = `r adhd_cor$parameter + 2`. Given the great uncertainty over the true correlation we were unable to justify using the error-corrected estimator with the ADHD polygenic score.

@fig-true-effects reports the error corrected estimates of selection. As a rule of thumb, a 0.1 standard deviation change in a polygenic score over a generation might count as "substantive": about 54% of the new generation will be below the parents' mean. Many upper confidence bounds meet that threshold, but lower bounds are often small or include zero. The confidence bounds capture uncertainty from sampling variation, but not other sources, including uncertainty about the true $h^2$, the true $R^2_{PGS}$, limitations of the within-sample phenotypes, noise from correlated environments, and biases in the polygenic scores such as assortative mating, population stratification, and gene-environment correlations. For twin-heritability, different relationships with fertility among variants not measured on the chip may also bias the estimates. Given all this, the estimates mostly show the limits of our knowledge, and should be treated as best guesses only.

To our knowledge, this is the first time error corrected estimates of natural selection have been made for non-cognitive traits. The point estimates for many traits, including depressive symptoms, asthma, age of first birth and self-rated health, exceed point estimates for educational attainment and cognitive performance. This suggests the effects of natural selection are not limited to socioeconomic outcomes, but are also relevant to population health.

To estimate how natural selection will contribute to changing the phenotype, we can multiply the change in the mean genetic value by the correlation between genetic values and phenotypes $h$, which is the square root of heritability. For cognitive performance we have estimated a genetic change of $`r IQgen_change`$ standard deviations per generation, assuming a twin heritability of $0.51$. This implies a phenotypic change of $`r IQgen_change` \times 0.51^\frac{1}{2} = `r IQphen_change_sd_units`$ standard deviations per generation, equivalent to $`r IQphen_change`$ points in the units of IQ, where a standard deviation is equal to $15$ points. This calculation assumes the heritability of the trait remains constant, that the genetic correlation across time is equal to one and it ignores the environmental contributions to changes in the phenotype. Given the assumptions required for this calculation, on top of the problems involved in estimating the genetic change, it can only be considered a guess.

Looking at the polygenic scores changing the most, many seem to have moved in the opposite direction to how the phenotype has changed over the latter half of the twentieth century. Education, IQ, delayed child birth, abstinence from smoking and life expectancy have all increased in the late twentieth century, contrary to how we believe the genetics have changed. However, the prevalence of self-reported anxiety and depressive symptoms has increased [@Witters2023; @Twenge2000], paralleling the polygenic scores for depression and neuroticism. The prevalence of asthma has also increased [@Moorman2007] in line with the polygenic score. The apparent paradox of phenotypes and natural selection moving in opposite directions suggests that environmental change is a much stronger force than that of natural selection across recent history.

```{r}
#| label: fig-true-effects
#| warning: false
#| fig-cap: |
#|   Estimated coefficients of true polygenic scores on RLRS calculated
#|   using estimates of chip- and twin-heritability and the most recent
#|   polygenic score.
#| fig-height: 6.79
#| fig-width:  5.98

  
corrected_effects |>
  # This removes ADHD due to the low power of the score. 
  # Note ADHD predict RLRS after bonferroni! 
  # It also removes NEB
  filter(R2 > 0.005) |> 
  mutate(
    trait = nice_names[trait],
    trait = fct_reorder(trait, estimate)
  ) |>
  ggplot(aes(y = trait, x = estimate, xmin = conf.low, xmax = conf.high,
           color = Heritability)) +
  geom_vline(xintercept = 0, color = "grey60") +
  geom_pointrange(aes(linetype = Heritability == "Twin"),
                  position = position_dodge(0.2)) +
  scale_linetype(guide = "none") +
  coord_cartesian(xlim = c(-0.3, 0.3)) +
  labs(x = "Corrected coefficient on RLRS", y = "") +
  theme_minimal() +
  facet_grid(Category ~ ., scales = "free_y", space = "free_y") +
  theme(strip.text.y = element_text(angle = -90,face = "bold"))
  
```

```{r}
#| label: stat-bootstrap-true-effects
#| cache: false
#| eval: false

# Now we just do this using the python tool

#' Calculate corrected PGS
#'
#' @param w Weights
#' @param data Data frame
#' @param pgs Name(s) of polygenic score(s) columns in  `data`.
#' @param phenotype Name of target phenotype in `data`
#' @param chip_h2 Chip-heritability
#' @param twin_h2 Twin-study heritability
#'
#' @return A vector of corrected estimates of effects on fertility, 
#'   named like `source.to.target` 
#'   where `source` is one of `pgs` and target is either a `pgs` or the
#'   chip-PGS or true PGS. If `pgs` has more than one element then
#'   the function will correct pgs using each other as a sanity check. If
#'   `source == target` then that is the original uncorrected score.
#' @noRd
#'
#' @examples
boot_correct <- function (w, data, pgs, phenotype, chip_h2, twin_h2) {
  
  fs_fert <- map(pgs, \(x) reformulate(c(x), response = "rlrs"))
  fs_phen <- map(pgs, \(x) reformulate(c(x), response = phenotype))

  ms_fert <- map(fs_fert, \(x) lm(x, data = data, weights = w))
  ms_phen <- map(fs_phen, \(x) lm(x, data = data, weights = w))
  bhat_fert <- map2_dbl(ms_fert, pgs, \(x, y) coef(x)[[y]])
  bhat_phen <- map2_dbl(ms_phen, pgs, \(x, y) coef(x)[[y]])

  r2_phen <- map_dbl(ms_phen, \(x) {
    an <- car::Anova(x)
    an$`Sum Sq`[1]/sum(an$`Sum Sq`)  # incremental R2 of first term
  })

  r2_phen <- c(r2_phen, chip_h2, twin_h2)
  # the correction factor
  # lower triangular part of this shows how much you multiply to get from
  # COLNAME to ROWNAME
  rhos <- outer(r2_phen, r2_phen, \(x, y) sqrt(x/y)) 
  rownames(rhos) <- colnames(rhos) <- c(pgs, "Chip", "Twin")

  # multiply first column by bhat_edu[1] to give predictions from the
  # first PGS to the next ones; etc.
  # The diagonals of b_edu are exactly equal to bhat_edu
  # arow,column gives the predicted value of b by ROWNAME for COLNAME
  b_phen <- apply(rhos, 1, \(x) x * c(bhat_phen, NA, NA))
  b_fert <- apply(rhos, 1, \(x) x * c(bhat_fert, NA, NA))

  # don't include the last two rows of NA predictions "from chip/twin h2"
  b_fert <- b_fert[seq_along(pgs), , drop = FALSE]

  res <- c(b_fert)
  # TODO: are the "backward estimates" wrong?
  # TODO: check against the python tool from the PGI repository paper
  res_names <- outer(rownames(b_fert), colnames(b_fert),
                      \(x, y) paste0(x, ".to.", y))
  names(res) <- c(res_names)
  
  res
}

n_reps_true <- 199
rand_boot <- rand |> as.svrepdesign(type = "bootstrap", 
                                    replicates = n_reps_true)

correct_scores <- tibble::tribble(
  ~pgs                         , ~phenotype  , ~chip_h2, ~twin_h2,
  c("ADHD_PGC10", "ADHD_PGC17"), "adhd_score", 0.21    , 0.75    ,
  c("EDU2", "EDU3", "EA3")     , "raedyrs"   , 0.10    , 0.43    ,  
  "AFBC"                       , "agefbn"    , 0.05    , 0.26    ,
  "NEUROT"                     , "neurot"    , 0.10    , 0.37    ,
  "CD"                         , "r10hearte" , 0.22    , 0.4     ,
  "T2D"                        , "r10diabe"  , 0.18    , 0.22    , 
  c("MDD", "MDD2")             , "r10cesd"   , 0.08    , 0.37    ,
#  "ANXCC"                      , "anxiety"   , 0.31    , 0.45    ,
  "AI"                         , "age_smoked", 0.05    , 0.50    ,
  c("GENCOG", "GCOG2")         , "r10cogtot" , 0.19    , 0.86
)

correct_scores <- drop_na(correct_scores)
# Sources for chip/twin h2; Abdel.
# general cognition used h2 for "adult IQ"
# Except:
# heart disease 0.4 to 0.6 from https://www.nejm.org/doi/full/10.1056/nejm199404143301503
# via https://www.jacc.org/doi/full/10.1016/j.jacc.2014.12.033
# T2D  0.22 to 0.73...
# https://www.cell.com/ajhg/pdf/S0002-9297(12)00031-6.pdf
# https://www.cambridge.org/core/services/aop-cambridge-core/content/view/94E0005D626FB0399AA8FDDAC82F970E/S1832427415000833a.pdf/div-class-title-the-concordance-and-heritability-of-type-2-diabetes-in-34-166-twin-pairs-from-international-twin-registers-the-discordant-twin-discotwin-consortium-div.pdf
# anxiety (nb not social anxiety!)
# https://ajp.psychiatryonline.org/doi/full/10.1176/ajp.156.2.246
# generalized anxiet disorder 42%:
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0134865
# age of smoking initiation, including chip heritability
# http://www.matthewckeller.com/publications/Evans_Addiction_2021.pdf
# CAD chip heritability
# https://academic.oup.com/cardiovascres/article/113/8/973/2965332?login=false
# type ii diabetes chip heritability
# https://www.nature.com/articles/s41588-018-0241-6
# 'current anxiety symptoms' chip heritability
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7237282/

corrected_effects <- list()
for (i in seq(nrow(correct_scores))) {
  source_pgs <- correct_scores$pgs[[i]]
  phenotype <- correct_scores$phenotype[[i]]
  chip_h2 <- correct_scores$chip_h2[[i]]
  twin_h2 <- correct_scores$twin_h2[[i]]
  
  results <- withReplicates(rand_boot, theta = boot_correct, 
                            pgs = source_pgs, phenotype = phenotype, 
                            chip_h2 = chip_h2, twin_h2 = twin_h2)
  corrected_effects[[phenotype]] <- results |>
    as.data.frame() |>
    tibble::rownames_to_column(var = "rn") |>
    cbind(confint(results)) |>
    tibble::remove_rownames() |>
    setNames(c("rn", "mean", "SE", "conf.low", "conf.high")) |> 
    mutate(
      phenotype = phenotype,
      source.pgs = str_remove(rn, ".to.*"),
      target.pgs = str_remove(rn, ".*.to."),
      rn = NULL,
      .before = 1
    )
}

corrected_effects <- list_rbind(corrected_effects)
```


# Discussion

Across three generations of Americans, estimates of selection correlate over time, and selection is always more positive for traits that negatively predict educational attainment. Patterns of selection appear consistent across the twentieth and early twenty-first century. This result fits that of @Zietsch2014perfect, who found a perfect genetic correlation between number of children and number of grandchildren in a sample of Swedish twins. The underlying causes of selection seem to have stayed the same over time, so that the effects of selection have accumulated. In particular, selection against traits associated with education clearly predates the development of the US welfare state.

The economic theory of fertility is driven by the trade-off between children and income. On the one hand, a higher hourly wage increases the opportunity cost of raising children (a "substitution effect"). On the other hand, higher expected income makes children more affordable (an "income effect"). Prediction 1 of the theory holds when substitution effects dominate income effects overall. Prediction 2 is driven by the specific form of individuals' preferences for income: when marginal utility is higher at lower income levels, the substitution effect is stronger for those who expect to earn less.

The results here support prediction 1 but are more ambiguous for prediction 2. PGS coefficients on RLRS appear larger for low-income groups and unmarried respondents. But there is little evidence for larger coefficients among people with lower education, or younger parents. This may be due to the low sample size. But in the UK, the between-group differences were large [@hugh2022human]; differences that big would surely have been visible here. The theory can accommodate this non-result, if preferences for income are less curved in the US for any reason. But note that *any* theory with a negative relationship between education and fertility will give prediction 1.[^3] In this sense, results here are less supportive of the economic theory specifically.

[^3]: See @balbo2013fertility for a broad review of fertility theories.

Why does the US data show fewer differences by socio-economic status (SES) than the UK? One possibility is that in the US, SES maps on to race, so that ethnic differences capture some of the variation seen in the UK. The regression coefficient of phenotypic educational attainment on RLRS is more negative among black than white respondents (black respondents: `r tidy_fert_edu_phen$black$estimate`, 95% C.I. `r tidy_fert_edu_phen$black$conf.low` to `r tidy_fert_edu_phen$black$conf.high`, white respondents: `r tidy_fert_edu_phen$white$estimate`, 95% C.I. `r tidy_fert_edu_phen$white$conf.low` to `r tidy_fert_edu_phen$white$conf.high`; cf. @goldscheider1969minority, @johnson1979minority, @yang2003big, @maralani2013demography). And the slope of PGS education coefficients on fertility coefficients is larger among black respondents, though imprecisely estimated. But comparisons of PGS selection coefficients between the ethnic groups are hard because of the smaller sample size and differences in the scores' predictive power, so this hypothesis can only be speculative. Another possibility is that the US cohort was exposed to a smaller welfare state than the UK cohort, since many of them had children before the "Great Society" programs of the 1960s.

Lastly, we provide the first estimates of selection differentials of "true" polygenic scores for traits other than intelligence and education, finding results typically between 0-0.2 standard deviations. Our estimate of how much natural selection is changing intelligence is slightly smaller than in previous work. For example, @kong2017selection provide an estimate of $-0.9$ IQ points every $30$ years in Iceland, compared to our $`r round(IQphen_change,1)`$ points per generation. However, cognitive traits are only one part of the story of natural selection in humans. Whilst past research has focused on these traits, we find selection differentials are often of similar magnitudes for health related traits. The selection differential for self-rated health is greater than those for education and cognitive performance. The most significant, positively selected trait was ADHD, which was also found in the UK [@hugh2022human]. Future research should study the health and medical implications of natural selection, in addition to its social implications. To know more, we must await more accurate polygenic scores.

# Appendix

## Acknowledgements

The HRS (Health and Retirement Study) is sponsored by the National Institute on Aging (grant number NIA U01AG009740) and is conducted by the University of Michigan.

Code to reproduce this paper is available at <https://github.com/hughjonesd/hrs-selection>.

## Black respondents

There are `r n_black` genotyped black survey participants. Among this sample, the relationship between polygenic scores' coefficients on RLRS and on education is negative and significant (correlation `r cors_fert_edu$black[1]`, bootstrap 95% C.I. `r ci_boot_fert$black[1]` to `r ci_boot_fert$black[2]`). Correlations between number of siblings and education are insignificant (correlation `r cors_sibs_edu$black[1]`, bootstrapped 95% C.I. `r ci_boot_sibs$black[1]` to `r ci_boot_sibs$black[2]`). Similarly, coefficients on RLRS2 (children's average number of children) were insignificantly correlated with coefficients on years of education, with large standard errors (correlation `r cors_gkids_edu$black[1]`, bootstrapped 95% C.I. `r ci_boot_gkids$black[1]` to `r ci_boot_gkids$black[2]`; removed `r gkids_unusable_black` respondents reporting grandchildren but no children).

## Figures

```{r}
#| label: fig-cohorts
#| fig-cap: Distribution of birth years adjusted for sampling weights. 

# TE old version saved here in case we want to revert

#rand$variables |> 
#    mutate(
#      Ethnicity = fct_relevel(titles[ethnicity], "White participants",
#                              "Black participants")
#    ) |> 
#    ggplot(aes(rabyear, fill = Ethnicity)) + 
#      geom_bar(position = "identity") +
#      labs(x = "Birth year", y = "Count") +
#      scale_fill_manual(values = c(Black = "orange2", White = "steelblue4")) +
#      theme_minimal() +
#      theme(legend.title = element_blank())  


# Various packages and functions exist for making a weighted barplot
# But the y axes show the count for the population, not a representative subsample. So I've made the weighted plot mannually. This way the y axis has sensible values rather than millions

rand$variables <- rand$variables |> 
    mutate(
      weights_sample = (n_white+n_black)*weight/sum(weight)
    )

year_counts <- tibble()

for (eth in eths) {
  rand$variables |>
    subset(ethnicity == eth) |>
    count(rabyear, wt = weights_sample) |>
    cbind(eth) |>
    rbind(year_counts) -> year_counts
}


year_counts |>
  mutate(
      Ethnicity = fct_relevel(titles[eth], "White participants",
                              "Black participants"),
    ) |> 
  ggplot(aes(x = rabyear, y = n, fill = Ethnicity)) + 
      geom_bar(stat="identity") +
      labs(x = "Birth year", y = "Count") +
      scale_fill_manual(values = c("Black participants" = "orange2",
                                   "White participants" = "steelblue4")) +
      theme_minimal() +
      theme(legend.title = element_blank())  

```

```{r}
#| label: fig-sibling-cohorts
#| eval: false
#| fig-cap: Distribution of birth years adjusted for parent weights.

# Ideally we'd like to know the time period we're analysing when we look at selection in the parent's generation. Unfortuantely we don't have the parents' age of birth only the parent's age if they are still alive or the age at which they died.

# The next best thing we can do is look at the respondent's birth year adjusted for the parent weight. But this isn't very interesting and almost identical to weight by the respondent weight so the graph is left unevaluated.


r_parents$variables <- r_parents$variables |> 
    mutate(
      weights_sample = (n_white+n_black)*parent_weight/
                        sum(parent_weight)
    )

year_counts <- tibble()

for (eth in eths) {
  r_parents$variables |>
    subset(ethnicity == eth) |>
    count(rabyear, wt = weights_sample) |>
    cbind(eth) |>
    rbind(year_counts) -> year_counts
}


year_counts |>
   mutate(
      Ethnicity = fct_relevel(titles[eth], "White participants", 
                              "Black participants"),
    ) |> 
  ggplot(aes(x = rabyear, y = n, fill = Ethnicity )) + 
      geom_bar(stat="identity") +
      labs(x = "Birth year", y = "Count") +
      scale_fill_manual(values = c("Black participants" = "orange2",
                                   "White participants" = "steelblue4")) +
      theme_minimal() +
      theme(legend.title = element_blank())  


```

```{r}
#| label: fig-offspring-cohorts
#| fig-cap: Distribution of birth years for the oldest child of respondents, adjusted for sampling weights.


n_oldest <- r_child$variables |> 
     filter(!is.na(rlrs_div_h13gkid)) |>
             nrow()

child_graph_data <- r_child$variables |> 
    filter(!is.na(rlrs_div_h13gkid)) |>
    mutate(
      yearageokid = 2016 - h13ageokid,
      weights_sample = n_oldest*child_weight/
                        sum(child_weight)
      )


year_counts <- tibble()

for (eth in eths) {
  child_graph_data |>
    subset(ethnicity == eth) |>
    count(yearageokid, wt = weights_sample) |>
    cbind(eth) |>
    rbind(year_counts) -> year_counts
}


year_counts |>
  mutate(
      Ethnicity = fct_relevel(titles[eth], "White participants",
                              "Black participants"),
    )  |> 
  ggplot(aes(x = yearageokid, y = n, fill = Ethnicity )) + 
      geom_bar(stat="identity") +
      labs(x = "Birth year", y = "Count") +
      scale_fill_manual(values = c("Black participants" = "orange2",
                                   "White participants" = "steelblue4")) +
      theme_minimal() +
      theme(legend.title = element_blank())  

mean_yold_kid <- sum(year_counts$yearageokid*
                       year_counts$n)/sum(year_counts$n)
```

```{r}
#| label: fig-rlrs
#| fig-cap: |
#|  Coefficients of PGS on RLRS. Lines are 95% confidence intervals.
#| fig-height: 6


tidy_pgs$white |>
  mutate(
    term = nice_names[term],
    term = fct_reorder(term, estimate)
  ) |>
  ggplot(aes(estimate, term)) +
    geom_vline(xintercept = 0) +
    geom_pointrange(aes(xmin = conf.low, xmax = conf.high), size = 0.3,
                    color = "steelblue3") +
    theme_light() + 
    labs(
      x = "", y = "",
      title = "Coefficients of polygenic scores on RLRS"
    ) +
    theme(
      axis.text.y = element_text(size = 7),
      panel.grid.major.y = element_blank(),
      panel.grid.minor.x = element_blank(),
      legend.position = "none"
    )


```

```{r}
#| label: fig-siblings
#| fig-cap: |
#|   Scatterplot of PGS coefficients on number of live siblings and years of
#|   education for parents. The dashed line shows a linear regression.


tidy_pgs_sibs$white |> 
  inner_join(tidy_pgs_ed_parent$white, 
             by = join_by(term), suffix = c(".sibs", ".edyrs")) |> 
  ggplot(aes(estimate.edyrs, estimate.sibs)) +
    geom_hline(yintercept = 0, color = "grey80") +
    geom_vline(xintercept = 0, color = "grey80") +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE, 
                linetype = "dashed", colour = "red", linewidth = 0.5) +
    coord_cartesian(xlim = c(-.3, .75)) +
    theme_light() + 
    theme(panel.grid = element_blank()) +
    labs(
        x = "Coefficient on years of education",
        y = "Coefficient on number of siblings"
    )

```

```{r}
#| label: fig-gkids
#| fig-cap: |
#|   Scatterplot of PGS coefficients on RLRS2 and years of
#|   education. RLRS2 is the reproductive success of respondent's offspring
#|   relative to the success of other offspring. The dashed line shows a 
#|   linear regression.


tidy_pgs_gkids$white |> 
  inner_join(tidy_pgs_ed$white, 
             by = join_by(term), suffix = c(".gkids", ".edyrs")) |> 
  ggplot(aes(estimate.edyrs, estimate.gkids)) +
    geom_hline(yintercept = 0, color = "grey80") +
    geom_vline(xintercept = 0, color = "grey80") +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE, 
                linetype = "dashed", colour = "red", linewidth = 0.5) +
    coord_cartesian(xlim = c(-.3, .75)) +
    theme_light() + 
    theme(panel.grid = element_blank()) +
    labs(
      x = "Coefficient on years of education",
      y = "Coefficient on RLRS2"
    )
  

```


\FloatBarrier

# Bibliography