-
Notifications
You must be signed in to change notification settings - Fork 0
/
hrs-selection.qmd
2036 lines (1631 loc) · 88.3 KB
/
hrs-selection.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Natural Selection Across Three Generations of Americans"
author:
- David Hugh-Jones
- Tobias Edwards
abstract: |
We investigate natural selection on polygenic scores
in the contemporary US, using the Health and Retirement Study. Across
three generations, scores which correlate negatively (positively) with
education are selected for (against). However, results only
partially support the economic theory of fertility as an explanation for
natural selection. The theory predicts that
selection coefficients should be stronger among low-income,
less educated, unmarried and younger parents, but these predictions are only
half borne out: coefficients are larger only among low-income
parents and unmarried parents. We also estimate effect sizes corrected
for noise in the polygenic scores. Selection for some health traits is
similar in magnitude to that for cognitive traits.
date: "February 2024"
format:
pdf:
keep-tex: true
include-in-header:
- text: |
\usepackage{placeins}
geometry:
- top=2in
- left=1.25in
- right=1.25in
- heightrounded
editor: visual
editor_options:
chunk_output_type: console
knitr:
opts_chunk:
echo: false
bibliography: bibliography-hrs.bib
---
```{r}
#| label: setup
#| cache-lazy: false
# TE One time I tried to render the document with cache on I got the error message on the first chunk: "Error in `lazyLoadDBinsertVariable()`: ! long vectors not supported yet: connections.c:6028" Added cache-lazy: false which seems to fix things
options(digits = 3)
set.seed(271075)
library(tibble)
library(haven)
library(forcats)
library(dplyr, warn.conflicts = FALSE)
library(glue)
library(stringr)
library(purrr)
library(carData)
library(car, warn.conflicts = FALSE) # masks some
library(ggplot2)
library(tidyr)
library(santoku, warn.conflicts = FALSE) # masks tidyr::chop
# masks dotchart:
suppressPackageStartupMessages(library(survey, warn.conflicts = FALSE))
knitr::knit_hooks$set("inline",
\(x) {
if (! is.na(x <- suppressWarnings(as.numeric(x)))) {
x <- prettyNum(formatC(x, digits = 3, format = "f"), ",")
}
x
}
)
nice_names <- c(
SCZ = "Schizophrenia",
WC = "Waist circ.",
WHR = "Waist-hip ratio",
NEUROT = "Neuroticism",
WELLB = "Well-being",
DEPSYMP = "Depr. symptoms",
CD = "Cor. art. disease",
MI = "Myoc. infarction",
CORT = "Cortisol",
T2D = "Type 2 diabetes",
BIP = "Bipolar",
ADHD_PGC17 = "ADHD",
XDISORDER = "Cross-disorder",
MENA = "Age at menarche",
MENO = "Age at menopause",
EXTRAVER = "Extraversion",
AUTISM = "Autism",
AB = "Antisoc. behav.",
OCD = "OCD",
AFBC = "Age 1st birth combn",
AFBF = "Age 1st birth F",
AFBM = "Age 1st birth M",
MDD2 = "Maj. depr. disorder",
PTSDAA = "PTSD Afr",
PTSDEA = "PTSD Eur",
PTSDC = "PTSD combined",
HDL = "HDL cholesterol",
LDL = "LDL cholesterol",
TC = "Total cholesterol",
ANXFS = "Anxiety factor",
ANXCC = "Anxiety case-control",
BUN = "Blood urea nitro.",
BUNTE = "Blood urea nitro. TE",
CKD = "Kidney disease",
CKDTE = "Kidney disease TE",
DBP = "Diastolic blood pr.",
BMI2 = "BMI",
HEIGHT2 = "Height",
AI = "Age smoking init.",
CPD_GSCAN19 = "Cigs per day",
DPW = "Drinks per week",
SC = "Smoking cessation",
SI = "Smoking initiation",
HTN = "Hypertension",
CANNABIS = "Cannabis",
GWAD2NA = "Alzheimer's no APOE",
ALZ_01AD2NA = "Alzheimer's p 0.01 no APOE",
GWAD2WA = "Alzheimer's",
ALZ_01AD2WA = "Alzheimer's p 0.01",
ALC = "Alcohol dependence",
PP = "Pulse pressure",
SBP = "Systolic blood pr.",
EGFR = "Glom. filtr.",
EGFRTE = "Glom. filtr. TE",
EA3 = "Educ. attainment",
HBA1CAA = "HbA1c Afr",
HBA1CEA = "HbA1c Eur",
GCOG2 = "Gen. cognition",
ACTIVITY = "Physical Activity", # pgirep here
ADHD = "ADHD",
ADVENTURE = "Adventurous",
AFB = "Age First Birth",
ALLERGYCAT = "Cat Allergy",
ALLERGYDUST = "Dust Allergy",
ALLERGYPOLLEN = "Pollen Allergy",
ASTECZRHI = "Asthma/Eczema/Rhinitis",
ASTHMA = "Asthma",
AUDIT = "Alcohol Misuse",
BMI = "Body Mass Index",
CANNABISpgirep = "Cannabis Use",
COGEMP = "Cognitive Empathy",
COPD = "Chronic Obstructive Pulmonary Disease",
CPD = "Cigarettes per Day",
CP = "Cognitive Performance",
DELAYDISC = "Delay Discounting",
DEP = "Depressive Symptoms",
DPWpgirep = "Drinks per Week",
EA = "Educational Attainment",
EVERSMOKE = "Ever Smoker",
EXTRA = "Extraversion",
FAMSAT = "Satisfaction with Family",
FINSAT = "Satisfaction with Finances",
FRIENDSAT = "Satisfaction with Friends",
HAYFEVER = "Hayfever",
HEIGHTpgirep = "Height",
HIGHMATH = "Highest Math",
LEFTOUT = "Left Out of Social Activity",
LONELY = "Loneliness",
MENARCHE = "Age First Menses",
MIGRAINE = "Migraine",
MORNING = "Morning Person",
NARCIS = "Narcissism",
NEARSIGHTED = "Nearsightedness",
NEBmen = "Number of Children (men)",
NEBwomen = "Number of Children (women)",
NEURO = "Neuroticism",
OPEN = "Openness",
READING = "Age Started Reading",
RELIGATT = "Religious Attendance",
RISK = "Risk Tolerance",
SELFHEALTH = "Self-Rated Health",
SELFMATH = "Self-Rated Math Ability",
SWB = "Subjective Well-Being",
VOICEDEEP = "Age Voice Deepened",
WORKSAT = "Satisfaction with Work"
)
eths <- c("white", "black")
titles <- c(white = "White participants", black = "Black participants")
shorten <- function (x) {
x <- str_remove(x, "^(A4|E4)_")
x_short <- str_remove(x, "_.*")
duplicates <- table(x_short)
duplicates <- names(duplicates)[duplicates > 1]
x_short[x_short %in% duplicates] <- x[x_short %in% duplicates]
x_short <- str_replace(x_short, "^01", "ALZ_01") # two alzheimer's variants
x_short
}
calc_income <- function (x, eth) {
x <- x |> filter(ethnicity == eth)
ages <- x |> select(matches("r\\d+agey_e"))
income <- x |> select(matches("r\\d+iearn"))
income <- rowMeans(income, na.rm = TRUE)
mod <- lm(income ~ factor(x$rabyear), weights = x$weight, na.action =
na.exclude)
income_r <- resid(mod)
income_r <- income_r - min(income_r, na.rm = TRUE) + 1
income_r
}
calc_n <- function (mods, eths = "white", format = "(N = %s)") {
nobs <- map_dbl(eths, \(x) nobs(mods[[x]][[1]]) )
if (! is.null(format)) nobs <- sprintf(format, nobs)
names(nobs) <- eths
nobs
}
# avoid warnings about zero weights
svyglm_quiet <- function (...) suppressWarnings(svyglm(...))
# for a confidence interval
significant <- function (x) {
stopifnot(is.numeric(x), length(x) == 2)
all(x < 0) || all(x > 0)
}
insignificant <- Negate(significant)
# rand HRS longit data. VERY BIG
# Done just once:
# rand_orig <- haven::read_dta("data/randhrs/randhrs1992_2020v1.dta")
# saveRDS(rand_orig, file = "data/randhrs/rand-orig.rds", compress = FALSE)
rand_orig <- readRDS("data/randhrs/rand-orig.rds")
# ragender: 1 male, 2 female
rand_orig <- filter(rand_orig,
(rabyear <= 1965 & ragender == 1) |
(rabyear <= 1970 & ragender == 2))
# african ancestry
pgen4a <- haven::read_dta("data/PGENSCORE4r3/pgenscore4a_r.dta")
# european ancestry
pgen4e <- haven::read_dta("data/PGENSCORE4r3/pgenscore4e_r.dta")
pgirep <- readr::read_tsv("data/pgi-repository/HRS_PGIrepo_v1.0.txt",
show_col_types = FALSE)
names(pgirep)[1:2] <- c("hhid","pn")
pgirep <- pgirep |>
select(hhid, pn, matches("single"), -PGI_NEBwomen_single) |>
rename_with(
\(x) str_replace(x, "PGI_(.*)_single", "\\1")
)
# TE PGI rep does not come pre-standardised. Going to standardize over whole sample to match
# the approach taken with PGENscores
pgirep[, 3:ncol(pgirep)] <- scale(pgirep[, 3:ncol(pgirep)])
# original HRS tracker file
trk_orig <- haven::read_dta("data/trk2020v3/trk2020tr_r.dta")
trk_weights <- trk_orig |> select(HHID, PN, matches("BIOWGTR"))
# rand family respondents file
# Done just once:
# rand_fam <- haven::read_dta(
# "data/randhrsfam1992_2018v2_STATA/randhrsfamr1992_2018v2.dta")
# saveRDS(rand_fam, "data/randhrsfam1992_2018v2_STATA/rand_fam.rds")
rand_fam <- readRDS("data/randhrsfam1992_2018v2_STATA/rand_fam.rds")
# only selected columns to avoid creating suffixes when we join with
# rand.
# Age of oldest kid 2016; age of oldest kid 2010; grandchildren; num siblings who died;
# Mother's years of education; father's years of education
rand_fam <- rand_fam |> select(hhid, pn, h13ageokid, h10ageokid, h13gkid, r10sbdied,
rameduc, rafeduc) |>
mutate(paedyrs = as.numeric(rameduc + rafeduc)/2) #parent ave education
# 2010 (and others) record "age started smoking"
# rand_2010 <- haven::read_dta("data/hd10f6a_STATA/hd10f6a.dta")
# saveRDS(rand_2010, "data/hd10f6a_STATA/rand_2010.rds")
rand_2010 <- readRDS("data/hd10f6a_STATA/rand_2010.rds")
rand_2010 <- rand_2010 |>
select(hhid, pn, mc120, matches("mlb041")) |>
mutate(
age_smoked = ifelse(mc120 %in% c(95, 98, 99), NA_real_, mc120)
)
rand_2010$anxiety <- rand_2010 |>
select(matches("mlb041")) |>
rowMeans(na.rm = TRUE)
rand_2010$anx_na <- rand_2010 |>
select(matches("mlb041")) |>
is.na() |>
rowSums()
# matches the documentation at
# https://hrs.isr.umich.edu/sites/default/files/biblio/HRS2006LBQscale.pdf :
rand_2010$anxiety[rand_2010$anx_na > 2] <- NA_real_
# 2016 survey had a module with 18 questions diagnosing adult ADHD
# rand_2016 <- haven::read_dta("data/h16f2c_STATA/h16f2c.dta")
# saveRDS(rand_2016, "data/h16f2c_STATA/rand_2016.rds")
rand_2016 <- readRDS("data/h16f2c_STATA/rand_2016.rds")
rand_2016 <- rand_2016 |>
select(hhid, pn, pv001:pv018) |>
mutate(
across(matches("pv\\d\\d\\d"), \(x) ifelse(x %in% 8:9, NA, x)),
across(matches("pv\\d\\d\\d"), \(x) x == 1) # 1 is "yes', 5 is "no"
)
rand_2016$adhd_score <- rand_2016 |>
select(pv001:pv018) |>
rowSums(na.rm = TRUE)
rand_2016$adhd_nas <- rand_2016 |>
select(pv001:pv018) |>
is.na() |>
rowSums(na.rm = TRUE)
rand_2016$adhd_score <- rand_2016$adhd_score/(18 - rand_2016$adhd_nas)
pgens <- list(white = pgen4e, black = pgen4a)
pcs <- c(paste0("PC1_5", LETTERS[1:5]), paste0("PC6_10", LETTERS[1:5]))
rands <- list()
for (eth in eths) {
pgen <- pgens[[eth]]
names(pgen) <- shorten(names(pgen))
rands[[eth]] <- inner_join(rand_orig, pgen,
join_by(hhid, pn),
unmatched = "drop",
relationship = "one-to-one")
}
rand <- list_rbind(rands, names_to = "ethnicity")
old_pgs <- c("EDU2", "EDU3", "HEIGHT", "BMI",
"AD", "AD2", "GENCOG", "EVRSMK", "LONG", "CPD_TAG10",
"ADHD_PGC10", "MDD")
# TE going to simplify names of pgen4e. Then merge, then create pgs.
pgen4e_new_names <- pgen4e |>
rename_with(shorten) |>
# weird that we have to operate on whole df to use select semantics
select(-version,
-matches("^PC\\d"),
-matches("^NEB"), # number ever born
# outdated, or EDU3 without 23andMe:
-any_of(old_pgs)
)
pgen4e_new_names <- left_join(pgen4e_new_names, pgirep,
by = join_by(hhid == hhid, pn == pn),
suffix = c("", "pgirep"))
# Height is a duplicated name, but it's with the old PGS, so fixing this manually
pgen4e_new_names <- rename(pgen4e_new_names,
HEIGHTpgirep = HEIGHT)
# TE here we removed pgenscores that are duplicated in the pgi repository
# we also remove trans ethnic and African ancestry PGS
dupe_PGS <- c("EA3", # TE when I've tested it the pgirep one is slightly better...
"GCOG2", #CP
"BMI2",
"NEUROT",
"WELLB",
"DEPSYMP",
"ADHD_PGC17",
"EXTRAVER",
"AFBC",
"PTSDAA", # African
"PTSDC", # Combined Euro + African.
#"ANXFS" ,"ANXCC", continuous vs case-control anxiety
# r = 0.6 let's keep them
# I suppose it's equivalent to having CP and EA
"BUNTE",
"CKDTE",
"BMI2",
"HEIGHT2",
"CPD_GSCAN19",
"DPW",
"SI",
"CANNABIS",
"GWAD2NA",
"ALZ_01AD2NA",
"GWAD2WA", # Three alzheimers deleted. Ones using p value 1 threshold
# and alzheimers that remove APOE variant region
"ALC", #AUDIT is alcohol misuse in pgirep
"EGFR",
"HBA1CEA"
)
dupe_PGS_black <- c("PTSDAA",
"PTSDC",
"BUNTE",
"CKDTE",
"BMI2",
"CANNABIS",
"GWAD2NA",
"ALZ_01AD2NA",
"GWAD2WA",
"EGFR",
"HBA1CEA"
)
pgen4e_new_names <- select(pgen4e_new_names,
-hhid, -pn, -any_of(dupe_PGS), -any_of(old_pgs))
pgs_white <- names(pgen4e_new_names)
pgs_white[pgs_white == "HEIGHT"] <- "HEIGHTpgirep"
pgs_black <- pgen4a |>
rename_with(shorten) |>
select(-hhid, -pn, -version,
-matches("^PC\\d"),
-matches("^NEB"), # number ever born
# outdated, or EDU3 without 23andMe:
-any_of(old_pgs),
-any_of(dupe_PGS_black)
) |>
names()
rand <- left_join(rand, pgirep, by = join_by(hhid, pn),suffix=c("","pgirep"))
# weighting
rand <- left_join(rand, trk_weights, by = join_by(hhid == HHID, pn == PN))
rand$weight <- ifelse(is.na(rand$MBIOWGTR), rand$NBIOWGTR, rand$MBIOWGTR)
# Replacing 0 weights with NA because it can create odd problems
rand$weight <- na_if(rand$weight, 0)
# compensate for fecund parents more likely to have child in survey.
# we use living sibs rather than including dead sibs too here. we think that's
# the correct approach:
rand$parent_weight <- rand$weight / (rand$r10livsib + 1)
rand$child_weight <- rand$weight*rand$raevbrn
rand <- rand |> left_join(rand_fam,
by = join_by(hhid, pn),
unmatched = "drop",
relationship = "one-to-one")
rand <- rand |> left_join(rand_2016,
by = join_by(hhid, pn),
unmatched = "drop",
relationship = "one-to-one")
rand <- rand |> left_join(rand_2010,
by = join_by(hhid, pn),
unmatched = "drop",
relationship = "one-to-one")
# TE this is average number of kids that your kids have
rand$div_h13gkid <- rand$h13gkid / rand$raevbrn
# Some people report having grandkids, but having had no children. Weird!
# This code subsets the data by ethnicity then counts all observations
# where grandkids is greater than 0 but number of kids is 0
gkids_unusable_white <- rand |>
filter(ethnicity == "white", h13gkid > 0, raevbrn == 0) |>
nrow()
gkids_unusable_black <- rand |>
filter(ethnicity == "black", h13gkid > 0, raevbrn == 0) |>
nrow()
# if grandkids but no kids div_gkid is infinite. Replacing with NA
rand$div_h13gkid[is.infinite(rand$div_h13gkid)] <- NA
rand <- rand |>
mutate(.by = c(ethnicity, rabyear), # by year and ethnicity
mean_weight = mean(weight, na.rm = TRUE),
mean_child_weight = mean(child_weight, na.rm = TRUE),
mean_raevbrn = mean(raevbrn*weight/mean_weight, na.rm = TRUE),
rlrs = raevbrn/mean_raevbrn,
mean_h13gkid = mean(h13gkid*weight/mean_weight, na.rm = TRUE),
rlrs_h13gkid = h13gkid/mean_h13gkid,
mean_div_h13gkid = mean(div_h13gkid*child_weight/mean_child_weight,
na.rm = TRUE),
rlrs_div_h13gkid = div_h13gkid/mean_div_h13gkid
) |>
mutate(.by = ethnicity, # calculate these stats within-ethnicity
income_resid = calc_income(rand, ethnicity[1]),
income_resid = income_resid - min(income_resid, na.rm = TRUE) + 1,
income_med = chop_equally(income_resid, 2, labels = c("Low", "High")),
# as.numeric avoids haven.labelled issues:
raedyrs = as.numeric(raedyrs),
r10hearte = as.numeric(r10hearte),
r10diabe = as.numeric(r10diabe),
edu = chop(as.numeric(raedyrs), 13, lbl_discrete(symbol = "-")),
born = chop(rabyear, 1942, labels = c("pre-1942", "post-1942")),
# wave 10 is 2010
# table(rands$white$r10mstat)
# 1 1.married
# 2 2.married,spouse absent
# 3 3.partnered
# 4 4.separated
# 5 5.divorced
# 6 6.separated/divorced
# 7 7.widowed
# 8 8.never married
married = factor(r10mstat == 1,
levels = c(FALSE, TRUE),
labels = c("Other", "Married")),
r10sibs = r10livsib + 1, # n. sibs including oneself!
agefbn = r10agey_e - h10ageokid,
agefbn = ifelse(agefbn < 12, NA_real_, agefbn),
) |>
mutate(.by = c(ethnicity, ragender),
agefb = chop_equally(agefbn, 2, labels = c("Low", "High"))
)
rand$neurot <- rand |>
select(matches("r\\d+lbneur")) |>
rowMeans(na.rm = TRUE)
# two of four Alzheimer's variants with awkward names:
#pgs <- str_replace(pgs, "^01", "ALZ_01")
pgs_white <- str_replace(pgs_white, "^01", "ALZ_01")
pgs_black <- str_replace(pgs_black, "^01", "ALZ_01")
names(rand) <- str_replace(names(rand), "^01", "ALZ_01")
pgs <- unique(c(pgs_white,pgs_black))
##############################################################
# Residualising then standardising PGS, with sampling weights
##############################################################
dat_eth_list <- list()
for (eth in eths) {
r_eth <- rand |> subset(ethnicity == eth)
if (eth == "white") {
pgs <- pgs_white
} else {
pgs <- pgs_black
}
for (pg in pgs) {
reg <- lm(get(pg) ~ PC1_5A + PC1_5B + PC1_5C + PC1_5D + PC1_5E +
PC6_10A + PC6_10B + PC6_10C + PC6_10D +PC6_10E, weights = weight,
data = r_eth)
r_eth <- r_eth |> mutate(
!!pg := get(pg) - predict(reg, r_eth),
mean_weight = mean(weight, na.rm=TRUE),
tot_weight = sum(weight, na.rm=TRUE),
mean_res_pgs = sum(get(pg)*weight, na.rm=TRUE)/tot_weight,
w.dev = weight*(get(pg) - mean_res_pgs)^2,
w.var.pgs = sum(w.dev, na.rm =TRUE)/tot_weight,
deman_pgs = get(pg) - mean_res_pgs,
!!pg := deman_pgs/sqrt(w.var.pgs)
)
}
dat_eth_list[[eth]] <- r_eth
}
rand <- dat_eth_list |> list_rbind()
pgs <- unique(c(pgs_white,pgs_black))
# SD of EA before and after this process goes from 1 to 1.01, implying that the HRS is slightly more diverse in EA than the population and that Z scoring using sampling weights doesn't really matter
```
```{r}
#| label: prog-cleanup
# clean up cache so it doesn't get too big
rm(rand_orig, trk_orig, rands, rand_fam, rand_2010, rand_2016)
# TE we can delete more
rm(pgen, pgen4a, pgen4e, pgirep, trk_weights)
rand <- rand |>
select(ethnicity, all_of(pgs), all_of(old_pgs),
all_of(pcs), weight, raevbrn, rlrs,
raedyrs, r10sibs, edu, income_resid, income_med, married,
agefb, agefbn, rabyear, born, raehsamp, raestrat,
parent_weight, ragender, adhd_score, neurot, r10diabe,
r10hearte, r10cesd, age_smoked, anxiety, r10cogtot,
rlrs_div_h13gkid, child_weight, h13ageokid, paedyrs)
r_parents <- svydesign(
id = ~ raehsamp,
strata = ~ raestrat,
weights = ~ parent_weight,
nest = TRUE,
data = rand |> drop_na(parent_weight)
)
r_child <- svydesign(
id = ~ raehsamp,
strata = ~ raestrat,
weights = ~ child_weight,
nest = TRUE,
data = rand |> drop_na(child_weight) |>
filter(!is.na( rlrs_div_h13gkid), h13ageokid > 39)
)
rand <- svydesign(
id = ~ raehsamp,
strata = ~ raestrat,
weights = ~ weight,
nest = TRUE,
data = rand |> drop_na(weight))
n_white <- nrow(rand |> subset(ethnicity == "white"))
n_black <- nrow(rand |> subset(ethnicity == "black"))
```
@hugh2022human explain patterns of natural selection on polygenic scores in the UK, using an economic theory of fertility derived from @becker1976child. The theory has two components:
1. There is a trade-off between time spent working and raising children. This "substitution effect" leads people with more human capital and higher expected wages to have fewer children. Evidence for this is that polygenic scores which correlate positively with human capital correlate negatively with number of children, i.e. they are being selected against; conversely, scores which correlate positively with human capital are being selected for.
2. The trade-off is sharper for low-income people, people with low human capital, and single parents. Because these groups value income more at the margin, the substitution effect is stronger for them. In other groups, the substitution effect is balanced by the "income effect", that children become more affordable when you get richer. As a result, natural selection is stronger among these groups. Evidence for this is that scores' regression coefficients on number of children are larger among people with lower income or less education, and single parents.
Here, we make an independent test of the theory in the US population, using the Health and Retirement Survey [@hrsrandfam2023; @hrsrand2023]. The motivation is to establish the direction and magnitude of natural selection on a range of traits in Americans, and to test whether the economic theory can explain the selection. Using information on respondents' siblings and grandchildren, we can also extend the analysis to three generations of Americans. This is interesting first because selection effects may accumulate over time, and second because an possible alternative explanation for existing findings is the development of the welfare state, which happened in the US during the "Great Society" programs of the 1960s (i.e. after the respondents' parents' generation). To preview our results, we confirm point 1 above across all three generations. But we only see partial and ambiguous support for point 2.
# Data
The HRS sample focuses on cohorts born between 1920 and 1960, but contains some younger and older participants. We include only male participants born before 1965 and female participants born before 1970, which guarantees that most will have completed their fertility by 2010. The resulting sample contains `r n_white` genotyped white participants. We focus on these because the sample size is large enough. The appendix reports some basic analyses for the `r n_black` genotyped black participants. (Throughout, "white" and "black" refer to participants who (a) self-identified as non-Hispanic and as "White/Caucasian" and "Black/African-American" respectively, and who (b) had principal components of SNP data close to the respective population mean. See @hrspgs2020.)
Genotyping took place in 2006, 2008 and subsequent years. PGS were taken from those pre-calculated by the HRS [@hrspgs2020] and those produced by the Social Science Genetic Association Consortium, as part of their Polygenic Index Repository [@becker2021resource]. Scores created by the HRS were provided for black and white participants, but Polygenic Index Repository scores were only created for white participants.
For the white participants, when scores from the two samples measured the same trait, we only used the PGS from the Polygenic Index Repository. For some traits, polygenic scores were created from both European ancestry and multiple ancestry GWAS. We choose to use polygenic scores trained only on individuals of European ancestry. We discard obsolete PGS for which there is a newer, more accurate score targeting the same phenotype. We also discard PGS for number of children ever born (but keep scores for age at first birth). This leaves a total of `r length(pgs_white)` scores for the white participants. PGS were residualised on the first ten within-ethnicity principal components of the DNA array data, to reduce bias from population stratification. PGS were then rescaled to zero mean and unit variance.
The key dependent variable is relative lifetime reproductive success (RLRS): number of children ever born, divided by the mean number of children of people born in the same year. RLRS is calculated separately by ethnicity. This is not ideal, because it treats the ethnicities as "separate breeding populations" (i.e. subpopulations who rarely interbreed; black and white people intermarried rarely in the HRS generation, but that is due to laws and norms against intermarriage which have now mostly disappeared.) But the alternative of calculating pooled RLRS would effectively be estimating natural selection in the whole US population by treating whites as representative. We therefore focus on the white US population, with the caveat that results from this "one data point" may not replicate in other ethnicities or countries. The mean number of children of people born in each year was calculated using sampling weights.
The intuition behind our analysis is simple: if a polygenic score predicts more reproductive success, then people higher in the PGS will reproduce more than others, causing scores to increase in the population. However the approach is also based on quantitative genetic theory and is able to yield an estimate of genetic change from one generation to the next.
The Robertson-Price Identity states that the change in a genetic trait between one generation and the next is equal to its covariance with relative fitness [@Price1970identity; @Robertson1966identity], assuming no mutations. In humans, infant mortality is so low today that a person's reproductive success is a very close approximation of the fitness. As such, the covariance between a polygenic score and RLRS yields the expected change in the mean polygenic score per generation. Because the polygenic scores are standardized to have a variance of 1, the regression coefficient of RLRS on the PGS is equal to the covariance. This approach is standard in the literature [@Beauchamp2016selection; @hugh2022human; @kong2017selection].
The HRS contains weights which match survey respondents to the US population. We use weights for the biomarker subsample (\*BIOWGTR in the HRS tracker file). Since half the sample enters the extended interview including biomarker data in each biannual survey, we weight individuals by either their 2010 weight or their 2012 weight. This maximizes the available sample of both black and white respondents, and should approximately match the US population of the sample cohorts between 2010 and 2012. Standardization of polygenic scores used estimates of the population mean and variance of the polygenic score, as estimated with sampling weights. Statistical tests are adjusted for clustering and stratification using the R *survey* package [@lumleysurvey2023].
# Results
```{r}
#| label: stat-regressions
# cache seems to cause trouble even with cache-lazy: FALSE
options("survey.lonely.psu" = "average")
tidy_pgs <- mod_pgs <- list()
tidy_pgs_ed <- mod_pgs_edyrs <- list()
tidy_pgs_ed_parent <- mod_pgs_edyrs_parent <- list()
tidy_pgs_sibs <- mod_pgs_sibs <- list()
tidy_pgs_gkids <- mod_pgs_gkids <- list()
tidy_pgs_x_ed <- mod_pgs_x_ed <- list()
tidy_pgs_x_inc <- mod_pgs_x_inc <- list()
tidy_pgs_x_mar <- mod_pgs_x_mar <- list()
tidy_pgs_x_agefb <- mod_pgs_x_agefb <- list()
coefs_fert_edu <- cors_fert_edu <- tidy_pgs_joined <- list()
cors_sibs_edu <- list()
cors_sibs_edu_parent <- list()
cors_gkids_edu <- list()
tidy_fert_edu_phen <- list()
for (eth in eths) {
if (eth == "white") {
pgs <- pgs_white
} else {
pgs <- pgs_black
}
r_eth <- rand |> subset(ethnicity == eth)
r_parents_eth <- r_parents |> subset(ethnicity == eth)
r_child_eth <- r_child |> subset(ethnicity == eth)
form_pgs <- map(pgs, \(x) reformulate(c(x), response = "rlrs"))
mod_pgs[[eth]] <- map(form_pgs,
\(x) svyglm_quiet(x, r_eth))
tidy_pgs[[eth]] <- mod_pgs[[eth]] |>
map(broom::tidy, conf.int = TRUE, conf.level = 0.95) |>
list_rbind() |>
filter(term %in% pgs)
form_pgs_edyrs <- map(pgs, \(x) reformulate(c(x),
response = "raedyrs"))
mod_pgs_edyrs[[eth]] <- map(form_pgs_edyrs,
\(x) svyglm_quiet(x, r_eth))
names(mod_pgs_edyrs[[eth]]) <- pgs
tidy_pgs_ed[[eth]] <- mod_pgs_edyrs[[eth]] |>
map(broom::tidy) |>
list_rbind() |>
filter(term %in% pgs)
form_pgs_edyrs_parent <- map(pgs, \(x) reformulate(c(x),
response = "paedyrs"))
mod_pgs_edyrs_parent[[eth]] <- map(form_pgs_edyrs_parent,
\(x) svyglm_quiet(x, r_parents_eth))
names(mod_pgs_edyrs_parent[[eth]]) <- pgs
tidy_pgs_ed_parent[[eth]] <- mod_pgs_edyrs_parent[[eth]] |>
map(broom::tidy) |>
list_rbind() |>
filter(term %in% pgs)
tidy_pgs_joined[[eth]] <- inner_join(tidy_pgs[[eth]], tidy_pgs_ed[[eth]],
by = join_by(term),
unmatched = "error",
suffix = c(".fert", ".edyrs"),
relationship = "one-to-one")
cors_fert_edu[[eth]] <- cor(tidy_pgs_joined[[eth]]$estimate.edyrs,
tidy_pgs_joined[[eth]]$estimate.fert)
coefs_fert_edu[[eth]] <- lm(estimate.fert ~ estimate.edyrs,
data = tidy_pgs_joined[[eth]]) |>
coef() |>
pluck("estimate.edyrs")
form_pgs_sibs <- map(pgs,
\(x) reformulate(c(x), response = "r10sibs"))
mod_pgs_sibs[[eth]] <- map(form_pgs_sibs,
\(x) svyglm_quiet(x, r_parents_eth))
tidy_pgs_sibs[[eth]] <- mod_pgs_sibs[[eth]] |>
map(broom::tidy, conf.int = TRUE, conf.level = 0.95) |>
list_rbind() |>
filter(term %in% pgs)
cors_sibs_edu[[eth]] <- cor(tidy_pgs_sibs[[eth]]$estimate,
tidy_pgs_ed[[eth]]$estimate)
cors_sibs_edu_parent[[eth]] <- cor(tidy_pgs_sibs[[eth]]$estimate,
tidy_pgs_ed_parent[[eth]]$estimate)
form_pgs_gkids <- map(pgs,
\(x) reformulate(c(x), response = "rlrs_div_h13gkid"))
mod_pgs_gkids[[eth]] <- map(form_pgs_gkids,
\(x) svyglm_quiet(x, r_child_eth))
tidy_pgs_gkids[[eth]] <- mod_pgs_gkids[[eth]] |>
map(broom::tidy, conf.int = TRUE, conf.level = 0.95) |>
list_rbind() |>
filter(term %in% pgs)
cors_gkids_edu[[eth]] <- cor(tidy_pgs_gkids[[eth]]$estimate,
tidy_pgs_ed[[eth]]$estimate)
tidy_fert_edu_phen[[eth]] <-
svyglm_quiet(rlrs ~ raedyrs, r_eth) |>
broom::tidy(conf.int = TRUE) |>
filter(term == "raedyrs")
}
# we only look at interactions among whites, the rest is pointless
for (eth in "white") {
r_eth <- rand |> subset(ethnicity == eth)
if (eth == "white") {
pgs <- pgs_white
} else {
pgs <- pgs_black
}
# the colon interaction estimates the effect separately within each group
form_pgs_x_ed <- paste0(pgs, ":edu") |>
map(\(x) reformulate(c(x, "edu"), response = "rlrs"))
mod_pgs_x_ed[[eth]] <-
map(form_pgs_x_ed,
\(x) svyglm_quiet(x, r_eth))
tidy_pgs_x_ed[[eth]] <-
map(mod_pgs_x_ed[[eth]], broom::tidy, conf.int = TRUE)
names(tidy_pgs_x_ed[[eth]]) <- pgs
tidy_pgs_x_ed[[eth]] <-
list_rbind(tidy_pgs_x_ed[[eth]], names_to = "pgs") |>
filter(!term %in% c("(Intercept)", "edu13-17")) |>
mutate(edu = str_remove(term, ".*:"))
form_pgs_x_inc <- paste0(pgs, ":income_med") |>
map(\(x) reformulate(c(x, "income_med"), response = "rlrs"))
mod_pgs_x_inc[[eth]] <-
map(form_pgs_x_inc,
\(x) svyglm_quiet(x, r_eth))
tidy_pgs_x_inc[[eth]] <-
map(mod_pgs_x_inc[[eth]], broom::tidy, conf.int = TRUE)
names(tidy_pgs_x_inc[[eth]]) <- pgs
tidy_pgs_x_inc[[eth]] <-
list_rbind(tidy_pgs_x_inc[[eth]], names_to = "pgs") |>
filter(!term %in% c("(Intercept)", "income_medHigh")) |>
mutate(income_med = str_remove(term, ".*:income_med"))
form_pgs_x_mar <- paste0(pgs, ":married") |>
map(\(x) reformulate(c(x, "married"), response = "rlrs"))
mod_pgs_x_mar[[eth]] <-
map(form_pgs_x_mar,
\(x) svyglm_quiet(x, r_eth))
tidy_pgs_x_mar[[eth]] <-
map(mod_pgs_x_mar[[eth]], broom::tidy, conf.int = TRUE)
names(tidy_pgs_x_mar[[eth]]) <- pgs
tidy_pgs_x_mar[[eth]] <-
list_rbind(tidy_pgs_x_mar[[eth]], names_to = "pgs") |>
filter(! term %in% c("(Intercept)", "marriedMarried")) |>
mutate(
married = str_remove(term, ".*:married")
)
form_pgs_x_agefb <- paste0(pgs, ":agefb") |>
map(\(x) reformulate(c(x, "agefb"), response = "rlrs"))
mod_pgs_x_agefb[[eth]] <-
map(form_pgs_x_agefb,
\(x) svyglm_quiet(x, r_eth))
tidy_pgs_x_agefb[[eth]] <-
map(mod_pgs_x_agefb[[eth]], broom::tidy, conf.int = TRUE) |>
setNames(pgs) |>
list_rbind(names_to = "pgs") |>
filter(! term %in% c("(Intercept)", "agefbHigh")) |>
mutate(
agefb = str_remove(term, ".*:agefb")
)
}
tidy_pgs_eth <- list_rbind(tidy_pgs)
n_tests <- nrow(tidy_pgs_eth)
n_sig_bonf <- sum(tidy_pgs_eth$p.value < 0.05/n_tests)
n_tests_white <- nrow(tidy_pgs$white)
n_sig_bonf_white <- sum(tidy_pgs$white$p.value < 0.05/n_tests_white)
n_tests_black <- nrow(tidy_pgs$black)
n_sig_bonf_black <- sum(tidy_pgs$black$p.value < 0.05/n_tests_black)
```
```{r}
#| label: stat-bootstrap-eth-cor
#| cache: false
#| cache-lazy: false
#| eval: false
# I don't mention this below so turning it off for now
# TODO: should I mention it? It's a low correlation.
n_reps_eth_diff <- 199
rand_boot <- rand |>
as.svrepdesign(type = "bootstrap", replicates = n_reps_eth_diff)
calc_eth_cor <- function (weights, data) {
coef_pgs_white <- pgs |>
map(\(x) reformulate(c(x), response = "rlrs")) |>
map(\(x) lm(x, data, weights = weights,
subset = ethnicity == "white")) |>
map2_dbl(pgs, \(x, y) coef(x)[[y]])
coef_pgs_black <- pgs |>
map(\(x) reformulate(c(x), response = "rlrs")) |>
map(\(x) lm(x, data, weights = weights,
subset = ethnicity == "black")) |>
map2_dbl(pgs, \(x, y) coef(x)[[y]])
c(cor = cor(coef_pgs_black, coef_pgs_white))
}
cor_eths <- cor(tidy_pgs$white$estimate, tidy_pgs$black$estimate)
cor_eths_boot <- withReplicates(rand_boot, theta = calc_eth_cor)
ci_eths <- confint(cor_eths_boot)[1,]
```
We estimate coefficients of PGS on RLRS. These do not identify causal effects; recall that natural selection involves correlation, not necessarily causation, between selected characteristics and fertility. Appendix @fig-rlrs shows coefficients. Standard errors are large because of the relatively low sample sizes. `r n_sig_bonf_white` scores are significant at Bonferroni-corrected p \< 0.05/`r n_tests_white`. The scores are age at first birth, educational attainment, ADHD, self-rated health and having ever smoked. But we are most concerned with looking at patterns across scores rather than judging the significance of individual scores.
```{r}
#| label: stat-bootstraps
#| cache: false
#| cache-lazy: false
#| cache-comments: true
# TODO set cache to false before final build
# Not sure this actually does anything :-)
old_opts <- options(survey.multicore = TRUE)
# The large n_reps is to get an accurate upper tail on the black sample,
# since it is close to 0:
n_reps <- 599
calc_cors_edu <- function (weights, data, var) {
coef_pgs_fert <- pgs |>
map(\(x) reformulate(c(x), response = var)) |>
map(\(x) lm(x, data, weights = weights)) |>
map_dbl(\(x) coef(x)[[2]])
coef_pgs_edu <- pgs |>
map(\(x) reformulate(c(x), response = "raedyrs")) |>
map(\(x) lm(x, data, weights = weights)) |>
map_dbl(\(x) coef(x)[[2]])
c(
cor = cor(coef_pgs_edu, coef_pgs_fert),
coef = coef(lm(coef_pgs_fert ~ coef_pgs_edu))["coef_pgs_edu"]
)
}
calc_cors_edu_parent <- function (weights, data, var) {
coef_pgs_fert <- pgs |>
map(\(x) reformulate(c(x), response = var)) |>
map(\(x) lm(x, data, weights = weights)) |>
map_dbl(\(x) coef(x)[[2]])
coef_pgs_edu <- pgs |>
map(\(x) reformulate(c(x), response = "paedyrs")) |>
map(\(x) lm(x, data, weights = weights)) |>
map_dbl(\(x) coef(x)[[2]])
c(
cor = cor(coef_pgs_edu, coef_pgs_fert),
coef = coef(lm(coef_pgs_fert ~ coef_pgs_edu))["coef_pgs_edu"]
)
}
cors_fert_edu <- cors_sibs_edu <- list()
for (eth in eths) {
pgs <- if (eth == "white") pgs_white else pgs_black
rand_boot <- rand |>
subset(ethnicity == eth) |>
as.svrepdesign(type = "bootstrap", replicates = n_reps)
r_parents_boot <- r_parents |>
subset(ethnicity == eth) |>
as.svrepdesign(type = "bootstrap", replicates = n_reps)
r_child_boot <- r_child |>
subset(ethnicity == eth) |>
as.svrepdesign(type = "bootstrap", replicates = n_reps)
cors_fert_edu[[eth]] <-
withReplicates(rand_boot, theta = calc_cors_edu, var = "rlrs")
cors_sibs_edu[[eth]] <-
withReplicates(r_parents_boot, theta = calc_cors_edu, var = "r10sibs")
cors_sibs_edu_parent[[eth]] <-
withReplicates(r_parents_boot, theta = calc_cors_edu_parent, var = "r10sibs")
cors_gkids_edu[[eth]] <-
withReplicates(r_child_boot, theta = calc_cors_edu, var = "rlrs_div_h13gkid")
}
options(old_opts)
```
```{r}
#| label: stat-bootstrap-eth-diff
#| eval: false
# this was an attempt to see if black coefs were larger in abs size than
# white coefs. Results were always absolutely insignificant (and conf
# intervals too wide to be informative).
n_reps_eth_diff <- 199
rand_boot <- rand |>
as.svrepdesign(type = "bootstrap", replicates = n_reps_eth_diff)
calc_eth_diff <- function (weights, data) {
coef_pgs_white <- pgs |>
map(\(x) reformulate(c(x), response = "rlrs")) |>
map(\(x) lm(x, data, weights = weights,
subset = ethnicity == "white")) |>
map_dbl(\(x) coef(x)[[2]])
coef_pgs_black <- pgs |>
map(\(x) reformulate(c(x), response = "rlrs")) |>
map(\(x) lm(x, data, weights = weights,
subset = ethnicity == "black")) |>
map_dbl(\(x) coef(x)[[2]])
# this is positive if white was "bigger" than black taking the white sign
# as the correct sign. Negative if b was "bigger" than w.
diff_coefs <- (coef_pgs_white - coef_pgs_black)*sign(coef_pgs_white)
names(diff_coefs) <- pgs
c(diff = mean(diff_coefs))
}
diff_eths <- withReplicates(rand_boot, theta = calc_eth_diff)
```