forked from Simon-W-M/intermediate_R_training
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinter_train.R
1921 lines (1343 loc) · 50.2 KB
/
inter_train.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# TRAINER VERSION #
# the other version does not have example answers in it
##########
# help #
##########
# hover over a function and press F1
# or
# type ?<function name>
#
# search within help tab
#
# google.com!
##########################
# what we is going to do #
##########################
# a lot of intermediate data wrangling
# we may do a few minor graphs but the focus will be wrangling
# other training is available re charts, markdown and tables
# This course assumes a basic level of knowledge of R and dpylr
# It is expected that you are at least familiar with the contents of the
# NHS-R Introduction to R and R Studio course
# in this tutorial we will be using modern |> pipes
# these work exactly the same as old pipes %>%
##############################
# loading required libraries #
##############################
# load in initial libraries
# will come back to the code around this
# Specify required packages
my_packages <- c("tidyverse",
"NHSRdatasets",
"pacman")
# Extract not installed packages
not_installed <- my_packages[!(my_packages %in%
installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
# Load all packages
pacman::p_load(char = my_packages)
################################
# Some useful base R functions #
################################
# make a dataframe
data <- ae_attendances
# get list of column names
colnames(data)
# little bit of base R - using $ sign to select a column
data$type
# get list of unique data items in a variable
unique(data$org_code)
# notice the bit of base R
# base R is useful for certain things and it is a good idea
# to understand some of the basics at how some of the things work
# The $ sign allows us to call a column of a data frame as a vector.
data$type
# it is also possible to to select a specific entry within that vector with
# square brackets
# to call the 4th entry we can
data$type[4]
# we could also call the 4th to 10th entries
data$type[4:10]
# note to python users, we start counting at 1 and not zero
# we also include the 10th entry
# this is very different to how python does indexing
# get number of distinct entries
n_distinct(data$org_code)
# get the range of a variable (useful for dates)
range(data$period)
# see structure of data to check data types
str(data)
# see the first 5 rows of data
head(data)
# see the last 5 rows of data
tail(data)
# see the first 15 rows of data
head(data,
15)
# see the first 15 rows of data - but using dpylr
top_n(data,
15)
# see the first 15% of total rows of data - but using dpylr
# (defaults to last variable to order by)
top_frac(data,
.15)
# see the first 15% of total rows of data - but using dpylr
# ordered by attendances
top_frac(data,
.15,
attendances)
# <<< Over to you >>>>
# see if you can find the lowest 5 attendances
top_n(data,
-5,
attendances)
#########################
# see summary statistics of a dataframe
# summary is a super function that also can give great summaries of other
# objects - such as linear models
summary(data)
# quick counts by a column
table (data$type)
# or by 2 columns
table (data$type,
data$org_code)
# can do 3 columns but it starts to get silly how that is displayed
####################
# renaming columns #
####################
# dealing with spaces in variables (looking at you excel!)
# or if you want to rename a variable to a readable format for table
# lets rename our 'org_code' to 'Organisation Code'
# use back ticks to enclose a variable with with a space in it
# kind of the equivalent to square brackets in sql
data <- data |>
rename(`Organisation Code` = org_code)
# lets also rename some more variables to something horrible
data <- data |>
rename(Breaches = breaches,
PERIOD = period,
` type` = type)
# we now have the look of a typical NHS table
# we could clean all those names manually
# or we could call in a janitor to use a function to do this for us
library(janitor)
# if this is not installed, install it manually
data <- clean_names(data)
# phew! all better.
# <<< Over to you >>>>
# have a quick try at renaming breaches to 'number of breaches'
# then put it back again
# HINT - check the order or your rename!
data <- data |>
rename(`no of breaches` = breaches)
colnames(data)
data <- data |>
rename('breaches' = `no of breaches`)
#########################
# janitor has some other really helpful functions to
# convert excel dates to dates (if they do that weird numeric thing)
# find duplicate rows
# add quick totals
# check it out, tidyverse and janitor are probably the two librarys
# I call at the start of any analysis
# best to run this to ensure we are back to clean data
data <- ae_attendances
####################
# select statement #
####################
# The super useful select statement
# in very simple terms this works like a sql select statement
# we can select columns from our dataframe and in addition they
# will be ordered in the order we select them
data_select <- data |>
select (period,
org_code)
view(data_select)
# or
data_select <- data |>
select (org_code,
period)
view(data_select)
# I have included a view statement here to look at our data, there are various
# other ways to achieve the same without having do so so much or indeed any
# typing
# you can simply call the name of the dataframe object
data_select
# you can select over there on the top right hand size in the environment
# or my absolutely favourite is to hold control and click on an object name
# anyway back to codin'
# we can also do a simple rename at the select stage
data_select <- data |>
select (date_period = period,
organisation = org_code)
# can also do a ! or - for negative select
data_select <- data |>
select (!c(org_code,
period))
# NOTE - you have to put the multiples into a vector
data_select <- data |>
select (!org_code,
!period)
# THIS DOES NOT WORK
data_select <- data |>
select (-org_code,
-period)
# this does work
# we can use some additional verbs in our select statement
# select any column name that contains 'es'
data_select <- data |>
select (contains ('es'))
# select any column name that does not contains 'es'
data_select <- data |>
select (!contains ('es'))
# select columns where the data is numeric
data_select <- data |>
select (where(is.numeric))
# select breaches then admissions and then everything else as it is
data_select <- data |>
select (admissions,
breaches,
everything())
# <<< Over to you >>>>
# select the data so that it is order of admissions,
# any column that is a factor and then anything else
data_select <- data |>
select(admissions,
where(is.factor),
everything())
#########################
####################################################
# alternative joins - row and column concatenation #
####################################################
# a few more things to join datasets together
# standard joins are covered in the introduction course in some detail
# this is about joining tables without keys
df_one <- data |>
select (period,
org_code,
type,
attendances) |>
head()
df_two <- data |>
select (where(is.numeric)) |>
head()
# jam the two data frames together, side by side
# column bind
df_new <- cbind(df_one,
df_two)
# note duplicated column is duplicated - which can cause issues - best to remove
# or rename - also maintains order - need to be mindful you have rows that line
# up
# we can also bind data frames by rows, this is similar to concatenation
# in sql, need matching columns
df_one <- data |>
head()
df_two <- data |>
tail()
df_new <- rbind(df_one,
df_two)
# if we dont have matching columns can use bind_rows
# which matches columns where they match and puts in na for where data does not
# match
df_one <- data |>
select (period,
org_code,
type,
attendances) |>
head()
df_two <- data |>
tail()
df_new <- bind_rows(df_one,
df_two)
# or in a pipe
df_new <- df_one |>
bind_rows(df_two)
# <<< Over to you >>>>
# create a dataframe of the top 5 admissions only
# and the bottom 5 attendances only and join the two columns together
df_one <- data |>
select(admissions) |>
top_n(5, admissions)
df_two <- data |>
select(attendances) |>
top_n(-5, attendances)
df_new <- bind_cols(df_one,
df_two)
#########################
# if you want to join columns of different sizes you are
# probably better off using the join family of functions
# Combining rows that exist in both tables and dropping duplicates
# going to rename breaches to admissions to create some duplicates
df_one <- data |>
select (org_code,
admissions = breaches)
df_two <- data |>
select (org_code,
admissions)
df_new_union <- union(df_one,
df_two)
# Finding identical columns in both tables
df_new_intersect <- intersect(df_one,
df_two)
# Finding rows that don’t exist in another table
df_new_diff <- setdiff(df_one,
df_two)
#####################################
# Group within mutate and summarise #
#####################################
# this may or may not be new to you, depending on then you learnt R
# old skool would be
data_old <- data |>
group_by(type) |>
summarise (count = n()) |>
ungroup ()
# nu skool is
data_nu <- data |>
summarise (count = n(),
.by = type)
# old skool would be
data_old <- data |>
group_by(type) |>
mutate (double_admissions = admissions * 2) |>
ungroup ()
# nu skool is
data_nu <- data |>
mutate (double_admissions = admissions * 2,
.by = type)
# nu skool, dropping groups is default, no need to ungroup()
###################
# Count functions #
###################
# lets do some more dpylr
# lets do a tidy version of 'table'
# this is useful if we want that kind of summary at the end
# of a longer pipe of stuff
data_count <- data |>
count(type)
# this could be the same as
data_count <- data |>
summarise (count = n(),
.by = type)
# creates a new column like a mutate with a count by feature
# eg how many times an org_code has submitted
data_count <- data |>
add_count(org_code)
# is the same as
data_count <- data |>
mutate (count = n(),
.by = type)
###################
# fancy filtering #
###################
# filters data to org codes that contain a 'R' at any point
data_filter <- data |>
filter(str_detect(org_code, "R"))
# filters data to latest date period per org code
# (such an awesome feature to put a group by in a filter!)
data_filter <- data |>
filter(period == max(period),
.by = org_code)
# filters data to type 1 AND 'attendances over 10,000
data_filter <- data |>
filter(type == '1',
attendances > 10000)
# filters data to type 1 OR 'attendances over 10,000
data_filter <- data |>
filter(type == '1' |
attendances > 10000)
# <<< Over to you >>>>
# can you write a script to check if we have one row per org_code
# if we have not, return only those where we have more than one row?
# and for bonus points put them in order by number of rows?
# HINT - you can count on the fact we have already covered how to do this
# and maybe look at what else the function can do (?count)
data_filter <- data |>
count(org_code,
sort = TRUE) |>
filter (n > 1)
#########################
################
# conditionals #
################
# basic two part conditional - if_else
# lets flag all instances where attendances were above 20,000
# a simple if_else statement if condition, do this else do that
data <- data |>
mutate(above_20000 = if_else (attendances >= 20000,
'Y',
'N'))
# NOTE data types and outputs for two conditions need to be the same
data <- data |>
mutate(above_20000 = if_else (attendances >= 20000,
100,
"9999"))
# will throw a wobble
# if_else is great for a single conditional - you can nest if_else statements
# but that gets really messy quickly, especially with the amount of brackets at
# the end
# multi part conditional - or case statement
# lets create a grouping column for our attendances in 5000s
data <- data |>
mutate(
attendance_grouping = case_when(
attendances < 5000 ~ 'Less than 5,000',
attendances < 10000 ~ '5,000 to 9,999',
attendances < 15000 ~ '10,000 to 14,999',
attendances < 20000 ~ '15,000 to 19,999',
attendances < 25000 ~ '20,000 to 24,999',
.default = 'Over 25,0000'
)
)
# the .default gives the default or 'else' statement
# the '~' is called a tilde and can be found as shift # next to the return key
# there is also an old skool way of doing this
data <- data |>
mutate(
attendance_grouping = case_when(
attendances < 5000 ~ 'Less than 5,000',
attendances < 10000 ~ '5,000 to 9,999',
attendances < 15000 ~ '10,000 to 14,999',
attendances < 20000 ~ '15,000 to 19,999',
attendances < 25000 ~ '20,000 to 24,999',
TRUE ~ 'Over 25,0000'
)
)
# note the 'true' is the else statement and you use a tilde rather than equals
# both work the same, the .default is the more modern method
# however lets make a deliberate issue
# for example this case statement fails if we have a value of exactly 25,000
# well not fail, just puts something in the wrong category
data$attendances[1] <- 25000
# now rerun the above case statement
# Lets also add some null data
data$attendances[2] <- NA
# again the case statement 'works' but is incorrect
# personally I find it best to use the true statement as an error catch
data <- data |>
mutate(
attendance_grouping = case_when(
attendances < 5000 ~ 'Less than 5,000',
attendances < 10000 ~ '5,000 to 9,999',
attendances < 15000 ~ '10,000 to 14,999',
attendances < 20000 ~ '15,000 to 19,999',
attendances < 25000 ~ '20,000 to 24,999',
attendances > 25000 ~ 'Over 25,0000',
.default = 'ERROR - does not compute'
)
)
# we can fix this by adding a >= and changing our grouping
# description to '25,000 and over'
# <<< Over to you >>>>
# add a column that if type 1 halves the attendances
# if type 2 triples the attendances
# if type other quads the attendances
# if error a suitable error
# HINT - for your returns ensure they are all of the same data type
data <- data |>
mutate(attendtance_multliper = case_when(type == '1' ~ attendances * 0.5,
type == '2' ~ attendances * 3,
type == 'other' ~ attendances * 4,
.default = -9999))
#########################
# base R if statement - allows us to do what I call a one-sided if statement
# really useful if you want to trigger a conditional process
# curly brackets denote a 'scope' - a scope being a piece of code that may
# not necessarily be evaluated and does not always affect the global environment
a <- 5
if (a == 5) {
a <- 10
print ('a is now 10')
b <- a
print('b has been created as a variable and is now a')
}
# this is really powerful as works by if condition in brackets is met
# to do ALL of what is in the brackets
# the 'scope' of the if statement is global
a <- 5
if (a == 5) {
a <- 10
print ('a is now 10')
c <- a
print('c has been created as a variable and is now a')
} else {
print(paste0('a is ', a))
print('c does not exist')
}
#####################
# group and mutate #
# to make sub total #
#####################
# grouping and mutating rather than summarising
# calc total and percentages by month and org
# lets create a total number of attendances across all types by org and month
data_tot_perc <- data |>
mutate (total_attend = sum(attendances),
perc_attend = (attendances / total_attend) * 100,
.by = c(org_code, period))
# <<< Over to you >>>>
# that creates a pretty long decimal as a percentage
# can you round it to 1 decimal place?
# may need a little google fu
# HINT - be very mindful of your commas and brackets!
data_tot_perc <- data |>
mutate (total_attend = sum(attendances),
perc_attend = round((attendances / total_attend) * 100 ,1),
.by = c(org_code, period))
# or alternatively if you are not confident with your brackets and commas
data_tot_perc <- data |>
mutate (total_attend = sum(attendances),
perc_attend = (attendances / total_attend) * 100,
perc_attend = round(perc_attend,1),
.by = c(org_code, period))
#########################
# say we wanted to do that on all our numeric data
# the awesome across function allows us to do pretty fancy stuff
data_tot_perc <- data |>
mutate (across(where(is.numeric),
~(. / sum(.)) * 100,
.names = "perc_{.col}"),
.by = c(org_code, period))
# across also uses tidy select functions
data_tot_perc <- data |>
mutate (across(contains('es'),
~(. / sum(.)) * 100,
.names = "perc_{.col}"),
.by = c(org_code, period))
# this may be a little too advanced for now, but do come back to it!
#######################
# row wise operations #
#######################
# we are now going to do a rowwise operation to find the maximum of our
# attendances, admissions and our newly created column
# in essence rowwise treats each each row as its own group
data <- data |>
rowwise() |>
mutate(max_col = max(attendances,
admissions,
breaches,
na.rm = TRUE)) |>
ungroup()
# note that rowwise is a grouping function and needs to be ungrouped
##########################
# pivot wider and longer #
##########################
# pivoting data to longer or wider formats
# often we want long data for charts and wide data for tables and often have to
# convert from one to another
# lets go wide, lets look at a handful of sites and attendances and pivot wider
# on date
# ie convert
# org period attendances
# abd jan 100
# abd feb 200
# abd mar 300
# to
# org jan feb mar
# abd 100 200 300
# lets start with filtering our data and selecting only a few columns
data_wide <- data |>
filter(org_code == 'RQM',
type == '1',
period >= '2018-08-01') |>
select (org_code,
period,
attendances)
# have quick look at your data and see what shape it is in
# lets pivot
data_wide <- data_wide |>
pivot_wider(names_from = period,
values_from = attendances)
# <<< Over to you >>>>
# do the same but for breaches
# HINT - remember to run the first bit and pull through the right data before
# you pivot
# for bonus points make the process one pipe
data_wide <- data |>
filter(org_code == 'RQM',
type == '1',
period >= '2018-08-01') |>
select (org_code,
period,
breaches) |>
pivot_wider(names_from = period,
values_from = breaches)
#########################
# lets do a complex version
data_wide <- data |>
filter(org_code == 'RF4',
#type == '1', # have not removed the type this time
period >= '2018-08-01') |>
select (org_code,
period,
attendances,
type) # have included type back in
# look at the data
# lets pivot
data_wide <- data_wide |>
pivot_wider(names_from = period,
values_from = attendances)
# lets do a complex version with more sites
data_wide <- data |>
filter(org_code %in% c('RQM',
'RJ1',
'RF4'),
# type == '1', # have not removed the type
period >= '2018-08-01') |>
select (org_code,
period,
attendances,
type) # have included type back in
# look at the data
# lets pivot
data_wide <- data_wide |>
pivot_wider(names_from = period,
values_from = attendances)
# lets do a complex version with breaches as well - another period dependent
# variable
data_wide <- data |>
filter(org_code %in% c('RQM',
'RJ1',
'RF4'),
period >= '2018-08-01') |>
select (org_code,
period,
attendances,
breaches, # have breaches type back in
type)
# look at the data
# lets pivot
data_wide <- data_wide |>
pivot_wider(names_from = period,
values_from = attendances)
# yuck not what we want - reset the data and lets try again
data_wide <- data |>
filter(org_code %in% c('RQM','RJ1', 'RF4'),
period >= '2018-08-01') |>
select (org_code,
period,
attendances,
breaches, # have breaches type back in
type)
# lets pivot
data_wide <- data_wide |>
pivot_wider(names_from = period,
values_from = c(attendances,
breaches))
# r has given us automatically generated column names
# (there are options to change how that is handled, but not going to go into
# that now)
# lets make our wide data long
# start with a basic wide dataset again
data_wide <- data |>
filter(org_code == 'RQM',
type == '1',
period >= '2018-08-01') |>
select (org_code,
period,
attendances)
# lets pivot
data_wide <- data_wide |>
pivot_wider(names_from = period,
values_from = attendances)
# and make it long
data_long <- data_wide |>
pivot_longer(cols = where(is.numeric),
names_to = 'period',
values_to = 'attendances')
# not going to lie, pivoting wide data to long is harder and requires much more
# wrangling, however the principle is the same and so going to move on
#####################
# rolling functions #
#####################
# lets just do some other bits of wrangling
# say we wanted a 6 month rolling mean of attendances by each of the sites in
# our data...
library(zoo)
# zoo has a lovely function for rolling windows
data <- ae_attendances
data_roll <- data |>
filter(org_code %in% c('RQM',
'RJ1',
'RDD')
) |>
arrange(org_code,
type,
period) |>
mutate(rolling = rollapply(attendances,
6,
mean,
align = 'right',
fill = NA),
.by = c(org_code,
type))
# <<< Over to you >>>>
# see if you can change the window to 3 months
# then add an additional new column with a median over 3 months
# with the median, see if you can calculate it on the middle time period
# and replace any blanks with 9999
# HINT - read your error messages
data_roll <- data |>
filter(org_code %in% c('RQM',
'RJ1',
'RDD')
) |>