forked from rafapereirabr/gtfs_to_igraph
-
Notifications
You must be signed in to change notification settings - Fork 0
/
.Rhistory
512 lines (512 loc) · 29.6 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
df <- unique(df) # remove duplicate of identical stops
head(df)
df[, quant := .N, by = clust]
table(df$quant)
nrow(df[ parent_station ==""])
df[ location_type==1, parent_station := stop_id ]
df[ parent_station=="" & stop_id %in% df$parent_station, parent_station := stop_id ]
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station=="" & stop_id %in% df$parent_station, stop_id, "")), by=clust]
nrow(df[ parent_station ==""]) # 2014: 7349 |||| 2017: 5938
# Update Parent Stations for each cluster
# a) Stops which alread have parent stations stay the same
# b) stations with no parent, will receive the parent of the cluster
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station=="", max(parent_station), "")), by=clust]
nrow(df[ parent_station ==""]) # total number of stops without Parent Station
# d) For those clusters with no parent stations, get the 1st stop to be a Parent
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station== "", stop_id[1L], "")), by=clust]
nrow(df[ parent_station ==""]) # total number of stops without Parent Station
# all clusters > 1 have a parent station
df[quant > 1 & parent_station==""][order(clust)] # should be empty
# Remaining stops without Parent Station
nrow(df[ parent_station ==""]) # 5149 (2014) ||| 4397 (2017mix)
# make sure intersection does not change (before and after)
intersect(df$parent_station, stop_times$stop_id) %>% length()
# make sure parent stations are consistent within each cluster with more than one stop
df[ quant > 1 , parent_station := max(parent_station), by=clust]
unique(df$parent_station) %>% length()
#### Correction approach based on stop_name ------------------
### 2. BRT stops ------------------
# df[ parent_station=="" & stop_name %like% "BRT" ]
# df[ stop_name %like% "Magar?a" ]
# df[ clust ==41] # 18223857
# df[ stop_id ==18225872]
if (year=="2017mix" | year=="2017counter"){
# all stops on 2017-05-05
df[ grepl( "BRT.*Madureira", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Bosque da Barra", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Mato Alto", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pastor José Santos", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Praça do Carmo", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Guaporé", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Marambaia", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Vila Kosmos", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pedro Taques", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Vila Queiroz", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Vaz Lobo", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Mercadão", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Otaviano", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Rio 2", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Merck", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Arroio Pavuna", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Vila Sapé", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Recanto das Palmeiras", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Divina Providência", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pedro Correia", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Curicica", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Praça do Bandolim", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Tanque", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Galeão 2", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Galeão 1", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Guiomar Novaes", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Santa Efigênia", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*André Rocha", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Aracy Cabral", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Taquara", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Centro Metropolitano", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Vila Militar", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Rede Sarah", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Marechal Fontenelle", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*São José de Magalhães Bastos", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pe. João Chribbin", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Ventura", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Aeroporto de Jacarepaguá", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Colônia", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Lourenço Jorge", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Outeiro Santo", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Boiúna", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Leila Diniz", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Ilha Pura", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Tapebuias", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Catedral do Recreio", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Minha Praia", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Asa Branca", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Morro do Outeiro", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Parque Olímpico", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Olof Palme", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Riocentro", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Via Parque", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Gilka Machado", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Bosque Marapendi", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Paulo Malta Rezende", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Dom Bosco", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Ricardo Marinho", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Riviera", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Afrânio Costa", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Parque das Rosas", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pontões/Barrasul", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Ibiapina", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Maré", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Santa Luzia", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Cardoso de Moraes", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Fundão", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Barra Shopping", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Benvindo de Novaes", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Ilha de Guaratiba", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Curral Falso", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Interlagos", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Nova Barra", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Gelson Fonseca", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Guignard", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pedra de Itaúna", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Riomar", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Novo Leblon", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Américas Park", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Golfe Olímpico", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Cesarinho", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Três Pontes", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*31 de Outubro", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Cesarão II", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Cesarão I", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Vila Paciência", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Cesarão III", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pingo d'Água", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Cajueiros", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Vendas de Varanda", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Cetex", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Embrapa", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pontal", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Notre Dame", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Santa Cruz", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*General Olímpio", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Gastão Rangel", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Recanto das Garças", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Gláucio Gil", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Recreio Shopping", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Santa Mônica Jardins", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Salvador Allende", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Santa Veridiana", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Pinto Teles", stop_name), parent_station := max(parent_station) ]
df[ stop_name %like% "BRT.*Penha|Terminal Penha", parent_station := max(parent_station) ]
#### BRT special cases
df[ grepl( "BRT.*Tanque|Terminal.*Tanque", stop_name) , parent_station := max(parent_station) ]
df[ grepl( "BRT.*Magarça|Terminal.*Magarça", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Alvorada|Terminal.*Alvorada", stop_name), parent_station := max(parent_station) ]
df[ grepl( "BRT.*Taquara", stop_name), parent_station := "31827958" ]
# BRT Transcarioca - Fundão
temp <- df[ grepl( "BRT.*Fundão", stop_name), .(parent_station) ][1]
df[ stop_name %like% "Terminal Aroldo Melodia", parent_station := temp]
# BRT Santa Eugênia merge integracao a Trem paciencia
df[ stop_id =="9641", parent_station := "49666230" ]
# Jardim Oceanico, Metro e BRT
df[ stop_name %like% "Jardim Oceânico", parent_station := max(parent_station)]
}
# Remaining stops without Parent Station
nrow(df[ parent_station ==""]) # ||| 2017mix: 4396
### 3. stops in Terminals ------------------
# df[ parent_station=="" & stop_name %like% "Terminal" ]
# df[ stop_name %like% "Terminal Aroldo Melodia" ]
terminal_to_correct <- c( "Terminal Da Puc próximo"
, "Terminal da Puc próximo"
, "Terminal Alvorada"
, "Terminal Padre Henrique Otte - Plataforma")
for (i in terminal_to_correct ){
df[ stop_name %like% i, parent_station := ifelse( parent_station !="" , parent_station, # a
ifelse( parent_station=="", max(stop_id), "ERRO"))]
}
# Remaining stops without Parent Station
nrow(df[ parent_station ==""]) # 2014: 5144 ||| 2017: 4396
### 4. stops with Plataforma ------------------
df[ parent_station=="" & stop_name %like% "Plataforma", ]
df[ parent_station=="" & stop_name %like% "plataforma", ]
### 5. stops with Rodoviária ------------------
if (year=="2017mix" | year=="2017counter"){
# df[ parent_station=="" & stop_name %like% "rodoviária", ]
# df[ parent_station=="" & stop_name %like% "Rodoviária", ]
# df[ parent_station=="" & stop_name %like% "Terminal Rodo", ]
# Fix Terminal Rodoviário Novo Rio
df[ stop_id == '28420676', parent_station := '28420676'] # Novo Rio
df[ stop_id == '18242377', parent_station := '28420676']
df[ stop_id == '47998886', parent_station := '28420676']
df[ stop_id == '47999015', parent_station := '28420676']
}
df[ stop_name %like% 'Novo Rio',]
# Remaining stops without Parent Station
nrow(df[ parent_station ==""]) # 2014: 5080 ||||| 017mix: 4394
### 5. stops with Estação ------------------
#ok ??? necesarrio?
df[ parent_station=="" & stop_name %like% "Estação", ]
df[ parent_station=="" & stop_name %like% "estação", ]
if (year=="2014" | year=="2017counter"){
# Estação de Integração - UFRJ
df[ stop_id =="19859078" , parent_station := "19859078"]
df[ stop_id =="19859062" , parent_station := "19859078"] }
### 6. stops with Central do Brasil ------------------
df[ stop_name %like% "Central do Brasil", ]
df[ stop_name %like% "Central", ]
temp <- "parent_CentraldoBrasil"
df[ stop_name %like% "Central do Brasil", parent_station := temp]
df[ stop_name == "Central", parent_station := temp]
df[ stop_id == "18270480", parent_station := temp]
df[ stop_id == "18223617", parent_station := temp]
df[ stop_id == "18226161", parent_station := temp] # Procópio Ferreira
df[ stop_id == "31658487", parent_station := temp]
df[ stop_id == "18370876", parent_station := temp]
df[ stop_name %like% "Procópio Ferreira|PROCÓPIO FERREIRA", parent_station := temp]
# ### 6. Train Stations ------------------
# um solucao seria (1) identifica paradas q sao de trem, depois aplica essa formula soh para 2014
# only for 2014 because it has train statations and plataforms as different stops
# # get a list with all train station names form 2017
# dftrens17 <- "./GTFS data/GTFS Rio feed_supervia 2017-03-06.zip"
# dftrens17 <- lapply( list(dftrens17) , unzip_fread_stops, file="stops.txt") %>% rbindlist()
# Encoding(dftrens17$stop_name) <- "UTF-8"
if (year=="2014" | year=="2017counter"){
trains_to_correct <- c( "São Cristóvão"
, "Maracanã"
, "Praça da Bandeira"
, "São Francisco Xavier"
, "Silva Freire"
, "Méier"
, "Engenho de Dentro"
, "Piedade"
, "Quintino"
, "Cascadura"
, "Ricardo de Albuquerque"
, "Vila Militar"
, "Magalhães Bastos"
, "Guilherme da Silveira"
, "Senador Camará"
, "Santíssimo"
, "Augusto Vasconcelos"
, "Benjamin do Monte"
, "Triagem"
, "Tancredo Neves"
, "Jacarezinho"
, "Pilares"
, "Mercadão de Madureira"
, "Rocha Miranda"
, "Honório Gurgel"
, "Barros Filho"
, "Costa Barros"
, "Mangueira"
, "Manguinhos"
, "Penha Circular"
, "Brás de Pina"
, "Parada de Lucas"
, "Vigário Geral")
for (i in trains_to_correct ){
df[ stop_name %like% i, parent_station := max(stop_id) ] }
}
# Train station with names common to other places
# in general, they are Ok, tentativa de correcao pode confundir e atrapalhar
nrow(df[ parent_station ==""]) # 5139 (2014) ||| 4390 (2017mix)
### Special Cases
# parada na Uranos da Direto na Estacao de Trem Bonsucesso, 2014, 2017
df[ stop_id == "18223413", parent_station := "18710007"]
# parada da Uranos perto da Estacao de Trem Ramos, mas inacessivel
df[ stop_id == '18227189', parent_station := '18227189']
# Estacao Deodoro
df[ stop_name %like% "Deodoro", parent_station := max(parent_station)]
nrow(df[ parent_station ==""]) # 5138 (2014) ||| 4389 (2017mix)
# # parada de onibus muito perto do metro em 2017
# df[ stop_id =='18370886', parent_station := stop_id] # metro Gloria
# df[ stop_id =='18370884', parent_station := stop_id] # metro Cinelandia
# df[ stop_id =='18370882', parent_station := stop_id] # metro Carioca
# merge trem e BRT
if (year=="2017mix" | year=="2017counter"){
# Estacao Madureira, merge trem e BRT
df[ stop_id == "9608", parent_station := "31827977"]
df[ stop_id == "18231461", parent_station := "31827977"]
# Estacao Olaria, merge trem e BRT
df[ grepl( "BRT.*Olaria", stop_name), parent_station := max(parent_station) ]
df[ stop_id == "9687", parent_station := "31828618"]
# Estacao Vicente de Carvalho, merge trem e BRT
temp <- df[ grepl( "BRT.*Vicente de Carvalho", stop_name)] %>% .$parent_station %>% max()
df[ grepl( "BRT.*Vicente de Carvalho", stop_name), parent_station := temp]
df[ stop_id %like% "18371101", parent_station := temp ]
df[ stop_id %like% "38734816", parent_station := temp ]
df[ stop_id %like% "38734700", parent_station := temp ]
df[ stop_id %like% "18228584", parent_station := temp ]
df[ stop_id %like% "38734698", parent_station := temp ]
# Estacao Vila Militar, merge trem e BRT
df[ stop_id == "9630", parent_station := "48602908"]
}
### 6. Subway Stations ------------------
# df[ stop_name %like% "Vicente de Carvalho"]
# df[ clust ==49 ] # 18223857
# df[ stop_id ==18370876]
# this should be empty
stop_times[ !(stop_id %in% df$stop_id) & (stop_id %in% df$parent_station)]
### identify and remove stops with no links at all ---------------------
df[ quant ==1 & parent_station=="", isolated_dummy := if_else( stop_id %in% stop_times$stop_id, 0,1)]
sum(df$isolated_dummy, na.rm = T)
ggplot(data=subset(df, isolated_dummy==1), aes(x=stop_lon,y=stop_lat)) + geom_point()
# Do they have any links via a parent station?
# if the isolated stop has a parent_station that is present in the stop_times file, then replace stop_id with parent station, change back isolaton status
df[ isolated_dummy ==1 & parent_station %in% stop_times$stop_id, c('stop_id', 'isolated_dummy'):= list(parent_station, 0)]
sum(df$isolated_dummy, na.rm = T)
# isolated stops excluded: ||||| 2017: 127
stops_excluded <- df[ (isolated_dummy==1 & parent_station!="") ]
cat(paste("number of isolated stops (inactive with no trips) excluded:", nrow(stops_excluded) ))
# # Andherson
# fwrite(stops_excluded, "stops_with_no_trips.csv")
# if no connection at all, remove isolated (inactive) stops
df <- subset(df, !(stop_id %in% stops_excluded$stop_id))
sum(df$isolated_dummy, na.rm=T)
#### Final evaluation -----------------------
# all clusters > 1 have a parent station
df[quant > 1 & parent_station==""][order(clust)] # this should be empty
# number of stops per cluster
table(df$quant)
# Remaining stops without Parent Station
nrow(df[ quant ==1 & parent_station ==""]) # 5071 ||| 4250
# for the lonly stops, make sure they are the Parent station of themselves
df[ quant ==1 & parent_station=="" , parent_station := stop_id , by=stop_id]
nrow(df[ parent_station ==""]) # 0
# update lat + long to be the same as 1st Parent Station
df[, stop_lon := stop_lon[1], by=parent_station]
df[, stop_lat := stop_lat[1], by=parent_station]
unique(df$stop_id) %>% length() # (2014, 7626)(2017mix, 6338)
unique(df$parent_station) %>% length() # (2014, 6283)(2017mix, 5290)
# # fazer mapa antes e depois
# map_after <- ggplot() + geom_point(data=df, aes(x=stop_lon,y=stop_lat,label=c(parent_station)), color="gray") + coord_equal() + ggtitle("After")
# map_after
# map_before + geom_point(data=df, aes(x=stop_lon,y=stop_lat,label=c(parent_station)), color="gray") + coord_equal() + ggtitle("After")
# the core of the CORRECTION is this ---------------------
# the core of the CORRECTION is this
# the core of the CORRECTION is this
# stop_times2 <- copy(stop_times)
# stop_times <- copy(stop_times2)
#
# Add parent_station info to stop_times
# based on correspondence btwn stop_times$stop_id and df$stop_id
stop_times[df, on= 'stop_id', c('clust', 'parent_station') := list(i.clust, i.parent_station) ]
# trips with non-identified stops (2014: 766,791 ||| 2017: 328,421)
sum(is.na(stop_times$parent_station))
# non-identified stops (2014: 1003 ||| 2017mix: 0)
stop_times[is.na(parent_station), .(stop_id, parent_station)] %>% unique() %>% nrow()
# CRUX: Replace stop_id with parent_station --------------------
stop_times[ !is.na(parent_station) , stop_id := parent_station ]
df[ , stop_id := parent_station ]
# remove repeated stops
df <- unique(df)
# identify transport modes, route and service level for each trip
routes <- routes[,.(route_id, route_type)] # keep necessary cols
trips <- trips[,.(route_id, trip_id, service_id)] # keep necessary cols
trips[routes, on=.(route_id), route_type := i.route_type] # add route_type to trips
# add these columns to stop_times: route_id, route_type, service_id
stop_times[trips, on=.(trip_id), c('route_id', 'route_type', 'service_id') := list(i.route_id, i.route_type, i.service_id) ]
gc(reset = T)
# # Only keep trips during weekdays
# # remove columns with weekends
# calendar <- calendar[, -c('saturday', 'sunday')]
#
# # keep only rows that are not zero (i.e. that have service during weekday)
# calendar <- calendar[rowMeans(calendar >0)==T,]
#
# # Only keep those trips which run on weekdays
# stop_times2 <- subset(stop_times, service_id %in% calendar$service_id)
# subset columns
stop_times <- stop_times[, .(route_type, route_id, trip_id, stop_id, stop_sequence, arrival_time, departure_time)]
# get outputs stop_times and stops_edited in a list with name corresponding to year of input data
newList <- list("stop_times_" = stop_times, "stops_edited_"= df)
names(newList) <- paste0(names(newList), year) # attach year to name of output
# return result of function
return(list2env(newList ,.GlobalEnv))
list_gtfs_2017mix <- list.files("./GTFS data/GTFS_used/2017mix", full.names = T)
source("./R scripts/00_correct_GTFS_stops.R")
correct_gtfs_for_igraph(list_gtfs_2017mix, dist_threshold = 30, year="2017mix")
rm(list=setdiff(ls(), c('stop_times_2014','stop_times_2017mix', 'stop_times_2017counter', 'stops_edited_2014', 'stops_edited_2017mix', 'stops_edited_2017counter')))
gc(reset = T)
beep()
stop_times <- copy(stop_times_2017mix)
stops <- copy(stops_edited_2017mix)
stop_times[, stop_sequence := as.numeric(stop_sequence) ]
head(stop_times)
stop_times[order(trip_id, stop_sequence, route_id)] %>% head
head(stop_times)
stop_times[order(trip_id, stop_sequence, route_id)] %>% head
stop_times[order(route_id, trip_id, stop_sequence)] %>% head
stop_times[order(route_id, trip_id, stop_sequence)] %>% head
stop_times[order(trip_id, stop_sequence, route_id)] %>% head
setorder(stop_times, trip_id, stop_sequence, route_id)
head(stop_times)
setwd("R:/Dropbox/github/gtfs_to_igraph")
setwd("R:/Dropbox/github/gtfs_to_igraph")
my_gtfs_feeds <- list.files(path = ".", pattern =".zip", full.names = T)
list_gtfs = my_gtfs_feeds
dist_threshold =30
save_muxviz =T
##################### Load packages -------------------------------------------------------
library(igraph)
library(data.table)
library(dplyr)
library(magrittr)
library(sp)
library(geosphere)
############ 0. read GTFS files -----------------
# list_gtfs= list_of_gtfs_feeds
# dist_threshold = 30
cat("reading GTFS data \n")
# function to read and rbind files from a list with different GTFS.zip
tmpd <- tempdir()
unzip_fread_gtfs <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character") }
unzip_fread_routes <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('route_id', 'route_short_name', 'route_type', 'route_long_name')) }
unzip_fread_trips <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('route_id', 'service_id', 'trip_id', 'direction_id')) }
unzip_fread_stops <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station', 'location_type')) }
unzip_fread_stoptimes <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence')) }
# Read
stops <- lapply( list_gtfs , unzip_fread_stops, file="stops.txt") %>% rbindlist()
stop_times <- lapply( list_gtfs , unzip_fread_stoptimes, file="stop_times.txt") %>% rbindlist()
routes <- lapply( list_gtfs , unzip_fread_routes, file="routes.txt") %>% rbindlist()
trips <- lapply( list_gtfs , unzip_fread_trips, file="trips.txt") %>% rbindlist()
calendar <- lapply( list_gtfs , unzip_fread_gtfs, file="calendar.txt") %>% rbindlist()
# make sure lat long are numeric, and text is encoded
stops[, stop_lon := as.numeric(stop_lon) ][, stop_lat := as.numeric(stop_lat) ]
Encoding(stops$stop_name) <- "UTF-8"
############ 1. Identify stops that closee than distance Threshold in meters ------------------
cat("calculating distances between stops \n")
### Convert stops into SpatialPointsDataFrame
# lat long projection
myprojection_latlong <- CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
# convert stops into spatial points
coordinates(stops) <- c("stop_lon", "stop_lat")
proj4string(stops) <- myprojection_latlong
stops <- spTransform(stops, myprojection_latlong)
# use the distm function to generate a geodesic distance matrix in meters
mdist <- distm(stops, stops, fun=distHaversine)
# cluster all points using a hierarchical clustering approach
hc <- hclust(as.dist(mdist), method="complete")
# define clusters based on a tree "height" cutoff "d" and add them to the SpDataFrame
stops@data$clust <- cutree(hc, h=dist_threshold)
gc(reset = T)
# convert stops back into data frame
df <- as.data.frame(stops) %>% setDT()
df <- df[order(clust)]
df <- unique(df) # remove duplicate of identical stops
head(df)
# identify how many stops per cluster
df[, quant := .N, by = clust]
table(df$quant)
plot(df$stop_lon, df$stop_lat, col=df$clust)
############ 2. Identify and update Parent Stations ------------------
cat("Identifying and updating Parent Stations \n")
# How many stops have a Parent Station
nrow(df[ parent_station !=""])
# How many stops without Parent Station
nrow(df[ parent_station ==""])
# Stops which are Parent Stations (location_type==1) will be Parent Stations of themselves
df[ location_type==1, parent_station := stop_id ]
# in case the field location_type is missinformed
df[ parent_station=="" & stop_id %in% df$parent_station, parent_station := stop_id ]
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station=="" & stop_id %in% df$parent_station, stop_id, "")), by=clust]
#total number of stops without Parent Station
nrow(df[ parent_station ==""])
# Update Parent Stations for each cluster
# a) Stops which alread have parent stations stay the same
# b) stations with no parent, will receive the parent of the cluster
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station=="", max(parent_station), "")), by=clust]
nrow(df[ parent_station ==""])
# d) For those clusters with no parent stations, get the 1st stop to be a Parent
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station== "", stop_id[1L], "")), by=clust]
nrow(df[ parent_station ==""])
# all clusters > 1 have a parent station
df[quant > 1 & parent_station==""][order(clust)] # should be empty
# Remaining stops without Parent Station
nrow(df[ parent_station ==""])
# make sure parent stations are consistent within each cluster with more than one stop
df[ quant > 1 , parent_station := max(parent_station), by=clust]
unique(df$parent_station) %>% length()
# for the lonly stops, make sure they are the Parent station of themselves
df[ quant ==1 & parent_station=="" , parent_station := stop_id , by=stop_id]
nrow(df[ parent_station ==""]) == 0
############ 3. Update Lat long of stops based on parent_station -----------------------
# Update in stops data: get lat long to be the same as 1st Parent Station
df[, stop_lon := stop_lon[1], by=parent_station]
df[, stop_lat := stop_lat[1], by=parent_station]
# Update in stop_times data: get lat long to be the same as 1st Parent Station
# Add parent_station info to stop_times
# merge stops and stop_times based on correspondence btwn stop_times$stop_id and df$stop_id
stop_times[df, on= 'stop_id', c('clust', 'parent_station') := list(i.clust, i.parent_station) ]
# CRUX: Replace stop_id with parent_station
stop_times[ !is.na(parent_station) , stop_id := parent_station ]
df[ , stop_id := parent_station ]
# remove repeated stops
df <- unique(df)
############ 4. identify transport modes, route and service level for each trip -----------------------
routes <- routes[,.(route_id, route_type)] # keep only necessary cols
trips <- trips[,.(route_id, trip_id, service_id)] # keep only necessary cols
trips[routes, on=.(route_id), route_type := i.route_type] # add route_type to trips
# add these columns to stop_times: route_id, route_type, service_id
stop_times[trips, on=.(trip_id), c('route_id', 'route_type', 'service_id') := list(i.route_id, i.route_type, i.service_id) ]
gc(reset = T)
# # Only keep trips during weekdays
# # remove columns with weekends
# calendar <- calendar[, -c('saturday', 'sunday')]
#
# # keep only rows that are not zero (i.e. that have service during weekday)
# calendar <- calendar[rowMeans(calendar >0)==T,]
#
# # Only keep those trips which run on weekdays
# stop_times2 <- subset(stop_times, service_id %in% calendar$service_id)
stops_edited <- df[, .(stop_id, stop_name, parent_station, location_type, stop_lon, stop_lat)]
stop_times_edited <- stop_times[, .(route_type, route_id, trip_id, stop_id, stop_sequence, arrival_time, departure_time)]
stop_times_edited[, stop_sequence := as.numeric(stop_sequence) ]
head(stop_times_edited)
stop_times_edited[order(trip_id, stop_sequence, route_id)] %>% head
stop_times_edited[order(trip_id, stop_sequence, route_id)] %>% head