novos codigos

curso-r · Dec 10, 2024 · 5f77e41 · 5f77e41
1 parent 5e27565
commit 5f77e41
Show file tree

Hide file tree

Showing 4 changed files with 434 additions and 4 deletions.
diff --git a/exemplos_de_aula/08-logistic-reg.R b/exemplos_de_aula/08-logistic-reg.R
@@ -18,6 +18,9 @@ dados_numericos <- credit_data |>
     Assets, Debt, Amount, Price
   ) |>
   drop_na()
+
+dados <- credit_data |>
+  drop_na()
 # isso aqui nao é muito legal! na proxima aula vamos ver como
 # melhorar para nao precisar manipular desse jeito antes de
 # modelar
@@ -29,15 +32,20 @@ dados_numericos <- credit_data |>
 
 # split inicial -----------------------------------------------------------
 
-credit_initial_split <- initial_split(dados_numericos)
+credit_initial_split <- initial_split(dados)
 
 treino <- training(credit_initial_split)
 teste <- testing(credit_initial_split)
 
 # receita -----------------------------------------------------------------
 
 receita <- recipe(Status ~ ., data = treino) |>
-  step_normalize(all_numeric_predictors())
+  step_dummy(Records)
+
+# esse comando abaixo é útil para que eu veja
+prep(receita) |>
+  juice() |>
+  View()
 
 # modelo ------------------------------------------------------------------
 
@@ -52,7 +60,6 @@ meu_fluxo <- workflow() |>
   add_recipe(receita) |>
   add_model(modelo)
 
-
 # cv ----------------------------------------------------------------------
 
 reamostras <- vfold_cv(treino, v = 5)
@@ -65,9 +72,11 @@ tunagem <- tune_grid(
   meu_fluxo,
   resamples = reamostras,
   metrics = metricas,
-  grid =grid_regular(penalty(c(-12, 0)))
+  grid = grid_random(levels = 5, penalty(c(-12, 0)), degree(range = c(1, 5)))
 )
 
+tune_bayes(meu_fluxo, resamples = reamostras)
+
 
 # graficos ----------------------------------------------------------------
 

diff --git a/exemplos_de_aula/09-arvore-vs-regressao.R b/exemplos_de_aula/09-arvore-vs-regressao.R
@@ -0,0 +1,131 @@
+
+# Pacote ------------------------------------------------------------------
+
+library(tidymodels)
+library(tidyverse)
+library(pROC)
+
+
+# Base de dados -----------------------------------------------------------
+
+data(credit_data)
+
+dados <- credit_data
+
+# initial split -----------------------------------------------------------
+
+set.seed(1)
+
+split_inicial <- initial_split(dados)
+
+treino <- training(split_inicial)
+teste <- testing(split_inicial)
+
+# receitas ----------------------------------------------------------------
+
+receita_regressao <- recipe(Status ~ ., data = treino) |>
+  #step_naomit(everything()) |>
+  step_unknown(Home, Marital, Job) |>
+  step_impute_median(Assets, Debt, Income) |>
+  step_impute_knn(Assets, Debt, Income, neighbors = tune()) |>
+  step_dummy(all_nominal_predictors()) |>
+  step_poly(Assets, Debt, Amount, Income, degree = tune())
+
+receita_arvore <- recipe(Status ~ ., data = treino) |>
+ step_zv(all_nominal_predictors())
+
+# modelos -----------------------------------------------------------------
+
+modelo_arvore <- decision_tree(
+  min_n = tune(),
+  tree_depth = tune(),
+  cost_complexity = tune(),
+  "classification") |>
+  set_engine(
+    "rpart"
+  )
+
+modelo_regressao <- logistic_reg(
+  penalty = tune()
+) |>
+  set_engine("glmnet")
+
+
+# workflows ---------------------------------------------------------------
+
+workflow_arvore <- workflow() |>
+  add_recipe(receita_arvore) |>
+  add_model(modelo_arvore)
+
+workflow_regressao <- workflow() |>
+  add_recipe(receita_regressao) |>
+  add_model(modelo_regressao)
+
+# tunagem -----------------------------------------------------------------
+
+
+grid_arvore <- grid_regular(
+  tree_depth(c(10, 15)),
+  min_n(c(10, 30)),
+  cost_complexity(c(-10, -3)),
+  levels = 5
+)
+
+controle <- control_grid(verbose = TRUE, allow_par = TRUE)
+
+metricas <- metric_set(roc_auc, accuracy, sensitivity)
+
+reamostras <- vfold_cv(treino, 3)
+
+tunagem_arvore <- tune_grid(
+  workflow_arvore,
+  reamostras,
+  grid = grid_arvore,
+  control = controle,
+  metrics = metricas
+  )
+
+tunagem_regressao <- tune_grid(workflow_regressao, reamostras)
+
+autoplot(tunagem_arvore, metric = "roc_auc")
+autoplot(tunagem_regressao)
+
+# finalizando modelo ------------------------------------------------------
+
+workflow_arvore_final <- workflow_arvore |>
+  finalize_workflow(
+    select_best(tunagem_arvore)
+  )
+
+workflow_regressao_final <- workflow_regressao |>
+  finalize_workflow(
+    select_best(tunagem_regressao)
+  )
+
+
+# ultimo ajuste -----------------------------------------------------------
+
+last_fit_arvore <- last_fit(workflow_arvore_final, split_inicial)
+
+last_fit_regressao <- last_fit(workflow_regressao_final, split_inicial)
+
+collect_metrics(last_fit_arvore)
+collect_metrics(last_fit_regressao)
+
+# curvas ROC --------------------------------------------------------------
+
+collect_predictions (last_fit_arvore) |>
+  roc_curve(Status, .pred_bad) |>
+  autoplot()
+
+bind_rows(
+  collect_predictions(last_fit_arvore) |>
+    roc_curve(Status, .pred_bad) |>
+    mutate(modelo = "Árvore"),
+  collect_predictions(last_fit_regressao) |>
+    roc_curve(Status, .pred_bad) |>
+    mutate(modelo = "Regressão")
+) |>
+  ggplot(aes(x = 1-specificity, y = sensitivity, color = modelo)) +
+  geom_line() +
+  theme_bw()
diff --git a/exemplos_de_aula/09-logistic-reg-receita.R b/exemplos_de_aula/09-logistic-reg-receita.R
@@ -0,0 +1,134 @@
+# objetivo: ajustar uma regressao logistica no R
+# com regularizacao
+
+
+# Pacotes -----------------------------------------------------------------
+
+library(tidymodels)
+library(ISLR)
+
+
+# Base dados --------------------------------------------------------------
+
+data(credit_data)
+
+dados_numericos <- credit_data |>
+  select(
+    Status, Seniority, Time, Age, Expenses, Income,
+    Assets, Debt, Amount, Price
+  ) |>
+  drop_na()
+
+dados <- credit_data  #drop_na()
+# isso aqui nao é muito legal! na proxima aula vamos ver como
+# melhorar para nao precisar manipular desse jeito antes de
+# modelar
+
+skimr::skim(dados)
+
+# Analises iniciais -------------------------------------------------------
+
+# nao vou fazer hoje
+
+
+# split inicial -----------------------------------------------------------
+
+credit_initial_split <- initial_split(dados)
+
+treino <- training(credit_initial_split)
+teste <- testing(credit_initial_split)
+
+# receita -----------------------------------------------------------------
+
+receita <- recipe(Status ~ ., data = treino) |>
+  #step_naomit(everything()) |>
+  step_unknown(Home, Marital, Job) |>
+  #step_impute_median(Assets, Debt, Income) |>
+  step_impute_knn(Assets, Debt, Income, neighbors = tune()) |>
+  step_dummy(all_nominal_predictors()) |>
+  step_poly(Assets, Debt, Amount, Income, degree = tune())
+
+# esse comando abaixo é útil para que eu veja os passos sendo executados
+
+# prep(receita) |>
+#   juice() |>
+#   skimr::skim()
+
+# modelo ------------------------------------------------------------------
+
+modelo <- logistic_reg(
+  engine = "glmnet",
+  penalty = tune()
+)
+
+# workflow ----------------------------------------------------------------
+
+meu_fluxo <- workflow() |>
+  add_recipe(receita) |>
+  add_model(modelo)
+
+# cv ----------------------------------------------------------------------
+
+reamostras <- vfold_cv(treino, v = 5)
+
+# tunagem -----------------------------------------------------------------
+
+metricas <- metric_set(mn_log_loss, accuracy, roc_auc)
+
+tunagem <- tune_grid(
+  meu_fluxo,
+  resamples = reamostras,
+  metrics = metricas,
+  control = control_grid(verbose = TRUE),
+  # esse comando é novo e controla a tunagem
+  # em particular verbose=TRUE manda ele
+  # imprimir na tela conforme vai ajustando
+  # modelos
+  grid = grid_regular(levels = 5, penalty(c(-4, -2)), degree(range = c(1, 5)), neighbors())
+)
+
+autoplot(tunagem)
+
+show_best(tunagem, metric = "accuracy")
+
+# graficos ----------------------------------------------------------------
+
+autoplot(tunagem)
+
+
+# finalizar workflow ------------------------------------------------------
+
+workflow_final <- meu_fluxo |>
+  finalize_workflow(
+    select_best(tunagem, metric = "accuracy")
+  )
+
+
+# ultimo fit --------------------------------------------------------------
+
+ultimo_modelo <- last_fit(workflow_final, credit_initial_split,
+                          metrics = metricas)
+# ajustar o modelo na base de teste e coletar as predicoes
+
+collect_metrics(ultimo_modelo)
+
+collect_predictions(ultimo_modelo) |>
+  roc_curve(Status, .pred_bad) |>
+  autoplot()
+
+# modelo final
+
+# antes teria que fazer o de treino/teste pra ver se nao overfitou...
+
+modelo_final <- fit(workflow_final, dados_numericos)
+
+dados_com_previsao <- dados_numericos |>
+  bind_cols(
+    predict(modelo_final, new_data = dados_numericos, type = c("prob")),
+    predict(modelo_final, new_data = dados_numericos, type = c("class"))
+  )
+
+dados_com_previsao |>
+  mutate(prob_grafico = ifelse(Status == "good", .pred_good, .pred_bad)) |>
+  ggplot(aes(x = .pred_good, fill = Status)) +
+  geom_density(alpha = 0.5)