diff --git a/doc/pub/week48/html/week48-bs.html b/doc/pub/week48/html/week48-bs.html index 1c2786548..0a899735c 100644 --- a/doc/pub/week48/html/week48-bs.html +++ b/doc/pub/week48/html/week48-bs.html @@ -38,6 +38,18 @@ {'highest level': 2, 'sections': [('Overview of week 48', 2, None, 'overview-of-week-48'), ('Plan for week 47', 2, None, 'plan-for-week-47'), + ('Random Forest Algorithm, reminder from last week', + 2, + None, + 'random-forest-algorithm-reminder-from-last-week'), + ('Random Forests Compared with other Methods on the Cancer Data', + 2, + None, + 'random-forests-compared-with-other-methods-on-the-cancer-data'), + ('Compare Bagging on Trees with Random Forests', + 2, + None, + 'compare-bagging-on-trees-with-random-forests'), ("Boosting, a Bird's Eye View", 2, None, @@ -267,74 +279,77 @@ @@ -386,7 +401,7 @@

Nov 23, 2024

  • 9
  • 10
  • ...
  • -
  • 71
  • +
  • 74
  • »
  • diff --git a/doc/pub/week48/html/week48-reveal.html b/doc/pub/week48/html/week48-reveal.html index e1ed40818..d4fa5f1a5 100644 --- a/doc/pub/week48/html/week48-reveal.html +++ b/doc/pub/week48/html/week48-reveal.html @@ -209,7 +209,7 @@

    Plan for week 47

  • Lab sessions at usual times.
  • -

  • For the week of December 2-6, lab sessions atart at 10am and end 4pm, room FØ434, Tuesday and Wednesday
  • +

  • For the week of December 2-6, lab sessions start at 10am and end at 4pm, room FØ434, Tuesday and Wednesday
  • @@ -222,8 +222,8 @@

    Plan for week 47

  • Summary of course
  • Readings and Videos:
      -

    1. These lecture notes at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week48.ipynb
    2. -

    3. See also lecture notes from week 47 at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week46/ipynb/week47.ipynb. The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples +

    4. These lecture notes at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week48/ipynb/week48.ipynb
    5. +

    6. See also lecture notes from week 47 at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week47.ipynb. The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples
    7. Video on Decision trees https://www.youtube.com/watch?v=RmajweUFKvM&ab_channel=Simplilearn
    8. @@ -237,6 +237,183 @@

      Plan for week 47

      +
      +

      Random Forest Algorithm, reminder from last week

      + +

      The algorithm described here can be applied to both classification and regression problems.

      + +

      We will grow of forest of say \( B \) trees.

      +
        +

      1. For \( b=1:B \) +
          +

        1. Draw a bootstrap sample from the training data organized in our \( \boldsymbol{X} \) matrix.
        2. +

        3. We grow then a random forest tree \( T_b \) based on the bootstrapped data by repeating the steps outlined till we reach the maximum node size is reached
        4. +
            + +

          1. we select \( m \le p \) variables at random from the \( p \) predictors/features
          2. + +

          3. pick the best split point among the \( m \) features using for example the CART algorithm and create a new node
          4. + +

          5. split the node into daughter nodes
          6. +
          +

          +

        +

        +

      2. Output then the ensemble of trees \( \{T_b\}_1^{B} \) and make predictions for either a regression type of problem or a classification type of problem.
      3. +
      +
      + +
      +

      Random Forests Compared with other Methods on the Cancer Data

      + + + +
      +
      +
      +
      +
      +
      import matplotlib.pyplot as plt
      +import numpy as np
      +from sklearn.model_selection import  train_test_split 
      +from sklearn.datasets import load_breast_cancer
      +from sklearn.svm import SVC
      +from sklearn.linear_model import LogisticRegression
      +from sklearn.tree import DecisionTreeClassifier
      +from sklearn.ensemble import BaggingClassifier
      +
      +# Load the data
      +cancer = load_breast_cancer()
      +
      +X_train, X_test, y_train, y_test = train_test_split(cancer.data,cancer.target,random_state=0)
      +print(X_train.shape)
      +print(X_test.shape)
      +#define methods
      +# Logistic Regression
      +logreg = LogisticRegression(solver='lbfgs')
      +# Support vector machine
      +svm = SVC(gamma='auto', C=100)
      +# Decision Trees
      +deep_tree_clf = DecisionTreeClassifier(max_depth=None)
      +#Scale the data
      +from sklearn.preprocessing import StandardScaler
      +scaler = StandardScaler()
      +scaler.fit(X_train)
      +X_train_scaled = scaler.transform(X_train)
      +X_test_scaled = scaler.transform(X_test)
      +# Logistic Regression
      +logreg.fit(X_train_scaled, y_train)
      +print("Test set accuracy Logistic Regression with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test)))
      +# Support Vector Machine
      +svm.fit(X_train_scaled, y_train)
      +print("Test set accuracy SVM with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test)))
      +# Decision Trees
      +deep_tree_clf.fit(X_train_scaled, y_train)
      +print("Test set accuracy with Decision Trees and scaled data: {:.2f}".format(deep_tree_clf.score(X_test_scaled,y_test)))
      +
      +
      +from sklearn.ensemble import RandomForestClassifier
      +from sklearn.preprocessing import LabelEncoder
      +from sklearn.model_selection import cross_validate
      +# Data set not specificied
      +#Instantiate the model with 500 trees and entropy as splitting criteria
      +Random_Forest_model = RandomForestClassifier(n_estimators=500,criterion="entropy")
      +Random_Forest_model.fit(X_train_scaled, y_train)
      +#Cross validation
      +accuracy = cross_validate(Random_Forest_model,X_test_scaled,y_test,cv=10)['test_score']
      +print(accuracy)
      +print("Test set accuracy with Random Forests and scaled data: {:.2f}".format(Random_Forest_model.score(X_test_scaled,y_test)))
      +
      +
      +import scikitplot as skplt
      +y_pred = Random_Forest_model.predict(X_test_scaled)
      +skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True)
      +plt.show()
      +y_probas = Random_Forest_model.predict_proba(X_test_scaled)
      +skplt.metrics.plot_roc(y_test, y_probas)
      +plt.show()
      +skplt.metrics.plot_cumulative_gain(y_test, y_probas)
      +plt.show()
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      + +

      Recall that the cumulative gains curve shows the percentage of the +overall number of cases in a given category gained by targeting a +percentage of the total number of cases. +

      + +

      Similarly, the receiver operating characteristic curve, or ROC curve, +displays the diagnostic ability of a binary classifier system as its +discrimination threshold is varied. It plots the true positive rate against the false positive rate. +

      +
      + +
      +

      Compare Bagging on Trees with Random Forests

      + + +
      +
      +
      +
      +
      +
      bag_clf = BaggingClassifier(
      +    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
      +    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      + +
      +
      +
      +
      +
      +
      bag_clf.fit(X_train, y_train)
      +y_pred = bag_clf.predict(X_test)
      +from sklearn.ensemble import RandomForestClassifier
      +rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
      +rnd_clf.fit(X_train, y_train)
      +y_pred_rf = rnd_clf.predict(X_test)
      +np.sum(y_pred == y_pred_rf) / len(y_pred) 
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +

      Boosting, a Bird's Eye View

      diff --git a/doc/pub/week48/html/week48-solarized.html b/doc/pub/week48/html/week48-solarized.html index 98b14445d..6a80bf332 100644 --- a/doc/pub/week48/html/week48-solarized.html +++ b/doc/pub/week48/html/week48-solarized.html @@ -65,6 +65,18 @@ {'highest level': 2, 'sections': [('Overview of week 48', 2, None, 'overview-of-week-48'), ('Plan for week 47', 2, None, 'plan-for-week-47'), + ('Random Forest Algorithm, reminder from last week', + 2, + None, + 'random-forest-algorithm-reminder-from-last-week'), + ('Random Forests Compared with other Methods on the Cancer Data', + 2, + None, + 'random-forests-compared-with-other-methods-on-the-cancer-data'), + ('Compare Bagging on Trees with Random Forests', + 2, + None, + 'compare-bagging-on-trees-with-random-forests'), ("Boosting, a Bird's Eye View", 2, None, @@ -309,7 +321,7 @@

      Plan for week 47

    9. Work and Discussion of project 3
    10. Last weekly exercise
    11. Lab sessions at usual times.
    12. -
    13. For the week of December 2-6, lab sessions atart at 10am and end 4pm, room FØ434, Tuesday and Wednesday
    14. +
    15. For the week of December 2-6, lab sessions start at 10am and end at 4pm, room FØ434, Tuesday and Wednesday
    16. @@ -322,8 +334,8 @@

      Plan for week 47

    17. Summary of course
    18. Readings and Videos:
        -
      1. These lecture notes at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week48.ipynb
      2. -
      3. See also lecture notes from week 47 at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week46/ipynb/week47.ipynb. The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples +
      4. These lecture notes at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week48/ipynb/week48.ipynb
      5. +
      6. See also lecture notes from week 47 at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week47.ipynb. The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples
      7. Video on Decision trees https://www.youtube.com/watch?v=RmajweUFKvM&ab_channel=Simplilearn
      8. @@ -336,6 +348,175 @@

        Plan for week 47

        +









        +

        Random Forest Algorithm, reminder from last week

        + +

        The algorithm described here can be applied to both classification and regression problems.

        + +

        We will grow of forest of say \( B \) trees.

        +
          +
        1. For \( b=1:B \) +
            +
          1. Draw a bootstrap sample from the training data organized in our \( \boldsymbol{X} \) matrix.
          2. +
          3. We grow then a random forest tree \( T_b \) based on the bootstrapped data by repeating the steps outlined till we reach the maximum node size is reached
          4. +
              +
            1. we select \( m \le p \) variables at random from the \( p \) predictors/features
            2. +
            3. pick the best split point among the \( m \) features using for example the CART algorithm and create a new node
            4. +
            5. split the node into daughter nodes
            6. +
            +
          +
        2. Output then the ensemble of trees \( \{T_b\}_1^{B} \) and make predictions for either a regression type of problem or a classification type of problem.
        3. +
        +









        +

        Random Forests Compared with other Methods on the Cancer Data

        + + + +
        +
        +
        +
        +
        +
        import matplotlib.pyplot as plt
        +import numpy as np
        +from sklearn.model_selection import  train_test_split 
        +from sklearn.datasets import load_breast_cancer
        +from sklearn.svm import SVC
        +from sklearn.linear_model import LogisticRegression
        +from sklearn.tree import DecisionTreeClassifier
        +from sklearn.ensemble import BaggingClassifier
        +
        +# Load the data
        +cancer = load_breast_cancer()
        +
        +X_train, X_test, y_train, y_test = train_test_split(cancer.data,cancer.target,random_state=0)
        +print(X_train.shape)
        +print(X_test.shape)
        +#define methods
        +# Logistic Regression
        +logreg = LogisticRegression(solver='lbfgs')
        +# Support vector machine
        +svm = SVC(gamma='auto', C=100)
        +# Decision Trees
        +deep_tree_clf = DecisionTreeClassifier(max_depth=None)
        +#Scale the data
        +from sklearn.preprocessing import StandardScaler
        +scaler = StandardScaler()
        +scaler.fit(X_train)
        +X_train_scaled = scaler.transform(X_train)
        +X_test_scaled = scaler.transform(X_test)
        +# Logistic Regression
        +logreg.fit(X_train_scaled, y_train)
        +print("Test set accuracy Logistic Regression with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test)))
        +# Support Vector Machine
        +svm.fit(X_train_scaled, y_train)
        +print("Test set accuracy SVM with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test)))
        +# Decision Trees
        +deep_tree_clf.fit(X_train_scaled, y_train)
        +print("Test set accuracy with Decision Trees and scaled data: {:.2f}".format(deep_tree_clf.score(X_test_scaled,y_test)))
        +
        +
        +from sklearn.ensemble import RandomForestClassifier
        +from sklearn.preprocessing import LabelEncoder
        +from sklearn.model_selection import cross_validate
        +# Data set not specificied
        +#Instantiate the model with 500 trees and entropy as splitting criteria
        +Random_Forest_model = RandomForestClassifier(n_estimators=500,criterion="entropy")
        +Random_Forest_model.fit(X_train_scaled, y_train)
        +#Cross validation
        +accuracy = cross_validate(Random_Forest_model,X_test_scaled,y_test,cv=10)['test_score']
        +print(accuracy)
        +print("Test set accuracy with Random Forests and scaled data: {:.2f}".format(Random_Forest_model.score(X_test_scaled,y_test)))
        +
        +
        +import scikitplot as skplt
        +y_pred = Random_Forest_model.predict(X_test_scaled)
        +skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True)
        +plt.show()
        +y_probas = Random_Forest_model.predict_proba(X_test_scaled)
        +skplt.metrics.plot_roc(y_test, y_probas)
        +plt.show()
        +skplt.metrics.plot_cumulative_gain(y_test, y_probas)
        +plt.show()
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        + +

        Recall that the cumulative gains curve shows the percentage of the +overall number of cases in a given category gained by targeting a +percentage of the total number of cases. +

        + +

        Similarly, the receiver operating characteristic curve, or ROC curve, +displays the diagnostic ability of a binary classifier system as its +discrimination threshold is varied. It plots the true positive rate against the false positive rate. +

        + +









        +

        Compare Bagging on Trees with Random Forests

        + + +
        +
        +
        +
        +
        +
        bag_clf = BaggingClassifier(
        +    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
        +    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        + +
        +
        +
        +
        +
        +
        bag_clf.fit(X_train, y_train)
        +y_pred = bag_clf.predict(X_test)
        +from sklearn.ensemble import RandomForestClassifier
        +rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
        +rnd_clf.fit(X_train, y_train)
        +y_pred_rf = rnd_clf.predict(X_test)
        +np.sum(y_pred == y_pred_rf) / len(y_pred) 
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        +
        + +









        Boosting, a Bird's Eye View

        diff --git a/doc/pub/week48/html/week48.html b/doc/pub/week48/html/week48.html index 93cdc9f13..ebd73a6cc 100644 --- a/doc/pub/week48/html/week48.html +++ b/doc/pub/week48/html/week48.html @@ -142,6 +142,18 @@ {'highest level': 2, 'sections': [('Overview of week 48', 2, None, 'overview-of-week-48'), ('Plan for week 47', 2, None, 'plan-for-week-47'), + ('Random Forest Algorithm, reminder from last week', + 2, + None, + 'random-forest-algorithm-reminder-from-last-week'), + ('Random Forests Compared with other Methods on the Cancer Data', + 2, + None, + 'random-forests-compared-with-other-methods-on-the-cancer-data'), + ('Compare Bagging on Trees with Random Forests', + 2, + None, + 'compare-bagging-on-trees-with-random-forests'), ("Boosting, a Bird's Eye View", 2, None, @@ -386,7 +398,7 @@

        Plan for week 47

      9. Work and Discussion of project 3
      10. Last weekly exercise
      11. Lab sessions at usual times.
      12. -
      13. For the week of December 2-6, lab sessions atart at 10am and end 4pm, room FØ434, Tuesday and Wednesday
      14. +
      15. For the week of December 2-6, lab sessions start at 10am and end at 4pm, room FØ434, Tuesday and Wednesday
      16. @@ -399,8 +411,8 @@

        Plan for week 47

      17. Summary of course
      18. Readings and Videos:
          -
        1. These lecture notes at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week48.ipynb
        2. -
        3. See also lecture notes from week 47 at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week46/ipynb/week47.ipynb. The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples +
        4. These lecture notes at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week48/ipynb/week48.ipynb
        5. +
        6. See also lecture notes from week 47 at https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week47.ipynb. The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples
        7. Video on Decision trees https://www.youtube.com/watch?v=RmajweUFKvM&ab_channel=Simplilearn
        8. @@ -413,6 +425,175 @@

          Plan for week 47

          +









          +

          Random Forest Algorithm, reminder from last week

          + +

          The algorithm described here can be applied to both classification and regression problems.

          + +

          We will grow of forest of say \( B \) trees.

          +
            +
          1. For \( b=1:B \) +
              +
            1. Draw a bootstrap sample from the training data organized in our \( \boldsymbol{X} \) matrix.
            2. +
            3. We grow then a random forest tree \( T_b \) based on the bootstrapped data by repeating the steps outlined till we reach the maximum node size is reached
            4. +
                +
              1. we select \( m \le p \) variables at random from the \( p \) predictors/features
              2. +
              3. pick the best split point among the \( m \) features using for example the CART algorithm and create a new node
              4. +
              5. split the node into daughter nodes
              6. +
              +
            +
          2. Output then the ensemble of trees \( \{T_b\}_1^{B} \) and make predictions for either a regression type of problem or a classification type of problem.
          3. +
          +









          +

          Random Forests Compared with other Methods on the Cancer Data

          + + + +
          +
          +
          +
          +
          +
          import matplotlib.pyplot as plt
          +import numpy as np
          +from sklearn.model_selection import  train_test_split 
          +from sklearn.datasets import load_breast_cancer
          +from sklearn.svm import SVC
          +from sklearn.linear_model import LogisticRegression
          +from sklearn.tree import DecisionTreeClassifier
          +from sklearn.ensemble import BaggingClassifier
          +
          +# Load the data
          +cancer = load_breast_cancer()
          +
          +X_train, X_test, y_train, y_test = train_test_split(cancer.data,cancer.target,random_state=0)
          +print(X_train.shape)
          +print(X_test.shape)
          +#define methods
          +# Logistic Regression
          +logreg = LogisticRegression(solver='lbfgs')
          +# Support vector machine
          +svm = SVC(gamma='auto', C=100)
          +# Decision Trees
          +deep_tree_clf = DecisionTreeClassifier(max_depth=None)
          +#Scale the data
          +from sklearn.preprocessing import StandardScaler
          +scaler = StandardScaler()
          +scaler.fit(X_train)
          +X_train_scaled = scaler.transform(X_train)
          +X_test_scaled = scaler.transform(X_test)
          +# Logistic Regression
          +logreg.fit(X_train_scaled, y_train)
          +print("Test set accuracy Logistic Regression with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test)))
          +# Support Vector Machine
          +svm.fit(X_train_scaled, y_train)
          +print("Test set accuracy SVM with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test)))
          +# Decision Trees
          +deep_tree_clf.fit(X_train_scaled, y_train)
          +print("Test set accuracy with Decision Trees and scaled data: {:.2f}".format(deep_tree_clf.score(X_test_scaled,y_test)))
          +
          +
          +from sklearn.ensemble import RandomForestClassifier
          +from sklearn.preprocessing import LabelEncoder
          +from sklearn.model_selection import cross_validate
          +# Data set not specificied
          +#Instantiate the model with 500 trees and entropy as splitting criteria
          +Random_Forest_model = RandomForestClassifier(n_estimators=500,criterion="entropy")
          +Random_Forest_model.fit(X_train_scaled, y_train)
          +#Cross validation
          +accuracy = cross_validate(Random_Forest_model,X_test_scaled,y_test,cv=10)['test_score']
          +print(accuracy)
          +print("Test set accuracy with Random Forests and scaled data: {:.2f}".format(Random_Forest_model.score(X_test_scaled,y_test)))
          +
          +
          +import scikitplot as skplt
          +y_pred = Random_Forest_model.predict(X_test_scaled)
          +skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True)
          +plt.show()
          +y_probas = Random_Forest_model.predict_proba(X_test_scaled)
          +skplt.metrics.plot_roc(y_test, y_probas)
          +plt.show()
          +skplt.metrics.plot_cumulative_gain(y_test, y_probas)
          +plt.show()
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          + +

          Recall that the cumulative gains curve shows the percentage of the +overall number of cases in a given category gained by targeting a +percentage of the total number of cases. +

          + +

          Similarly, the receiver operating characteristic curve, or ROC curve, +displays the diagnostic ability of a binary classifier system as its +discrimination threshold is varied. It plots the true positive rate against the false positive rate. +

          + +









          +

          Compare Bagging on Trees with Random Forests

          + + +
          +
          +
          +
          +
          +
          bag_clf = BaggingClassifier(
          +    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
          +    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          + +
          +
          +
          +
          +
          +
          bag_clf.fit(X_train, y_train)
          +y_pred = bag_clf.predict(X_test)
          +from sklearn.ensemble import RandomForestClassifier
          +rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
          +rnd_clf.fit(X_train, y_train)
          +y_pred_rf = rnd_clf.predict(X_test)
          +np.sum(y_pred == y_pred_rf) / len(y_pred) 
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          +
          + +









          Boosting, a Bird's Eye View

          diff --git a/doc/pub/week48/ipynb/ipynb-week48-src.tar.gz b/doc/pub/week48/ipynb/ipynb-week48-src.tar.gz index 581bc0935..8526beceb 100644 Binary files a/doc/pub/week48/ipynb/ipynb-week48-src.tar.gz and b/doc/pub/week48/ipynb/ipynb-week48-src.tar.gz differ diff --git a/doc/pub/week48/ipynb/week48.ipynb b/doc/pub/week48/ipynb/week48.ipynb index 5b6adc95a..baa902728 100644 --- a/doc/pub/week48/ipynb/week48.ipynb +++ b/doc/pub/week48/ipynb/week48.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a39ebc71", + "id": "b8e98ac0", "metadata": { "editable": true }, @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "1e7972ed", + "id": "12cb369c", "metadata": { "editable": true }, @@ -29,7 +29,7 @@ }, { "cell_type": "markdown", - "id": "4e7d57ca", + "id": "25b1691e", "metadata": { "editable": true }, @@ -39,7 +39,7 @@ }, { "cell_type": "markdown", - "id": "208cc80b", + "id": "c213ed56", "metadata": { "editable": true }, @@ -54,7 +54,7 @@ "\n", " * Lab sessions at usual times.\n", "\n", - " * For the week of December 2-6, lab sessions atart at 10am and end 4pm, room FØ434, Tuesday and Wednesday\n", + " * For the week of December 2-6, lab sessions start at 10am and end at 4pm, room FØ434, Tuesday and Wednesday\n", "\n", " \n", "\n", @@ -66,9 +66,9 @@ "\n", "3. Readings and Videos:\n", "\n", - "a. These lecture notes at \n", + "a. These lecture notes at \n", "\n", - "b. See also lecture notes from week 47 at . The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples\n", + "b. See also lecture notes from week 47 at . The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples\n", "\n", "\n", "\n", @@ -85,7 +85,179 @@ }, { "cell_type": "markdown", - "id": "26deb357", + "id": "c3891e8a", + "metadata": { + "editable": true + }, + "source": [ + "## Random Forest Algorithm, reminder from last week\n", + "\n", + "The algorithm described here can be applied to both classification and regression problems.\n", + "\n", + "We will grow of forest of say $B$ trees.\n", + "1. For $b=1:B$\n", + "\n", + "a. Draw a bootstrap sample from the training data organized in our $\\boldsymbol{X}$ matrix.\n", + "\n", + "b. We grow then a random forest tree $T_b$ based on the bootstrapped data by repeating the steps outlined till we reach the maximum node size is reached\n", + "\n", + "1. we select $m \\le p$ variables at random from the $p$ predictors/features\n", + "\n", + "2. pick the best split point among the $m$ features using for example the CART algorithm and create a new node\n", + "\n", + "3. split the node into daughter nodes\n", + "\n", + "4. Output then the ensemble of trees $\\{T_b\\}_1^{B}$ and make predictions for either a regression type of problem or a classification type of problem." + ] + }, + { + "cell_type": "markdown", + "id": "dbe2476d", + "metadata": { + "editable": true + }, + "source": [ + "## Random Forests Compared with other Methods on the Cancer Data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d03f9475", + "metadata": { + "collapsed": false, + "editable": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split \n", + "from sklearn.datasets import load_breast_cancer\n", + "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import BaggingClassifier\n", + "\n", + "# Load the data\n", + "cancer = load_breast_cancer()\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(cancer.data,cancer.target,random_state=0)\n", + "print(X_train.shape)\n", + "print(X_test.shape)\n", + "#define methods\n", + "# Logistic Regression\n", + "logreg = LogisticRegression(solver='lbfgs')\n", + "# Support vector machine\n", + "svm = SVC(gamma='auto', C=100)\n", + "# Decision Trees\n", + "deep_tree_clf = DecisionTreeClassifier(max_depth=None)\n", + "#Scale the data\n", + "from sklearn.preprocessing import StandardScaler\n", + "scaler = StandardScaler()\n", + "scaler.fit(X_train)\n", + "X_train_scaled = scaler.transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)\n", + "# Logistic Regression\n", + "logreg.fit(X_train_scaled, y_train)\n", + "print(\"Test set accuracy Logistic Regression with scaled data: {:.2f}\".format(logreg.score(X_test_scaled,y_test)))\n", + "# Support Vector Machine\n", + "svm.fit(X_train_scaled, y_train)\n", + "print(\"Test set accuracy SVM with scaled data: {:.2f}\".format(logreg.score(X_test_scaled,y_test)))\n", + "# Decision Trees\n", + "deep_tree_clf.fit(X_train_scaled, y_train)\n", + "print(\"Test set accuracy with Decision Trees and scaled data: {:.2f}\".format(deep_tree_clf.score(X_test_scaled,y_test)))\n", + "\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.model_selection import cross_validate\n", + "# Data set not specificied\n", + "#Instantiate the model with 500 trees and entropy as splitting criteria\n", + "Random_Forest_model = RandomForestClassifier(n_estimators=500,criterion=\"entropy\")\n", + "Random_Forest_model.fit(X_train_scaled, y_train)\n", + "#Cross validation\n", + "accuracy = cross_validate(Random_Forest_model,X_test_scaled,y_test,cv=10)['test_score']\n", + "print(accuracy)\n", + "print(\"Test set accuracy with Random Forests and scaled data: {:.2f}\".format(Random_Forest_model.score(X_test_scaled,y_test)))\n", + "\n", + "\n", + "import scikitplot as skplt\n", + "y_pred = Random_Forest_model.predict(X_test_scaled)\n", + "skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True)\n", + "plt.show()\n", + "y_probas = Random_Forest_model.predict_proba(X_test_scaled)\n", + "skplt.metrics.plot_roc(y_test, y_probas)\n", + "plt.show()\n", + "skplt.metrics.plot_cumulative_gain(y_test, y_probas)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "6a5a918f", + "metadata": { + "editable": true + }, + "source": [ + "Recall that the cumulative gains curve shows the percentage of the\n", + "overall number of cases in a given category *gained* by targeting a\n", + "percentage of the total number of cases.\n", + "\n", + "Similarly, the receiver operating characteristic curve, or ROC curve,\n", + "displays the diagnostic ability of a binary classifier system as its\n", + "discrimination threshold is varied. It plots the true positive rate against the false positive rate." + ] + }, + { + "cell_type": "markdown", + "id": "fef4d0d0", + "metadata": { + "editable": true + }, + "source": [ + "## Compare Bagging on Trees with Random Forests" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "38b4f201", + "metadata": { + "collapsed": false, + "editable": true + }, + "outputs": [], + "source": [ + "bag_clf = BaggingClassifier(\n", + " DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n", + " n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8c186195", + "metadata": { + "collapsed": false, + "editable": true + }, + "outputs": [], + "source": [ + "bag_clf.fit(X_train, y_train)\n", + "y_pred = bag_clf.predict(X_test)\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)\n", + "rnd_clf.fit(X_train, y_train)\n", + "y_pred_rf = rnd_clf.predict(X_test)\n", + "np.sum(y_pred == y_pred_rf) / len(y_pred)" + ] + }, + { + "cell_type": "markdown", + "id": "6a6f19a9", "metadata": { "editable": true }, @@ -105,7 +277,7 @@ }, { "cell_type": "markdown", - "id": "7a2f8226", + "id": "ea3a24e9", "metadata": { "editable": true }, @@ -119,7 +291,7 @@ }, { "cell_type": "markdown", - "id": "dc77215d", + "id": "1a3c318c", "metadata": { "editable": true }, @@ -131,7 +303,7 @@ }, { "cell_type": "markdown", - "id": "f85044fd", + "id": "e16cdd0d", "metadata": { "editable": true }, @@ -148,7 +320,7 @@ }, { "cell_type": "markdown", - "id": "0203e8af", + "id": "8d3ad320", "metadata": { "editable": true }, @@ -160,7 +332,7 @@ }, { "cell_type": "markdown", - "id": "cf1ed563", + "id": "fd25b1e3", "metadata": { "editable": true }, @@ -174,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "e6bf0fad", + "id": "10404a9e", "metadata": { "editable": true }, @@ -186,7 +358,7 @@ }, { "cell_type": "markdown", - "id": "8130f888", + "id": "124bd4cf", "metadata": { "editable": true }, @@ -199,7 +371,7 @@ }, { "cell_type": "markdown", - "id": "6c2724ee", + "id": "8fb8c197", "metadata": { "editable": true }, @@ -211,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "02ee12e9", + "id": "ced4e7f4", "metadata": { "editable": true }, @@ -221,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "b79767ef", + "id": "161eec23", "metadata": { "editable": true }, @@ -249,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "9af520b1", + "id": "a1964067", "metadata": { "editable": true }, @@ -265,7 +437,7 @@ }, { "cell_type": "markdown", - "id": "bde4df04", + "id": "1933dc63", "metadata": { "editable": true }, @@ -277,7 +449,7 @@ }, { "cell_type": "markdown", - "id": "fbd18e93", + "id": "d932cfbd", "metadata": { "editable": true }, @@ -288,7 +460,7 @@ }, { "cell_type": "markdown", - "id": "aa97ee02", + "id": "24f7a02a", "metadata": { "editable": true }, @@ -300,7 +472,7 @@ }, { "cell_type": "markdown", - "id": "d2aa39e9", + "id": "6eb9e426", "metadata": { "editable": true }, @@ -310,7 +482,7 @@ }, { "cell_type": "markdown", - "id": "1e3955c2", + "id": "0a48ddf7", "metadata": { "editable": true }, @@ -322,7 +494,7 @@ }, { "cell_type": "markdown", - "id": "372e44b9", + "id": "d0087ffb", "metadata": { "editable": true }, @@ -332,7 +504,7 @@ }, { "cell_type": "markdown", - "id": "19c28d10", + "id": "b455ce5e", "metadata": { "editable": true }, @@ -344,7 +516,7 @@ }, { "cell_type": "markdown", - "id": "27059383", + "id": "5aeac1a5", "metadata": { "editable": true }, @@ -354,7 +526,7 @@ }, { "cell_type": "markdown", - "id": "6fb75d7d", + "id": "86dfaddf", "metadata": { "editable": true }, @@ -366,7 +538,7 @@ }, { "cell_type": "markdown", - "id": "db81a8d2", + "id": "4826590f", "metadata": { "editable": true }, @@ -380,7 +552,7 @@ }, { "cell_type": "markdown", - "id": "c005776e", + "id": "3de0828a", "metadata": { "editable": true }, @@ -396,7 +568,7 @@ }, { "cell_type": "markdown", - "id": "41b295d1", + "id": "961265b1", "metadata": { "editable": true }, @@ -408,7 +580,7 @@ }, { "cell_type": "markdown", - "id": "4e153aad", + "id": "a74db275", "metadata": { "editable": true }, @@ -424,7 +596,7 @@ }, { "cell_type": "markdown", - "id": "b507d423", + "id": "81d27911", "metadata": { "editable": true }, @@ -436,7 +608,7 @@ }, { "cell_type": "markdown", - "id": "c1293602", + "id": "3b7dbe21", "metadata": { "editable": true }, @@ -446,7 +618,7 @@ }, { "cell_type": "markdown", - "id": "ee91ca92", + "id": "e664eec9", "metadata": { "editable": true }, @@ -458,7 +630,7 @@ }, { "cell_type": "markdown", - "id": "18cda998", + "id": "50fe52a0", "metadata": { "editable": true }, @@ -470,7 +642,7 @@ }, { "cell_type": "markdown", - "id": "f4882409", + "id": "a4cd6cb0", "metadata": { "editable": true }, @@ -482,7 +654,7 @@ }, { "cell_type": "markdown", - "id": "76865a9d", + "id": "116d1038", "metadata": { "editable": true }, @@ -493,7 +665,7 @@ }, { "cell_type": "markdown", - "id": "00f40cd2", + "id": "690f6882", "metadata": { "editable": true }, @@ -505,7 +677,7 @@ }, { "cell_type": "markdown", - "id": "8e2ecc8f", + "id": "436d0a30", "metadata": { "editable": true }, @@ -516,7 +688,7 @@ }, { "cell_type": "markdown", - "id": "6b366af8", + "id": "779a3214", "metadata": { "editable": true }, @@ -528,7 +700,7 @@ }, { "cell_type": "markdown", - "id": "3a1a1034", + "id": "baf4ef97", "metadata": { "editable": true }, @@ -538,7 +710,7 @@ }, { "cell_type": "markdown", - "id": "58fb6fa7", + "id": "819d9a45", "metadata": { "editable": true }, @@ -550,7 +722,7 @@ }, { "cell_type": "markdown", - "id": "eaee6ea7", + "id": "9de7ba47", "metadata": { "editable": true }, @@ -562,7 +734,7 @@ }, { "cell_type": "markdown", - "id": "6de435fd", + "id": "3130e2cc", "metadata": { "editable": true }, @@ -574,7 +746,7 @@ }, { "cell_type": "markdown", - "id": "1139bd2b", + "id": "88bee939", "metadata": { "editable": true }, @@ -586,7 +758,7 @@ }, { "cell_type": "markdown", - "id": "bfe4569a", + "id": "81de3757", "metadata": { "editable": true }, @@ -596,7 +768,7 @@ }, { "cell_type": "markdown", - "id": "13d0b962", + "id": "b1897c2e", "metadata": { "editable": true }, @@ -608,7 +780,7 @@ }, { "cell_type": "markdown", - "id": "c46749a8", + "id": "eee8697d", "metadata": { "editable": true }, @@ -618,7 +790,7 @@ }, { "cell_type": "markdown", - "id": "fb9707b0", + "id": "56355f2a", "metadata": { "editable": true }, @@ -630,7 +802,7 @@ }, { "cell_type": "markdown", - "id": "7599d5c4", + "id": "88a20b27", "metadata": { "editable": true }, @@ -640,7 +812,7 @@ }, { "cell_type": "markdown", - "id": "6be2f8e6", + "id": "2a02a6d7", "metadata": { "editable": true }, @@ -652,7 +824,7 @@ }, { "cell_type": "markdown", - "id": "f7519116", + "id": "816d8d8f", "metadata": { "editable": true }, @@ -662,7 +834,7 @@ }, { "cell_type": "markdown", - "id": "154c4a3a", + "id": "a5a820fc", "metadata": { "editable": true }, @@ -674,7 +846,7 @@ }, { "cell_type": "markdown", - "id": "39a20802", + "id": "6517fb85", "metadata": { "editable": true }, @@ -684,7 +856,7 @@ }, { "cell_type": "markdown", - "id": "efb002ef", + "id": "4a3c7a45", "metadata": { "editable": true }, @@ -696,7 +868,7 @@ }, { "cell_type": "markdown", - "id": "9b9cf35a", + "id": "d295d135", "metadata": { "editable": true }, @@ -716,7 +888,7 @@ }, { "cell_type": "markdown", - "id": "c72455a5", + "id": "710b2ab1", "metadata": { "editable": true }, @@ -728,7 +900,7 @@ }, { "cell_type": "markdown", - "id": "a76d98a0", + "id": "1e153c11", "metadata": { "editable": true }, @@ -738,7 +910,7 @@ }, { "cell_type": "markdown", - "id": "2d094d5e", + "id": "7ca4618e", "metadata": { "editable": true }, @@ -754,7 +926,7 @@ }, { "cell_type": "markdown", - "id": "1c4386fd", + "id": "ce93cd0f", "metadata": { "editable": true }, @@ -766,7 +938,7 @@ }, { "cell_type": "markdown", - "id": "9d052a68", + "id": "3405f033", "metadata": { "editable": true }, @@ -794,7 +966,7 @@ }, { "cell_type": "markdown", - "id": "ea5f4148", + "id": "4e254d44", "metadata": { "editable": true }, @@ -806,8 +978,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "9808c49a", + "execution_count": 4, + "id": "a2372724", "metadata": { "collapsed": false, "editable": true @@ -832,7 +1004,7 @@ }, { "cell_type": "markdown", - "id": "7fce1041", + "id": "42c1e9ee", "metadata": { "editable": true }, @@ -842,8 +1014,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "6f645036", + "execution_count": 5, + "id": "bad891e4", "metadata": { "collapsed": false, "editable": true @@ -932,7 +1104,7 @@ }, { "cell_type": "markdown", - "id": "7ba94acc", + "id": "87c499dc", "metadata": { "editable": true }, @@ -950,7 +1122,7 @@ }, { "cell_type": "markdown", - "id": "01cc8969", + "id": "d78bf458", "metadata": { "editable": true }, @@ -963,7 +1135,7 @@ }, { "cell_type": "markdown", - "id": "792b01b0", + "id": "12318d85", "metadata": { "editable": true }, @@ -975,7 +1147,7 @@ }, { "cell_type": "markdown", - "id": "7b248aa2", + "id": "2d96597d", "metadata": { "editable": true }, @@ -985,7 +1157,7 @@ }, { "cell_type": "markdown", - "id": "2e9c37b8", + "id": "ca649b47", "metadata": { "editable": true }, @@ -997,7 +1169,7 @@ }, { "cell_type": "markdown", - "id": "2e4ce635", + "id": "f3e57f43", "metadata": { "editable": true }, @@ -1007,7 +1179,7 @@ }, { "cell_type": "markdown", - "id": "9b82b90d", + "id": "d8cc18d6", "metadata": { "editable": true }, @@ -1019,7 +1191,7 @@ }, { "cell_type": "markdown", - "id": "00cdf85c", + "id": "43d98194", "metadata": { "editable": true }, @@ -1032,7 +1204,7 @@ }, { "cell_type": "markdown", - "id": "75879888", + "id": "b35a4881", "metadata": { "editable": true }, @@ -1044,7 +1216,7 @@ }, { "cell_type": "markdown", - "id": "7df5cbe2", + "id": "b058af39", "metadata": { "editable": true }, @@ -1056,7 +1228,7 @@ }, { "cell_type": "markdown", - "id": "ed39df0e", + "id": "3ad897cf", "metadata": { "editable": true }, @@ -1068,7 +1240,7 @@ }, { "cell_type": "markdown", - "id": "17a6c806", + "id": "bd31fcbe", "metadata": { "editable": true }, @@ -1078,7 +1250,7 @@ }, { "cell_type": "markdown", - "id": "4853b938", + "id": "f8376f29", "metadata": { "editable": true }, @@ -1090,7 +1262,7 @@ }, { "cell_type": "markdown", - "id": "e48f6eff", + "id": "be665473", "metadata": { "editable": true }, @@ -1100,7 +1272,7 @@ }, { "cell_type": "markdown", - "id": "8e16d35d", + "id": "e1091a41", "metadata": { "editable": true }, @@ -1116,7 +1288,7 @@ }, { "cell_type": "markdown", - "id": "8582402a", + "id": "e84e0298", "metadata": { "editable": true }, @@ -1128,7 +1300,7 @@ }, { "cell_type": "markdown", - "id": "f3affd6c", + "id": "c21417da", "metadata": { "editable": true }, @@ -1149,7 +1321,7 @@ }, { "cell_type": "markdown", - "id": "0e7145f7", + "id": "5525cf60", "metadata": { "editable": true }, @@ -1159,16 +1331,14 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "94ef427d", + "execution_count": 6, + "id": "a06c6902", "metadata": { "collapsed": false, "editable": true }, "outputs": [], "source": [ - "%matplotlib inline\n", - "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", @@ -1214,7 +1384,7 @@ }, { "cell_type": "markdown", - "id": "962c417a", + "id": "b515314f", "metadata": { "editable": true }, @@ -1224,8 +1394,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "595aada5", + "execution_count": 7, + "id": "4231d351", "metadata": { "collapsed": false, "editable": true @@ -1276,7 +1446,7 @@ }, { "cell_type": "markdown", - "id": "e83be453", + "id": "121a1d8a", "metadata": { "editable": true }, @@ -1299,7 +1469,7 @@ }, { "cell_type": "markdown", - "id": "8d3d5790", + "id": "a9c5f373", "metadata": { "editable": true }, @@ -1309,8 +1479,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "34e94a90", + "execution_count": 8, + "id": "16f56e04", "metadata": { "collapsed": false, "editable": true @@ -1362,7 +1532,7 @@ }, { "cell_type": "markdown", - "id": "21b9da8e", + "id": "af8e732d", "metadata": { "editable": true }, @@ -1374,8 +1544,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "249fe2dc", + "execution_count": 9, + "id": "b63afee6", "metadata": { "collapsed": false, "editable": true @@ -1438,7 +1608,7 @@ }, { "cell_type": "markdown", - "id": "7f908d19", + "id": "84c9ab3e", "metadata": { "editable": true }, @@ -1448,8 +1618,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "a03ed5b7", + "execution_count": 10, + "id": "01952655", "metadata": { "collapsed": false, "editable": true @@ -1536,7 +1706,7 @@ }, { "cell_type": "markdown", - "id": "05b8f3c7", + "id": "9a762674", "metadata": { "editable": true }, @@ -1546,7 +1716,7 @@ }, { "cell_type": "markdown", - "id": "9c4496bd", + "id": "1b17200a", "metadata": { "editable": true }, @@ -1561,7 +1731,7 @@ }, { "cell_type": "markdown", - "id": "e81bb354", + "id": "4acb7811", "metadata": { "editable": true }, @@ -1577,7 +1747,7 @@ }, { "cell_type": "markdown", - "id": "969bddcc", + "id": "6b2f2ccd", "metadata": { "editable": true }, @@ -1602,7 +1772,7 @@ }, { "cell_type": "markdown", - "id": "d4949b85", + "id": "e6bdbe75", "metadata": { "editable": true }, @@ -1649,7 +1819,7 @@ }, { "cell_type": "markdown", - "id": "45cb9fdd", + "id": "5900b979", "metadata": { "editable": true }, @@ -1684,7 +1854,7 @@ }, { "cell_type": "markdown", - "id": "922f087a", + "id": "97bf13a4", "metadata": { "editable": true }, @@ -1707,7 +1877,7 @@ }, { "cell_type": "markdown", - "id": "5215e2cc", + "id": "1e152734", "metadata": { "editable": true }, @@ -1730,7 +1900,7 @@ }, { "cell_type": "markdown", - "id": "b47badea", + "id": "1ed0e449", "metadata": { "editable": true }, @@ -1750,7 +1920,7 @@ }, { "cell_type": "markdown", - "id": "db0ba96d", + "id": "b24640ed", "metadata": { "editable": true }, @@ -1766,7 +1936,7 @@ }, { "cell_type": "markdown", - "id": "3ce90bd6", + "id": "33957222", "metadata": { "editable": true }, @@ -1794,7 +1964,7 @@ }, { "cell_type": "markdown", - "id": "3580f610", + "id": "56ba1280", "metadata": { "editable": true }, @@ -1816,7 +1986,7 @@ }, { "cell_type": "markdown", - "id": "3996d3be", + "id": "fb062869", "metadata": { "editable": true }, @@ -1841,7 +2011,7 @@ }, { "cell_type": "markdown", - "id": "97989551", + "id": "808e4541", "metadata": { "editable": true }, @@ -1859,7 +2029,7 @@ }, { "cell_type": "markdown", - "id": "c347f2fe", + "id": "1ac3860d", "metadata": { "editable": true }, @@ -1889,7 +2059,7 @@ }, { "cell_type": "markdown", - "id": "44e9af30", + "id": "c9bead32", "metadata": { "editable": true }, @@ -1903,7 +2073,7 @@ }, { "cell_type": "markdown", - "id": "8077d8fa", + "id": "982d4930", "metadata": { "editable": true }, @@ -1929,7 +2099,7 @@ }, { "cell_type": "markdown", - "id": "07aa4c82", + "id": "369bc8e7", "metadata": { "editable": true }, @@ -1958,7 +2128,7 @@ }, { "cell_type": "markdown", - "id": "78c3e014", + "id": "539550ba", "metadata": { "editable": true }, @@ -1975,7 +2145,7 @@ }, { "cell_type": "markdown", - "id": "9323bb0e", + "id": "45a3026e", "metadata": { "editable": true }, @@ -1997,7 +2167,7 @@ }, { "cell_type": "markdown", - "id": "75509bf3", + "id": "3ba33ca6", "metadata": { "editable": true }, @@ -2015,7 +2185,7 @@ }, { "cell_type": "markdown", - "id": "35674c96", + "id": "512af4a3", "metadata": { "editable": true }, @@ -2038,7 +2208,7 @@ }, { "cell_type": "markdown", - "id": "29f53333", + "id": "bafc70a8", "metadata": { "editable": true }, @@ -2057,7 +2227,7 @@ }, { "cell_type": "markdown", - "id": "5dbba68a", + "id": "c8db3bb3", "metadata": { "editable": true }, @@ -2073,7 +2243,7 @@ }, { "cell_type": "markdown", - "id": "6828d447", + "id": "19a62799", "metadata": { "editable": true }, @@ -2088,7 +2258,7 @@ }, { "cell_type": "markdown", - "id": "cdf3f161", + "id": "dee23d4a", "metadata": { "editable": true }, @@ -2112,7 +2282,7 @@ }, { "cell_type": "markdown", - "id": "72c3fb89", + "id": "27b6fe1b", "metadata": { "editable": true }, @@ -2124,7 +2294,7 @@ }, { "cell_type": "markdown", - "id": "37629ea0", + "id": "671ebf4b", "metadata": { "editable": true }, @@ -2142,7 +2312,7 @@ }, { "cell_type": "markdown", - "id": "98aab4d7", + "id": "1fe52803", "metadata": { "editable": true }, @@ -2152,7 +2322,7 @@ }, { "cell_type": "markdown", - "id": "87b99c69", + "id": "5a2a9f51", "metadata": { "editable": true }, @@ -2170,7 +2340,7 @@ }, { "cell_type": "markdown", - "id": "104bf3d9", + "id": "bad1a098", "metadata": { "editable": true }, @@ -2180,7 +2350,7 @@ }, { "cell_type": "markdown", - "id": "dd8e8532", + "id": "942d899e", "metadata": { "editable": true }, @@ -2199,7 +2369,7 @@ }, { "cell_type": "markdown", - "id": "0fcf899e", + "id": "78dc7403", "metadata": { "editable": true }, @@ -2211,7 +2381,7 @@ }, { "cell_type": "markdown", - "id": "73aab9a5", + "id": "7c2bfde9", "metadata": { "editable": true }, @@ -2225,7 +2395,7 @@ }, { "cell_type": "markdown", - "id": "c60f69c7", + "id": "2f152023", "metadata": { "editable": true }, @@ -2240,7 +2410,7 @@ }, { "cell_type": "markdown", - "id": "4e511fae", + "id": "70f4a4a4", "metadata": { "editable": true }, @@ -2258,7 +2428,7 @@ }, { "cell_type": "markdown", - "id": "fa972f2d", + "id": "0dc78848", "metadata": { "editable": true }, @@ -2272,7 +2442,7 @@ }, { "cell_type": "markdown", - "id": "8a994f1a", + "id": "831b279e", "metadata": { "editable": true }, @@ -2290,7 +2460,7 @@ }, { "cell_type": "markdown", - "id": "03b5397c", + "id": "8b2e5067", "metadata": { "editable": true }, @@ -2314,7 +2484,7 @@ }, { "cell_type": "markdown", - "id": "45018dc5", + "id": "5a7fd9c4", "metadata": { "editable": true }, @@ -2352,7 +2522,7 @@ }, { "cell_type": "markdown", - "id": "b2cb45a9", + "id": "3fff9e22", "metadata": { "editable": true }, @@ -2375,7 +2545,7 @@ }, { "cell_type": "markdown", - "id": "3865af47", + "id": "06e08aa9", "metadata": { "editable": true }, @@ -2411,7 +2581,7 @@ }, { "cell_type": "markdown", - "id": "6f19b509", + "id": "564bb5de", "metadata": { "editable": true }, @@ -2432,7 +2602,7 @@ }, { "cell_type": "markdown", - "id": "9343ee7f", + "id": "a50184f4", "metadata": { "editable": true }, @@ -2454,7 +2624,7 @@ }, { "cell_type": "markdown", - "id": "b2dd9738", + "id": "0abcc34b", "metadata": { "editable": true }, @@ -2474,7 +2644,7 @@ }, { "cell_type": "markdown", - "id": "b9be3dbc", + "id": "8d2c9af4", "metadata": { "editable": true }, @@ -2489,7 +2659,7 @@ }, { "cell_type": "markdown", - "id": "8cbdca7e", + "id": "f477a76f", "metadata": { "editable": true }, @@ -2507,7 +2677,7 @@ }, { "cell_type": "markdown", - "id": "537082f4", + "id": "6155600c", "metadata": { "editable": true }, @@ -2537,7 +2707,7 @@ }, { "cell_type": "markdown", - "id": "db559f68", + "id": "83336d35", "metadata": { "editable": true }, @@ -2566,7 +2736,7 @@ }, { "cell_type": "markdown", - "id": "ac53848b", + "id": "99eb3e53", "metadata": { "editable": true }, @@ -2597,7 +2767,7 @@ }, { "cell_type": "markdown", - "id": "d8d03d63", + "id": "283b9930", "metadata": { "editable": true }, @@ -2620,7 +2790,7 @@ }, { "cell_type": "markdown", - "id": "e3f5660d", + "id": "defb65a0", "metadata": { "editable": true }, @@ -2638,7 +2808,7 @@ }, { "cell_type": "markdown", - "id": "b8d80de2", + "id": "027093b9", "metadata": { "editable": true }, @@ -2661,7 +2831,7 @@ }, { "cell_type": "markdown", - "id": "de54e1e2", + "id": "78a77435", "metadata": { "editable": true }, @@ -2682,7 +2852,7 @@ }, { "cell_type": "markdown", - "id": "be906a03", + "id": "500291ea", "metadata": { "editable": true }, @@ -2698,7 +2868,7 @@ }, { "cell_type": "markdown", - "id": "d2090cb2", + "id": "62d8538b", "metadata": { "editable": true }, diff --git a/doc/src/week48/week48.do.txt b/doc/src/week48/week48.do.txt index 7608590eb..02de1f006 100644 --- a/doc/src/week48/week48.do.txt +++ b/doc/src/week48/week48.do.txt @@ -12,15 +12,15 @@ DATE: today * Work and Discussion of project 3 * Last weekly exercise * Lab sessions at usual times. - * For the week of December 2-6, lab sessions atart at 10am and end 4pm, room FØ434, Tuesday and Wednesday + * For the week of December 2-6, lab sessions start at 10am and end at 4pm, room FØ434, Tuesday and Wednesday !eblock !bblock Plans for the lecture Monday 25 November, with video suggestions etc o Boosting and gradient boosting and ensemble models o Summary of course o Readings and Videos: - o These lecture notes at URL:"https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week48.ipynb" - o See also lecture notes from week 47 at URL:"https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week46/ipynb/week47.ipynb". The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples + o These lecture notes at URL:"https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week48/ipynb/week48.ipynb" + o See also lecture notes from week 47 at URL:"https://github.com/CompPhysics/MachineLearning/blob/master/doc/pub/week47/ipynb/week47.ipynb". The lecture on Monday starts with a repetition on AdaBoost before we move over to gradient boosting with examples # o Video of lecture at URL:"https://youtu.be/RIHzmLv05DA" # o Whiteboard notes at URL:"https://github.com/CompPhysics/MachineLearning/blob/master/doc/HandWrittenNotes/2024/NotesNovember25.pdf" o Video on Decision trees URL:"https://www.youtube.com/watch?v=RmajweUFKvM&ab_channel=Simplilearn" @@ -31,6 +31,128 @@ o Readings and Videos: !eblock +!split +===== Random Forest Algorithm, reminder from last week ===== + + +The algorithm described here can be applied to both classification and regression problems. + +We will grow of forest of say $B$ trees. +o For $b=1:B$ + o Draw a bootstrap sample from the training data organized in our $\bm{X}$ matrix. + o We grow then a random forest tree $T_b$ based on the bootstrapped data by repeating the steps outlined till we reach the maximum node size is reached + o we select $m \le p$ variables at random from the $p$ predictors/features + o pick the best split point among the $m$ features using for example the CART algorithm and create a new node + o split the node into daughter nodes +o Output then the ensemble of trees $\{T_b\}_1^{B}$ and make predictions for either a regression type of problem or a classification type of problem. + + + +!split +===== Random Forests Compared with other Methods on the Cancer Data ===== + + +!bc pycod +import matplotlib.pyplot as plt +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_breast_cancer +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import BaggingClassifier + +# Load the data +cancer = load_breast_cancer() + +X_train, X_test, y_train, y_test = train_test_split(cancer.data,cancer.target,random_state=0) +print(X_train.shape) +print(X_test.shape) +#define methods +# Logistic Regression +logreg = LogisticRegression(solver='lbfgs') +# Support vector machine +svm = SVC(gamma='auto', C=100) +# Decision Trees +deep_tree_clf = DecisionTreeClassifier(max_depth=None) +#Scale the data +from sklearn.preprocessing import StandardScaler +scaler = StandardScaler() +scaler.fit(X_train) +X_train_scaled = scaler.transform(X_train) +X_test_scaled = scaler.transform(X_test) +# Logistic Regression +logreg.fit(X_train_scaled, y_train) +print("Test set accuracy Logistic Regression with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test))) +# Support Vector Machine +svm.fit(X_train_scaled, y_train) +print("Test set accuracy SVM with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test))) +# Decision Trees +deep_tree_clf.fit(X_train_scaled, y_train) +print("Test set accuracy with Decision Trees and scaled data: {:.2f}".format(deep_tree_clf.score(X_test_scaled,y_test))) + + +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import cross_validate +# Data set not specificied +#Instantiate the model with 500 trees and entropy as splitting criteria +Random_Forest_model = RandomForestClassifier(n_estimators=500,criterion="entropy") +Random_Forest_model.fit(X_train_scaled, y_train) +#Cross validation +accuracy = cross_validate(Random_Forest_model,X_test_scaled,y_test,cv=10)['test_score'] +print(accuracy) +print("Test set accuracy with Random Forests and scaled data: {:.2f}".format(Random_Forest_model.score(X_test_scaled,y_test))) + + +import scikitplot as skplt +y_pred = Random_Forest_model.predict(X_test_scaled) +skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True) +plt.show() +y_probas = Random_Forest_model.predict_proba(X_test_scaled) +skplt.metrics.plot_roc(y_test, y_probas) +plt.show() +skplt.metrics.plot_cumulative_gain(y_test, y_probas) +plt.show() + +!ec + + +Recall that the cumulative gains curve shows the percentage of the +overall number of cases in a given category *gained* by targeting a +percentage of the total number of cases. + +Similarly, the receiver operating characteristic curve, or ROC curve, +displays the diagnostic ability of a binary classifier system as its +discrimination threshold is varied. It plots the true positive rate against the false positive rate. + + +!split +===== Compare Bagging on Trees with Random Forests ===== +!bc pycod +bag_clf = BaggingClassifier( + DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42), + n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42) +!ec + + + +!bc pycod +bag_clf.fit(X_train, y_train) +y_pred = bag_clf.predict(X_test) +from sklearn.ensemble import RandomForestClassifier +rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42) +rnd_clf.fit(X_train, y_train) +y_pred_rf = rnd_clf.predict(X_test) +np.sum(y_pred == y_pred_rf) / len(y_pred) +!ec + + + + + + + !split @@ -1539,3 +1661,4 @@ FIGURE: [figures/Nebbdyr2.png, width=500 frac=0.6] +