From 1c143f339923d1183cb75779ba429722c9ff2e32 Mon Sep 17 00:00:00 2001 From: Doubleking-1 <71910936+Doubleking-1@users.noreply.github.com> Date: Wed, 3 Jan 2024 23:17:02 +0800 Subject: [PATCH] Wangzun second homework of ML --- ...ork_credit_scoring_finetune_ensemble.ipynb | 2073 +++++++++++++++++ 1 file changed, 2073 insertions(+) create mode 100644 2023/homework/Zun_Wang/homework_credit_scoring_finetune_ensemble.ipynb diff --git a/2023/homework/Zun_Wang/homework_credit_scoring_finetune_ensemble.ipynb b/2023/homework/Zun_Wang/homework_credit_scoring_finetune_ensemble.ipynb new file mode 100644 index 00000000..87a7e7a5 --- /dev/null +++ b/2023/homework/Zun_Wang/homework_credit_scoring_finetune_ensemble.ipynb @@ -0,0 +1,2073 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 一起来打怪之 Credit Scoring 练习" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-------\n", + "## >>>说明:\n", + "### 1. 答题步骤:\n", + "- 回答问题**请保留每一步**操作过程,请不要仅仅给出最后答案\n", + "- 请养成代码注释的好习惯\n", + "\n", + "### 2. 解题思路:\n", + "- 为方便大家准确理解题目,在习题实战中有所收获,本文档提供了解题思路提示\n", + "- 解题思路**仅供参考**,鼓励原创解题方法\n", + "- 为督促同学们自己思考,解题思路内容设置为**注释**,请注意查看\n", + "\n", + "### 3. 所用数据:\n", + "- 问题使用了多个数据库,请注意导入每个数据库后都先**查看和了解数据的基本性质**,后面的问题不再一一提醒" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "--------\n", + "## 操作题" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 信用卡欺诈项目" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " #### 前期数据导入,预览及处理(此部分勿修改,涉及的数据文件无需复制移动)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SeriousDlqin2yrsRevolvingUtilizationOfUnsecuredLinesageNumberOfTime30-59DaysPastDueNotWorseDebtRatioMonthlyIncomeNumberOfOpenCreditLinesAndLoansNumberOfTimes90DaysLateNumberRealEstateLoansOrLinesNumberOfTime60-89DaysPastDueNotWorseNumberOfDependents
010.76612745.02.00.8029829120.013.00.06.00.02.0
100.95715140.00.00.1218762600.04.00.00.00.01.0
200.65818038.01.00.0851133042.02.01.00.00.00.0
300.23381030.00.00.0360503300.05.00.00.00.00.0
400.90723949.01.00.02492663588.07.00.01.00.00.0
\n", + "
" + ], + "text/plain": [ + " SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age \\\n", + "0 1 0.766127 45.0 \n", + "1 0 0.957151 40.0 \n", + "2 0 0.658180 38.0 \n", + "3 0 0.233810 30.0 \n", + "4 0 0.907239 49.0 \n", + "\n", + " NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome \\\n", + "0 2.0 0.802982 9120.0 \n", + "1 0.0 0.121876 2600.0 \n", + "2 1.0 0.085113 3042.0 \n", + "3 0.0 0.036050 3300.0 \n", + "4 1.0 0.024926 63588.0 \n", + "\n", + " NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate \\\n", + "0 13.0 0.0 \n", + "1 4.0 0.0 \n", + "2 2.0 1.0 \n", + "3 5.0 0.0 \n", + "4 7.0 0.0 \n", + "\n", + " NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse \\\n", + "0 6.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "\n", + " NumberOfDependents \n", + "0 2.0 \n", + "1 1.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.set_option('display.max_columns', 500)\n", + "import zipfile\n", + "with zipfile.ZipFile('KaggleCredit2.csv.zip', 'r') as z:\n", + " f = z.open('KaggleCredit2.csv')\n", + " data = pd.read_csv(f, index_col=0)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(112915, 11)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 检查数据维度\n", + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "SeriousDlqin2yrs 0\n", + "RevolvingUtilizationOfUnsecuredLines 0\n", + "age 4267\n", + "NumberOfTime30-59DaysPastDueNotWorse 0\n", + "DebtRatio 0\n", + "MonthlyIncome 0\n", + "NumberOfOpenCreditLinesAndLoans 0\n", + "NumberOfTimes90DaysLate 0\n", + "NumberRealEstateLoansOrLines 0\n", + "NumberOfTime60-89DaysPastDueNotWorse 0\n", + "NumberOfDependents 4267\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 查看数据缺失值情况\n", + "data.isnull().sum(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_253566/2980780030.py:3: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", + " data.shapey = data['SeriousDlqin2yrs']\n" + ] + } + ], + "source": [ + "# 清除缺失值\n", + "data.dropna(inplace=True)\n", + "data.shapey = data['SeriousDlqin2yrs']\n", + "X = data.drop('SeriousDlqin2yrs', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.06742876076872101" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 取出对应的X和y\n", + "y = data['SeriousDlqin2yrs']\n", + "X = data.drop('SeriousDlqin2yrs', axis=1)\n", + "# 查看平均的欺诈率\n", + "y.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 以下为操作题" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.把数据切分成训练集和测试集" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((76053, 10), (32595, 10), (76053,), (32595,), (108648, 10))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 提示:查看train_test_split函数\n", + "from sklearn.model_selection import train_test_split\n", + "# 把数据切分成70%的训练集,30%的测试集\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n", + "\n", + "# 查看训练集跟测试集的维度---原来数据集的维度\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape, X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0, 'Catalog')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# 通过SeriousDlqin2yrs字段查看正负样本分布情况\n", + "# 提示:value_counts\n", + "data_counts = data['SeriousDlqin2yrs'].value_counts()\n", + "\n", + "# 绘制两种类别的柱状图\n", + "# 提示:dataframe可以直接plot(kind='bar')\n", + "import matplotlib.pyplot as plt\n", + "fig = plt.figure()\n", + "data_counts.plot(kind='bar')\n", + "plt.title('Positive and negative sample distribution')\n", + "plt.ylabel('Number')\n", + "plt.xlabel('Catalog')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.数据预处理之离散化" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "# 请对年龄按照3岁一个区间进行离散化\n", + "# 提示:可以先计算出分桶边界,再基于pandas的cut函数进行离散化(分箱、分桶)\n", + "low = min(data['age'])\n", + "high = max(data['age'])\n", + "\n", + "bins = list(range(int(low), int(high), 3)) # 以3岁为一个区间\n", + "\n", + "# 使用 cut 函数进行离散化\n", + "data['age_group'] = pd.cut(data['age'], bins)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.数据预处理之独热向量编码" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SeriousDlqin2yrsRevolvingUtilizationOfUnsecuredLinesageNumberOfTime30-59DaysPastDueNotWorseDebtRatioMonthlyIncomeNumberOfOpenCreditLinesAndLoansNumberOfTimes90DaysLateNumberRealEstateLoansOrLinesNumberOfTime60-89DaysPastDueNotWorseNumberOfDependentsage_groupage_group_(0, 3]age_group_(3, 6]age_group_(6, 9]age_group_(9, 12]age_group_(12, 15]age_group_(15, 18]age_group_(18, 21]age_group_(21, 24]age_group_(24, 27]age_group_(27, 30]age_group_(30, 33]age_group_(33, 36]age_group_(36, 39]age_group_(39, 42]age_group_(42, 45]age_group_(45, 48]age_group_(48, 51]age_group_(51, 54]age_group_(54, 57]age_group_(57, 60]age_group_(60, 63]age_group_(63, 66]age_group_(66, 69]age_group_(69, 72]age_group_(72, 75]age_group_(75, 78]age_group_(78, 81]age_group_(81, 84]age_group_(84, 87]age_group_(87, 90]age_group_(90, 93]age_group_(93, 96]age_group_(96, 99]age_group_(99, 102]
010.76612745.02.00.8029829120.013.00.06.00.02.0(42, 45]0000000000000010000000000000000000
100.95715140.00.00.1218762600.04.00.00.00.01.0(39, 42]0000000000000100000000000000000000
200.65818038.01.00.0851133042.02.01.00.00.00.0(36, 39]0000000000001000000000000000000000
300.23381030.00.00.0360503300.05.00.00.00.00.0(27, 30]0000000001000000000000000000000000
400.90723949.01.00.02492663588.07.00.01.00.00.0(48, 51]0000000000000000100000000000000000
.............................................................................................................................................
11291000.38574250.00.00.4042933400.07.00.00.00.00.0(48, 51]0000000000000000100000000000000000
11291100.04067474.00.00.2251312100.04.00.01.00.00.0(72, 75]0000000000000000000000001000000000
11291200.29974544.00.00.7165625584.04.00.01.00.02.0(42, 45]0000000000000010000000000000000000
11291300.00000030.00.00.0000005716.04.00.00.00.00.0(27, 30]0000000001000000000000000000000000
11291400.85028364.00.00.2499088158.08.00.02.00.00.0(63, 66]0000000000000000000001000000000000
\n", + "

108648 rows × 46 columns

\n", + "
" + ], + "text/plain": [ + " SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age \\\n", + "0 1 0.766127 45.0 \n", + "1 0 0.957151 40.0 \n", + "2 0 0.658180 38.0 \n", + "3 0 0.233810 30.0 \n", + "4 0 0.907239 49.0 \n", + "... ... ... ... \n", + "112910 0 0.385742 50.0 \n", + "112911 0 0.040674 74.0 \n", + "112912 0 0.299745 44.0 \n", + "112913 0 0.000000 30.0 \n", + "112914 0 0.850283 64.0 \n", + "\n", + " NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome \\\n", + "0 2.0 0.802982 9120.0 \n", + "1 0.0 0.121876 2600.0 \n", + "2 1.0 0.085113 3042.0 \n", + "3 0.0 0.036050 3300.0 \n", + "4 1.0 0.024926 63588.0 \n", + "... ... ... ... \n", + "112910 0.0 0.404293 3400.0 \n", + "112911 0.0 0.225131 2100.0 \n", + "112912 0.0 0.716562 5584.0 \n", + "112913 0.0 0.000000 5716.0 \n", + "112914 0.0 0.249908 8158.0 \n", + "\n", + " NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate \\\n", + "0 13.0 0.0 \n", + "1 4.0 0.0 \n", + "2 2.0 1.0 \n", + "3 5.0 0.0 \n", + "4 7.0 0.0 \n", + "... ... ... \n", + "112910 7.0 0.0 \n", + "112911 4.0 0.0 \n", + "112912 4.0 0.0 \n", + "112913 4.0 0.0 \n", + "112914 8.0 0.0 \n", + "\n", + " NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse \\\n", + "0 6.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "... ... ... \n", + "112910 0.0 0.0 \n", + "112911 1.0 0.0 \n", + "112912 1.0 0.0 \n", + "112913 0.0 0.0 \n", + "112914 2.0 0.0 \n", + "\n", + " NumberOfDependents age_group age_group_(0, 3] age_group_(3, 6] \\\n", + "0 2.0 (42, 45] 0 0 \n", + "1 1.0 (39, 42] 0 0 \n", + "2 0.0 (36, 39] 0 0 \n", + "3 0.0 (27, 30] 0 0 \n", + "4 0.0 (48, 51] 0 0 \n", + "... ... ... ... ... \n", + "112910 0.0 (48, 51] 0 0 \n", + "112911 0.0 (72, 75] 0 0 \n", + "112912 2.0 (42, 45] 0 0 \n", + "112913 0.0 (27, 30] 0 0 \n", + "112914 0.0 (63, 66] 0 0 \n", + "\n", + " age_group_(6, 9] age_group_(9, 12] age_group_(12, 15] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(15, 18] age_group_(18, 21] age_group_(21, 24] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(24, 27] age_group_(27, 30] age_group_(30, 33] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 1 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 1 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(33, 36] age_group_(36, 39] age_group_(39, 42] \\\n", + "0 0 0 0 \n", + "1 0 0 1 \n", + "2 0 1 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(42, 45] age_group_(45, 48] age_group_(48, 51] \\\n", + "0 1 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 1 \n", + "... ... ... ... \n", + "112910 0 0 1 \n", + "112911 0 0 0 \n", + "112912 1 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(51, 54] age_group_(54, 57] age_group_(57, 60] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(60, 63] age_group_(63, 66] age_group_(66, 69] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 1 0 \n", + "\n", + " age_group_(69, 72] age_group_(72, 75] age_group_(75, 78] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 1 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(78, 81] age_group_(81, 84] age_group_(84, 87] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(87, 90] age_group_(90, 93] age_group_(93, 96] \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "... ... ... ... \n", + "112910 0 0 0 \n", + "112911 0 0 0 \n", + "112912 0 0 0 \n", + "112913 0 0 0 \n", + "112914 0 0 0 \n", + "\n", + " age_group_(96, 99] age_group_(99, 102] \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "... ... ... \n", + "112910 0 0 \n", + "112911 0 0 \n", + "112912 0 0 \n", + "112913 0 0 \n", + "112914 0 0 \n", + "\n", + "[108648 rows x 46 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 请对上述分箱后的年龄段进行独热向量编码\n", + "# 提示:使用pandas的get_dummies完成\n", + "# 对年龄段进行独热向量编码\n", + "one_hot_encoded = pd.get_dummies(data['age_group'], prefix='age_group')\n", + "\n", + "# 将独热编码结果与原始 DataFrame 进行合并\n", + "data = pd.concat([data, one_hot_encoded], axis=1)\n", + "\n", + "data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.数据预处理之幅度缩放" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "# 请对连续值特征进行幅度缩放\n", + "# 提示:可以使用StandardScaler等幅度缩放器进行处理\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "sc = StandardScaler()\n", + "X_train_std = sc.fit_transform(X_train)\n", + "X_test_std = sc.fit_transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.使用logistic regression建模,并且输出一下系数,分析重要度。 " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "系数: [[-0.01427294 -0.36429202 1.72869067 0.31207913 -0.11534462 -0.09187206\n", + " 1.68994946 -0.19639706 -3.2487085 0.11638382]]\n", + "截距: [-2.85903863]\n" + ] + } + ], + "source": [ + "# 提示:fit建模,建完模之后可以取出coef属性\n", + "from sklearn.linear_model import LogisticRegression\n", + "import seaborn as sns\n", + "\n", + "lr = LogisticRegression(C=1000.0,random_state=0, penalty='l2', solver='liblinear') \n", + "lr.fit(X_train_std, y_train)\n", + "# 输出系数\n", + "coefficients = lr.coef_\n", + "intercept = lr.intercept_\n", + "\n", + "print(\"系数:\", coefficients)\n", + "print(\"截距:\", intercept)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.使用网格搜索交叉验证进行调参\n", + "调整penalty和C参数,其中penalty候选为\"l1\"和\"l2\",C的候选为[1,10,100,500]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "最佳参数: {'C': 1, 'penalty': 'l1'}\n", + "最佳模型: LogisticRegression(C=1, penalty='l1', random_state=0, solver='liblinear')\n", + "在测试集上的准确率: 0.9339162448228255\n" + ] + } + ], + "source": [ + "# 提示:先按照上面要求准备好网格字典,再使用GridSearchCV进行调参\n", + "# 设置参数候选\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid = {\n", + " 'penalty': ['l1', 'l2'],\n", + " 'C': [1, 10, 100, 500]\n", + "}\n", + "\n", + "# 初始化 GridSearchCV\n", + "grid_search = GridSearchCV(lr, param_grid, cv=5)\n", + "\n", + "# 执行网格搜索交叉验证\n", + "grid_search.fit(X_train_std, y_train)\n", + "\n", + "# 输出最佳参数\n", + "print(\"最佳参数:\", grid_search.best_params_)\n", + "\n", + "# 输出最佳模型\n", + "best_model = grid_search.best_estimator_\n", + "print(\"最佳模型:\", best_model)\n", + "\n", + "# 在测试集上评估最佳模型\n", + "accuracy = best_model.score(X_test_std, y_test)\n", + "print(\"在测试集上的准确率:\", accuracy)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7.在测试集上进行预测,计算 查准率/查全率/auc/混淆矩阵/f1值 等测试指标" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "准确率: 0.9339162448228255\n", + "查全率: 0.046061722708429294\n", + "AUC: 0.6979862258129022\n", + "F1值: 0.08496176720475784\n", + "混淆矩阵:\n", + "[[30341 83]\n", + " [ 2071 100]]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# 提示:在测试集上预测可以使用predict\n", + "# 提示:各种指标可以在sklearn.metrics中查到各种评估指标,分别是accuracy_score、recall_score、auc、confusion_matrix、f1_score\n", + "\n", + "from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix, f1_score,ConfusionMatrixDisplay\n", + "\n", + "# 在测试集上进行预测\n", + "y_pred = best_model.predict(X_test_std)\n", + "\n", + "# 计算准确率\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"准确率:\", accuracy)\n", + "\n", + "# 计算查全率\n", + "recall = recall_score(y_test, y_pred)\n", + "print(\"查全率:\", recall)\n", + "\n", + "# 计算AUC\n", + "y_pred_proba = best_model.predict_proba(X_test_std)[:, 1] # 预测为正例的概率\n", + "auc = roc_auc_score(y_test, y_pred_proba)\n", + "print(\"AUC:\", auc)\n", + "\n", + "# 计算F1值\n", + "f1 = f1_score(y_test, y_pred)\n", + "print(\"F1值:\", f1)\n", + "\n", + "# 计算混淆矩阵\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(\"混淆矩阵:\")\n", + "print(cm)\n", + "disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lr.classes_)\n", + "\n", + "disp_lr.plot()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 8.更多优化\n", + "银行通常会有更严格的要求,因为欺诈带来的后果通常比较严重,一般我们会调整模型的标准。 \n", + "\n", + "比如在logistic regression当中,一般我们的概率判定边界为0.5,但是我们可以把阈值设定低一些,来提高模型的“敏感度” \n", + "试试看把阈值设定为0.3,再看看这个时候的混淆矩阵等评估指标。" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "阈值为 0.1 时的混淆矩阵:\n", + "[[26778 3646]\n", + " [ 1253 918]]\n", + "阈值为 0.1 时的准确率: 0.8497008743672343\n", + "阈值为 0.1 时的查全率: 0.42284661446338095\n", + "阈值为 0.1 时的AUC: 0.6515035070739202\n", + "阈值为 0.1 时的F1值: 0.2726057906458797\n", + "--------------------\n", + "阈值为 0.2 时的混淆矩阵:\n", + "[[29799 625]\n", + " [ 1754 417]]\n", + "阈值为 0.2 时的准确率: 0.9270133456051541\n", + "阈值为 0.2 时的查全率: 0.19207738369415017\n", + "阈值为 0.2 时的AUC: 0.5857671956598545\n", + "阈值为 0.2 时的F1值: 0.25957049486461253\n", + "--------------------\n", + "阈值为 0.3 时的混淆矩阵:\n", + "[[30178 246]\n", + " [ 1927 244]]\n", + "阈值为 0.3 时的准确率: 0.9333333333333333\n", + "阈值为 0.3 时的查全率: 0.11239060340856748\n", + "阈值为 0.3 时的AUC: 0.5521524408049937\n", + "阈值为 0.3 时的F1值: 0.18338970311912817\n", + "--------------------\n", + "阈值为 0.4 时的混淆矩阵:\n", + "[[30291 133]\n", + " [ 2018 153]]\n", + "阈值为 0.4 时的准确率: 0.9340082834790612\n", + "阈值为 0.4 时的查全率: 0.07047443574389682\n", + "阈值为 0.4 时的AUC: 0.5330514434833079\n", + "阈值为 0.4 时的F1值: 0.12454212454212456\n", + "--------------------\n", + "阈值为 0.5 时的混淆矩阵:\n", + "[[30341 83]\n", + " [ 2071 100]]\n", + "阈值为 0.5 时的准确率: 0.9339162448228255\n", + "阈值为 0.5 时的查全率: 0.046061722708429294\n", + "阈值为 0.5 时的AUC: 0.5216668066605518\n", + "阈值为 0.5 时的F1值: 0.08496176720475784\n", + "--------------------\n", + "阈值为 0.6 时的混淆矩阵:\n", + "[[30371 53]\n", + " [ 2104 67]]\n", + "阈值为 0.6 时的准确率: 0.93382420616659\n", + "阈值为 0.6 时的查全率: 0.030861354214647627\n", + "阈值为 0.6 时的AUC: 0.5145596542306475\n", + "阈值为 0.6 时的F1值: 0.058489742470536885\n", + "--------------------\n", + "阈值为 0.7 时的混淆矩阵:\n", + "[[30390 34]\n", + " [ 2128 43]]\n", + "阈值为 0.7 时的准确率: 0.9336708084061973\n", + "阈值为 0.7 时的查全率: 0.019806540764624597\n", + "阈值为 0.7 时的AUC: 0.5093445009897275\n", + "阈值为 0.7 时的F1值: 0.03825622775800712\n", + "--------------------\n", + "阈值为 0.8 时的混淆矩阵:\n", + "[[30403 21]\n", + " [ 2143 28]]\n", + "阈值为 0.8 时的准确率: 0.9336094493020402\n", + "阈值为 0.8 时的查全率: 0.012897282358360202\n", + "阈值为 0.8 时的AUC: 0.5061035189072896\n", + "阈值为 0.8 时的F1值: 0.025225225225225224\n", + "--------------------\n", + "阈值为 0.9 时的混淆矩阵:\n", + "[[30418 6]\n", + " [ 2167 4]]\n", + "阈值为 0.9 时的准确率: 0.9333333333333333\n", + "阈值为 0.9 时的查全率: 0.0018424689083371719\n", + "阈值为 0.9 时的AUC: 0.5008226280907713\n", + "阈值为 0.9 时的F1值: 0.003668042182485099\n", + "--------------------\n" + ] + } + ], + "source": [ + "# 提示:thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]\n", + "# 根据predict_proba的结果和threshold的比较确定结果,再评估各种结果指标\n", + "import numpy as np\n", + "thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]\n", + "\n", + "for threshold in thresholds:\n", + " # 根据阈值确定最终的分类结果\n", + " y_pred_threshold = np.where(y_pred_proba > threshold, 1, 0)\n", + "\n", + " # 计算混淆矩阵\n", + " cm_threshold = confusion_matrix(y_test, y_pred_threshold)\n", + " print(\"阈值为\", threshold, \"时的混淆矩阵:\")\n", + " print(cm_threshold)\n", + "\n", + " # 计算准确率\n", + " accuracy_threshold = accuracy_score(y_test, y_pred_threshold)\n", + " print(\"阈值为\", threshold, \"时的准确率:\", accuracy_threshold)\n", + "\n", + " # 计算查全率\n", + " recall_threshold = recall_score(y_test, y_pred_threshold)\n", + " print(\"阈值为\", threshold, \"时的查全率:\", recall_threshold)\n", + "\n", + " # 计算AUC\n", + " auc_threshold = roc_auc_score(y_test, y_pred_threshold)\n", + " print(\"阈值为\", threshold, \"时的AUC:\", auc_threshold)\n", + "\n", + " # 计算F1值\n", + " f1_threshold = f1_score(y_test, y_pred_threshold)\n", + " print(\"阈值为\", threshold, \"时的F1值:\", f1_threshold)\n", + "\n", + " print(\"--------------------\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 9.尝试对不同特征的重要度进行排序,通过特征选择的方式,对特征进行筛选。并重新建模,观察此时的模型准确率等评估指标。" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "feat_labels = data.columns[1:]\n", + "\n", + "# 获取特征重要性\n", + "feature_importance = lr.coef_[0] # 假设model是你之前训练的逻辑回归模型\n", + "\n", + "# 将特征重要性进行排序\n", + "sorted_idx = feature_importance.argsort()\n", + "\n", + "# 创建水平条形图\n", + "plt.figure(figsize=(10, 8))\n", + "plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')\n", + "plt.yticks(range(len(sorted_idx)), np.array(feat_labels)[sorted_idx])\n", + "plt.xlabel('Feature Importance')\n", + "plt.ylabel('Features')\n", + "plt.title('Feature Importance Ranking')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "特征选择后的模型准确率: 0.9339469243749041\n", + "特征选择后的模型查全率: 0.036849378166743434\n", + "特征选择后的模型AUC: 0.5174057566615994\n", + "特征选择后的模型F1值: 0.06917423259835712\n" + ] + } + ], + "source": [ + "# 可以根据逻辑回归的系数绝对值大小进行排序,也可以基于树模型的特征重要度进行排序\n", + "# 特征选择可以使用RFE或者selectFromModel\n", + "\n", + "from sklearn.feature_selection import SelectFromModel\n", + "\n", + "\n", + "# 使用 SelectFromModel 进行特征选择\n", + "selector = SelectFromModel(lr)\n", + "selector.fit(X_train_std, y_train)\n", + "\n", + "# 获取选择的特征索引\n", + "selected_features = selector.get_support(indices=True)\n", + "\n", + "# # 根据选择的特征重新构建训练集和测试集\n", + "X_train_selected = X_train_std[:, selected_features]\n", + "X_test_selected = X_test_std[:, selected_features]\n", + "\n", + "# 在选择的特征上重新训练模型\n", + "model_selected = LogisticRegression()\n", + "model_selected.fit(X_train_selected, y_train)\n", + "\n", + "# 在测试集上进行预测\n", + "y_pred_selected = model_selected.predict(X_test_selected)\n", + "\n", + "# 计算准确率\n", + "accuracy_selected = accuracy_score(y_test, y_pred_selected)\n", + "print(\"特征选择后的模型准确率:\", accuracy_selected)\n", + "\n", + "# 计算查全率\n", + "recall_selected = recall_score(y_test, y_pred_selected)\n", + "print(\"特征选择后的模型查全率:\", recall_selected)\n", + "\n", + "# 计算AUC\n", + "auc_selected = roc_auc_score(y_test, y_pred_selected)\n", + "print(\"特征选择后的模型AUC:\", auc_selected)\n", + "\n", + "# 计算F1值\n", + "f1_selected = f1_score(y_test, y_pred_selected)\n", + "print(\"特征选择后的模型F1值:\", f1_selected)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 10.其他模型算法尝试\n", + "使用RandomForestClassifier/SVM/KNN等sklearn分类算法进行分类,尝试上述超参数调优算法过程。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "# 随机森林\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "# 支持向量机\n", + "from sklearn.svm import SVC\n", + "# K最近邻\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n", + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest - Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n", + "Random Forest - Best Score: 0.935492337006019\n", + "SVM - Best Parameters: {'C': 1, 'gamma': 1}\n", + "SVM - Best Score: 0.9344404476310031\n", + "KNN - Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 7, 'weights': 'uniform'}\n", + "KNN - Best Score: 0.9333754150632625\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n", + "from sklearn.metrics import precision_score, recall_score, roc_auc_score, confusion_matrix, f1_score, roc_curve, auc\n", + "\n", + "# 定义参数网格\n", + "param_grid_rf = {\n", + " 'n_estimators': [100, 200],\n", + " 'max_depth': [5, 10],\n", + " 'min_samples_split': [5, 10]\n", + "}\n", + "\n", + "param_grid_svm = {\n", + " 'C': [0.1, 1],\n", + " 'gamma': [0.1,1]\n", + "}\n", + "\n", + "param_grid_knn = {\n", + " 'n_neighbors': [3, 5, 7],\n", + " 'weights': ['uniform', 'distance'],\n", + " 'algorithm': ['ball_tree', 'kd_tree']\n", + "}\n", + "\n", + "# 创建分类器对象\n", + "rf_classifier = RandomForestClassifier()\n", + "svm_classifier = SVC()\n", + "knn_classifier = KNeighborsClassifier()\n", + "\n", + "# 创建网格搜索对象\n", + "grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5)\n", + "grid_search_svm = GridSearchCV(svm_classifier, param_grid_svm, cv=5)\n", + "grid_search_knn = GridSearchCV(knn_classifier, param_grid_knn, cv=5)\n", + "\n", + "# 在训练集上进行网格搜索\n", + "grid_search_rf.fit(X_train_std, y_train)\n", + "grid_search_svm.fit(X_train_std, y_train)\n", + "grid_search_knn.fit(X_train_std, y_train)\n", + "\n", + "# 输出最佳参数组合和对应的得分\n", + "print(\"Random Forest - Best Parameters:\", grid_search_rf.best_params_)\n", + "print(\"Random Forest - Best Score:\", grid_search_rf.best_score_)\n", + "print(\"SVM - Best Parameters:\", grid_search_svm.best_params_)\n", + "print(\"SVM - Best Score:\", grid_search_svm.best_score_)\n", + "print(\"KNN - Best Parameters:\", grid_search_knn.best_params_)\n", + "print(\"KNN - Best Score:\", grid_search_knn.best_score_)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dk/anaconda3/envs/igwn-py39/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:211: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n", + " mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest - Precision: 0.615071283095723\n", + "Random Forest - Recall: 0.13910640257945647\n", + "Random Forest - F1 Score: 0.22689706987227648\n", + "Random Forest - AUC: 0.8532556239086381\n", + "SVM - Precision: 0.6153846153846154\n", + "SVM - Recall: 0.07369875633348687\n", + "SVM - F1 Score: 0.1316330728095434\n", + "SVM - AUC: 0.6692492308612815\n", + "KNN - Precision: 0.5091324200913242\n", + "KNN - Recall: 0.10271764163979732\n", + "KNN - F1 Score: 0.1709467228823304\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "\n", + "# 使用最佳参数进行训练\n", + "best_rf = grid_search_rf.best_estimator_\n", + "best_svm = grid_search_svm.best_estimator_\n", + "best_knn = grid_search_knn.best_estimator_\n", + "\n", + "best_rf.fit(X_train_std, y_train)\n", + "best_svm.fit(X_train_std, y_train)\n", + "best_knn.fit(X_train_std, y_train)\n", + "\n", + "# 预测\n", + "y_pred_rf = best_rf.predict(X_test_std)\n", + "y_pred_svm = best_svm.predict(X_test_std)\n", + "y_pred_knn = best_knn.predict(X_test_std)\n", + "\n", + "# 计算评估指标\n", + "precision_rf = precision_score(y_test, y_pred_rf)\n", + "precision_svm = precision_score(y_test, y_pred_svm)\n", + "precision_knn = precision_score(y_test, y_pred_knn)\n", + "\n", + "recall_rf = recall_score(y_test, y_pred_rf)\n", + "recall_svm = recall_score(y_test, y_pred_svm)\n", + "recall_knn = recall_score(y_test, y_pred_knn)\n", + "\n", + "f1_rf = f1_score(y_test, y_pred_rf)\n", + "f1_svm = f1_score(y_test, y_pred_svm)\n", + "f1_knn = f1_score(y_test, y_pred_knn)\n", + "\n", + "auc_rf = roc_auc_score(y_test, best_rf.predict_proba(X_test_std)[:, 1])\n", + "auc_svm = roc_auc_score(y_test, best_svm.decision_function(X_test_std))\n", + "fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, best_rf.predict_proba(X_test_std)[:, 1])\n", + "fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, best_svm.decision_function(X_test_std))\n", + "\n", + "conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)\n", + "conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)\n", + "conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)\n", + "\n", + "# 输出评估指标\n", + "print(\"Random Forest - Precision:\", precision_rf)\n", + "print(\"Random Forest - Recall:\", recall_rf)\n", + "print(\"Random Forest - F1 Score:\", f1_rf)\n", + "print(\"Random Forest - AUC:\", auc_rf)\n", + "print(\"SVM - Precision:\", precision_svm)\n", + "print(\"SVM - Recall:\", recall_svm)\n", + "print(\"SVM - F1 Score:\", f1_svm)\n", + "print(\"SVM - AUC:\", auc_svm)\n", + "print(\"KNN - Precision:\", precision_knn)\n", + "print(\"KNN - Recall:\", recall_knn)\n", + "print(\"KNN - F1 Score:\", f1_knn)\n", + "\n", + "# 绘制混淆矩阵\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "plt.subplot(1, 3, 1)\n", + "sns.heatmap(conf_matrix_rf, annot=True, fmt=\"d\", cmap=\"YlGnBu\", cbar=False)\n", + "plt.title(\"Random Forest Confusion Matrix\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.ylabel(\"Actual\")\n", + "\n", + "plt.subplot(1, 3, 2)\n", + "sns.heatmap(conf_matrix_svm, annot=True, fmt=\"d\", cmap=\"YlGnBu\", cbar=False)\n", + "plt.title(\"SVM Confusion Matrix\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.ylabel(\"Actual\")\n", + "\n", + "plt.subplot(1, 3, 3)\n", + "sns.heatmap(conf_matrix_knn, annot=True, fmt=\"d\", cmap=\"YlGnBu\", cbar=False)\n", + "plt.title(\"KNN Confusion Matrix\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.ylabel(\"Actual\")\n", + "\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}