Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Final Submission of Team Cipher #8

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions Cipher.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 159,
"id": "7cda9455-b81f-4d01-bbc2-cebc8b57cc4e",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt, pandas as pd,numpy as np\n",
"from sklearn.impute import SimpleImputer, KNNImputer\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"import sklearn.model_selection"
]
},
{
"cell_type": "code",
"execution_count": 160,
"id": "626d5b54-0be2-4923-86ca-f3ad5a3b4c17",
"metadata": {},
"outputs": [],
"source": [
"#reading testing and training files using pandas and dropping columns with least covariance with systolic and diastolic bp\n",
"df = pd.read_csv(\"ML101_train_dataset.csv\")\n",
"df_test=pd.read_csv(\"ML101_dataset_test_feature.csv\")\n",
"df.dropna(subset=['Caloric Intake','Height','Cholesterol level','Average Daily Steps','Hours of Sleep','Gender','LifeStyle','Systolic BP','Diastolic BP'], inplace=True)\n",
"df=df.reset_index(drop=True)\n",
"df_test = df_test.drop('Gender', axis=1)\n",
"df = df.drop('Gender', axis=1)\n",
"df_test = df_test.drop('Average Daily Steps', axis=1)\n",
"df = df.drop('Average Daily Steps', axis=1)\n",
"df_test = df_test.drop('Hours of Sleep', axis=1)\n",
"df = df.drop('Hours of Sleep', axis=1)\n",
"attributes = df.columns"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "1caa087e-bc68-4941-b85f-6764b574d9cd",
"metadata": {},
"outputs": [],
"source": [
"#replacing nan values of columns using mean \n",
"columns_to_impute = ['Caloric Intake','Age','Weight','Height','Cholesterol level','Blood Sugar level']\n",
"imputer = SimpleImputer(strategy='mean')\n",
"df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])"
]
},
{
"cell_type": "code",
"execution_count": 162,
"id": "4c31cb17-dec0-4cb5-b6be-3767535d106b",
"metadata": {},
"outputs": [],
"source": [
"#creating another numpy arrays such that each row starts with constant value 1 \n",
"y1=df['Systolic BP']\n",
"y2=df['Diastolic BP']\n",
"y3=df['LifeStyle']\n",
"x_test=np.array(df_test)\n",
"x=np.array(df[attributes[0:len(attributes)-3]])\n",
"x_test1=np.array([[1]*(len(x[0])+1)]*len(x_test),dtype=float)\n",
"x_final=np.array([[1]*(len(x[0])+1)]*len(x),dtype=float)\n",
"for i in range(1,len(x_final[0])):\n",
" for j in range(len(x)):\n",
" x_final[j][i]=x[j][i-1]\n",
"for i in range(1,len(x_test1[0])):\n",
" for j in range(len(x_test1)):\n",
" x_test1[j][i]=x_test[j][i-1]"
]
},
{
"cell_type": "code",
"execution_count": 163,
"id": "8b0b33cf-37f8-46e4-b493-82b6962e4b57",
"metadata": {},
"outputs": [],
"source": [
"#finding parameters for both systolic and diastolic bp by calculating dagger of data matrix and then multiplying it with target variable and then\n",
"#writing predicted values to another csv file and also adding two columns in test data dataframe so as to use during classification \n",
"systolic_bp=[]\n",
"diastolic_bp=[]\n",
"w_sys=np.matmul(np.matmul(np.linalg.inv(np.matmul(x_final.transpose(),x_final)),x_final.transpose()),y1)\n",
"w_dia=np.matmul(np.matmul(np.linalg.inv(np.matmul(x_final.transpose(),x_final)),x_final.transpose()),y2)\n",
"for i in range(len(x_test1)):\n",
" systolic_bp.append(np.matmul(w_sys.transpose(),x_test1[i]))\n",
" diastolic_bp.append(np.matmul(w_dia.transpose(),x_test1[i]))\n",
"systolic_bp=np.array(systolic_bp)\n",
"diastolic_bp=np.array(diastolic_bp)\n",
"op_df=pd.DataFrame({\"Systolic BP\":pd.Series(systolic_bp),\"Diastolic BP\":pd.Series(diastolic_bp)})\n",
"op_df=op_df.rename_axis(\"ID\") \n",
"\n",
"op_df.to_csv(\"regression_cipher.csv\")\n",
"\n",
"df_test['Systolic BP']=systolic_bp\n",
"df_test['Diastolic BP']=diastolic_bp\n",
"x_test2=np.array(df_test)\n",
"x_train2=np.array(df[attributes[:len(attributes)-1]])"
]
},
{
"cell_type": "code",
"execution_count": 164,
"id": "e597c37f-b378-4530-8ff7-d40e3fbacb2d",
"metadata": {},
"outputs": [],
"source": [
"#creating an adaboost classifier with the decision tree as the base classifier and fitting the ensemble classifier to data and making predictions and \n",
"#writing it to csv\n",
"base_classifier = DecisionTreeClassifier()\n",
"ensemble_classifier = AdaBoostClassifier(base_classifier, n_estimators=50)\n",
"ensemble_classifier.fit(x_train2, y3)\n",
"y_pred = ensemble_classifier.predict(x_test2)\n",
"df_final=pd.DataFrame({\"LifeStyle\":y_pred})\n",
"df_final.to_csv(\"classification_cipher.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading