-
Notifications
You must be signed in to change notification settings - Fork 121
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #859 from sanchitc05/sanchitc05/issue858
Feature request: Add Anomaly Detection Project
- Loading branch information
Showing
8 changed files
with
382 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "plaintext" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Import necessary libraries\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import seaborn as sns\n", | ||
"from sklearn.ensemble import IsolationForest\n", | ||
"from sklearn.cluster import KMeans\n", | ||
"from sklearn.preprocessing import StandardScaler\n", | ||
"from sklearn.metrics import confusion_matrix, classification_report\n", | ||
"from sklearn.decomposition import PCA\n", | ||
"\n", | ||
"# Set plot style\n", | ||
"sns.set(style=\"whitegrid\")\n", | ||
"\n", | ||
"# Load Dataset\n", | ||
"# Replace 'data.csv' with your dataset file\n", | ||
"data = pd.read_csv('data.csv')\n", | ||
"\n", | ||
"# Display the first few rows\n", | ||
"data.head()\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# anomalies_report.py | ||
|
||
import pandas as pd | ||
|
||
# Load preprocessed data | ||
data = pd.read_csv('preprocessed_data.csv') | ||
|
||
# Identify anomalies based on both clustering and isolation forest | ||
anomalies_kmeans = data[data['Cluster'] == 1] # From KMeans | ||
anomalies_isoforest = data[data['Anomaly'] == 1] # From Isolation Forest | ||
|
||
# Combine and save identified anomalies from both methods | ||
anomalies_report = pd.concat([anomalies_kmeans, anomalies_isoforest]).drop_duplicates() | ||
anomalies_report.to_csv('anomalies_report.csv', index=False) | ||
print("Anomalies report saved as 'anomalies_report.csv'.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# data_preprocessing.py | ||
|
||
import pandas as pd | ||
from sklearn.preprocessing import StandardScaler | ||
|
||
# Load dataset | ||
data = pd.read_csv('raw_data.csv') # Replace with the actual path to your dataset | ||
|
||
# Check for missing values and drop rows with missing values | ||
data = data.dropna() | ||
|
||
# Standardize data | ||
scaler = StandardScaler() | ||
data_scaled = scaler.fit_transform(data) | ||
data_scaled = pd.DataFrame(data_scaled, columns=data.columns) | ||
|
||
# Save preprocessed data to CSV | ||
data_scaled.to_csv('preprocessed_data.csv', index=False) | ||
print("Preprocessed data saved to 'preprocessed_data.csv'.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# eda_visualizations.py | ||
|
||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
|
||
# Load preprocessed data | ||
data = pd.read_csv('preprocessed_data.csv') | ||
|
||
# Set plot style | ||
sns.set(style="whitegrid") | ||
|
||
# Plot feature distributions | ||
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10)) | ||
axes = axes.flatten() | ||
for i, col in enumerate(data.columns): | ||
sns.histplot(data[col], kde=True, ax=axes[i]) | ||
axes[i].set_title(f"Distribution of {col}") | ||
plt.tight_layout() | ||
plt.savefig('eda_feature_distributions.png') | ||
plt.show() |
38 changes: 38 additions & 0 deletions
38
Anomaly Detection Project/isolation_forest_anomaly_detection.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# isolation_forest_anomaly_detection.py | ||
|
||
import pandas as pd | ||
from sklearn.ensemble import IsolationForest | ||
import joblib | ||
from sklearn.decomposition import PCA | ||
import matplotlib.pyplot as plt | ||
|
||
# Load preprocessed data | ||
data = pd.read_csv('preprocessed_data.csv') | ||
|
||
# Set Isolation Forest parameters | ||
contamination_rate = 0.05 # Adjust this value based on your dataset and anomaly expectations | ||
max_samples_value = "auto" # Can be an integer or "auto" (256 or total sample size, whichever is smaller) | ||
|
||
# Isolation Forest with custom parameters | ||
iso_forest = IsolationForest(contamination=contamination_rate, max_samples=max_samples_value, random_state=42) | ||
data['Anomaly'] = iso_forest.fit_predict(data) | ||
|
||
# Convert labels for anomalies | ||
data['Anomaly'] = data['Anomaly'].apply(lambda x: 1 if x == -1 else 0) | ||
|
||
# Save the Isolation Forest model | ||
joblib.dump(iso_forest, 'isolation_forest_model.pkl') | ||
print("Isolation Forest model saved as 'isolation_forest_model.pkl'.") | ||
|
||
# Visualize anomalies with PCA | ||
pca = PCA(n_components=2) | ||
pca_data = pca.fit_transform(data.drop(columns=['Anomaly'])) | ||
|
||
plt.figure(figsize=(10, 6)) | ||
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Anomaly'], cmap='coolwarm', marker='o', alpha=0.6) | ||
plt.title("Isolation Forest Anomaly Detection with PCA") | ||
plt.xlabel("PCA Component 1") | ||
plt.ylabel("PCA Component 2") | ||
plt.colorbar(label="Anomaly") | ||
plt.savefig('isolation_forest_anomalies_pca.png') | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# kmeans_anomaly_detection.py | ||
|
||
import pandas as pd | ||
from sklearn.cluster import KMeans | ||
import joblib | ||
from sklearn.decomposition import PCA | ||
import matplotlib.pyplot as plt | ||
|
||
# Load preprocessed data | ||
data = pd.read_csv('preprocessed_data.csv') | ||
|
||
# KMeans Clustering | ||
kmeans = KMeans(n_clusters=2, random_state=42) | ||
kmeans.fit(data) | ||
data['Cluster'] = kmeans.labels_ | ||
|
||
# Save the KMeans model | ||
joblib.dump(kmeans, 'kmeans_model.pkl') | ||
print("KMeans model saved as 'kmeans_model.pkl'.") | ||
|
||
# Identify anomalies based on cluster assignment | ||
anomalies_kmeans = data[data['Cluster'] == 1] | ||
|
||
# Visualize clusters with PCA | ||
pca = PCA(n_components=2) | ||
pca_data = pca.fit_transform(data.drop(columns=['Cluster'])) | ||
|
||
plt.figure(figsize=(10, 6)) | ||
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Cluster'], cmap='viridis', marker='o', alpha=0.6) | ||
plt.title("KMeans Clustering with PCA") | ||
plt.xlabel("PCA Component 1") | ||
plt.ylabel("PCA Component 2") | ||
plt.colorbar(label="Cluster") | ||
plt.savefig('kmeans_clusters_pca.png') | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# model_evaluation.py | ||
|
||
import pandas as pd | ||
from sklearn.metrics import confusion_matrix, classification_report | ||
|
||
# Load data with true labels (replace 'TrueLabels' with the actual column if available) | ||
data = pd.read_csv('preprocessed_data.csv') | ||
# Assuming `TrueLabels` column exists in the original data | ||
|
||
# Evaluate Isolation Forest Model | ||
# Assuming 'TrueLabels' is in the original data and represents ground truth for anomalies | ||
# Uncomment and use if true labels are available | ||
|
||
# true_labels = data['TrueLabels'] | ||
# print("Confusion Matrix (Isolation Forest):\n", confusion_matrix(true_labels, data['Anomaly'])) | ||
# print("Classification Report (Isolation Forest):\n", classification_report(true_labels, data['Anomaly'])) | ||
|
||
# Save the evaluation report to file | ||
# with open("evaluation_report.txt", "w") as f: | ||
# f.write("Confusion Matrix (Isolation Forest):\n") | ||
# f.write(str(confusion_matrix(true_labels, data['Anomaly'])) + "\n\n") | ||
# f.write("Classification Report (Isolation Forest):\n") | ||
# f.write(classification_report(true_labels, data['Anomaly'])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
# Anomaly Detection Project | ||
|
||
This project demonstrates how to detect anomalies (unusual patterns) in a dataset using machine learning techniques like **Isolation Forest** and **KMeans Clustering**. The goal is to identify anomalous behaviors that could indicate fraudulent transactions, network intrusions, or other forms of outliers. | ||
|
||
## Table of Contents | ||
|
||
- [Project Overview](#project-overview) | ||
- [Files and Their Purpose](#files-and-their-purpose) | ||
- [Installation Instructions](#installation-instructions) | ||
- [Usage Instructions](#usage-instructions) | ||
- [Dependencies](#dependencies) | ||
- [Model Explanation](#model-explanation) | ||
- [Results and Visualizations](#results-and-visualizations) | ||
- [Contributing](#contributing) | ||
|
||
## Project Overview | ||
|
||
This project uses **supervised and unsupervised machine learning techniques** to identify anomalies in a given dataset. The anomaly detection process involves: | ||
|
||
1. **Data Collection & Preprocessing**: Gathering and cleaning data by handling missing values and outliers. | ||
2. **Exploratory Data Analysis (EDA)**: Visualizing the data distributions, identifying potential anomalies, and calculating summary statistics. | ||
3. **Anomaly Detection Techniques**: Using **KMeans clustering** and **Isolation Forest** to detect anomalies. | ||
4. **Model Evaluation**: Evaluating the model performance with metrics like precision, recall, and F1 score (if ground truth labels are available). | ||
5. **Visualization**: Visualizing the anomalies detected using **PCA** and other plots. | ||
|
||
## Files and Their Purpose | ||
|
||
Here’s a breakdown of each script in the project: | ||
|
||
1. **`data_preprocessing.py`**: | ||
- Handles data loading, cleaning, and preprocessing tasks. | ||
- Deals with missing values, categorical data encoding, and outlier handling. | ||
|
||
2. **`eda_visualizations.py`**: | ||
- Performs Exploratory Data Analysis (EDA). | ||
- Visualizes the data distributions and detects potential anomalies through visual methods like histograms and scatter plots. | ||
|
||
3. **`kmeans_anomaly_detection.py`**: | ||
- Implements anomaly detection using **KMeans clustering**. | ||
- Identifies outliers based on clustering results. | ||
|
||
4. **`isolation_forest_anomaly_detection.py`**: | ||
- Implements anomaly detection using the **Isolation Forest** algorithm. | ||
- Detects anomalies by isolating points that are far from the rest of the data. | ||
- Allows you to adjust `contamination` and `max_samples` parameters. | ||
|
||
5. **`model_evaluation.py`** (Optional): | ||
- Evaluates the performance of anomaly detection techniques (if ground truth labels are available). | ||
- Calculates precision, recall, F1 score, and other evaluation metrics. | ||
|
||
6. **`anomalies_report.py`**: | ||
- Generates a report of the detected anomalies, which can be saved to a CSV or viewed as a summary. | ||
|
||
7. **`Anomaly_Detection_Project.ipynb`**: | ||
- Jupyter notebook that integrates all the scripts above. | ||
- Contains code, visualizations, and detailed explanations for the entire anomaly detection process. | ||
|
||
## Installation Instructions | ||
|
||
### Clone this repository | ||
To get started, clone the repository to your local machine using: | ||
|
||
```bash | ||
git clone https://github.com/your-username/Anomaly-Detection-Project.git | ||
cd Anomaly-Detection-Project | ||
``` | ||
|
||
### Install dependencies | ||
This project requires Python 3.x. You can install the required libraries using **pip** by running the following command in your terminal: | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
If `requirements.txt` is not included, you can manually install the dependencies: | ||
|
||
```bash | ||
pip install pandas scikit-learn matplotlib seaborn joblib | ||
``` | ||
|
||
## Usage Instructions | ||
|
||
### Step 1: Data Preprocessing | ||
Run the `data_preprocessing.py` script to preprocess the raw data. This will clean the data, handle missing values, and normalize it for anomaly detection: | ||
|
||
```bash | ||
python data_preprocessing.py | ||
``` | ||
|
||
### Step 2: Exploratory Data Analysis (EDA) | ||
Run the `eda_visualizations.py` script to visualize the data distributions and identify potential anomalies: | ||
|
||
```bash | ||
python eda_visualizations.py | ||
``` | ||
|
||
### Step 3: Anomaly Detection with KMeans | ||
Use **KMeans** to detect anomalies by running the `kmeans_anomaly_detection.py` script: | ||
|
||
```bash | ||
python kmeans_anomaly_detection.py | ||
``` | ||
|
||
### Step 4: Anomaly Detection with Isolation Forest | ||
Use **Isolation Forest** to detect anomalies by running the `isolation_forest_anomaly_detection.py` script. Adjust parameters like `contamination` and `max_samples` as needed. | ||
|
||
```bash | ||
python isolation_forest_anomaly_detection.py | ||
``` | ||
|
||
### Step 5: Model Evaluation (Optional) | ||
If you have ground truth labels, run `model_evaluation.py` to evaluate the performance of the anomaly detection models: | ||
|
||
```bash | ||
python model_evaluation.py | ||
``` | ||
|
||
### Step 6: Generate Anomalies Report | ||
Run the `anomalies_report.py` script to generate and save a report of detected anomalies: | ||
|
||
```bash | ||
python anomalies_report.py | ||
``` | ||
|
||
### Step 7: Jupyter Notebook Workflow | ||
Run the entire workflow within a Jupyter notebook for better visualization and step-by-step execution. Open the notebook: | ||
|
||
```bash | ||
jupyter notebook Anomaly_Detection_Project.ipynb | ||
``` | ||
|
||
## Dependencies | ||
|
||
This project requires the following Python libraries: | ||
|
||
- `pandas`: For data manipulation and analysis. | ||
- `numpy`: For numerical operations. | ||
- `scikit-learn`: For machine learning models (KMeans and Isolation Forest). | ||
- `matplotlib`: For data visualization. | ||
- `seaborn`: For enhanced data visualizations. | ||
- `joblib`: For saving and loading models. | ||
|
||
You can install all dependencies with the command: | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Model Explanation | ||
|
||
- **KMeans Clustering**: This unsupervised learning algorithm groups data points into clusters. Points that do not belong to any well-defined cluster are considered anomalies. | ||
- **Isolation Forest**: This tree-based algorithm isolates anomalies by randomly selecting a feature and splitting the data. Anomalous points are those that are isolated more quickly than normal points. | ||
|
||
## Results and Visualizations | ||
|
||
The results of the anomaly detection process are visualized in several ways: | ||
|
||
1. **PCA Visualizations**: The results of the anomaly detection models (KMeans and Isolation Forest) are visualized using PCA (Principal Component Analysis), which reduces the dimensionality of the data to 2D for easy visualization. | ||
2. **Anomaly Distribution**: The distribution of detected anomalies is shown using scatter plots and heatmaps. | ||
|
||
## Contributing | ||
|
||
We welcome contributions to this project! If you have suggestions or want to add new features, feel free to fork this repository and submit a pull request. | ||
|
||
Please follow these steps to contribute: | ||
|
||
1. Fork the repository. | ||
2. Clone your fork to your local machine. | ||
3. Create a new branch for your feature (`git checkout -b feature-name`). | ||
4. Commit your changes (`git commit -m 'Add new feature'`). | ||
5. Push to your forked repository (`git push origin feature-name`). | ||
6. Submit a pull request. | ||
|
||
--- | ||
|
||
Thank you for checking out the **Anomaly Detection Project**! | ||
|
||
``` | ||
### Explanation of Sections | ||
1. **Project Overview**: Describes the project goals and steps involved. | ||
2. **Files and Their Purpose**: Details the role of each file in the project. | ||
3. **Installation Instructions**: How to clone the repo and set up dependencies. | ||
4. **Usage Instructions**: Walks through how to run each script step by step. | ||
5. **Model Explanation**: Brief overview of the machine learning models used (KMeans and Isolation Forest). | ||
6. **Results and Visualizations**: Explains how the anomalies are visualized and evaluated. | ||
7. **Contributing**: Encourages other developers to contribute by forking the repo and submitting pull requests. |