diff --git a/Anomaly Detection Project/Anomaly_Detection_Project.ipynb b/Anomaly Detection Project/Anomaly_Detection_Project.ipynb new file mode 100644 index 00000000..6f98f841 --- /dev/null +++ b/Anomaly Detection Project/Anomaly_Detection_Project.ipynb @@ -0,0 +1,43 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import confusion_matrix, classification_report\n", + "from sklearn.decomposition import PCA\n", + "\n", + "# Set plot style\n", + "sns.set(style=\"whitegrid\")\n", + "\n", + "# Load Dataset\n", + "# Replace 'data.csv' with your dataset file\n", + "data = pd.read_csv('data.csv')\n", + "\n", + "# Display the first few rows\n", + "data.head()\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Anomaly Detection Project/anomalies_report.py b/Anomaly Detection Project/anomalies_report.py new file mode 100644 index 00000000..17f84884 --- /dev/null +++ b/Anomaly Detection Project/anomalies_report.py @@ -0,0 +1,15 @@ +# anomalies_report.py + +import pandas as pd + +# Load preprocessed data +data = pd.read_csv('preprocessed_data.csv') + +# Identify anomalies based on both clustering and isolation forest +anomalies_kmeans = data[data['Cluster'] == 1] # From KMeans +anomalies_isoforest = data[data['Anomaly'] == 1] # From Isolation Forest + +# Combine and save identified anomalies from both methods +anomalies_report = pd.concat([anomalies_kmeans, anomalies_isoforest]).drop_duplicates() +anomalies_report.to_csv('anomalies_report.csv', index=False) +print("Anomalies report saved as 'anomalies_report.csv'.") diff --git a/Anomaly Detection Project/data_preprocessing.py b/Anomaly Detection Project/data_preprocessing.py new file mode 100644 index 00000000..58c3879e --- /dev/null +++ b/Anomaly Detection Project/data_preprocessing.py @@ -0,0 +1,19 @@ +# data_preprocessing.py + +import pandas as pd +from sklearn.preprocessing import StandardScaler + +# Load dataset +data = pd.read_csv('raw_data.csv') # Replace with the actual path to your dataset + +# Check for missing values and drop rows with missing values +data = data.dropna() + +# Standardize data +scaler = StandardScaler() +data_scaled = scaler.fit_transform(data) +data_scaled = pd.DataFrame(data_scaled, columns=data.columns) + +# Save preprocessed data to CSV +data_scaled.to_csv('preprocessed_data.csv', index=False) +print("Preprocessed data saved to 'preprocessed_data.csv'.") diff --git a/Anomaly Detection Project/eda_visualizations.py b/Anomaly Detection Project/eda_visualizations.py new file mode 100644 index 00000000..0c0dcd91 --- /dev/null +++ b/Anomaly Detection Project/eda_visualizations.py @@ -0,0 +1,21 @@ +# eda_visualizations.py + +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +# Load preprocessed data +data = pd.read_csv('preprocessed_data.csv') + +# Set plot style +sns.set(style="whitegrid") + +# Plot feature distributions +fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10)) +axes = axes.flatten() +for i, col in enumerate(data.columns): + sns.histplot(data[col], kde=True, ax=axes[i]) + axes[i].set_title(f"Distribution of {col}") +plt.tight_layout() +plt.savefig('eda_feature_distributions.png') +plt.show() diff --git a/Anomaly Detection Project/isolation_forest_anomaly_detection.py b/Anomaly Detection Project/isolation_forest_anomaly_detection.py new file mode 100644 index 00000000..c03265ff --- /dev/null +++ b/Anomaly Detection Project/isolation_forest_anomaly_detection.py @@ -0,0 +1,38 @@ +# isolation_forest_anomaly_detection.py + +import pandas as pd +from sklearn.ensemble import IsolationForest +import joblib +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt + +# Load preprocessed data +data = pd.read_csv('preprocessed_data.csv') + +# Set Isolation Forest parameters +contamination_rate = 0.05 # Adjust this value based on your dataset and anomaly expectations +max_samples_value = "auto" # Can be an integer or "auto" (256 or total sample size, whichever is smaller) + +# Isolation Forest with custom parameters +iso_forest = IsolationForest(contamination=contamination_rate, max_samples=max_samples_value, random_state=42) +data['Anomaly'] = iso_forest.fit_predict(data) + +# Convert labels for anomalies +data['Anomaly'] = data['Anomaly'].apply(lambda x: 1 if x == -1 else 0) + +# Save the Isolation Forest model +joblib.dump(iso_forest, 'isolation_forest_model.pkl') +print("Isolation Forest model saved as 'isolation_forest_model.pkl'.") + +# Visualize anomalies with PCA +pca = PCA(n_components=2) +pca_data = pca.fit_transform(data.drop(columns=['Anomaly'])) + +plt.figure(figsize=(10, 6)) +plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Anomaly'], cmap='coolwarm', marker='o', alpha=0.6) +plt.title("Isolation Forest Anomaly Detection with PCA") +plt.xlabel("PCA Component 1") +plt.ylabel("PCA Component 2") +plt.colorbar(label="Anomaly") +plt.savefig('isolation_forest_anomalies_pca.png') +plt.show() diff --git a/Anomaly Detection Project/kmeans_anomaly_detection.py b/Anomaly Detection Project/kmeans_anomaly_detection.py new file mode 100644 index 00000000..aa18c662 --- /dev/null +++ b/Anomaly Detection Project/kmeans_anomaly_detection.py @@ -0,0 +1,35 @@ +# kmeans_anomaly_detection.py + +import pandas as pd +from sklearn.cluster import KMeans +import joblib +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt + +# Load preprocessed data +data = pd.read_csv('preprocessed_data.csv') + +# KMeans Clustering +kmeans = KMeans(n_clusters=2, random_state=42) +kmeans.fit(data) +data['Cluster'] = kmeans.labels_ + +# Save the KMeans model +joblib.dump(kmeans, 'kmeans_model.pkl') +print("KMeans model saved as 'kmeans_model.pkl'.") + +# Identify anomalies based on cluster assignment +anomalies_kmeans = data[data['Cluster'] == 1] + +# Visualize clusters with PCA +pca = PCA(n_components=2) +pca_data = pca.fit_transform(data.drop(columns=['Cluster'])) + +plt.figure(figsize=(10, 6)) +plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Cluster'], cmap='viridis', marker='o', alpha=0.6) +plt.title("KMeans Clustering with PCA") +plt.xlabel("PCA Component 1") +plt.ylabel("PCA Component 2") +plt.colorbar(label="Cluster") +plt.savefig('kmeans_clusters_pca.png') +plt.show() diff --git a/Anomaly Detection Project/model_evaluation.py b/Anomaly Detection Project/model_evaluation.py new file mode 100644 index 00000000..f2591857 --- /dev/null +++ b/Anomaly Detection Project/model_evaluation.py @@ -0,0 +1,23 @@ +# model_evaluation.py + +import pandas as pd +from sklearn.metrics import confusion_matrix, classification_report + +# Load data with true labels (replace 'TrueLabels' with the actual column if available) +data = pd.read_csv('preprocessed_data.csv') +# Assuming `TrueLabels` column exists in the original data + +# Evaluate Isolation Forest Model +# Assuming 'TrueLabels' is in the original data and represents ground truth for anomalies +# Uncomment and use if true labels are available + +# true_labels = data['TrueLabels'] +# print("Confusion Matrix (Isolation Forest):\n", confusion_matrix(true_labels, data['Anomaly'])) +# print("Classification Report (Isolation Forest):\n", classification_report(true_labels, data['Anomaly'])) + +# Save the evaluation report to file +# with open("evaluation_report.txt", "w") as f: +# f.write("Confusion Matrix (Isolation Forest):\n") +# f.write(str(confusion_matrix(true_labels, data['Anomaly'])) + "\n\n") +# f.write("Classification Report (Isolation Forest):\n") +# f.write(classification_report(true_labels, data['Anomaly'])) diff --git a/Anomaly Detection Project/readme.md b/Anomaly Detection Project/readme.md new file mode 100644 index 00000000..8516573d --- /dev/null +++ b/Anomaly Detection Project/readme.md @@ -0,0 +1,188 @@ +# Anomaly Detection Project + +This project demonstrates how to detect anomalies (unusual patterns) in a dataset using machine learning techniques like **Isolation Forest** and **KMeans Clustering**. The goal is to identify anomalous behaviors that could indicate fraudulent transactions, network intrusions, or other forms of outliers. + +## Table of Contents + +- [Project Overview](#project-overview) +- [Files and Their Purpose](#files-and-their-purpose) +- [Installation Instructions](#installation-instructions) +- [Usage Instructions](#usage-instructions) +- [Dependencies](#dependencies) +- [Model Explanation](#model-explanation) +- [Results and Visualizations](#results-and-visualizations) +- [Contributing](#contributing) + +## Project Overview + +This project uses **supervised and unsupervised machine learning techniques** to identify anomalies in a given dataset. The anomaly detection process involves: + +1. **Data Collection & Preprocessing**: Gathering and cleaning data by handling missing values and outliers. +2. **Exploratory Data Analysis (EDA)**: Visualizing the data distributions, identifying potential anomalies, and calculating summary statistics. +3. **Anomaly Detection Techniques**: Using **KMeans clustering** and **Isolation Forest** to detect anomalies. +4. **Model Evaluation**: Evaluating the model performance with metrics like precision, recall, and F1 score (if ground truth labels are available). +5. **Visualization**: Visualizing the anomalies detected using **PCA** and other plots. + +## Files and Their Purpose + +Here’s a breakdown of each script in the project: + +1. **`data_preprocessing.py`**: + - Handles data loading, cleaning, and preprocessing tasks. + - Deals with missing values, categorical data encoding, and outlier handling. + +2. **`eda_visualizations.py`**: + - Performs Exploratory Data Analysis (EDA). + - Visualizes the data distributions and detects potential anomalies through visual methods like histograms and scatter plots. + +3. **`kmeans_anomaly_detection.py`**: + - Implements anomaly detection using **KMeans clustering**. + - Identifies outliers based on clustering results. + +4. **`isolation_forest_anomaly_detection.py`**: + - Implements anomaly detection using the **Isolation Forest** algorithm. + - Detects anomalies by isolating points that are far from the rest of the data. + - Allows you to adjust `contamination` and `max_samples` parameters. + +5. **`model_evaluation.py`** (Optional): + - Evaluates the performance of anomaly detection techniques (if ground truth labels are available). + - Calculates precision, recall, F1 score, and other evaluation metrics. + +6. **`anomalies_report.py`**: + - Generates a report of the detected anomalies, which can be saved to a CSV or viewed as a summary. + +7. **`Anomaly_Detection_Project.ipynb`**: + - Jupyter notebook that integrates all the scripts above. + - Contains code, visualizations, and detailed explanations for the entire anomaly detection process. + +## Installation Instructions + +### Clone this repository +To get started, clone the repository to your local machine using: + +```bash +git clone https://github.com/your-username/Anomaly-Detection-Project.git +cd Anomaly-Detection-Project +``` + +### Install dependencies +This project requires Python 3.x. You can install the required libraries using **pip** by running the following command in your terminal: + +```bash +pip install -r requirements.txt +``` + +If `requirements.txt` is not included, you can manually install the dependencies: + +```bash +pip install pandas scikit-learn matplotlib seaborn joblib +``` + +## Usage Instructions + +### Step 1: Data Preprocessing +Run the `data_preprocessing.py` script to preprocess the raw data. This will clean the data, handle missing values, and normalize it for anomaly detection: + +```bash +python data_preprocessing.py +``` + +### Step 2: Exploratory Data Analysis (EDA) +Run the `eda_visualizations.py` script to visualize the data distributions and identify potential anomalies: + +```bash +python eda_visualizations.py +``` + +### Step 3: Anomaly Detection with KMeans +Use **KMeans** to detect anomalies by running the `kmeans_anomaly_detection.py` script: + +```bash +python kmeans_anomaly_detection.py +``` + +### Step 4: Anomaly Detection with Isolation Forest +Use **Isolation Forest** to detect anomalies by running the `isolation_forest_anomaly_detection.py` script. Adjust parameters like `contamination` and `max_samples` as needed. + +```bash +python isolation_forest_anomaly_detection.py +``` + +### Step 5: Model Evaluation (Optional) +If you have ground truth labels, run `model_evaluation.py` to evaluate the performance of the anomaly detection models: + +```bash +python model_evaluation.py +``` + +### Step 6: Generate Anomalies Report +Run the `anomalies_report.py` script to generate and save a report of detected anomalies: + +```bash +python anomalies_report.py +``` + +### Step 7: Jupyter Notebook Workflow +Run the entire workflow within a Jupyter notebook for better visualization and step-by-step execution. Open the notebook: + +```bash +jupyter notebook Anomaly_Detection_Project.ipynb +``` + +## Dependencies + +This project requires the following Python libraries: + +- `pandas`: For data manipulation and analysis. +- `numpy`: For numerical operations. +- `scikit-learn`: For machine learning models (KMeans and Isolation Forest). +- `matplotlib`: For data visualization. +- `seaborn`: For enhanced data visualizations. +- `joblib`: For saving and loading models. + +You can install all dependencies with the command: + +```bash +pip install -r requirements.txt +``` + +## Model Explanation + +- **KMeans Clustering**: This unsupervised learning algorithm groups data points into clusters. Points that do not belong to any well-defined cluster are considered anomalies. +- **Isolation Forest**: This tree-based algorithm isolates anomalies by randomly selecting a feature and splitting the data. Anomalous points are those that are isolated more quickly than normal points. + +## Results and Visualizations + +The results of the anomaly detection process are visualized in several ways: + +1. **PCA Visualizations**: The results of the anomaly detection models (KMeans and Isolation Forest) are visualized using PCA (Principal Component Analysis), which reduces the dimensionality of the data to 2D for easy visualization. +2. **Anomaly Distribution**: The distribution of detected anomalies is shown using scatter plots and heatmaps. + +## Contributing + +We welcome contributions to this project! If you have suggestions or want to add new features, feel free to fork this repository and submit a pull request. + +Please follow these steps to contribute: + +1. Fork the repository. +2. Clone your fork to your local machine. +3. Create a new branch for your feature (`git checkout -b feature-name`). +4. Commit your changes (`git commit -m 'Add new feature'`). +5. Push to your forked repository (`git push origin feature-name`). +6. Submit a pull request. + +--- + +Thank you for checking out the **Anomaly Detection Project**! + +``` + +### Explanation of Sections + +1. **Project Overview**: Describes the project goals and steps involved. +2. **Files and Their Purpose**: Details the role of each file in the project. +3. **Installation Instructions**: How to clone the repo and set up dependencies. +4. **Usage Instructions**: Walks through how to run each script step by step. +5. **Model Explanation**: Brief overview of the machine learning models used (KMeans and Isolation Forest). +6. **Results and Visualizations**: Explains how the anomalies are visualized and evaluated. +7. **Contributing**: Encourages other developers to contribute by forking the repo and submitting pull requests.