#major v3.0.0 Merge pull request #56 from stratosphereips/development

v3.0.0
stratosphereips · Oct 30, 2024 · 0234de5 · 0234de5
2 parents bd56abf + 2c04f48
commit 0234de5
Show file tree

Hide file tree

Showing 18 changed files with 345 additions and 233 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,34 @@
+venv/
+env/
+*.pyc
+*.pyo
+*.pyd
+__pycache__/
+*.so
+*.egg
+*.egg-info/
+.eggs/
+.git/
+.gitignore
+.DS_Store
+Thumbs.db
+*.log
+*.swp
+*.tmp
+build/
+dist/
+*.egg-info/
+*.tar.gz
+*.zip
+node_modules/
+.idea/
+.vscode/
+*.sublime-project
+*.sublime-workspace
+.dockerignore
+Dockerfile
+Dockerfile_MacM1
+images/
+.github
+data/
+tests/
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -20,6 +20,10 @@ to follow when contributing:
 - refactor-<>:      pull request branch, contains code refactoring,
 
 
+## Tests
+
+Our project uses `unittest` for testing. To ensure code quality and maintainability, please run all tests before opening a pull request.
+
 ## Creating a pull request
 
 Commits:

diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,8 @@
 .idea/
 .env
-*__pycache__*
-src/data/__pycache__/
-edna-run.sh
-automatic_run.sh
+__pycache__/
+*-run.sh
 venv/
+*.venv/
 *.swp
+
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,25 @@
+cff-version: 1.2.0
+title: "Stratosphere AIP: Attacker IP Prioritizer"
+message: 'If you use this software, please cite it as specified below.'
+url: "https://github.com/stratosphereips/AIP"
+type: software
+authors:
+  - given-names: Thomas
+    family-names: O'Hara
+  - given-names: Joaquin
+    family-names: Bogado
+    orcid: 'https://orcid.org/0000-0001-9491-5698'
+  - given-names: Veronica
+    family-names: Valeros
+    email: [email protected]
+    affiliation: >-
+      Stratosphere Laboratory, AIC, FEL, Czech
+      Technical University in Prague
+    orcid: 'https://orcid.org/0000-0003-2554-3231'
+  - given-names: Sebastian
+    family-names: Garcia
+    email: [email protected]
+    affiliation: >-
+      Stratosphere Laboratory, AIC, FEL, Czech
+      Technical University in Prague
+    orcid: 'https://orcid.org/0000-0001-6238-9910'
diff --git a/README.md b/README.md
@@ -1,12 +1,55 @@
 # Attacker IP Prioritization (AIP) Tool
-The Attacker IP Prioritization (AIP) is a tool to generate IP blocklists based on network traffic captured from honeypot networks. Originally designed to create the blocklists for the [Stratosphere Blocklist Generation project](https://mcfp.felk.cvut.cz/publicDatasets/CTU-AIPP-BlackList/), it aims to generate an IoT-friendly blocklist. With the advent of 5G, IoT devices will be directly connected to the Internet instead of being protected by a router's firewall. Therefore we need blocklists that are small and portable and designed to block those IPs that are targeting IoT devices. The main models used to this end are the Prioritize Consistent and the Prioritize New.
+The Attacker IP Prioritization (AIP) is a tool to generate efficient and economic IP blocklists based on network traffic captured from honeypot networks. 
 
+With the advent of 5G, IoT devices are directly connected often without firewall protection. Therefore we need blocklists that are small, efficient and economic. The AIP structure is shown below.
 
-Eventually, the project evolved, aiming to test new blocklists generation models beyond the PN and PC. The actual codebase allows a fast developing and testing of those new models, providing a common interface to access the attacks from several sensors deployed on the Public Internet, and a common set of metrics to compare the output of the models.
+![Description of the AIP pipeline](images/AIP_Diagram.png "AIP Tool pipeline")
 
+## AIP Models
 
-Given a honeypot network in your organization, it should be easy to use AIP to generate your own local blocklists based on the traffic reaching the honeypots.
+Each AIP model generates its own blocklist based on a specific criteria. The main models are:
 
-![Description of the AIP pipeline](images/AIP_Diagram.png "AIP Tool pipeline")
+1. **Prioritize New (PN)**
+    - Focuses on IPs that are new or have not been seen frequently in previous data.
+    - Useful to identify emerging attackers that are starting to target a network.
+2. **Prioritize Consistent (PC)**
+    - Focuses on IPs that have consistently attacked over time in previous data.
+    - Useful to identify persistent attackers that continuously target a network.
+3. **Alpha**
+    - Provides a baseline identifying all attackers seen in the last 24 hours.
+    - Useful to compare the effectiveness of other models.
+4. **Alpha7**
+    - Provides a baseline identifying all attackers seen in the last 7 days. 
+    - Useful to further compare the effectiveness of other models.
+5. **Random Forest**
+    - Focuses on IPs that are more likely to attack in the future.
+    - A more experimental approach to increase blocklist efficiency.
+
+
+## AIP Docker
+
+The best way to run AIP right now is using [Docker](etc/docker/README.md).
+
+## Usage
+
+AIP will automatically attempt to run all the models using the available data. Assuming the Zeek data is located in its usual location:
+
+```bash
+:~$ cd AIP
+:~$ docker run --rm -v /opt/zeek/logs/:/home/aip/AIP/data/raw:ro -v ${PWD}/data/:/home/aip/AIP/data/:rw --name aip stratosphereips/aip:latest bin/aip
+```
+
+To run AIP for a specific day:
+```bash
+:~$ cd AIP
+:~$ docker run --rm -v /opt/zeek/logs/:/home/aip/AIP/data/raw:ro -v ${PWD}/data/:/home/aip/AIP/data/:rw --name aip stratosphereips/aip:latest bin/aip YYYY-MM-DD
+```
+
+## License
+
+The Stratosphere AIP tool is licensed under [GNU General Public License v3.0](https://github.com/stratosphereips/AIP/blob/main/LICENSE).
 
+## About
+This tool was developed at the Stratosphere Laboratory at the Czech Technical University in Prague. This is part of the [Stratosphere blocklist generation project](https://mcfp.felk.cvut.cz/publicDatasets/CTU-AIPP-BlackList/).
 
+This tool was originally born from the bachelor thesis of Thomas O'Hara, [The Attacker IP Prioritizer: An IoT Optimized Blacklisting Algorithm (2021)](https://dspace.cvut.cz/handle/10467/96722).
diff --git a/bin/aip b/bin/aip
@@ -29,85 +29,72 @@ __license__ = "GPLv3"
 __maintainer__ = "Joaquin Bogado"
 __version__ = "1.0.0"
 
+import argparse
 import logging
-import pandas as pd
-
-from aip.data.access import data_path, project_dir
+from datetime import date
+from os import makedirs
+from os import path
+from aip.data.access import data_path
 from aip.models.alpha import Alpha
 from aip.models.prioritize import New
 from aip.models.prioritize import Consistent
 from aip.models.prioritize import RandomForest
-from pathlib import Path
-from os import makedirs, path, scandir
-from datetime import date, timedelta, datetime
-import sys
+from aip.utils.date_utils import validate_and_convert_date
+
+
+def run_model(aip_model_name, aip_model, date_day):
+    """
+    Run a given model with exception handling
+    """
+    blocklist=""
+    model_output_dir = path.join(data_path,'output',aip_model_name)
+    # Make sure output directory is created
+    if not path.exists(model_output_dir):
+        makedirs(model_output_dir)
 
-def validate_and_convert_date(date_str):
     try:
-        dateobj = datetime.strptime(date_str, '%Y-%m-%d')
-        return dateobj.date()
-    except ValueError as e:
-        print('Invalid date format. It should be YYYY-MM-DD')
-        raise e
+        blocklist = aip_model.run(date_day)
+        blocklist.to_csv(path.join(model_output_dir, f'AIP-{aip_model_name}-{str(date_day)}.csv.gz'), index=False, compression='gzip')
+        logging.info(f"{aip_model_name} model completed successfully.")
+    except Exception as e:
+        logging.error(f"Error running {aip_model_name} model: {e}", exc_info=True)
 
-#project_dir = Path(__file__).resolve().parents[1]
 
-if __name__ == '__main__':
-    if len(sys.argv) == 2:
-        datestr = sys.argv[1]
-        day = validate_and_convert_date(datestr)
-    else:
-        day = date.today()
+def main():
+    parser = argparse.ArgumentParser(description='Attacker IP Prioritization (AIP) Tool')
+    parser.add_argument('--date', type=str, help='The date for running the models in YYYY-MM-DD format. Defaults to today.', default=str(date.today()))
+    parser.add_argument('--model', type=str, choices=['Alpha', 'Alpha7', 'Prioritize_New', 'Prioritize_Consistent', 'Random_Forest', 'all'], default='all', help='Select AIP model to run. Defaults to all.')
+    parser.add_argument('-d', '--debug', required=False, help="Debugging mode.", action="store_const", dest="log_level", const=logging.DEBUG, default=logging.ERROR,)
+    parser.add_argument('-v', '--verbose', required=False, help="Verbose mode", action="store_const", dest="log_level", const=logging.INFO,)
 
+    args = parser.parse_args()
 
+    # Set up logging
     log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    #logging.basicConfig(level=logging.INFO, format=log_fmt)
-    logging.basicConfig(level=logging.DEBUG, format=log_fmt)
-
-
-    #Alpha Model
-    output_dir = path.join(project_dir, 'data', 'output', 'Alpha')
-    if not path.exists(output_dir):
-        makedirs(output_dir)
-    alpha = Alpha()
-    blocklist = alpha.run(day)
-    blocklist = blocklist.rename(columns={'ip':'attacker'})
-    pd.DataFrame(blocklist, columns=['attacker']).to_csv(path.join(output_dir,
-        f'AIP-Alpha-{str(day)}.csv.gz'), index=False, compression='gzip')
-
-    #Alpha 7 Model (seven days in the past)
-    output_dir = path.join(project_dir, 'data', 'output', 'Alpha7')
-    if not path.exists(output_dir):
-        makedirs(output_dir)
-    alpha7 = Alpha(lookback=7)
-    blocklist = alpha7.run(day)
-    blocklist = blocklist.rename(columns={'ip':'attacker'})
-    pd.DataFrame(blocklist, columns=['attacker']).to_csv(path.join(output_dir,
-        f'AIP-Alpha7-{str(day)}.csv.gz'), index=False, compression='gzip')
+    logging.basicConfig(level=args.log_level, format=log_fmt)
+
+    # Validate input date
+    run_date_day = validate_and_convert_date(args.date)
+
+    # Run Alpha Model
+    if args.model in ['Alpha', 'all']:
+        run_model('Alpha', Alpha(), run_date_day)
+
+    # Alpha 7 Model
+    if args.model in ['Alpha7', 'all']:
+        run_model('Alpha7', Alpha(lookback=7), run_date_day)
 
     # Prioritize New Model
-    output_dir = path.join(data_path, 'output', 'Prioritize_New')
-    if not path.exists(output_dir):
-        makedirs(output_dir)
-    pn = New()
-    blocklist = pn.run(day)
-    blocklist.to_csv(path.join(output_dir,
-        f'AIP-Prioritize_New-{str(day)}.csv.gz'), index=False, compression='gzip')
+    if args.model in ['Prioritize_New', 'all']:
+        run_model('Prioritize_New', New(), run_date_day)
 
     # Prioritize Consistent Model
-    output_dir = path.join(data_path, 'output', 'Prioritize_Consistent')
-    if not path.exists(output_dir):
-        makedirs(output_dir)
-    pc = Consistent()
-    blocklist = pc.run(day)
-    blocklist.to_csv(path.join(output_dir,
-        f'AIP-Prioritize_Consistent-{str(day)}.csv.gz'), index=False, compression='gzip')
-
+    if args.model in ['Prioritize_Consistent', 'all']:
+        run_model('Prioritize_Consistent', Consistent(), run_date_day)
+
     # Prioritize Random Forest Model
-    output_dir = path.join(data_path, 'output', 'random_forest')
-    if not path.exists(output_dir):
-        makedirs(output_dir)
-    rf = RandomForest()
-    blocklist = rf.run(day)
-    blocklist.to_csv(path.join(output_dir,
-        f'AIP-Random_Forest-{str(day)}.csv.gz'), index=False, compression='gzip')
+    if args.model in ['Random_Forest', 'all']:
+        run_model('Random_Forest', RandomForest(), run_date_day)
+
+if __name__ == '__main__':
+    main()
diff --git a/environment.yml b/environment.yml