Notebook branch (#173)

+ gzip example, cruft removal
simplymathematics · Mar 20, 2024 · 9cbae22 · 9cbae22
1 parent 1c6db5b
commit 9cbae22
Show file tree

Hide file tree

Showing 116 changed files with 27,060 additions and 5,223 deletions.
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -7,4 +7,6 @@ jobs:
       - uses: actions/checkout@v2
       - uses: psf/black@stable
         with:
+          options: "--check --verbose"
           src: "deckard/"
+          jupyter: true
diff --git a/.gitignore b/.gitignore
@@ -126,3 +126,19 @@ deckard/deckard.egg-info/*
 
 *log.txt
 *.hydra
+
+
+# envs
+env/
+
+
+# random pdfs
+*.pdf
+# random pngs
+*.png
+
+# screenlog
+screenlog.*
+
+# tmp.py
+tmp.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,36 +1,36 @@
 repos:
-  - repo: https://github.com/asottile/add-trailing-comma
-    rev: v2.2.3
-    hooks:
-    -   id: add-trailing-comma
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0  # Use the ref you want to point at
-    hooks:
-    -   id: check-builtin-literals
-    -   id: check-case-conflict
-    -   id: check-symlinks
-    -   id: check-toml
-    -   id: detect-private-key
-    -   id: end-of-file-fixer
-    -   id: check-yaml
-        args : ['--unsafe']
-  - repo: https://github.com/hadialqattan/pycln
-    rev: v2.1.1 # Possible releases: https://github.com/hadialqattan/pycln/releases
-    hooks:
-      - id: pycln
-        args: [deckard/]
-  - repo: https://github.com/pycqa/flake8
-    rev: '5.0.4'  # pick a git hash / tag to point to
-    hooks:
-    -   id: flake8
-        exclude: __init__.py
-        args: [--ignore=E501 W503]
-  - repo: https://github.com/psf/black
-    rev: 22.8.0
-    hooks:
-      - id: black
+- repo: https://github.com/asottile/add-trailing-comma
+  rev: v3.1.0
+  hooks:
+  - id: add-trailing-comma
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.5.0    # Use the ref you want to point at
+  hooks:
+  - id: check-builtin-literals
+  - id: check-case-conflict
+  - id: check-symlinks
+  - id: check-toml
+  - id: detect-private-key
+  - id: end-of-file-fixer
+  - id: check-yaml
+    args: [--unsafe]
+- repo: https://github.com/hadialqattan/pycln
+  rev: v2.4.0   # Possible releases: https://github.com/hadialqattan/pycln/releases
+  hooks:
+  - id: pycln
+    args: [deckard/]
+- repo: https://github.com/psf/black
+  rev: 24.2.0
+  hooks:
+  - id: black
         # It is recommended to specify the latest version of Python
         # supported by your project here, or alternatively use
         # pre-commit's default_language_version, see
         # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3
+    language_version: python3
+- repo: https://github.com/pycqa/flake8
+  rev: 7.0.0      # pick a git hash / tag to point to
+  hooks:
+  - id: flake8
+    exclude: __init__.py
+    args: [--ignore=E501 W503]
diff --git a/Dockerfile b/Dockerfile
@@ -6,8 +6,6 @@ RUN python3 -m pip install nvidia-pyindex nvidia-cuda-runtime-cu11
 RUN git clone https://github.com/simplymathematics/deckard.git
 WORKDIR /deckard
 RUN python3 -m pip install --editable .
-RUN python3 -m pip install pytest torch torchvision tensorflow
 RUN git clone https://github.com/Trusted-AI/adversarial-robustness-toolbox.git
 RUN cd adversarial-robustness-toolbox && python3 -m pip install .
 RUN apt install python-is-python3
-RUN pytest test 
diff --git a/deckard/__init__.py b/deckard/__init__.py
@@ -46,7 +46,7 @@
         },
     },
     "loggers": {
-        "deckard": {"handlers": ["default"]},
+        "deckard": {"handlers": ["default"], "level": "INFO", "propagate": True},
         "tests": {"handlers": ["test"], "level": "DEBUG", "propagate": True},
     },
 }

diff --git a/deckard/__main__.py b/deckard/__main__.py
@@ -1,5 +1,4 @@
-""""Runs a submodule passed as an arg."""
-
+#!/usr/bin/env python3
 import argparse
 import subprocess
 import logging
@@ -44,9 +43,11 @@ def parse_and_repro(args, default_config="default.yaml", config_dir="conf"):
     if len(args) == 0:
         assert (
             save_params_file(
-                config_dir=Path(Path(), config_dir)
-                if not Path(config_dir).is_absolute()
-                else Path(config_dir),
+                config_dir=(
+                    Path(Path(), config_dir)
+                    if not Path(config_dir).is_absolute()
+                    else Path(config_dir)
+                ),
                 config_file=default_config,
             )
             is None

diff --git a/deckard/base/attack/attack.py b/deckard/base/attack/attack.py
@@ -8,6 +8,9 @@
 from omegaconf import DictConfig, OmegaConf
 from hydra.utils import instantiate
 from art.utils import to_categorical, compute_success
+from sklearn.utils.validation import check_is_fitted
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
 from random import randint
 from ..data import Data
 from ..model import Model
@@ -117,7 +120,13 @@ class EvasionAttack:
     kwargs: Union[dict, None] = field(default_factory=dict)
 
     def __init__(
-        self, name: str, data: Data, model: Model, init: dict, attack_size=-1, **kwargs
+        self,
+        name: str,
+        data: Data,
+        model: Model,
+        init: dict,
+        attack_size=-1,
+        **kwargs,
     ):
         self.name = name
         self.data = data
@@ -148,6 +157,10 @@ def __call__(
         if attack_file is not None and Path(attack_file).exists():
             samples = self.data.load(attack_file)
         else:
+            print(f"Type of self.init: {type(self.init)}")
+            print(f"Type of self.init.model: {type(self.init.model)}")
+            print(f"Type of model: {type(model)}")
+
             atk = self.init(model=model, attack_size=self.attack_size)
 
             if targeted is True:
@@ -466,7 +479,13 @@ class InferenceAttack:
     kwargs: Union[dict, None] = field(default_factory=dict)
 
     def __init__(
-        self, name: str, data: Data, model: Model, init: dict, attack_size=-1, **kwargs
+        self,
+        name: str,
+        data: Data,
+        model: Model,
+        init: dict,
+        attack_size=-1,
+        **kwargs,
     ):
         self.name = name
         self.data = data
@@ -577,7 +596,13 @@ class ExtractionAttack:
     kwargs: Union[dict, None] = field(default_factory=dict)
 
     def __init__(
-        self, name: str, data: Data, model: Model, init: dict, attack_size=-1, **kwargs
+        self,
+        name: str,
+        data: Data,
+        model: Model,
+        init: dict,
+        attack_size=-1,
+        **kwargs,
     ):
         self.name = name
         self.data = data
@@ -798,12 +823,21 @@ def __call__(
         adv_predictions_file=None,
         adv_probabilities_file=None,
         adv_losses_file=None,
+        **kwargs,
     ):
         name = self.init.name
         kwargs = deepcopy(self.kwargs)
         kwargs.update({"init": self.init.kwargs})
         data = self.data()
         data, model = self.model.initialize(data)
+        if isinstance(model, BaseEstimator):
+            try:
+                check_is_fitted(model), "Model must be fitted before calling attack."
+            except NotFittedError as e:
+                logger.warning(
+                    f"Model not fitted. Fitting model before attack. Error: {e}",
+                )
+                model, _ = self.model.fit(data=data, model=model)
         if "art" not in str(type(model)):
             model = self.model.art(model=model, data=data)
         if self.method == "evasion":

diff --git a/deckard/base/data/data.py b/deckard/base/data/data.py
@@ -4,10 +4,10 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Union
-
 import numpy as np
 from pandas import DataFrame, read_csv, Series
-
+from omegaconf import OmegaConf
+from validators import url
 from ..utils import my_hash
 from .generator import DataGenerator
 from .sampler import SklearnDataSampler
@@ -28,6 +28,7 @@ class Data:
     )
     target: Union[str, None] = None
     name: Union[str, None] = None
+    drop: list = field(default_factory=list)
 
     def __init__(
         self,
@@ -36,6 +37,8 @@ def __init__(
         sample: SklearnDataSampler = None,
         sklearn_pipeline: SklearnDataPipeline = None,
         target: str = None,
+        drop: list = [],
+        **kwargs,
     ):
         """Initialize the data object. If the data is generated, then generate the data and sample it. If the data is loaded, then load the data and sample it.
 
@@ -46,9 +49,6 @@ def __init__(
             sklearn_pipeline (SklearnDataPipeline, optional): The sklearn pipeline. Defaults to None.
             target (str, optional): The target column. Defaults to None.
         """
-        logger.info(
-            f"Instantiating {self.__class__.__name__} with name={name} and generate={generate} and sample={sample} and sklearn_pipeline={sklearn_pipeline} and target={target}",
-        )
         if generate is not None:
             self.generate = (
                 generate
@@ -66,16 +66,19 @@ def __init__(
         else:
             self.sample = SklearnDataSampler()
         if sklearn_pipeline is not None:
+            sklearn_pipeline = OmegaConf.to_container(
+                OmegaConf.create(sklearn_pipeline),
+            )
             self.sklearn_pipeline = (
                 sklearn_pipeline
-                if isinstance(sklearn_pipeline, (SklearnDataPipeline, type(None)))
+                if isinstance(sklearn_pipeline, (SklearnDataPipeline))
                 else SklearnDataPipeline(**sklearn_pipeline)
             )
         else:
             self.sklearn_pipeline = None
+        self.drop = drop
         self.target = target
         self.name = name if name is not None else my_hash(self)
-        logger.debug(f"Instantiating Data with id: {self.get_name()}")
 
     def get_name(self):
         """Get the name of the data object."""
@@ -91,7 +94,6 @@ def initialize(self, filename=None):
         """
         if filename is not None and Path(filename).exists():
             result = self.load(filename)
-            assert len(result) == 4, f"Data is not generated: {self.name}"
         elif self.generate is not None:
             result = self.generate()
         else:
@@ -100,14 +102,23 @@ def initialize(self, filename=None):
             assert self.target is not None, "Target is not specified"
             y = result[self.target]
             X = result.drop(self.target, axis=1)
-            X = np.array(X)
-            y = np.array(y)
+            if self.drop != []:
+                X = X.drop(self.drop, axis=1)
+            X = X.to_numpy()
+            y = y.to_numpy()
             result = [X, y]
+        else:
+            if self.drop != []:
+                raise ValueError(
+                    f"Drop is not supported for non-DataFrame data. Data is type {type(result)}",
+                )
         if len(result) == 2:
             result = self.sample(*result)
         assert (
             len(result) == 4
         ), f"Data is not generated: {self.name} {result}. Length: {len(result)},"
+        if self.sklearn_pipeline is not None:
+            result = self.sklearn_pipeline(*result)
         return result
 
     def load(self, filename) -> DataFrame:
@@ -125,6 +136,8 @@ def load(self, filename) -> DataFrame:
         elif suffix in [".pkl", ".pickle"]:
             with open(filename, "rb") as f:
                 data = pickle.load(f)
+        elif suffix in [".npz"]:
+            data = np.load(filename)
         else:  # pragma: no cover
             raise ValueError(f"Unknown file type {suffix}")
         return data
@@ -138,6 +151,10 @@ def save(self, data, filename):
             logger.info(f"Saving data to {filename}")
             suffix = Path(filename).suffix
             Path(filename).parent.mkdir(parents=True, exist_ok=True)
+            if isinstance(data, dict):
+                for k, v in data.items():
+                    v = str(v)
+                    data[k] = v
             if suffix in [".json"]:
                 if isinstance(data, (Series, DataFrame)):
                     data = data.to_dict()
@@ -155,16 +172,20 @@ def save(self, data, filename):
                 else:  # pragma: no cover
                     raise ValueError(f"Unknown data type {type(data)} for {filename}.")
                 with open(filename, "w") as f:
-                    json.dump(data, f)
+                    json.dump(data, f, indent=4, sort_keys=True)
             elif suffix in [".csv"]:
                 assert isinstance(
                     data,
                     (Series, DataFrame, dict, np.ndarray),
                 ), f"Data must be a Series, DataFrame, or dict, not {type(data)} to save to {filename}"
-                DataFrame(data).to_csv(filename, index=False)
+                if isinstance(data, (np.ndarray)):
+                    data = DataFrame(data)
+                data.to_csv(filename, index=False)
             elif suffix in [".pkl", ".pickle"]:
                 with open(filename, "wb") as f:
                     pickle.dump(data, f)
+            elif suffix in [".npz"]:
+                np.savez(filename, data)
             else:  # pragma: no cover
                 raise ValueError(f"Unknown file type {type(suffix)} for {suffix}")
             assert Path(filename).exists()
@@ -174,19 +195,19 @@ def __call__(
         data_file=None,
         train_labels_file=None,
         test_labels_file=None,
+        **kwargs,
     ) -> list:
         """Loads data from file if it exists, otherwise generates data and saves it to file. Returns X_train, X_test, y_train, y_test as a list of arrays, typed according to the framework.
         :param filename: str
         :return: list
         """
-        result_dict = {}
-        if data_file is not None and Path(data_file).exists():
-            data = self.load(data_file)
-            assert len(data) == 4, f"Some data is missing: {self.name}"
+        if Path(self.name).is_file() or url(self.name):
+            new_data_file = data_file
+            data_file = self.name
         else:
-            data = self.initialize(filename=data_file)
-            assert len(data) == 4, f"Some data is missing: {self.name}"
-            data_file = self.save(data, data_file)
+            new_data_file = data_file
+        result_dict = {}
+        data = self.initialize(data_file)
         result_dict["data"] = data
         if train_labels_file is not None:
             self.save(data[2], train_labels_file)
@@ -198,4 +219,6 @@ def __call__(
             assert Path(
                 test_labels_file,
             ).exists(), f"Error saving test labels to {test_labels_file}"
+        if new_data_file is not None:
+            self.save(data, new_data_file)
         return data