protectai · seanpmorgan · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/README.md b/README.md
@@ -82,15 +82,13 @@ This will be expanding continually, so look out for changes in our release notes
 
 At present, ModelScan supports any Pickle derived format and many others:
 
-| ML Library | API | Serialization Format                | modelscan support 
-| ----| ----|-------------------------------------| ----|
-| Pytorch | [torch.save() and torch.load()](https://pytorch.org/tutorials/beginner/saving_loading_models.html )| Pickle                              | Yes 
-| Tensorflow| [tf.saved_model.save()](https://www.tensorflow.org/guide/saved_model)| Protocol Buffer                     | Yes 
-| Keras| [keras.models.save(save_format= 'h5')](https://www.tensorflow.org/guide/keras/serialization_and_saving)| HD5 (Hierarchical Data Format)      | Yes 
-| | [keras.models.save(save_format= 'keras')](https://www.tensorflow.org/guide/keras/serialization_and_saving)| Keras V3 (Hierarchical Data Format) | Yes
-| Classic ML Libraries (Sklearn, XGBoost etc.)| pickle.dump(), dill.dump(), joblib.dump(), cloudpickle.dump() | Pickle, Cloudpickle, Dill, Joblib   | Yes 
-
-
+| ML Library                                   | API                                                                                                        | Serialization Format                | modelscan support |
+|----------------------------------------------|------------------------------------------------------------------------------------------------------------|-------------------------------------|-------------------|
+| Pytorch                                      | [torch.save() and torch.load()](https://pytorch.org/tutorials/beginner/saving_loading_models.html )        | Pickle                              | Yes               |
+| Tensorflow                                   | [tf.saved_model.save()](https://www.tensorflow.org/guide/saved_model)                                      | Protocol Buffer                     | Yes               |
+| Keras                                        | [keras.models.save(save_format= 'h5')](https://www.tensorflow.org/guide/keras/serialization_and_saving)    | HD5 (Hierarchical Data Format)      | Yes               |
+|                                              | [keras.models.save(save_format= 'keras')](https://www.tensorflow.org/guide/keras/serialization_and_saving) | Keras V3 (Hierarchical Data Format) | Yes               |
+| Classic ML Libraries (Sklearn, XGBoost etc.) | pickle.dump(), dill.dump(), joblib.dump(), cloudpickle.dump()                                              | Pickle, Cloudpickle, Dill, Joblib   | Yes               |
 
 ### Installation 
 ModelScan is installed on your systems as a Python package(Python 3.8 to 3.11 supported). As shown from above you can install
@@ -111,18 +109,17 @@ modelscan = ">=0.1.1"
 
 ModelScan supports the following arguments via the CLI:
 
-| Usage                                 | Argument             | Explanation                                | 
-|---------------------------------------|----------------------| -------------------------------------------|
-| ```modelscan -h ```                   | -h or --help         | View usage help                            |
-| ```modelscan -v ```                   | -v or --version      | View version information                   |
-| ```modelscan -p /path/to/model_file```| -p or --path         | Scan a locally stored model                |
+| Usage                                                                            | Argument         | Explanation                                             | 
+|----------------------------------------------------------------------------------|------------------|---------------------------------------------------------|
+| ```modelscan -h ```                                                              | -h or --help     | View usage help                                         |
+| ```modelscan -v ```                                                              | -v or --version  | View version information                                |
+| ```modelscan -p /path/to/model_file```                                           | -p or --path     | Scan a locally stored model                             |
+| ```modelscan -p /path/to/model_file --settings-file ./modelscan-settings.toml``` | --settings-file  | Scan a locally stored model using custom configurations |
+| ```modelscan create-settings-file```                                             | -l or --location | Create a configurable settings file                     |
 
 
 Remember models are just like any other form of digital media, you should scan content from any untrusted source before use.
 
-**NOTE**: LLMs are large files, it can take a few minutes to download them before scanning. Expect the process
-to take just a few minutes to complete. 
-
 ##### CLI Exit Codes
 The CLI exit status codes are:
 - `0`: Scan completed successfully, no vulnerabilities found

diff --git a/modelscan/cli.py b/modelscan/cli.py
@@ -1,32 +1,48 @@
 import logging
 import sys
+import os
 from pathlib import Path
 from typing import Optional
+from tomlkit import parse
 
 import click
 
 from modelscan.modelscan import ModelScan
 from modelscan.reports import ConsoleReport
 from modelscan._version import __version__
+from modelscan.settings import SettingsUtils, DEFAULT_SETTINGS
+from modelscan.tools.cli_utils import DefaultGroup
 
 logger = logging.getLogger("modelscan")
 
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
 
 
-# redefine format_usage so the appropriate command name shows up
-class ModelscanCommand(click.Command):
-    def format_usage(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
-        pieces = self.collect_usage_pieces(ctx)
-        formatter.write_usage("modelscan", " ".join(pieces))
-
-
-@click.command(
+@click.group(
+    "cli",
+    cls=DefaultGroup,
+    default="scan",
     context_settings=CONTEXT_SETTINGS,
-    cls=ModelscanCommand,
-    help="Modelscan detects machine learning model files that perform suspicious actions",
+    help="""
+    Modelscan detects machine learning model files that perform suspicious actions.
+
+    To scan a model file or directory, simply point toward your desired path:
+    `modelscan -p /path/to/model_file.h5` 
+
+    Scanning is the default action. If you'd like more information on configurations run:
+    `modelscan scan --help`
+
+    You can also create a configurable settings file using:
+    `modelscan create-settings-file`
+
+    """,
+    default_if_no_args=True,
 )
+def cli() -> None:
+    pass
+
+
 @click.version_option(__version__, "-v", "--version")
 @click.option(
     "-p",
@@ -48,20 +64,45 @@ def format_usage(self, ctx: click.Context, formatter: click.HelpFormatter) -> No
     default=False,
     help="Print a list of files that were skipped during the scan",
 )
+@click.option(
+    "--settings-file",
+    type=click.Path(exists=True, dir_okay=False),
+    help="Specify a settings file to use for the scan. Defaults to ./modelscan-settings.toml.",
+)
+@cli.command(
+    help="[Default] Scan a model file or diretory for ability to execute suspicious actions. "
+)  # type: ignore
 @click.pass_context
-def cli(
+def scan(
     ctx: click.Context,
     log: str,
     path: Optional[str],
     show_skipped: bool,
+    settings_file: Optional[str],
 ) -> int:
     logger.setLevel(logging.INFO)
     logger.addHandler(logging.StreamHandler(stream=sys.stdout))
 
     if log is not None:
         logger.setLevel(getattr(logging, log))
 
-    modelscan = ModelScan()
+    settings_file_path = Path(
+        settings_file if settings_file else f"{os.getcwd()}/modelscan-settings.toml"
+    )
+
+    settings = DEFAULT_SETTINGS
+
+    if settings_file_path and settings_file_path.is_file():
+        with open(settings_file_path) as sf:
+            settings = parse(sf.read()).unwrap()
+            click.echo(f"Detected settings file. Using {settings_file_path}. \n")
+    else:
+        click.echo(
+            f"No settings file detected at {settings_file_path}. Using defaults. \n"
+        )
+
+    modelscan = ModelScan(settings=settings)
+
     if path is not None:
         pathlibPath = Path().cwd() if path == "." else Path(path).absolute()
         if not pathlibPath.exists():
@@ -91,6 +132,37 @@ def cli(
         return 0
 
 
+@cli.command("create-settings-file", help="Create a modelscan settings file")  # type: ignore
+@click.option(
+    "-f", "--force", is_flag=True, help="Overwrite existing settings file if it exists."
+)
+@click.option(
+    "-l",
+    "--location",
+    type=click.Path(dir_okay=False, writable=True),
+    help="The specific filepath to write the settings file.",
+)
+def create_settings(force: bool, location: Optional[str]) -> None:
+    working_dir = os.getcwd()
+    settings_path = os.path.join(working_dir, "modelscan-settings.toml")
+
+    if location:
+        settings_path = location
+
+    try:
+        open(settings_path)
+        if force:
+            with open(settings_path, "w") as settings_file:
+                settings_file.write(SettingsUtils.get_default_settings_as_toml())
+        else:
+            logger.warning(
+                f"{settings_path} file already exists. Please use `--force` flag if you intend to overwrite it."
+            )
+    except FileNotFoundError:
+        with open(settings_path, "w") as settings_file:
+            settings_file.write(SettingsUtils.get_default_settings_as_toml())
+
+
 def main() -> None:
     try:
         result = cli.main(standalone_mode=False)

diff --git a/modelscan/issues.py b/modelscan/issues.py
@@ -66,6 +66,9 @@ def __eq__(self, other: Any) -> bool:
             and str(self.details.source) == str(other.details.source)  # type: ignore[attr-defined]
         )
 
+    def __repr__(self) -> str:
+        return str(self.severity) + str(self.details)
+
     def __hash__(self) -> int:
         return hash(
             str(self.code)

diff --git a/modelscan/scanners/saved_model/scan.py b/modelscan/scanners/saved_model/scan.py
@@ -111,9 +111,9 @@ def _check_for_unsafe_tf_keras_operator(
         source: Union[str, Path],
         settings: Dict[str, Any],
     ) -> ScanResults:
-        unsafe_operators: Dict[str, IssueSeverity] = settings[
-            SavedModelScan.full_name()
-        ]["unsafe_tf_keras_operators"]
+        unsafe_operators: Dict[str, Any] = settings[SavedModelScan.full_name()][
+            "unsafe_tf_keras_operators"
+        ]
 
         issues: List[Issue] = []
         all_operators = tensorflow.raw_ops.__dict__.keys()
@@ -123,7 +123,7 @@ def _check_for_unsafe_tf_keras_operator(
 
         for op in raw_operator:
             if op in unsafe_operators:
-                severity = unsafe_operators[op]
+                severity = IssueSeverity[unsafe_operators[op]]
             elif op not in all_safe_operators:
                 severity = IssueSeverity.MEDIUM
             else:

diff --git a/modelscan/settings.py b/modelscan/settings.py
@@ -1,4 +1,7 @@
-from modelscan.issues import IssueSeverity
+import tomlkit
+
+from typing import Any
+
 
 DEFAULT_SCANNERS = [
     "modelscan.scanners.H5Scan",
@@ -24,9 +27,9 @@
             "enabled": True,
             "supported_extensions": [".pb"],
             "unsafe_tf_keras_operators": {
-                "ReadFile": IssueSeverity.HIGH,
-                "WriteFile": IssueSeverity.HIGH,
-                "Lambda": IssueSeverity.MEDIUM,
+                "ReadFile": "HIGH",
+                "WriteFile": "HIGH",
+                "Lambda": "MEDIUM",
             },
         },
         "modelscan.scanners.NumpyScan": {
@@ -50,24 +53,24 @@
         },
         "unsafe_globals": {
             "CRITICAL": {
-                "__builtin__": {
+                "__builtin__": [
                     "eval",
                     "compile",
                     "getattr",
                     "apply",
                     "exec",
                     "open",
                     "breakpoint",
-                },  # Pickle versions 0, 1, 2 have those function under '__builtin__'
-                "builtins": {
+                ],  # Pickle versions 0, 1, 2 have those function under '__builtin__'
+                "builtins": [
                     "eval",
                     "compile",
                     "getattr",
                     "apply",
                     "exec",
                     "open",
                     "breakpoint",
-                },  # Pickle versions 3, 4 have those function under 'builtins'
+                ],  # Pickle versions 3, 4 have those function under 'builtins'
                 "runpy": "*",
                 "os": "*",
                 "nt": "*",  # Alias for 'os' on Windows. Includes os.system()
@@ -87,3 +90,14 @@
         },
     },
 }
+
+
+class SettingsUtils:
+    @staticmethod
+    def get_default_settings_as_toml() -> Any:
+        toml_settings = tomlkit.dumps(DEFAULT_SETTINGS)
+
+        # Add settings file header
+        toml_settings = f"# ModelScan settings file\n\n{toml_settings}"
+
+        return toml_settings
diff --git a/modelscan/tools/LICENSE b/modelscan/tools/LICENSE
@@ -19,3 +19,34 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+=========
+BSD 3-Clause
+
+Copyright (c) 2015-2023, Heungsub Lee
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+  Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.