Add the ruff linter to the pre-commit check (#214)

I've been finding the [ruff linter](https://docs.astral.sh/ruff/rules/) really useful in some other projects, so thought I'd see if you guys were interested in adding it here. My motivation was mainly to get the imports formatting standardised (which black doesn't do) for easier diffing/merging. But it's got a bunch of other really useful checks, especially when working with pandas (though I've turned these off for now so as not to confuse the PR diff). It's also found and fixed a bunch of style/safety things and I've flagged a couple of logic errors to be fixed also. What do you guys think?
etsap-TIMES · Mar 13, 2024 · 8acd004 · 8acd004
1 parent 1812cf5
commit 8acd004
Show file tree

Hide file tree

Showing 12 changed files with 260 additions and 214 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,4 @@ docs/api/
 *.log
 /profile.*
 xl2times/.cache/
+*.log.zip
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,11 +5,20 @@ repos:
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
+
   - repo: https://github.com/psf/black
     rev: 22.8.0
     hooks:
       - id: black
+
   - repo: https://github.com/RobertCraigie/pyright-python
     rev: v1.1.304
     hooks:
       - id: pyright
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.3.2
+    hooks:
+      - id: ruff
+        types_or: [ python, pyi, jupyter ]
+        args: [ --fix, --exit-non-zero-on-fix ]
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,8 @@ dev = [
     "tabulate",
     "pytest",
     "pytest-cov",
-    "poethepoet"
+    "poethepoet",
+    "ruff"
 ]
 
 [project.urls]
@@ -61,3 +62,37 @@ benchmark = { cmd = "python utils/run_benchmarks.py benchmarks.yml --run", help
 benchmark_all = { shell = "python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt", help = "Run the project", interpreter = "posix" }
 lint = { shell = "git add .pre-commit-config.yaml & pre-commit run", help = "Run pre-commit hooks", interpreter = "posix" }
 test = { cmd = "pytest --cov-report term --cov-report html --cov=xl2times --cov=utils", help = "Run unit tests with pytest" }
+
+
+# Config for various pre-commit checks are below
+# Ruff linting rules - see https://github.com/charliermarsh/ruff and https://beta.ruff.rs/docs/rules/
+[tool.ruff]
+target-version = "py311"
+line-length = 88
+
+# Option 1: use basic rules only.
+lint.select = [
+    "E", # pycodestyle errors
+    "W", # pycodestyle warnings
+    "F", # pyflakes
+    "UP", # pyupgrade
+    "N", # pep8 naming
+    "I", # isort
+    "TID", # tidy imports
+    "UP", # pyupgrade
+    "NPY", # numpy style
+    "PL", # pylint
+#    "PD", # pandas style # TODO enable later
+#    "C90", # code complexity # TODO enable later
+]
+
+# Add specific rule codes/groups here to ignore them, or add a '#noqa' comment to the line of code to skip all checks.
+lint.ignore = [
+    "PLR", # complexity rules
+    "PD901", "PD011", # pandas 'df''
+    "E501", # line too long, handled by black
+]
+
+# Ruff rule-specific options:
+[tool.ruff.mccabe]
+max-complexity = 12 # increase max function 'complexity'
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -2,16 +2,16 @@
 
 import pandas as pd
 
-from xl2times import transforms, utils, datatypes
+from xl2times import datatypes, transforms, utils
 from xl2times.transforms import (
-    _process_comm_groups_vectorised,
     _count_comm_group_vectorised,
+    _match_wildcards,
+    _process_comm_groups_vectorised,
+    commodity_map,
     expand_rows,
     get_matching_commodities,
     get_matching_processes,
-    _match_wildcards,
     process_map,
-    commodity_map,
 )
 
 logger = utils.get_logger()

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,6 +1,7 @@
-from xl2times import utils
 import pandas as pd
 
+from xl2times import utils
+
 
 class TestUtils:
     def test_explode(self):

diff --git a/utils/dd_to_csv.py b/utils/dd_to_csv.py
@@ -1,10 +1,9 @@
 import argparse
-import sys
-from collections import defaultdict
 import json
 import os
+import sys
+from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -13,7 +12,7 @@
 
 def parse_parameter_values_from_file(
     path: Path,
-) -> Tuple[Dict[str, List], Dict[str, set]]:
+) -> tuple[dict[str, list], dict[str, set]]:
     """
     Parse *.dd to turn it into CSV format
     There are parameters and sets, and each has a slightly different format
@@ -35,11 +34,11 @@ def parse_parameter_values_from_file(
 
     """
 
-    data = list(open(path, "r"))
+    data = list(open(path))
     data = [line.rstrip() for line in data]
 
-    param_value_dict: Dict[str, List] = dict()
-    set_data_dict: Dict[str, set] = dict()
+    param_value_dict: dict[str, list] = dict()
+    set_data_dict: dict[str, set] = dict()
     index = 0
     while index < len(data):
         if data[index].startswith("PARAMETER"):
@@ -124,8 +123,8 @@ def parse_parameter_values_from_file(
 
 
 def save_data_with_headers(
-    param_data_dict: Dict[str, Union[pd.DataFrame, List[str]]],
-    headers_data: Dict[str, List[str]],
+    param_data_dict: dict[str, pd.DataFrame | list[str]],
+    headers_data: dict[str, list[str]],
     save_dir: str,
 ) -> None:
     """
@@ -157,7 +156,7 @@ def save_data_with_headers(
     return
 
 
-def generate_headers_by_attr() -> Dict[str, List[str]]:
+def generate_headers_by_attr() -> dict[str, list[str]]:
     with open("xl2times/config/times-info.json") as f:
         attributes = json.load(f)
 
@@ -173,7 +172,7 @@ def generate_headers_by_attr() -> Dict[str, List[str]]:
 
 
 def convert_dd_to_tabular(
-    basedir: str, output_dir: str, headers_by_attr: Dict[str, List[str]]
+    basedir: str, output_dir: str, headers_by_attr: dict[str, list[str]]
 ) -> None:
     dd_files = [p for p in Path(basedir).rglob("*.dd")]
 
@@ -201,15 +200,15 @@ def convert_dd_to_tabular(
     os.makedirs(set_path, exist_ok=True)
 
     # Extract headers with key=param_name and value=List[attributes]
-    lines = list(open("xl2times/config/times_mapping.txt", "r"))
+    lines = list(open("xl2times/config/times_mapping.txt"))
     headers_data = headers_by_attr
     # The following will overwrite data obtained from headers_by_attr
     # TODO: Remove once migration is done?
     for line in lines:
-        line = line.strip()
-        if line != "":
-            param_name = line.split("[")[0]
-            attributes = line.split("[")[1].split("]")[0].split(",")
+        ln = line.strip()
+        if ln != "":
+            param_name = ln.split("[")[0]
+            attributes = ln.split("[")[1].split("]")[0].split(",")
             headers_data[param_name] = [*attributes]
 
     save_data_with_headers(all_parameters, headers_data, param_path)

diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py
@@ -9,22 +9,22 @@
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from os import path, symlink
-from typing import Any, Tuple
+from typing import Any
 
 import git
 import pandas as pd
 import yaml
+from dd_to_csv import main
 from tabulate import tabulate
 
-from dd_to_csv import main
 from xl2times import utils
 from xl2times.__main__ import parse_args, run
 from xl2times.utils import max_workers
 
 logger = utils.get_logger()
 
 
-def parse_result(output: str) -> Tuple[float, int, int]:
+def parse_result(output: str) -> tuple[float, int, int]:
     # find pattern in multiline string
     m = re.findall(
         r"(\d+\.\d)% of ground truth rows present in output \((\d+)/(\d+)\), (\d+) additional rows",
@@ -65,6 +65,7 @@ def run_gams_gdxdiff(
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         text=True,
+        check=False,
     )
     if res.returncode != 0:
         logger.info(res.stdout)
@@ -96,6 +97,7 @@ def run_gams_gdxdiff(
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         text=True,
+        check=False,
     )
     if res.returncode != 0:
         logger.info(res.stdout)
@@ -119,6 +121,7 @@ def run_gams_gdxdiff(
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         text=True,
+        check=False,
     )
     if verbose:
         logger.info(res.stdout)
@@ -138,7 +141,7 @@ def run_benchmark(
     out_folder: str = "out",
     verbose: bool = False,
     debug: bool = False,
-) -> Tuple[str, float, str, float, int, int]:
+) -> tuple[str, float, str, float, int, int]:
     xl_folder = path.join(benchmarks_folder, "xlsx", benchmark["input_folder"])
     dd_folder = path.join(benchmarks_folder, "dd", benchmark["dd_folder"])
     csv_folder = path.join(benchmarks_folder, "csv", benchmark["name"])
@@ -160,6 +163,7 @@ def run_benchmark(
                 stderr=subprocess.STDOUT,
                 text=True,
                 shell=True if os.name == "nt" else False,
+                check=False,
             )
             if res.returncode != 0:
                 # Remove partial outputs
@@ -191,7 +195,7 @@ def run_benchmark(
     if "regions" in benchmark:
         args.extend(["--regions", benchmark["regions"]])
     if "inputs" in benchmark:
-        args.extend((path.join(xl_folder, b) for b in benchmark["inputs"]))
+        args.extend(path.join(xl_folder, b) for b in benchmark["inputs"])
     else:
         args.append(xl_folder)
     start = time.time()
@@ -203,6 +207,7 @@ def run_benchmark(
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             text=True,
+            check=False,
         )
     else:
         # If debug option is set, run as a function call to allow stepping with a debugger.
@@ -295,7 +300,6 @@ def run_all_benchmarks(
         for benchmark in benchmarks:
             with open(
                 path.join(benchmarks_folder, "out-main", benchmark["name"], "stdout"),
-                "r",
             ) as f:
                 result = parse_result(f.readlines()[-1])
             # Use a fake runtime and GAMS result
@@ -330,7 +334,8 @@ def run_all_benchmarks(
             results_main = list(executor.map(run_a_benchmark, benchmarks))
 
     # Print table with combined results to make comparison easier
-    trunc = lambda s: s[:10] + "\u2026" if len(s) > 10 else s
+    trunc = lambda s: s[:10] + "\u2026" if len(s) > 10 else s  # noqa
+
     combined_results = [
         (
             f"{b:<20}",