make release-tag: Merge branch 'main' into stable

sdv-dev · Sep 26, 2024 · 1192d1a · 1192d1a
2 parents 44224df + 93788f6
commit 1192d1a
Show file tree

Hide file tree

Showing 29 changed files with 1,682 additions and 34 deletions.
diff --git a/.github/workflows/dtypes_benchmark.yml b/.github/workflows/dtypes_benchmark.yml
@@ -0,0 +1,80 @@
+name: Data Types Benchmark
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  run_dtypes_benchmark:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install invoke .[test]
+
+      - name: Create folder and JSON file
+        run: |
+          mkdir -p results
+          touch results/${{ matrix.python-version }}.json
+
+      # Run the benchmarking
+      - name: Benchmark Data Types
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        run: |
+          invoke benchmark-dtypes
+
+      # Upload the json files as artifacts
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: results-${{ matrix.python-version }}
+          path: results/*.json
+
+  generate_dtypes_report:
+    runs-on: ubuntu-latest
+    needs: run_dtypes_benchmark
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      # Set up Python 3.10
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies for report
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[test]
+
+      # Download the artifacts
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
+        with:
+          path: results/
+
+      # Generate the report
+      - name: Generate the report
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+        run: python -m tests.benchmark.utils
diff --git a/.github/workflows/install.yaml b/.github/workflows/install.yaml
@@ -5,6 +5,11 @@ on:
   push:
     branches:
       - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   install:
     name: ${{ matrix.python_version }} install

diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -5,6 +5,10 @@ on:
   pull_request:
     types: [opened, reopened]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   integration:
     runs-on: ${{ matrix.os }}
@@ -29,4 +33,9 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install invoke .[test]
     - name: Run integration tests
-      run: invoke integration
+      env:
+        PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+
+      run: |
+        invoke integration
+        invoke benchmark-dtypes
diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml
@@ -5,6 +5,10 @@ on:
   pull_request:
     types: [opened, reopened]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   minimum:
     runs-on: ${{ matrix.os }}

diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml
@@ -5,6 +5,10 @@ on:
   pull_request:
     types: [opened, reopened]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   unit:
     runs-on: ${{ matrix.os }}

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,6 +1,16 @@
 # Release Notes
 
-### v1.16.1 - 2024-08-27
+### v1.16.2 - 2024-09-25
+
+### New Features
+
+* Supported data types benchmark - Issue [#2200](https://github.com/sdv-dev/SDV/issues/2200) by @pvk-developer
+
+### Bugs Fixed
+
+* The `_validate_circular_relationships` method may fail to detect circular relationships - Issue [#2205](https://github.com/sdv-dev/SDV/issues/2205) by @fealho
+
+## v1.16.1 - 2024-08-27
 
 ### Internal
 

diff --git a/latest_requirements.txt b/latest_requirements.txt
@@ -4,8 +4,8 @@ ctgan==0.10.1
 deepecho==0.6.0
 graphviz==0.20.3
 numpy==1.26.4
-pandas==2.2.2
-platformdirs==4.2.2
-rdt==1.12.3
+pandas==2.2.3
+platformdirs==4.3.6
+rdt==1.12.4
 sdmetrics==0.15.1
 tqdm==4.66.5
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     'ctgan>=0.10.0',
     'deepecho>=0.6.0',
     'rdt>=1.12.3',
-    'sdmetrics>=0.14.0',
+    'sdmetrics>=0.16.0',
     'platformdirs>=4.0',
     'pyyaml>=6.0.1',
 ]
@@ -62,6 +62,10 @@ test = [
     'rundoc>=0.4.3,<0.5',
     'pytest-runner >= 2.11.1',
     'tomli>=2.0.0,<3',
+    'pydrive',
+    'pyarrow',
+    'gitpython',
+    'slack-sdk>=3.23,<4.0',
 ]
 pomegranate = ['pomegranate>=0.14.3,<0.15']
 dev = [
@@ -132,7 +136,7 @@ namespaces = false
 version = {attr = 'sdv.__version__'}
 
 [tool.bumpversion]
-current_version = "1.16.1"
+current_version = "1.16.2.dev1"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
@@ -181,6 +185,7 @@ exclude = [
     ".tox",
     ".git",
     "__pycache__",
+    "*.ipynb",
     ".ipynb_checkpoints",
     "tasks.py",
 ]

diff --git a/sdv/__init__.py b/sdv/__init__.py
@@ -6,7 +6,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = '[email protected]'
-__version__ = '1.16.1'
+__version__ = '1.16.2.dev1'
 
 
 import sys

diff --git a/sdv/_utils.py b/sdv/_utils.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 
 import pandas as pd
+from pandas.api.types import is_float, is_integer
 from pandas.core.tools.datetimes import _guess_datetime_format_for_array
 from rdt.transformers.utils import _GENERATORS
 
@@ -81,6 +82,7 @@ def _is_datetime_type(value):
             bool(_get_datetime_format([value]))
             or isinstance(value, pd.Timestamp)
             or isinstance(value, datetime)
+            or (isinstance(value, str) and pd.notna(pd.to_datetime(value, errors='coerce')))
         ):
             return False
 
@@ -439,3 +441,11 @@ def get_possible_chars(regex, num_subpatterns=None):
         possible_chars += _get_chars_for_option(option, params)
 
     return possible_chars
+
+
+def _is_numerical(value):
+    """Determine if the input is a numerical type or not."""
+    try:
+        return is_integer(value) or is_float(value)
+    except Exception:
+        return False
diff --git a/sdv/constraints/tabular.py b/sdv/constraints/tabular.py
@@ -36,7 +36,7 @@
 import numpy as np
 import pandas as pd
 
-from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type
+from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type, _is_numerical
 from sdv.constraints.base import Constraint
 from sdv.constraints.errors import (
     AggregateConstraintsError,
@@ -604,7 +604,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
         sdtype = metadata.columns.get(column_name, {}).get('sdtype')
         value = kwargs.get('value')
         if sdtype == 'numerical':
-            if not isinstance(value, (int, float)):
+            if not _is_numerical(value):
                 raise ConstraintMetadataError("'value' must be an int or float.")
 
         elif sdtype == 'datetime':
@@ -632,7 +632,7 @@ def _validate_init_inputs(column_name, value, relation):
         if relation not in ['>', '>=', '<', '<=']:
             raise ValueError('`relation` must be one of the following: `>`, `>=`, `<`, `<=`')
 
-        if not (isinstance(value, (int, float)) or value_is_datetime):
+        if not (_is_numerical(value) or value_is_datetime):
             raise ValueError('`value` must be a number or a string that represents a datetime.')
 
         if value_is_datetime and not isinstance(value, str):
@@ -1071,9 +1071,7 @@ def _validate_init_inputs(low_value, high_value):
         if values_are_datetimes and not values_are_strings:
             raise ValueError('Datetime must be represented as a string.')
 
-        values_are_numerical = bool(
-            isinstance(low_value, (int, float)) and isinstance(high_value, (int, float))
-        )
+        values_are_numerical = bool(_is_numerical(low_value) and _is_numerical(high_value))
         if not (values_are_numerical or values_are_datetimes):
             raise ValueError(
                 '``low_value`` and ``high_value`` must be a number or a string that '
@@ -1092,7 +1090,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
         high_value = kwargs.get('high_value')
         low_value = kwargs.get('low_value')
         if sdtype == 'numerical':
-            if not isinstance(high_value, (int, float)) or not isinstance(low_value, (int, float)):
+            if not _is_numerical(high_value) or not _is_numerical(low_value):
                 raise ConstraintMetadataError(
                     "Both 'high_value' and 'low_value' must be ints or floats"
                 )
@@ -1187,11 +1185,7 @@ def is_valid(self, table_data):
             self._operator(data, self._high_value),
             pd.isna(self._high_value),
         )
-
-        return np.logical_or(
-            np.logical_and(satisfy_low_bound, satisfy_high_bound),
-            pd.isna(data),
-        )
+        return (satisfy_low_bound & satisfy_high_bound) | pd.isna(data)
 
     def _transform(self, table_data):
         """Transform the table data.
@@ -1250,7 +1244,7 @@ def _reverse_transform(self, table_data):
             table_data[self._column_name] = data.round().astype(self._dtype)
 
         else:
-            table_data[self._column_name] = data.astype(self._dtype)
+            table_data[self._column_name] = data.astype(self._dtype, errors='ignore')
 
         table_data = table_data.drop(self._transformed_column, axis=1)
         return table_data

diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py
@@ -126,26 +126,26 @@ def _validate_relationship_sdtypes(
                 )
 
     def _validate_circular_relationships(
-        self, parent, children=None, parents=None, child_map=None, errors=None
+        self, parent, children=None, visited=None, child_map=None, errors=None
     ):
         """Validate that there is no circular relationship in the metadata."""
-        parents = set() if parents is None else parents
+        visited = set() if visited is None else visited
         if children is None:
             children = child_map[parent]
 
         if parent in children:
             errors.append(parent)
 
         for child in children:
-            if child in parents:
-                break
+            if child in visited:
+                continue
 
-            parents.add(child)
+            visited.add(child)
             self._validate_circular_relationships(
                 parent,
                 children=child_map.get(child, set()),
                 child_map=child_map,
-                parents=parents,
+                visited=visited,
                 errors=errors,
             )
 

diff --git a/sdv/multi_table/base.py b/sdv/multi_table/base.py
@@ -70,9 +70,10 @@ def _set_temp_numpy_seed(self):
     def _initialize_models(self):
         with disable_single_table_logger():
             for table_name, table_metadata in self.metadata.tables.items():
-                synthesizer_parameters = self._table_parameters.get(table_name, {})
+                synthesizer_parameters = {'locales': self.locales}
+                synthesizer_parameters.update(self._table_parameters.get(table_name, {}))
                 self._table_synthesizers[table_name] = self._synthesizer(
-                    metadata=table_metadata, locales=self.locales, **synthesizer_parameters
+                    metadata=table_metadata, **synthesizer_parameters
                 )
                 self._table_synthesizers[table_name]._data_processor.table_name = table_name
 
@@ -340,6 +341,10 @@ def _store_and_convert_original_cols(self, data):
             data[table] = dataframe
         return list_of_changed_tables
 
+    def _transform_helper(self, data):
+        """Stub method for transforming data patterns."""
+        return data
+
     def preprocess(self, data):
         """Transform the raw data to numerical space.
 
@@ -353,6 +358,7 @@ def preprocess(self, data):
         """
         list_of_changed_tables = self._store_and_convert_original_cols(data)
 
+        data = self._transform_helper(data)
         self.validate(data)
         if self._fitted:
             warnings.warn(
@@ -471,6 +477,10 @@ def reset_sampling(self):
     def _sample(self, scale):
         raise NotImplementedError()
 
+    def _reverse_transform_helper(self, sampled_data):
+        """Stub method for reverse transforming data patterns."""
+        return sampled_data
+
     def sample(self, scale=1.0):
         """Generate synthetic data for the entire dataset.
 
@@ -495,6 +505,7 @@ def sample(self, scale=1.0):
 
         with self._set_temp_numpy_seed(), disable_single_table_logger():
             sampled_data = self._sample(scale=scale)
+            sampled_data = self._reverse_transform_helper(sampled_data)
 
         total_rows = 0
         total_columns = 0