Skip to content

Commit

Permalink
make release-tag: Merge branch 'main' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
amontanez24 committed Sep 26, 2024
2 parents 44224df + 93788f6 commit 1192d1a
Show file tree
Hide file tree
Showing 29 changed files with 1,682 additions and 34 deletions.
80 changes: 80 additions & 0 deletions .github/workflows/dtypes_benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: Data Types Benchmark

on:
push:
branches:
- main

jobs:
run_dtypes_benchmark:
runs-on: ubuntu-latest

strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install invoke .[test]
- name: Create folder and JSON file
run: |
mkdir -p results
touch results/${{ matrix.python-version }}.json
# Run the benchmarking
- name: Benchmark Data Types
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
run: |
invoke benchmark-dtypes
# Upload the json files as artifacts
- name: Upload artifacts
uses: actions/upload-artifact@v3
with:
name: results-${{ matrix.python-version }}
path: results/*.json

generate_dtypes_report:
runs-on: ubuntu-latest
needs: run_dtypes_benchmark

steps:
- name: Checkout code
uses: actions/checkout@v4

# Set up Python 3.10
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install dependencies for report
run: |
python -m pip install --upgrade pip
python -m pip install .[test]
# Download the artifacts
- name: Download artifacts
uses: actions/download-artifact@v3
with:
path: results/

# Generate the report
- name: Generate the report
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}

run: python -m tests.benchmark.utils
5 changes: 5 additions & 0 deletions .github/workflows/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ on:
push:
branches:
- main

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
install:
name: ${{ matrix.python_version }} install
Expand Down
11 changes: 10 additions & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ on:
pull_request:
types: [opened, reopened]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
integration:
runs-on: ${{ matrix.os }}
Expand All @@ -29,4 +33,9 @@ jobs:
python -m pip install --upgrade pip
python -m pip install invoke .[test]
- name: Run integration tests
run: invoke integration
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}

run: |
invoke integration
invoke benchmark-dtypes
4 changes: 4 additions & 0 deletions .github/workflows/minimum.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ on:
pull_request:
types: [opened, reopened]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
minimum:
runs-on: ${{ matrix.os }}
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/unit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ on:
pull_request:
types: [opened, reopened]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit:
runs-on: ${{ matrix.os }}
Expand Down
12 changes: 11 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# Release Notes

### v1.16.1 - 2024-08-27
### v1.16.2 - 2024-09-25

### New Features

* Supported data types benchmark - Issue [#2200](https://github.com/sdv-dev/SDV/issues/2200) by @pvk-developer

### Bugs Fixed

* The `_validate_circular_relationships` method may fail to detect circular relationships - Issue [#2205](https://github.com/sdv-dev/SDV/issues/2205) by @fealho

## v1.16.1 - 2024-08-27

### Internal

Expand Down
6 changes: 3 additions & 3 deletions latest_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ ctgan==0.10.1
deepecho==0.6.0
graphviz==0.20.3
numpy==1.26.4
pandas==2.2.2
platformdirs==4.2.2
rdt==1.12.3
pandas==2.2.3
platformdirs==4.3.6
rdt==1.12.4
sdmetrics==0.15.1
tqdm==4.66.5
9 changes: 7 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ dependencies = [
'ctgan>=0.10.0',
'deepecho>=0.6.0',
'rdt>=1.12.3',
'sdmetrics>=0.14.0',
'sdmetrics>=0.16.0',
'platformdirs>=4.0',
'pyyaml>=6.0.1',
]
Expand All @@ -62,6 +62,10 @@ test = [
'rundoc>=0.4.3,<0.5',
'pytest-runner >= 2.11.1',
'tomli>=2.0.0,<3',
'pydrive',
'pyarrow',
'gitpython',
'slack-sdk>=3.23,<4.0',
]
pomegranate = ['pomegranate>=0.14.3,<0.15']
dev = [
Expand Down Expand Up @@ -132,7 +136,7 @@ namespaces = false
version = {attr = 'sdv.__version__'}

[tool.bumpversion]
current_version = "1.16.1"
current_version = "1.16.2.dev1"
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
serialize = [
'{major}.{minor}.{patch}.{release}{candidate}',
Expand Down Expand Up @@ -181,6 +185,7 @@ exclude = [
".tox",
".git",
"__pycache__",
"*.ipynb",
".ipynb_checkpoints",
"tasks.py",
]
Expand Down
2 changes: 1 addition & 1 deletion sdv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

__author__ = 'DataCebo, Inc.'
__email__ = '[email protected]'
__version__ = '1.16.1'
__version__ = '1.16.2.dev1'


import sys
Expand Down
10 changes: 10 additions & 0 deletions sdv/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pathlib import Path

import pandas as pd
from pandas.api.types import is_float, is_integer
from pandas.core.tools.datetimes import _guess_datetime_format_for_array
from rdt.transformers.utils import _GENERATORS

Expand Down Expand Up @@ -81,6 +82,7 @@ def _is_datetime_type(value):
bool(_get_datetime_format([value]))
or isinstance(value, pd.Timestamp)
or isinstance(value, datetime)
or (isinstance(value, str) and pd.notna(pd.to_datetime(value, errors='coerce')))
):
return False

Expand Down Expand Up @@ -439,3 +441,11 @@ def get_possible_chars(regex, num_subpatterns=None):
possible_chars += _get_chars_for_option(option, params)

return possible_chars


def _is_numerical(value):
"""Determine if the input is a numerical type or not."""
try:
return is_integer(value) or is_float(value)
except Exception:
return False
20 changes: 7 additions & 13 deletions sdv/constraints/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import numpy as np
import pandas as pd

from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type
from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type, _is_numerical
from sdv.constraints.base import Constraint
from sdv.constraints.errors import (
AggregateConstraintsError,
Expand Down Expand Up @@ -604,7 +604,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
sdtype = metadata.columns.get(column_name, {}).get('sdtype')
value = kwargs.get('value')
if sdtype == 'numerical':
if not isinstance(value, (int, float)):
if not _is_numerical(value):
raise ConstraintMetadataError("'value' must be an int or float.")

elif sdtype == 'datetime':
Expand Down Expand Up @@ -632,7 +632,7 @@ def _validate_init_inputs(column_name, value, relation):
if relation not in ['>', '>=', '<', '<=']:
raise ValueError('`relation` must be one of the following: `>`, `>=`, `<`, `<=`')

if not (isinstance(value, (int, float)) or value_is_datetime):
if not (_is_numerical(value) or value_is_datetime):
raise ValueError('`value` must be a number or a string that represents a datetime.')

if value_is_datetime and not isinstance(value, str):
Expand Down Expand Up @@ -1071,9 +1071,7 @@ def _validate_init_inputs(low_value, high_value):
if values_are_datetimes and not values_are_strings:
raise ValueError('Datetime must be represented as a string.')

values_are_numerical = bool(
isinstance(low_value, (int, float)) and isinstance(high_value, (int, float))
)
values_are_numerical = bool(_is_numerical(low_value) and _is_numerical(high_value))
if not (values_are_numerical or values_are_datetimes):
raise ValueError(
'``low_value`` and ``high_value`` must be a number or a string that '
Expand All @@ -1092,7 +1090,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
high_value = kwargs.get('high_value')
low_value = kwargs.get('low_value')
if sdtype == 'numerical':
if not isinstance(high_value, (int, float)) or not isinstance(low_value, (int, float)):
if not _is_numerical(high_value) or not _is_numerical(low_value):
raise ConstraintMetadataError(
"Both 'high_value' and 'low_value' must be ints or floats"
)
Expand Down Expand Up @@ -1187,11 +1185,7 @@ def is_valid(self, table_data):
self._operator(data, self._high_value),
pd.isna(self._high_value),
)

return np.logical_or(
np.logical_and(satisfy_low_bound, satisfy_high_bound),
pd.isna(data),
)
return (satisfy_low_bound & satisfy_high_bound) | pd.isna(data)

def _transform(self, table_data):
"""Transform the table data.
Expand Down Expand Up @@ -1250,7 +1244,7 @@ def _reverse_transform(self, table_data):
table_data[self._column_name] = data.round().astype(self._dtype)

else:
table_data[self._column_name] = data.astype(self._dtype)
table_data[self._column_name] = data.astype(self._dtype, errors='ignore')

table_data = table_data.drop(self._transformed_column, axis=1)
return table_data
Expand Down
12 changes: 6 additions & 6 deletions sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,26 +126,26 @@ def _validate_relationship_sdtypes(
)

def _validate_circular_relationships(
self, parent, children=None, parents=None, child_map=None, errors=None
self, parent, children=None, visited=None, child_map=None, errors=None
):
"""Validate that there is no circular relationship in the metadata."""
parents = set() if parents is None else parents
visited = set() if visited is None else visited
if children is None:
children = child_map[parent]

if parent in children:
errors.append(parent)

for child in children:
if child in parents:
break
if child in visited:
continue

parents.add(child)
visited.add(child)
self._validate_circular_relationships(
parent,
children=child_map.get(child, set()),
child_map=child_map,
parents=parents,
visited=visited,
errors=errors,
)

Expand Down
15 changes: 13 additions & 2 deletions sdv/multi_table/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ def _set_temp_numpy_seed(self):
def _initialize_models(self):
with disable_single_table_logger():
for table_name, table_metadata in self.metadata.tables.items():
synthesizer_parameters = self._table_parameters.get(table_name, {})
synthesizer_parameters = {'locales': self.locales}
synthesizer_parameters.update(self._table_parameters.get(table_name, {}))
self._table_synthesizers[table_name] = self._synthesizer(
metadata=table_metadata, locales=self.locales, **synthesizer_parameters
metadata=table_metadata, **synthesizer_parameters
)
self._table_synthesizers[table_name]._data_processor.table_name = table_name

Expand Down Expand Up @@ -340,6 +341,10 @@ def _store_and_convert_original_cols(self, data):
data[table] = dataframe
return list_of_changed_tables

def _transform_helper(self, data):
"""Stub method for transforming data patterns."""
return data

def preprocess(self, data):
"""Transform the raw data to numerical space.
Expand All @@ -353,6 +358,7 @@ def preprocess(self, data):
"""
list_of_changed_tables = self._store_and_convert_original_cols(data)

data = self._transform_helper(data)
self.validate(data)
if self._fitted:
warnings.warn(
Expand Down Expand Up @@ -471,6 +477,10 @@ def reset_sampling(self):
def _sample(self, scale):
raise NotImplementedError()

def _reverse_transform_helper(self, sampled_data):
"""Stub method for reverse transforming data patterns."""
return sampled_data

def sample(self, scale=1.0):
"""Generate synthetic data for the entire dataset.
Expand All @@ -495,6 +505,7 @@ def sample(self, scale=1.0):

with self._set_temp_numpy_seed(), disable_single_table_logger():
sampled_data = self._sample(scale=scale)
sampled_data = self._reverse_transform_helper(sampled_data)

total_rows = 0
total_columns = 0
Expand Down
Loading

0 comments on commit 1192d1a

Please sign in to comment.