Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Kolmogorov Smirnov Test in SQL-only #28

Merged
merged 32 commits into from
Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
8d80f9f
first integration: ks-test in database functionality
YYYasin19 Jun 23, 2022
8c7f25a
integrate sql query with data refs
YYYasin19 Jun 23, 2022
ba802d9
formatting
YYYasin19 Jun 24, 2022
1127da0
formatting
YYYasin19 Jun 24, 2022
61e06b0
refactoring: call `test` directly to access sql-result of KS test
YYYasin19 Jun 24, 2022
b847613
fix row count retrieval
YYYasin19 Jun 24, 2022
1688550
fix acceptance level domain error
YYYasin19 Jun 24, 2022
7b8b7dc
fix alpha adjustment
YYYasin19 Jun 24, 2022
806e2dd
fix type hints for python<3.10
YYYasin19 Jun 24, 2022
e5349ec
update sql query for postgres: all tables need to have an alias assig…
YYYasin19 Jun 24, 2022
c9bf5cb
fix: typo
YYYasin19 Jun 24, 2022
6866429
update query for mssql server
YYYasin19 Jun 24, 2022
b11fcf0
add check for column names
YYYasin19 Jun 25, 2022
a3ff0a6
alternative way of getting table name, incl. hot fix for mssql quotat…
YYYasin19 Jun 25, 2022
38f7dd6
don't accept zero alphas since in practice they don't make much sense
YYYasin19 Jun 27, 2022
b5307c4
update variable naming and doc-strings
YYYasin19 Jun 27, 2022
4ecf804
update data retrieval
YYYasin19 Jun 28, 2022
ecfbd8f
include query nesting brackets
YYYasin19 Jun 28, 2022
989dc99
better formatting for understandibility
YYYasin19 Jun 28, 2022
27d7604
better formatting for understandibility
YYYasin19 Jun 28, 2022
f43e69e
update query for better readibility with more WITH statements
YYYasin19 Jun 28, 2022
370514f
new option of passing values to the TestResult to compare these
YYYasin19 Jun 28, 2022
7fb7106
seperate implementation testing from use case testing
YYYasin19 Jun 29, 2022
395b411
make independent of numpy
YYYasin19 Jun 29, 2022
c1e01ab
update tests: new distributions, no scipy and numpy dependency, rando…
YYYasin19 Jun 29, 2022
8c12e83
update comment
YYYasin19 Jun 29, 2022
b0631f9
optional accuracy through scipy
YYYasin19 Jun 29, 2022
158ed7b
refactoring, clean up and formatting
YYYasin19 Jun 29, 2022
26594f8
update comment and type hints
YYYasin19 Jun 29, 2022
6524fb8
update tpye hints for older python versions
YYYasin19 Jun 29, 2022
10f689c
fix type hint: Tuple instead of tuple
YYYasin19 Jun 29, 2022
023bfad
update changelog and include comment about scipy calculation
YYYasin19 Jun 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions src/datajudge/constraints/stats.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import math
import warnings
from typing import Any, Optional, Tuple
from typing import Any, Optional

import sqlalchemy as sa

from .. import db_access
from ..db_access import DataReference
from .base import Constraint, OptionalSelections, TestResult
from .base import Constraint, TestResult


class KolmogorovSmirnov2Sample(Constraint):
Expand All @@ -16,13 +16,6 @@ def __init__(
self.significance_level = significance_level
super().__init__(ref, ref2=ref2)

def retrieve(
self, engine: sa.engine.Engine, ref: DataReference
) -> Tuple[Any, OptionalSelections]:
sel = ref.get_selection(engine) # table selection incl. WHERE condition
col = ref.get_column(engine) # column name
return sel, col

@staticmethod
def approximate_p_value(
d: float, n_samples: int, m_samples: int
Expand Down Expand Up @@ -56,12 +49,12 @@ def approximate_p_value(
@staticmethod
def check_acceptance(
d_statistic: float, n_samples: int, m_samples: int, accepted_level: float
):
) -> bool:
"""
For a given test statistic, d, and the respective sample sizes `n` and `m`, this function
checks whether the null hypothesis can be rejected for an accepted significance level.

For more information, check out the `Wikipedia entry <https://w.wiki/5May>`.
For more information, check out the `Wikipedia entry <https://w.wiki/5May>`_.
"""

def c(alpha: float):
Expand All @@ -72,24 +65,26 @@ def c(alpha: float):
)

@staticmethod
def calculate_statistic(engine, table1, table2) -> Any:
def calculate_statistic(engine, table1_def: tuple, table2_def: tuple) -> Any:
YYYasin19 marked this conversation as resolved.
Show resolved Hide resolved

# retrieve test statistic d, as well as sample sizes m and n
d_statistic, m, n = db_access.get_ks_2sample(
engine, table1=table1, table2=table2
d_statistic, n_samples, m_samples = db_access.get_ks_2sample(
engine, table1=table1_def, table2=table2_def
)

# calculate approximate p-value
p_value = KolmogorovSmirnov2Sample.approximate_p_value(d_statistic, m, n)
p_value = KolmogorovSmirnov2Sample.approximate_p_value(
d_statistic, n_samples, m_samples
)

return d_statistic, p_value, n, m
return d_statistic, p_value, n_samples, m_samples

def test(self, engine: sa.engine.Engine) -> TestResult:

# get query selections and column names for target columns
selection1 = str(self.ref.data_source.get_clause(engine))
selection1 = self.ref.data_source.get_clause(engine)
column1 = self.ref.get_column(engine)
selection2 = str(self.ref2.data_source.get_clause(engine))
selection2 = self.ref2.data_source.get_clause(engine)
column2 = self.ref2.get_column(engine)

d_statistic, p_value, n_samples, m_samples = self.calculate_statistic(
Expand All @@ -105,11 +100,11 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
f"Null hypothesis (H0) for the 2-sample Kolmogorov-Smirnov test was rejected, i.e., "
f"the two samples ({self.ref.get_string()} and {self.target_prefix})"
f" do not originate from the same distribution."
f"The test results are d={d_statistic}"
)
if p_value:
assertion_text += f"\n p-value: {p_value}"
if p_value is not None:
assertion_text += f"and {p_value=}"

# store values s.t. they can be checked later
if not result:
return TestResult.failure(assertion_text)

Expand Down
18 changes: 8 additions & 10 deletions src/datajudge/db_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,16 +908,14 @@ def get_ks_2sample(engine: sa.engine.Engine, table1: tuple, table2: tuple):
"""
Runs the query for the two-sample Kolmogorov-Smirnov test and returns the test statistic d.
"""
table1_selection, col1 = table1
table2_selection, col2 = table2

if is_mssql(engine):
table1_selection = str(table1_selection).replace(
'"', ""
) # tempdb.dbo.int_table
table2_selection = str(table2_selection).replace(
'"', ""
) # "tempdb.dbo".int_table

# make sure we have a string representation here
table1_selection, col1 = str(table1[0]), str(table1[1])
table2_selection, col2 = str(table2[0]), str(table2[1])

if is_mssql(engine): # "tempdb.dbo".table_name -> tempdb.dbo.table_name
table1_selection = table1_selection.replace('"', "")
table2_selection = table2_selection.replace('"', "")

# for RawQueryDataSource this could be a whole subquery and will therefore need to be wrapped
if "SELECT" in table1_selection:
Expand Down
9 changes: 2 additions & 7 deletions tests/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest

import datajudge.requirements as requirements
from datajudge import db_access
from datajudge.constraints.stats import KolmogorovSmirnov2Sample
from datajudge.db_access import (
Condition,
Expand Down Expand Up @@ -1856,9 +1855,9 @@ def test_ks_2sample_implementation(engine, random_normal_table, configuration):
ref2 = DataReference(tds, columns=[col_2])

# retrieve table selections from data references
selection1 = str(ref.data_source.get_clause(engine))
selection1 = ref.data_source.get_clause(engine)
column1 = ref.get_column(engine)
selection2 = str(ref2.data_source.get_clause(engine))
selection2 = ref2.data_source.get_clause(engine)
column2 = ref2.get_column(engine)

(
Expand All @@ -1870,10 +1869,6 @@ def test_ks_2sample_implementation(engine, random_normal_table, configuration):
engine, (selection1, column1), (selection2, column2)
)

# compare with scipy implementation
data1, _ = db_access.get_column(engine, ref)
data2, _ = db_access.get_column(engine, ref2)

assert (
abs(d_statistic - expected_d) <= 1e-10
), f"The test statistic does not match: {expected_d} vs {d_statistic}"
Expand Down