Quantco · YYYasin19 · Jun 30, 2022 · Jun 23, 2022 · Jun 23, 2022 · Jun 24, 2022
diff --git a/src/datajudge/constraints/stats.py b/src/datajudge/constraints/stats.py
@@ -1,10 +1,12 @@
-from typing import Any, Collection, Optional, Tuple
+import math
+import warnings
+from typing import Any, Optional, Tuple
 
 import sqlalchemy as sa
 
 from .. import db_access
 from ..db_access import DataReference
-from .base import Constraint, OptionalSelections
+from .base import Constraint, OptionalSelections, TestResult
 
 
 class KolmogorovSmirnov2Sample(Constraint):
@@ -14,44 +16,94 @@ def __init__(
         self.significance_level = significance_level
         super().__init__(ref, ref2=ref2)
 
+    def retrieve(
+        self, engine: sa.engine.Engine, ref: DataReference
+    ) -> Tuple[Any, OptionalSelections]:
+        sel = ref.get_selection(engine)  # table selection incl. WHERE condition
+        col = ref.get_column(engine)  # column name
+        return sel, col
+
     @staticmethod
-    def calculate_2sample_ks_test(data: Collection, data2: Collection) -> float:
+    def approximate_p_value(
+        d: float, n_samples: int, m_samples: int
+    ) -> Optional[float]:
         """
-        For two given lists of values calculates the Kolmogorov-Smirnov test.
-        Read more here: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html
+        Calculates the approximate p-value according to
+        'A procedure to find exact critical values of Kolmogorov-Smirnov Test', Silvia Fachinetti, 2009
         """
-        try:
-            from scipy.stats import ks_2samp
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError(
-                "Calculating the Kolmogorov-Smirnov test relies on scipy."
-                "Therefore, please install scipy before using this test."
+
+        # approximation does not work for small sample sizes
+        samples = min(n_samples, m_samples)
+        if samples < 35:
+            warnings.warn(
+                "Approximating the p-value is not accurate enough for sample size < 35"
             )
+            return None
 
-        # Currently, the calculation will be performed locally through scipy
-        # In future versions, an implementation where either the database engine
-        # (1) calculates the CDF
-        # or even (2) calculates the KS test
-        # can be expected
-        statistic, p_value = ks_2samp(data, data2)
+        d_alpha = d * math.sqrt(samples)
+        approx_p = 2 * math.exp(-(d_alpha**2))
 
-        return p_value
+        # clamp value to [0, 1]
+        return 1.0 if approx_p > 1.0 else 0.0 if approx_p < 0.0 else approx_p
 
-    def retrieve(
-        self, engine: sa.engine.Engine, ref: DataReference
-    ) -> Tuple[Any, OptionalSelections]:
-        return db_access.get_column(engine, ref)
+    @staticmethod
+    def check_acceptance(
+        d_statistic: float, n_samples: int, m_samples: int, accepted_level: float
+    ):
+        """
+        For a given test statistic, d, and the respective sample sizes `n` and `m`, this function
+        checks whether the null hypothesis can be rejected for an accepted significance level.
+
+        For more information, check out the `Wikipedia entry <https://w.wiki/5May>`.
+        """
+
+        def c(alpha: float):
+            return math.sqrt(-math.log(alpha / 2.0 + 1e-10) * 0.5)
 
-    def compare(
-        self, value_factual: Any, value_target: Any
-    ) -> Tuple[bool, Optional[str]]:
+        return d_statistic <= c(accepted_level) * math.sqrt(
+            (n_samples + m_samples) / (n_samples * m_samples)
+        )
+
+    @staticmethod
+    def calculate_statistic(engine, table1, table2) -> Any:
+
+        # retrieve test statistic d, as well as sample sizes m and n
+        d_statistic, m, n = db_access.get_ks_2sample(
+            engine, table1=table1, table2=table2
+        )
+
+        # calculate approximate p-value
+        p_value = KolmogorovSmirnov2Sample.approximate_p_value(d_statistic, m, n)
+
+        return d_statistic, p_value, n, m
+
+    def test(self, engine: sa.engine.Engine) -> TestResult:
+
+        # get query selections and column names for target columns
+        selection1 = str(self.ref.data_source.get_clause(engine))
+        column1 = self.ref.get_column(engine)
+        selection2 = str(self.ref2.data_source.get_clause(engine))
+        column2 = self.ref2.get_column(engine)
+
+        d_statistic, p_value, n_samples, m_samples = self.calculate_statistic(
+            engine, (selection1, column1), (selection2, column2)
+        )
+
+        # calculate test acceptance
+        result = self.check_acceptance(
+            d_statistic, n_samples, m_samples, self.significance_level
+        )
 
-        p_value = self.calculate_2sample_ks_test(value_factual, value_target)
-        result = p_value >= self.significance_level
         assertion_text = (
-            f"2-Sample Kolmogorov-Smirnov between {self.ref.get_string()} and {self.target_prefix}"
-            f"has p-value {p_value}  < {self.significance_level}"
-            f"{self.condition_string}"
+            f"Null hypothesis (H0) for the 2-sample Kolmogorov-Smirnov test was rejected, i.e., "
+            f"the two samples ({self.ref.get_string()} and {self.target_prefix})"
+            f" do not originate from the same distribution."
         )
+        if p_value:
+            assertion_text += f"\n p-value: {p_value}"
+
+        # store values s.t. they can be checked later
+        if not result:
+            return TestResult.failure(assertion_text)
 
-        return result, assertion_text
+        return TestResult.success()
diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py
@@ -902,3 +902,85 @@ def get_column_array_agg(
             for t in result
         ]
     return result, selections
+
+
+def get_ks_2sample(engine: sa.engine.Engine, table1: tuple, table2: tuple):
+    """
+    Runs the query for the two-sample Kolmogorov-Smirnov test and returns the test statistic d.
+    """
+    table1_selection, col1 = table1
+    table2_selection, col2 = table2
+
+    if is_mssql(engine):
+        table1_selection = str(table1_selection).replace(
+            '"', ""
+        )  # tempdb.dbo.int_table
+        table2_selection = str(table2_selection).replace(
+            '"', ""
+        )  # "tempdb.dbo".int_table
+
+    # for RawQueryDataSource this could be a whole subquery and will therefore need to be wrapped
+    if "SELECT" in table1_selection:
+        table1_selection = f"({table1_selection})"
+        table2_selection = f"({table2_selection})"
+
+    # for a more extensive explanation, see:
+    # https://github.com/Quantco/datajudge/pull/28#issuecomment-1165587929
+    ks_query_string = f"""
+        WITH
+        tab1 AS ( -- Step 0: Prepare data source and value column
+            SELECT {col1} as val FROM {table1_selection}
+        ),
+        tab2 AS (
+            SELECT {col2} as val FROM {table2_selection}
+        ),
+        tab1_cdf AS ( -- Step 1: Calculate the CDF over the value column
+            SELECT val, cume_dist() over (order by val) as cdf
+            FROM tab1
+        ),
+        tab2_cdf AS (
+            SELECT val, cume_dist() over (order by val) as cdf
+            FROM tab2
+        ),
+        tab1_grouped AS ( -- Step 2: Remove unnecessary values, s.t. we have (x, cdf(x)) rows only
+            SELECT val, MAX(cdf) as cdf
+            FROM tab1_cdf
+            GROUP BY val
+        ),
+        tab2_grouped AS (
+            SELECT val, MAX(cdf) as cdf
+            FROM tab2_cdf
+            GROUP BY val
+        ),
+        joined_cdf AS ( -- Step 3: combine the cdfs
+            SELECT coalesce(tab1_grouped.val, tab2_grouped.val) as v, tab1_grouped.cdf as cdf1, tab2_grouped.cdf as cdf2
+            FROM tab1_grouped FULL OUTER JOIN tab2_grouped ON tab1_grouped.val = tab2_grouped.val
+        ),
+        -- Step 4: Create a grouper id based on the value count; this is just a helper for forward-filling
+        grouped_cdf AS (
+            SELECT v,
+                COUNT(cdf1) over (order by v) as _grp1,
+                cdf1,
+                COUNT(cdf2) over (order by v) as _grp2,
+                cdf2
+            FROM joined_cdf
+        ),
+        -- Step 5: Forward-Filling: Select first non-null value per group (defined in the prev. step)
+        filled_cdf AS (
+            SELECT v,
+                first_value(cdf1) over (partition by _grp1 order by v) as cdf1_filled,
+                first_value(cdf2) over (partition by _grp2 order by v) as cdf2_filled
+            FROM grouped_cdf),
+        -- Step 6: Replace NULL values (at the beginning) with 0 to calculate difference
+        replaced_nulls AS (
+            SELECT coalesce(cdf1_filled, 0) as cdf1, coalesce(cdf2_filled, 0) as cdf2
+            FROM filled_cdf)
+        -- Step 7: Calculate final statistic as max. distance
+        SELECT MAX(ABS(cdf1 - cdf2)) FROM replaced_nulls;
+    """
+
+    d_statistic = engine.execute(ks_query_string).scalar()
+    n = engine.execute(f"SELECT COUNT(*) FROM {table1_selection} as n_table").scalar()
+    m = engine.execute(f"SELECT COUNT(*) FROM {table2_selection} as m_table").scalar()
+
+    return d_statistic, n, m
diff --git a/src/datajudge/requirements.py b/src/datajudge/requirements.py
@@ -1268,9 +1268,14 @@ def add_ks_2sample_constraint(
         The signifance_level must be a value between 0.0 and 1.0.
         """
 
-        if significance_level < 0.0 or significance_level > 1.0:
+        if not column1 or not column2:
             raise ValueError(
-                "The requested significance level has to be between 0.0 and 1.0. Default is 0.05."
+                "Column names have to be given for this test's functionality."
+            )
+
+        if significance_level <= 0.0 or significance_level > 1.0:
+            raise ValueError(
+                "The requested significance level has to be in `(0.0, 1.0]`. Default is 0.05."
             )
 
         ref = DataReference(self.data_source, [column1], condition=condition1)

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,6 +1,7 @@
 import datetime
 import itertools
 import os
+import random
 import urllib.parse
 
 import pytest
@@ -661,6 +662,37 @@ def groupby_aggregation_table_incorrect(engine, metadata):
     return TEST_DB_NAME, SCHEMA, table_name
 
 
+@pytest.fixture(scope="module")
+def random_normal_table(engine, metadata):
+    """
+    Table containing 10_000 randomly distributed values with mean = 0 and std.dev = 1.
+    """
+    table_name = "random_normal_table"
+    columns = [
+        sa.Column("value_0_1", sa.Float()),
+        sa.Column("value_005_1", sa.Float()),
+        sa.Column("value_02_1", sa.Float()),
+        sa.Column("value_1_1", sa.Float()),
+    ]
+    row_size = 10_000
+    random.seed(0)
+    rand1 = [random.gauss(0, 1) for _ in range(row_size)]
+    rand2 = [random.gauss(0.05, 1) for _ in range(row_size)]
+    rand3 = [random.gauss(0.2, 1) for _ in range(row_size)]
+    rand4 = [random.gauss(1, 1) for _ in range(row_size)]
+    data = [
+        {
+            "value_0_1": rand1[idx],
+            "value_005_1": rand2[idx],
+            "value_02_1": rand3[idx],
+            "value_1_1": rand4[idx],
+        }
+        for idx in range(row_size)
+    ]
+    _handle_table(engine, metadata, table_name, columns, data)
+    return TEST_DB_NAME, SCHEMA, table_name
+
+
 @pytest.fixture(scope="module")
 def capitalization_table(engine, metadata):
     table_name = "capitalization"