From 94616fecc5f9279d54db6a397effc1dfd493e0e5 Mon Sep 17 00:00:00 2001 From: Sukrit Jindal <132339317+sukjingitsit@users.noreply.github.com> Date: Sun, 24 Mar 2024 00:24:37 +0530 Subject: [PATCH 1/9] Create chi_squared.py --- skpro/distributions/chi_squared.py | 122 +++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 skpro/distributions/chi_squared.py diff --git a/skpro/distributions/chi_squared.py b/skpro/distributions/chi_squared.py new file mode 100644 index 000000000..f2c76e805 --- /dev/null +++ b/skpro/distributions/chi_squared.py @@ -0,0 +1,122 @@ +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +"""Chi-Squared probability distribution.""" + +__author__ = ["sukjingitsit"] + +import numpy as np +import pandas as pd +from scipy.special import gamma, chdtr, gammainc, chdtriv + +from skpro.distributions.base import BaseDistribution + + +class ChiSquared(BaseDistribution): + """Chi-Squared distribution (skpro native). + + Parameters + ---------- + dof : float or array of float (1D or 2D) + degrees of freedom of the chi-squared distribution + index : pd.Index, optional, default = RangeIndex + columns : pd.Index, optional, default = RangeIndex + + Example + ------- + >>> from skpro.distributions.normal import ChiSquared + + >>> chi = ChiSquared(dof=[[1, 2], [3, 4], [5, 6]]) + """ + + _tags = { + "capabilities:exact": ["mean", "var", "pdf", "log_pdf", "cdf", "ppf"], + "distr:measuretype": "continuous", + } + + def __init__(self, dof, index=None, columns=None): + self.dof = dof + self.index = index + self.columns = columns + + # todo: untangle index handling + # and broadcast of parameters. + # move this functionality to the base class + self._dof = self._get_bc_params(self.dof)[0] + shape = self._dof.shape + if index is None: + index = pd.RangeIndex(shape[0]) + if columns is None: + columns = pd.RangeIndex(shape[1]) + super().__init__(index=index, columns=columns) + + + # Working on maths of energy, have (mostly) finished for when + # x is a pandas Dataframe, but the math for self-energy is getting + # complicated, might have to use approx for self-energy case + + def mean(self): + r"""Return expected value of the distribution. + + Let :math:`X` be a random variable with the distribution of `self`. + Returns the expectation :math:`\mathbb{E}[X]` + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + expected value of distribution (entry-wise) + """ + mean_arr = self._dof + return pd.DataFrame(mean_arr, index=self.index, columns=self.columns) + + def var(self): + r"""Return element/entry-wise variance of the distribution. + + Let :math:`X` be a random variable with the distribution of `self`. + Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2` + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + variance of distribution (entry-wise) + """ + sd_arr = 2*self._dof + return pd.DataFrame(sd_arr, index=self.index, columns=self.columns) + + def pdf(self, x): + """Probability density function.""" + d = self.loc[x.index, x.columns] + pdf_arr = np.exp(-x/2)*np.power(x,(d.dof-2)/2) + pdf_arr = pdf_arr/(np.power(2,d.dof/2)*gamma(d.dof/2)) + return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns) + + def log_pdf(self, x): + """Logarithmic probability density function.""" + d = self.loc[x.index, x.columns] + lpdf_arr = -x/2 + (d.dof-2)*np.log(x)/2 + lpdf_arr = lpdf_arr - (d.dof*np.log(2)/2 + np.log(gamma(d.dof/2))) + return pd.DataFrame(lpdf_arr, index=x.index, columns=x.columns) + + def cdf(self, x): + """Cumulative distribution function.""" + d = self.loc[x.index, x.columns] + # cdf_arr = chdtr(d.dof, x) + cdf_arr = gammainc(d.dof/2, x/2) + cdf_arr = cdf_arr/(np.power(2,d.dof/2)*gamma(d.dof/2)) + return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns) + + def ppf(self, p): + """Quantile function = percent point function = inverse cdf.""" + # Working on maths of native ppf + d = self.loc[p.index, p.columns] + icdf_arr = chdtriv(d.dof, p) + return pd.DataFrame(icdf_arr, index=p.index, columns=p.columns) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator.""" + params1 = {"dof":[[1, 2], [3, 4], [5, 6]]} + params2 = { + "dof": 10, + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a", "b"]), + } + return [params1, params2] From 00d62b54e4f8fd55c42967ebf26b869d4dbb7535 Mon Sep 17 00:00:00 2001 From: Sukrit Jindal <132339317+sukjingitsit@users.noreply.github.com> Date: Sun, 24 Mar 2024 00:25:27 +0530 Subject: [PATCH 2/9] Update __init__.py --- skpro/distributions/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skpro/distributions/__init__.py b/skpro/distributions/__init__.py index 7fda17848..512c74293 100644 --- a/skpro/distributions/__init__.py +++ b/skpro/distributions/__init__.py @@ -11,6 +11,7 @@ "QPD_S", "QPD_B", "QPD_U", + "ChiSquared" ] from skpro.distributions.empirical import Empirical @@ -19,3 +20,4 @@ from skpro.distributions.normal import Normal from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U from skpro.distributions.t import TDistribution +from skpro.distributions.chi_squared import ChiSquared From 3eb867d3a3daa20c31218fb96b372d809df5c449 Mon Sep 17 00:00:00 2001 From: Sukrit Jindal <132339317+sukjingitsit@users.noreply.github.com> Date: Sun, 24 Mar 2024 03:19:16 +0530 Subject: [PATCH 3/9] Update chi_squared.py --- skpro/distributions/chi_squared.py | 42 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/skpro/distributions/chi_squared.py b/skpro/distributions/chi_squared.py index f2c76e805..24d4c05a7 100644 --- a/skpro/distributions/chi_squared.py +++ b/skpro/distributions/chi_squared.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from scipy.special import gamma, chdtr, gammainc, chdtriv +from scipy.special import chdtriv, gamma, gammainc from skpro.distributions.base import BaseDistribution @@ -19,11 +19,9 @@ class ChiSquared(BaseDistribution): degrees of freedom of the chi-squared distribution index : pd.Index, optional, default = RangeIndex columns : pd.Index, optional, default = RangeIndex - Example ------- >>> from skpro.distributions.normal import ChiSquared - >>> chi = ChiSquared(dof=[[1, 2], [3, 4], [5, 6]]) """ @@ -48,17 +46,28 @@ def __init__(self, dof, index=None, columns=None): columns = pd.RangeIndex(shape[1]) super().__init__(index=index, columns=columns) + r"""Energy implementation issues: + + The self-energy is mathematically difficult to calculate due to + their being no proper closed form. As discussed with fkiraly, + using E(d.energy(x)) is one possible way, but the question arises + on how to approximate the integral. The other alternative is to use + sampling to estimate the self-energy. + + The closed form version for cross-energy can be framed as follows: + Here, :math:`k=dof` + :math:`x <= 0, \operatorname{energy}(x) = k + \vert x \vert` + :math:`x > 0, \operatorname{energy}(x) = + x*(2*\operatorname{CDF}(k,x)-1)+k-2k*\operatorname{CDF}(k+1,x)` + where :math:`\operatorname{CDF}(k,x)` represents the CDF of x + for a chi-square distribution with k degrees of freedom. + """ - # Working on maths of energy, have (mostly) finished for when - # x is a pandas Dataframe, but the math for self-energy is getting - # complicated, might have to use approx for self-energy case - def mean(self): r"""Return expected value of the distribution. Let :math:`X` be a random variable with the distribution of `self`. Returns the expectation :math:`\mathbb{E}[X]` - Returns ------- pd.DataFrame with same rows, columns as `self` @@ -72,35 +81,34 @@ def var(self): Let :math:`X` be a random variable with the distribution of `self`. Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2` - Returns ------- pd.DataFrame with same rows, columns as `self` variance of distribution (entry-wise) """ - sd_arr = 2*self._dof + sd_arr = 2 * self._dof return pd.DataFrame(sd_arr, index=self.index, columns=self.columns) def pdf(self, x): """Probability density function.""" d = self.loc[x.index, x.columns] - pdf_arr = np.exp(-x/2)*np.power(x,(d.dof-2)/2) - pdf_arr = pdf_arr/(np.power(2,d.dof/2)*gamma(d.dof/2)) + pdf_arr = np.exp(-x / 2) * np.power(x, (d.dof - 2) / 2) + pdf_arr = pdf_arr / (np.power(2, d.dof / 2) * gamma(d.dof / 2)) return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns) def log_pdf(self, x): """Logarithmic probability density function.""" d = self.loc[x.index, x.columns] - lpdf_arr = -x/2 + (d.dof-2)*np.log(x)/2 - lpdf_arr = lpdf_arr - (d.dof*np.log(2)/2 + np.log(gamma(d.dof/2))) + lpdf_arr = -x / 2 + (d.dof - 2) * np.log(x) / 2 + lpdf_arr = lpdf_arr - (d.dof * np.log(2) / 2 + np.log(gamma(d.dof / 2))) return pd.DataFrame(lpdf_arr, index=x.index, columns=x.columns) def cdf(self, x): """Cumulative distribution function.""" d = self.loc[x.index, x.columns] # cdf_arr = chdtr(d.dof, x) - cdf_arr = gammainc(d.dof/2, x/2) - cdf_arr = cdf_arr/(np.power(2,d.dof/2)*gamma(d.dof/2)) + cdf_arr = gammainc(d.dof / 2, x / 2) + cdf_arr = cdf_arr / (np.power(2, d.dof / 2) * gamma(d.dof / 2)) return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns) def ppf(self, p): @@ -113,7 +121,7 @@ def ppf(self, p): @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator.""" - params1 = {"dof":[[1, 2], [3, 4], [5, 6]]} + params1 = {"dof": [[1, 2], [3, 4], [5, 6]]} params2 = { "dof": 10, "index": pd.Index([1, 2, 5]), From 3aef1ca230d9162206750e4d880ecf31d98c3d5b Mon Sep 17 00:00:00 2001 From: Sukrit Jindal <132339317+sukjingitsit@users.noreply.github.com> Date: Sun, 24 Mar 2024 03:19:49 +0530 Subject: [PATCH 4/9] Update __init__.py --- skpro/distributions/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skpro/distributions/__init__.py b/skpro/distributions/__init__.py index 512c74293..23e66faee 100644 --- a/skpro/distributions/__init__.py +++ b/skpro/distributions/__init__.py @@ -3,6 +3,7 @@ # adapted from sktime __all__ = [ + "ChiSquared", "Empirical", "Laplace", "Mixture", @@ -11,13 +12,12 @@ "QPD_S", "QPD_B", "QPD_U", - "ChiSquared" ] +from skpro.distributions.chi_squared import ChiSquared from skpro.distributions.empirical import Empirical from skpro.distributions.laplace import Laplace from skpro.distributions.mixture import Mixture from skpro.distributions.normal import Normal from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U from skpro.distributions.t import TDistribution -from skpro.distributions.chi_squared import ChiSquared From 4af21d88ef97d9e40881815ddae26a91a0be8116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 25 Apr 2024 22:14:31 +0100 Subject: [PATCH 5/9] update interface --- skpro/distributions/chi_squared.py | 148 ++++++++++++++++++----------- 1 file changed, 91 insertions(+), 57 deletions(-) diff --git a/skpro/distributions/chi_squared.py b/skpro/distributions/chi_squared.py index 24d4c05a7..2bcd09a6e 100644 --- a/skpro/distributions/chi_squared.py +++ b/skpro/distributions/chi_squared.py @@ -19,6 +19,7 @@ class ChiSquared(BaseDistribution): degrees of freedom of the chi-squared distribution index : pd.Index, optional, default = RangeIndex columns : pd.Index, optional, default = RangeIndex + Example ------- >>> from skpro.distributions.normal import ChiSquared @@ -26,24 +27,18 @@ class ChiSquared(BaseDistribution): """ _tags = { + # packaging info + # -------------- + "authors": "sukjingitsit", + # estimator tags + # -------------- "capabilities:exact": ["mean", "var", "pdf", "log_pdf", "cdf", "ppf"], "distr:measuretype": "continuous", } def __init__(self, dof, index=None, columns=None): self.dof = dof - self.index = index - self.columns = columns - - # todo: untangle index handling - # and broadcast of parameters. - # move this functionality to the base class - self._dof = self._get_bc_params(self.dof)[0] - shape = self._dof.shape - if index is None: - index = pd.RangeIndex(shape[0]) - if columns is None: - columns = pd.RangeIndex(shape[1]) + super().__init__(index=index, columns=columns) r"""Energy implementation issues: @@ -63,68 +58,107 @@ def __init__(self, dof, index=None, columns=None): for a chi-square distribution with k degrees of freedom. """ - def mean(self): - r"""Return expected value of the distribution. + def _mean(self): + """Return expected value of the distribution. - Let :math:`X` be a random variable with the distribution of `self`. - Returns the expectation :math:`\mathbb{E}[X]` Returns ------- - pd.DataFrame with same rows, columns as `self` - expected value of distribution (entry-wise) + 2D np.ndarray, same shape as ``self`` + expected value of distribution (entry-wise) """ - mean_arr = self._dof - return pd.DataFrame(mean_arr, index=self.index, columns=self.columns) + return self._bc_params["dof"] - def var(self): + def _var(self): r"""Return element/entry-wise variance of the distribution. - Let :math:`X` be a random variable with the distribution of `self`. - Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2` Returns ------- - pd.DataFrame with same rows, columns as `self` - variance of distribution (entry-wise) + 2D np.ndarray, same shape as ``self`` + variance of the distribution (entry-wise) + """ + return 2 * self._bc_params["dof"] + + def _pdf(self, x): + """Probability density function. + + Parameters + ---------- + x : 2D np.ndarray, same shape as ``self`` + values to evaluate the pdf at + + Returns + ------- + 2D np.ndarray, same shape as ``self`` + pdf values at the given points + """ + dof = self._bc_params["dof"] + pdf_arr = np.exp(-x / 2) * np.power(x, (dof - 2) / 2) + pdf_arr = pdf_arr / (np.power(2, dof / 2) * gamma(dof / 2)) + return pdf_arr + + def _log_pdf(self, x): + """Logarithmic probability density function. + + Parameters + ---------- + x : 2D np.ndarray, same shape as ``self`` + values to evaluate the pdf at + + Returns + ------- + 2D np.ndarray, same shape as ``self`` + log pdf values at the given points + """ + dof = self._bc_params["dof"] + lpdf_arr = -x / 2 + (dof - 2) * np.log(x) / 2 + lpdf_arr = lpdf_arr - (dof * np.log(2) / 2 + np.log(gamma(dof / 2))) + return lpdf_arr + + def _cdf(self, x): + """Cumulative distribution function. + + Parameters + ---------- + x : 2D np.ndarray, same shape as ``self`` + values to evaluate the cdf at + + Returns + ------- + 2D np.ndarray, same shape as ``self`` + cdf values at the given points + """ + dof = self._bc_params["dof"] + cdf_arr = gammainc(dof / 2, x / 2) + cdf_arr = cdf_arr / (np.power(2, dof / 2) * gamma(dof / 2)) + return cdf_arr + + def _ppf(self, p): + """Quantile function = percent point function = inverse cdf. + + Parameters + ---------- + p : 2D np.ndarray, same shape as ``self`` + values to evaluate the ppf at + + Returns + ------- + 2D np.ndarray, same shape as ``self`` + ppf values at the given points """ - sd_arr = 2 * self._dof - return pd.DataFrame(sd_arr, index=self.index, columns=self.columns) - - def pdf(self, x): - """Probability density function.""" - d = self.loc[x.index, x.columns] - pdf_arr = np.exp(-x / 2) * np.power(x, (d.dof - 2) / 2) - pdf_arr = pdf_arr / (np.power(2, d.dof / 2) * gamma(d.dof / 2)) - return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns) - - def log_pdf(self, x): - """Logarithmic probability density function.""" - d = self.loc[x.index, x.columns] - lpdf_arr = -x / 2 + (d.dof - 2) * np.log(x) / 2 - lpdf_arr = lpdf_arr - (d.dof * np.log(2) / 2 + np.log(gamma(d.dof / 2))) - return pd.DataFrame(lpdf_arr, index=x.index, columns=x.columns) - - def cdf(self, x): - """Cumulative distribution function.""" - d = self.loc[x.index, x.columns] - # cdf_arr = chdtr(d.dof, x) - cdf_arr = gammainc(d.dof / 2, x / 2) - cdf_arr = cdf_arr / (np.power(2, d.dof / 2) * gamma(d.dof / 2)) - return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns) - - def ppf(self, p): - """Quantile function = percent point function = inverse cdf.""" - # Working on maths of native ppf - d = self.loc[p.index, p.columns] - icdf_arr = chdtriv(d.dof, p) - return pd.DataFrame(icdf_arr, index=p.index, columns=p.columns) + dof = self._bc_params["dof"] + icdf_arr = chdtriv(dof, p) + return icdf_arr @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator.""" + # array case examples params1 = {"dof": [[1, 2], [3, 4], [5, 6]]} params2 = { "dof": 10, "index": pd.Index([1, 2, 5]), "columns": pd.Index(["a", "b"]), } - return [params1, params2] + # scalar case examples + params3 = {"dof": 3} + return [params1, params2, params3] From c59f6265f1d1be5da1eed53fa612c1359e37046b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 25 Apr 2024 22:17:53 +0100 Subject: [PATCH 6/9] Update distributions.rst --- docs/source/api_reference/distributions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api_reference/distributions.rst b/docs/source/api_reference/distributions.rst index 557b8a46f..4e4b9b11b 100644 --- a/docs/source/api_reference/distributions.rst +++ b/docs/source/api_reference/distributions.rst @@ -35,6 +35,7 @@ Continuous support :toctree: auto_generated/ :template: class.rst + ChiSquared Fisk Laplace Logistic From 09319c48a0d7df444e34ff310e05af3c641510ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 25 Apr 2024 23:13:50 +0100 Subject: [PATCH 7/9] Update chi_squared.py --- skpro/distributions/chi_squared.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skpro/distributions/chi_squared.py b/skpro/distributions/chi_squared.py index 2bcd09a6e..dcfb42a22 100644 --- a/skpro/distributions/chi_squared.py +++ b/skpro/distributions/chi_squared.py @@ -34,6 +34,7 @@ class ChiSquared(BaseDistribution): # -------------- "capabilities:exact": ["mean", "var", "pdf", "log_pdf", "cdf", "ppf"], "distr:measuretype": "continuous", + "broadcast_init": "on", } def __init__(self, dof, index=None, columns=None): From a59d026f6bdbfedf865ef1f4a8f0fcc3a9ff8d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 25 Apr 2024 23:30:32 +0100 Subject: [PATCH 8/9] use scipy directpy for wrong methods --- skpro/distributions/chi_squared.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/skpro/distributions/chi_squared.py b/skpro/distributions/chi_squared.py index dcfb42a22..f061cf00b 100644 --- a/skpro/distributions/chi_squared.py +++ b/skpro/distributions/chi_squared.py @@ -3,9 +3,8 @@ __author__ = ["sukjingitsit"] -import numpy as np import pandas as pd -from scipy.special import chdtriv, gamma, gammainc +from scipy.stats.distributions import chi2 from skpro.distributions.base import BaseDistribution @@ -93,8 +92,7 @@ def _pdf(self, x): pdf values at the given points """ dof = self._bc_params["dof"] - pdf_arr = np.exp(-x / 2) * np.power(x, (dof - 2) / 2) - pdf_arr = pdf_arr / (np.power(2, dof / 2) * gamma(dof / 2)) + pdf_arr = chi2.pdf(x, dof) return pdf_arr def _log_pdf(self, x): @@ -111,8 +109,7 @@ def _log_pdf(self, x): log pdf values at the given points """ dof = self._bc_params["dof"] - lpdf_arr = -x / 2 + (dof - 2) * np.log(x) / 2 - lpdf_arr = lpdf_arr - (dof * np.log(2) / 2 + np.log(gamma(dof / 2))) + lpdf_arr = chi2.logpdf(x, dof) return lpdf_arr def _cdf(self, x): @@ -129,8 +126,7 @@ def _cdf(self, x): cdf values at the given points """ dof = self._bc_params["dof"] - cdf_arr = gammainc(dof / 2, x / 2) - cdf_arr = cdf_arr / (np.power(2, dof / 2) * gamma(dof / 2)) + cdf_arr = chi2.cdf(x, dof) return cdf_arr def _ppf(self, p): @@ -147,7 +143,7 @@ def _ppf(self, p): ppf values at the given points """ dof = self._bc_params["dof"] - icdf_arr = chdtriv(dof, p) + icdf_arr = chi2.ppf(p, dof) return icdf_arr @classmethod From 149fb83786f7a7bb79217d870d72570c5a57cb16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 25 Apr 2024 23:42:55 +0100 Subject: [PATCH 9/9] Update chi_squared.py --- skpro/distributions/chi_squared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/chi_squared.py b/skpro/distributions/chi_squared.py index f061cf00b..372a39f99 100644 --- a/skpro/distributions/chi_squared.py +++ b/skpro/distributions/chi_squared.py @@ -21,7 +21,7 @@ class ChiSquared(BaseDistribution): Example ------- - >>> from skpro.distributions.normal import ChiSquared + >>> from skpro.distributions.chi_squared import ChiSquared >>> chi = ChiSquared(dof=[[1, 2], [3, 4], [5, 6]]) """