Skip to content

Commit

Permalink
[ENH] Lognormal probability distribution (#218)
Browse files Browse the repository at this point in the history
Towards #22 

#### What does this implement/fix? Explain your changes.
Lognormal probability distribution
  • Loading branch information
bhavikar04 authored Apr 18, 2024
1 parent e2862a5 commit 0f0248c
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 0 deletions.
2 changes: 2 additions & 0 deletions skpro/distributions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"Laplace",
"Mixture",
"Normal",
"LogNormal",
"Poisson",
"QPD_S",
"QPD_B",
Expand All @@ -17,6 +18,7 @@

from skpro.distributions.empirical import Empirical
from skpro.distributions.laplace import Laplace
from skpro.distributions.lognormal import LogNormal
from skpro.distributions.mixture import Mixture
from skpro.distributions.normal import Normal
from skpro.distributions.poisson import Poisson
Expand Down
174 changes: 174 additions & 0 deletions skpro/distributions/lognormal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Log-Normal probability distribution."""

__author__ = ["bhavikar04", "fkiraly"]

import numpy as np
import pandas as pd
from scipy.special import erf, erfinv

from skpro.distributions.base import BaseDistribution


class LogNormal(BaseDistribution):
"""Log-Normal distribution (skpro native).
Parameters
----------
mean : float or array of float (1D or 2D)
mean of the logarithm of the distribution
sd : float or array of float (1D or 2D), must be positive
standard deviation the logarithm of the distribution
index : pd.Index, optional, default = RangeIndex
columns : pd.Index, optional, default = RangeIndex
Example
-------
>>> from skpro.distributions.lognormal import LogNormal
>>> n = LogNormal(mu=[[0, 1], [2, 3], [4, 5]], sigma=1)
"""

_tags = {
"authors": ["bhavikar04", "fkiraly"],
"capabilities:approx": ["pdflognorm"],
"capabilities:exact": ["mean", "var", "energy", "pdf", "log_pdf", "cdf", "ppf"],
"distr:measuretype": "continuous",
}

def __init__(self, mu, sigma, index=None, columns=None):
self.mu = mu
self.sigma = sigma
self.index = index
self.columns = columns

# todo: untangle index handling
# and broadcast of parameters.
# move this functionality to the base class
self._mu, self._sigma = self._get_bc_params()
shape = self._mu.shape

if index is None:
index = pd.RangeIndex(shape[0])

if columns is None:
columns = pd.RangeIndex(shape[1])

super().__init__(index=index, columns=columns)

def _get_bc_params(self):
"""Fully broadcast parameters of self, given param shapes and index, columns."""
to_broadcast = [self.mu, self.sigma]
if hasattr(self, "index") and self.index is not None:
to_broadcast += [self.index.to_numpy().reshape(-1, 1)]
if hasattr(self, "columns") and self.columns is not None:
to_broadcast += [self.columns.to_numpy()]
bc = np.broadcast_arrays(*to_broadcast)
return bc[0], bc[1]

# commented out, seems incorrect
# def energy(self, x=None):
# r"""Energy of self, w.r.t. self or a constant frame x.

# Let :math:`X, Y` be i.i.d. random variables with the distribution of `self`.

# If `x` is `None`, returns :math:`\mathbb{E}[|X-Y|]` (per row), "self-energy".
# If `x` is passed, returns :math:`\mathbb{E}[|X-x|]-0.5\mathbb{E}[|X-Y|]`
# (per row), "CRPS wrt x".

# Parameters
# ----------
# x : None or pd.DataFrame, optional, default=None
# if pd.DataFrame, must have same rows and columns as `self`

# Returns
# -------
# pd.DataFrame with same rows as `self`, single column `"energy"`
# each row contains one float, self-energy/energy as described above.
# """
# if x is None:
# return super().energy(x)
# # explicit formula for CRPS of log-normal cross-term
# # obtained by bhavikar04 via wolfram alpha
# else:
# d = self.loc[x.index, x.columns]
# mu_arr, sd_arr = d._mu, d._sigma
# c_arr = x * (2 * self.cdf(x) - 1)
# c_arr2 = -2 * np.exp((mu_arr + sd_arr**2) / 2)
# c_arr3 = self.cdf((np.log(x) - mu_arr - sd_arr**2) / sd_arr)
# c_arr3 = c_arr3 + self.cdf(sd_arr / mu_arr**0.5) - 1
# c_arr2 = c_arr2 * c_arr3
# c_arr = c_arr + c_arr2

# energy_arr = np.sum(c_arr, axis=1)
# energy = pd.DataFrame(energy_arr, index=self.index, columns=["energy"])
# return energy

def mean(self):
r"""Return expected value of the distribution.
Let :math:`X` be a random variable with the distribution of `self`.
Returns the expectation :math:`\mathbb{E}[X]`
Returns
-------
pd.DataFrame with same rows, columns as `self`
expected value of distribution (entry-wise)
"""
mean_arr = np.exp(self._mu + self._sigma**2 / 2)
return pd.DataFrame(mean_arr, index=self.index, columns=self.columns)

def var(self):
r"""Return element/entry-wise variance of the distribution.
Let :math:`X` be a random variable with the distribution of `self`.
Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2`
Returns
-------
pd.DataFrame with same rows, columns as `self`
variance of distribution (entry-wise)
"""
mu = self._mu
sigma = self._sigma
sd_arr = np.exp(2 * mu + 2 * sigma**2) - np.exp(2 * mu + sigma**2)
return pd.DataFrame(sd_arr, index=self.index, columns=self.columns) ** 2

def pdf(self, x):
"""Probability density function."""
d = self.loc[x.index, x.columns]
pdf_arr = np.exp(-0.5 * ((np.log(x.values) - d.mu) / d.sigma) ** 2)
pdf_arr = pdf_arr / (x.values * d.sigma * np.sqrt(2 * np.pi))
return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns)

def log_pdf(self, x):
"""Logarithmic probability density function."""
d = self.loc[x.index, x.columns]
lpdf_arr = -0.5 * ((np.log(x.values) - d.mu) / d.sigma) ** 2
lpdf_arr = lpdf_arr - np.log(x.values * d.sigma * np.sqrt(2 * np.pi))
return pd.DataFrame(lpdf_arr, index=x.index, columns=x.columns)

def cdf(self, x):
"""Cumulative distribution function."""
d = self.loc[x.index, x.columns]
cdf_arr = 0.5 + 0.5 * erf((np.log(x.values) - d.mu) / (d.sigma * np.sqrt(2)))
return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns)

def ppf(self, p):
"""Quantile function = percent point function = inverse cdf."""
d = self.loc[p.index, p.columns]
icdf_arr = d.mu + d.sigma * np.sqrt(2) * erfinv(2 * p.values - 1)
icdf_arr = np.exp(self._sigma * icdf_arr)
return pd.DataFrame(icdf_arr, index=p.index, columns=p.columns)

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator."""
params1 = {"mu": [[0, 1], [2, 3], [4, 5]], "sigma": 1}
params2 = {
"mu": 0,
"sigma": 1,
"index": pd.Index([1, 2, 5]),
"columns": pd.Index(["a", "b"]),
}
return [params1, params2]

0 comments on commit 0f0248c

Please sign in to comment.