Skip to content

Commit

Permalink
ENH: Diversity of categorical values (#159)
Browse files Browse the repository at this point in the history
* ENH: Diversity of categorical values

* example

* more tests

* fix tests

* coverage
  • Loading branch information
martinfleis authored Mar 30, 2020
1 parent bd9b40e commit b30b595
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 35 deletions.
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ diversity
Shannon
Simpson
Theil
Unique

utilities
---------
Expand Down
191 changes: 156 additions & 35 deletions momepy/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import scipy as sp
from tqdm import tqdm # progress bar

__all__ = ["Range", "Theil", "Simpson", "Gini", "Shannon"]
__all__ = ["Range", "Theil", "Simpson", "Gini", "Shannon", "Unique"]


class Range:
Expand Down Expand Up @@ -222,6 +222,10 @@ class Simpson:
return Gini-Simpson index instead of Simpson index (1 - λ)
inverse : bool (default False)
return Inverse Simpson index instead of Simpson index (1 / λ)
categorical : bool (default False)
treat values as categories (will not use binning)
categories : list-like (default None)
list of categories. If None values.unique() is used.
**classification_kwds : dict
Keyword arguments for classification scheme
For details see mapclassify documentation:
Expand Down Expand Up @@ -267,26 +271,34 @@ def __init__(
binning="HeadTailBreaks",
gini_simpson=False,
inverse=False,
categorical=False,
categories=None,
**classification_kwds
):
try:
import mapclassify.classifiers as classifiers
except ImportError:
raise ImportError("The 'mapclassify' package is required")

schemes = {}
for classifier in classifiers.CLASSIFIERS:
schemes[classifier.lower()] = getattr(classifiers, classifier)
binning = binning.lower()
if binning not in schemes:
raise ValueError(
"Invalid binning. Binning must be in the" " set: %r" % schemes.keys()
)
if not categorical:
try:
import mapclassify.classifiers as classifiers
except ImportError:
raise ImportError("The 'mapclassify' package is required")

schemes = {}
for classifier in classifiers.CLASSIFIERS:
schemes[classifier.lower()] = getattr(classifiers, classifier)
binning = binning.lower()
if binning not in schemes:
raise ValueError(
"Invalid binning. Binning must be in the"
" set: %r" % schemes.keys()
)

self.gdf = gdf
self.sw = spatial_weights
self.id = gdf[unique_id]
self.binning = binning
self.gini_simpson = gini_simpson
self.inverse = inverse
self.categorical = categorical
self.categories = categories
self.classification_kwds = classification_kwds

data = gdf.copy()
Expand All @@ -296,8 +308,14 @@ def __init__(
values = "mm_v"
self.values = data[values]

self.bins = schemes[binning](data[values], **classification_kwds).bins
if not categorical:
self.bins = schemes[binning](data[values], **classification_kwds).bins

data = data.set_index(unique_id)[values]

if not categories:
categories = data.unique()

results_list = []
for index in tqdm(data.index, total=data.shape[0]):
if index in spatial_weights.neighbors.keys():
Expand All @@ -308,8 +326,15 @@ def __init__(
neighbours = [index]
values_list = data.loc[neighbours]

sample_bins = classifiers.UserDefined(values_list, self.bins)
counts = dict(zip(self.bins, sample_bins.counts))
if categorical:
counts = values_list.value_counts().to_dict()
for c in categories:
if c not in counts.keys():
counts[c] = 0
else:
sample_bins = classifiers.UserDefined(values_list, self.bins)
counts = dict(zip(self.bins, sample_bins.counts))

results_list.append(self._simpson_di(counts))
else:
results_list.append(np.nan)
Expand Down Expand Up @@ -463,6 +488,10 @@ class Shannon:
JenksCaspallForced, JenksCaspallSampled, MaxPClassifier,
MaximumBreaks, NaturalBreaks, Quantiles, Percentiles, StdMean,
UserDefined
categorical : bool (default False)
treat values as categories (will not use binning)
categories : list-like (default None)
list of categories. If None values.unique() is used.
**classification_kwds : dict
Keyword arguments for classification scheme
For details see mapclassify documentation:
Expand Down Expand Up @@ -505,28 +534,32 @@ def __init__(
spatial_weights,
unique_id,
binning="HeadTailBreaks",
gini_simpson=False,
inverse=False,
categorical=False,
categories=None,
**classification_kwds
):
try:
import mapclassify.classifiers as classifiers
except ImportError:
raise ImportError("The 'mapclassify' package is required")

schemes = {}
for classifier in classifiers.CLASSIFIERS:
schemes[classifier.lower()] = getattr(classifiers, classifier)
binning = binning.lower()
if binning not in schemes:
raise ValueError(
"Invalid binning. Binning must be in the" " set: %r" % schemes.keys()
)
if not categorical:
try:
import mapclassify.classifiers as classifiers
except ImportError:
raise ImportError("The 'mapclassify' package is required")

schemes = {}
for classifier in classifiers.CLASSIFIERS:
schemes[classifier.lower()] = getattr(classifiers, classifier)
binning = binning.lower()
if binning not in schemes:
raise ValueError(
"Invalid binning. Binning must be in the"
" set: %r" % schemes.keys()
)

self.gdf = gdf
self.sw = spatial_weights
self.id = gdf[unique_id]
self.binning = binning
self.categorical = categorical
self.categories = categories
self.classification_kwds = classification_kwds

data = gdf.copy()
Expand All @@ -536,8 +569,14 @@ def __init__(
values = "mm_v"
self.values = data[values]

self.bins = schemes[binning](data[values], **classification_kwds).bins
if not categorical:
self.bins = schemes[binning](data[values], **classification_kwds).bins

data = data.set_index(unique_id)[values]

if not categories:
categories = data.unique()

results_list = []
for index in tqdm(data.index, total=data.shape[0]):
if index in spatial_weights.neighbors.keys():
Expand All @@ -548,8 +587,15 @@ def __init__(
neighbours = [index]
values_list = data.loc[neighbours]

sample_bins = classifiers.UserDefined(values_list, self.bins)
counts = dict(zip(self.bins, sample_bins.counts))
if categorical:
counts = values_list.value_counts().to_dict()
for c in categories:
if c not in counts.keys():
counts[c] = 0
else:
sample_bins = classifiers.UserDefined(values_list, self.bins)
counts = dict(zip(self.bins, sample_bins.counts))

results_list.append(self._shannon(counts))
else:
results_list.append(np.nan)
Expand All @@ -576,3 +622,78 @@ def p(n, N):
N = sum(data.values())

return -sum(p(n, N) for n in data.values() if n != 0)


class Unique:
"""
Calculates the number of unique values within neighbours defined in `spatial_weights`.
.. math::
Parameters
----------
gdf : GeoDataFrame
GeoDataFrame containing morphological tessellation
values : str, list, np.array, pd.Series
the name of the dataframe column, np.array, or pd.Series where is stored character value.
spatial_weights : libpysal.weights
spatial weights matrix
unique_id : str
name of the column with unique id used as spatial_weights index
Attributes
----------
series : Series
Series containing resulting values
gdf : GeoDataFrame
original GeoDataFrame
values : Series
Series containing used values
sw : libpysal.weights
spatial weights matrix
id : Series
Series containing used unique ID
References
----------
Examples
--------
>>> sw = momepy.sw_high(k=3, gdf=tessellation_df, ids='uID')
>>> tessellation_df['cluster_unique'] = mm.Unique(tessellation_df, 'cluster', sw, 'uID').series
100%|██████████| 144/144 [00:00<00:00, 722.50it/s]
"""

def __init__(self, gdf, values, spatial_weights, unique_id):
self.gdf = gdf
self.sw = spatial_weights
self.id = gdf[unique_id]

data = gdf.copy()
if values is not None:
if not isinstance(values, str):
data["mm_v"] = values
values = "mm_v"
self.values = data[values]

data = data.set_index(unique_id)[values]

results_list = []
for index in tqdm(data.index, total=data.shape[0]):
if index in spatial_weights.neighbors.keys():
neighbours = spatial_weights.neighbors[index].copy()
if neighbours:
neighbours.append(index)
else:
neighbours = [index]

values_list = data.loc[neighbours]
results_list.append(len(values_list.unique()))
else:
results_list.append(np.nan)

self.series = pd.Series(results_list, index=gdf.index)
43 changes: 43 additions & 0 deletions tests/test_diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def setup_method(self):
self.df_buildings["height"] = np.linspace(10.0, 30.0, 144)
self.df_tessellation["area"] = mm.Area(self.df_tessellation).series
self.sw = sw_high(k=3, gdf=self.df_tessellation, ids="uID")
self.sw.neighbors[100] = []
self.sw_drop = sw_high(k=3, gdf=self.df_tessellation[2:], ids="uID")

def test_Range(self):
Expand Down Expand Up @@ -98,6 +99,22 @@ def test_Simpson(self):
).series
assert inv[0] == 1 / 0.385

self.df_tessellation["cat"] = list(range(8)) * 18
cat = mm.Simpson(
self.df_tessellation, "cat", self.sw, "uID", categorical=True
).series
assert cat[0] == pytest.approx(0.15)

cat2 = mm.Simpson(
self.df_tessellation,
"cat",
self.sw,
"uID",
categorical=True,
categories=range(15),
).series
assert cat2[0] == pytest.approx(0.15)

def test_Gini(self):
full_sw = mm.Gini(self.df_tessellation, "area", self.sw, "uID").series
assert full_sw[0] == approx(0.3945388)
Expand Down Expand Up @@ -137,3 +154,29 @@ def test_Shannon(self):
.series.isna()
.any()
)

self.df_tessellation["cat"] = list(range(8)) * 18
cat = mm.Shannon(
self.df_tessellation, "cat", self.sw, "uID", categorical=True
).series
assert cat[0] == pytest.approx(1.973)

cat2 = mm.Shannon(
self.df_tessellation,
"cat",
self.sw,
"uID",
categorical=True,
categories=range(15),
).series
assert cat2[0] == pytest.approx(1.973)

def test_Unique(self):
self.df_tessellation["cat"] = list(range(8)) * 18
un = mm.Unique(self.df_tessellation, "cat", self.sw, "uID").series
assert un[0] == 8
un = mm.Unique(self.df_tessellation, list(range(8)) * 18, self.sw, "uID").series
assert un[0] == 8
un = mm.Unique(self.df_tessellation, "cat", self.sw_drop, "uID").series
assert un.isna().any()
assert un[5] == 8

0 comments on commit b30b595

Please sign in to comment.