ENH: Diversity of categorical values (#159)

* ENH: Diversity of categorical values * example * more tests * fix tests * coverage
pysal · Mar 30, 2020 · b30b595 · b30b595
1 parent bd9b40e
commit b30b595
Show file tree

Hide file tree

Showing 3 changed files with 200 additions and 35 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -125,6 +125,7 @@ diversity
    Shannon
    Simpson
    Theil
+   Unique
 
 utilities
 ---------

diff --git a/momepy/diversity.py b/momepy/diversity.py
@@ -9,7 +9,7 @@
 import scipy as sp
 from tqdm import tqdm  # progress bar
 
-__all__ = ["Range", "Theil", "Simpson", "Gini", "Shannon"]
+__all__ = ["Range", "Theil", "Simpson", "Gini", "Shannon", "Unique"]
 
 
 class Range:
@@ -222,6 +222,10 @@ class Simpson:
         return Gini-Simpson index instead of Simpson index (1 - λ)
     inverse : bool (default False)
         return Inverse Simpson index instead of Simpson index (1 / λ)
+    categorical : bool (default False)
+        treat values as categories (will not use binning)
+    categories : list-like (default None)
+        list of categories. If None values.unique() is used.
     **classification_kwds : dict
         Keyword arguments for classification scheme
         For details see mapclassify documentation:
@@ -267,26 +271,34 @@ def __init__(
         binning="HeadTailBreaks",
         gini_simpson=False,
         inverse=False,
+        categorical=False,
+        categories=None,
         **classification_kwds
     ):
-        try:
-            import mapclassify.classifiers as classifiers
-        except ImportError:
-            raise ImportError("The 'mapclassify' package is required")
-
-        schemes = {}
-        for classifier in classifiers.CLASSIFIERS:
-            schemes[classifier.lower()] = getattr(classifiers, classifier)
-        binning = binning.lower()
-        if binning not in schemes:
-            raise ValueError(
-                "Invalid binning. Binning must be in the" " set: %r" % schemes.keys()
-            )
+        if not categorical:
+            try:
+                import mapclassify.classifiers as classifiers
+            except ImportError:
+                raise ImportError("The 'mapclassify' package is required")
+
+            schemes = {}
+            for classifier in classifiers.CLASSIFIERS:
+                schemes[classifier.lower()] = getattr(classifiers, classifier)
+            binning = binning.lower()
+            if binning not in schemes:
+                raise ValueError(
+                    "Invalid binning. Binning must be in the"
+                    " set: %r" % schemes.keys()
+                )
 
         self.gdf = gdf
         self.sw = spatial_weights
         self.id = gdf[unique_id]
         self.binning = binning
+        self.gini_simpson = gini_simpson
+        self.inverse = inverse
+        self.categorical = categorical
+        self.categories = categories
         self.classification_kwds = classification_kwds
 
         data = gdf.copy()
@@ -296,8 +308,14 @@ def __init__(
                 values = "mm_v"
         self.values = data[values]
 
-        self.bins = schemes[binning](data[values], **classification_kwds).bins
+        if not categorical:
+            self.bins = schemes[binning](data[values], **classification_kwds).bins
+
         data = data.set_index(unique_id)[values]
+
+        if not categories:
+            categories = data.unique()
+
         results_list = []
         for index in tqdm(data.index, total=data.shape[0]):
             if index in spatial_weights.neighbors.keys():
@@ -308,8 +326,15 @@ def __init__(
                     neighbours = [index]
                 values_list = data.loc[neighbours]
 
-                sample_bins = classifiers.UserDefined(values_list, self.bins)
-                counts = dict(zip(self.bins, sample_bins.counts))
+                if categorical:
+                    counts = values_list.value_counts().to_dict()
+                    for c in categories:
+                        if c not in counts.keys():
+                            counts[c] = 0
+                else:
+                    sample_bins = classifiers.UserDefined(values_list, self.bins)
+                    counts = dict(zip(self.bins, sample_bins.counts))
+
                 results_list.append(self._simpson_di(counts))
             else:
                 results_list.append(np.nan)
@@ -463,6 +488,10 @@ class Shannon:
         JenksCaspallForced, JenksCaspallSampled, MaxPClassifier,
         MaximumBreaks, NaturalBreaks, Quantiles, Percentiles, StdMean,
         UserDefined
+    categorical : bool (default False)
+        treat values as categories (will not use binning)
+    categories : list-like (default None)
+        list of categories. If None values.unique() is used.
     **classification_kwds : dict
         Keyword arguments for classification scheme
         For details see mapclassify documentation:
@@ -505,28 +534,32 @@ def __init__(
         spatial_weights,
         unique_id,
         binning="HeadTailBreaks",
-        gini_simpson=False,
-        inverse=False,
+        categorical=False,
+        categories=None,
         **classification_kwds
     ):
-        try:
-            import mapclassify.classifiers as classifiers
-        except ImportError:
-            raise ImportError("The 'mapclassify' package is required")
-
-        schemes = {}
-        for classifier in classifiers.CLASSIFIERS:
-            schemes[classifier.lower()] = getattr(classifiers, classifier)
-        binning = binning.lower()
-        if binning not in schemes:
-            raise ValueError(
-                "Invalid binning. Binning must be in the" " set: %r" % schemes.keys()
-            )
+        if not categorical:
+            try:
+                import mapclassify.classifiers as classifiers
+            except ImportError:
+                raise ImportError("The 'mapclassify' package is required")
+
+            schemes = {}
+            for classifier in classifiers.CLASSIFIERS:
+                schemes[classifier.lower()] = getattr(classifiers, classifier)
+            binning = binning.lower()
+            if binning not in schemes:
+                raise ValueError(
+                    "Invalid binning. Binning must be in the"
+                    " set: %r" % schemes.keys()
+                )
 
         self.gdf = gdf
         self.sw = spatial_weights
         self.id = gdf[unique_id]
         self.binning = binning
+        self.categorical = categorical
+        self.categories = categories
         self.classification_kwds = classification_kwds
 
         data = gdf.copy()
@@ -536,8 +569,14 @@ def __init__(
                 values = "mm_v"
         self.values = data[values]
 
-        self.bins = schemes[binning](data[values], **classification_kwds).bins
+        if not categorical:
+            self.bins = schemes[binning](data[values], **classification_kwds).bins
+
         data = data.set_index(unique_id)[values]
+
+        if not categories:
+            categories = data.unique()
+
         results_list = []
         for index in tqdm(data.index, total=data.shape[0]):
             if index in spatial_weights.neighbors.keys():
@@ -548,8 +587,15 @@ def __init__(
                     neighbours = [index]
                 values_list = data.loc[neighbours]
 
-                sample_bins = classifiers.UserDefined(values_list, self.bins)
-                counts = dict(zip(self.bins, sample_bins.counts))
+                if categorical:
+                    counts = values_list.value_counts().to_dict()
+                    for c in categories:
+                        if c not in counts.keys():
+                            counts[c] = 0
+                else:
+                    sample_bins = classifiers.UserDefined(values_list, self.bins)
+                    counts = dict(zip(self.bins, sample_bins.counts))
+
                 results_list.append(self._shannon(counts))
             else:
                 results_list.append(np.nan)
@@ -576,3 +622,78 @@ def p(n, N):
         N = sum(data.values())
 
         return -sum(p(n, N) for n in data.values() if n != 0)
+
+
+class Unique:
+    """
+    Calculates the number of unique values within neighbours defined in `spatial_weights`.
+
+    .. math::
+
+
+    Parameters
+    ----------
+    gdf : GeoDataFrame
+        GeoDataFrame containing morphological tessellation
+    values : str, list, np.array, pd.Series
+        the name of the dataframe column, np.array, or pd.Series where is stored character value.
+    spatial_weights : libpysal.weights
+        spatial weights matrix
+    unique_id : str
+        name of the column with unique id used as spatial_weights index
+
+    Attributes
+    ----------
+    series : Series
+        Series containing resulting values
+    gdf : GeoDataFrame
+        original GeoDataFrame
+    values : Series
+        Series containing used values
+    sw : libpysal.weights
+        spatial weights matrix
+    id : Series
+        Series containing used unique ID
+
+
+    References
+    ----------
+
+    Examples
+    --------
+    >>> sw = momepy.sw_high(k=3, gdf=tessellation_df, ids='uID')
+    >>> tessellation_df['cluster_unique'] = mm.Unique(tessellation_df, 'cluster', sw, 'uID').series
+    100%|██████████| 144/144 [00:00<00:00, 722.50it/s]
+
+
+    """
+
+    def __init__(self, gdf, values, spatial_weights, unique_id):
+        self.gdf = gdf
+        self.sw = spatial_weights
+        self.id = gdf[unique_id]
+
+        data = gdf.copy()
+        if values is not None:
+            if not isinstance(values, str):
+                data["mm_v"] = values
+                values = "mm_v"
+        self.values = data[values]
+
+        data = data.set_index(unique_id)[values]
+
+        results_list = []
+        for index in tqdm(data.index, total=data.shape[0]):
+            if index in spatial_weights.neighbors.keys():
+                neighbours = spatial_weights.neighbors[index].copy()
+                if neighbours:
+                    neighbours.append(index)
+                else:
+                    neighbours = [index]
+
+                values_list = data.loc[neighbours]
+                results_list.append(len(values_list.unique()))
+            else:
+                results_list.append(np.nan)
+
+        self.series = pd.Series(results_list, index=gdf.index)
diff --git a/tests/test_diversity.py b/tests/test_diversity.py
@@ -27,6 +27,7 @@ def setup_method(self):
         self.df_buildings["height"] = np.linspace(10.0, 30.0, 144)
         self.df_tessellation["area"] = mm.Area(self.df_tessellation).series
         self.sw = sw_high(k=3, gdf=self.df_tessellation, ids="uID")
+        self.sw.neighbors[100] = []
         self.sw_drop = sw_high(k=3, gdf=self.df_tessellation[2:], ids="uID")
 
     def test_Range(self):
@@ -98,6 +99,22 @@ def test_Simpson(self):
         ).series
         assert inv[0] == 1 / 0.385
 
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        cat = mm.Simpson(
+            self.df_tessellation, "cat", self.sw, "uID", categorical=True
+        ).series
+        assert cat[0] == pytest.approx(0.15)
+
+        cat2 = mm.Simpson(
+            self.df_tessellation,
+            "cat",
+            self.sw,
+            "uID",
+            categorical=True,
+            categories=range(15),
+        ).series
+        assert cat2[0] == pytest.approx(0.15)
+
     def test_Gini(self):
         full_sw = mm.Gini(self.df_tessellation, "area", self.sw, "uID").series
         assert full_sw[0] == approx(0.3945388)
@@ -137,3 +154,29 @@ def test_Shannon(self):
             .series.isna()
             .any()
         )
+
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        cat = mm.Shannon(
+            self.df_tessellation, "cat", self.sw, "uID", categorical=True
+        ).series
+        assert cat[0] == pytest.approx(1.973)
+
+        cat2 = mm.Shannon(
+            self.df_tessellation,
+            "cat",
+            self.sw,
+            "uID",
+            categorical=True,
+            categories=range(15),
+        ).series
+        assert cat2[0] == pytest.approx(1.973)
+
+    def test_Unique(self):
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        un = mm.Unique(self.df_tessellation, "cat", self.sw, "uID").series
+        assert un[0] == 8
+        un = mm.Unique(self.df_tessellation, list(range(8)) * 18, self.sw, "uID").series
+        assert un[0] == 8
+        un = mm.Unique(self.df_tessellation, "cat", self.sw_drop, "uID").series
+        assert un.isna().any()
+        assert un[5] == 8