From 2743aaa12939ef878bb9ee45bd46b4619a17dc3c Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 7 Feb 2019 10:02:23 -0600 Subject: [PATCH 1/3] make the JaggedArray.zip function no longer broken --- awkward/array/jagged.py | 119 +++++++++++++++++++++++++++++++++++----- awkward/version.py | 2 +- 2 files changed, 105 insertions(+), 16 deletions(-) diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index d23b3e34..ca26f31c 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -32,6 +32,10 @@ import numbers import os from collections import OrderedDict +try: + from collections.abc import Iterable +except ImportError: + from collections import Iterable import awkward.array.base import awkward.persist @@ -1350,29 +1354,114 @@ def concatenate(isclassmethod, cls_or_self, arrays): def stack(cls, first, *rest): # each item in first followed by second, etc. raise NotImplementedError - @classmethod - def zip(cls, columns1={}, *columns2, **columns3): - # FIXME for 1.0: make an @awkward.util.bothmethod like concatenate - table = self.Table(columns1, *columns2, **columns3) - inputs = list(table._content.values()) + @awkward.util.bothmethod + def zip(isclassmethod, cls_or_self, columns1={}, *columns2, **columns3): + if isclassmethod: + cls = cls_or_self + else: + self = cls_or_self + cls = self.__class__ + if not (isinstance(columns1, dict) and len(columns1) == 0): + columns2 = (columns1,) + columns2 + columns1 = self first = None - for i in range(len(inputs)): - if isinstance(inputs[i], JaggedArray): + def ready(x): + starts, stops = x._starts.reshape(-1), x._stops.reshape(-1) + if (x._canuseoffset() and len(starts) != 0 and starts[0] == 0) or (len(starts) != 0 and len(stops) != 0 and len(stops) >= len(starts) and starts[0] == 0 and stops[len(starts) - 1] >= starts[len(starts) - 1] and (starts[1:] == stops[:-1]).all()): + return x + else: + offsets = x.counts2offsets(stops - starts) + starts, stops = offsets[:-1], offsets[1:] + starts = starts.reshape((-1,) + x._starts.shape[1:]) + stops = stops.reshape((-1,) + x._stops.shape[1:]) + return x._tojagged(starts, stops, copy=False) + + if isinstance(columns1, JaggedArray): + columns1 = first = ready(columns1) + + if isinstance(columns1, dict): + for n in columns1: + x = columns1[n] + if isinstance(x, JaggedArray): + if first is None: + columns1[n] = first = ready(x) + else: + columns1[n] = x._tojagged(first._starts, first._stops, copy=False) + + columns2 = list(columns2) + for i in range(len(columns2)): + x = columns2[i] + if isinstance(x, JaggedArray): + if first is None: + columns2[i] = first = ready(x) + else: + columns2[i] = x._tojagged(first._starts, first._stops, copy=False) + + for n in columns3: + x = columns3[n] + if isinstance(x, JaggedArray): if first is None: - first = inputs[i] = inputs[i]._tojagged(copy=False) + columns3[n] = first = ready(x) else: - inputs[i] = inputs[i]._tojagged(first._starts, first._stops, copy=False) + columns3[n] = x._tojagged(first._starts, first._stops, copy=False) if first is None: - return table + raise TypeError("at least one argument in JaggedArray.zip must be a JaggedArray") - for i in range(len(inputs)): - if not isinstance(inputs[i], JaggedArray): - inputs[i] = first._broadcast(inputs[i]) + if isclassmethod: + numpy = cls.numpy + else: + numpy = first.numpy + + if isinstance(columns1, JaggedArray): + columns1 = columns1._content + elif isinstance(columns1, dict): + for n in columns1: + x = columns1[n] + if isinstance(x, JaggedArray): + columns1[n] = x._content + elif isinstance(x, Iterable): + columns1[n] = first._broadcast(x)._content + elif isinstance(x, (numbers.Number, numpy.number, numpy.bool, numpy.bool_)): + columns1[n] = JaggedArray(first._starts, first._stops, numpy.full(first._stops.max(), columns1, dtype=type(columns1)))._content + else: + raise TypeError("unrecognized type for JaggedArray.zip: {0}".format(type(x))) + elif isinstance(columns1, Iterable): + columns1 = first._broadcast(columns1)._content + elif isinstance(columns1, (numbers.Number, numpy.number, numpy.bool, numpy.bool_)): + columns1 = JaggedArray(first._starts, first._stops, numpy.full(first._stops.max(), columns1, dtype=type(columns1)))._content + else: + raise TypeError("unrecognized type for JaggedArray.zip: {0}".format(type(columns1))) + + for i in range(len(columns2)): + x = columns2[i] + if isinstance(x, JaggedArray): + columns2[i] = x._content + elif not isinstance(x, dict) and isinstance(x, Iterable): + columns2[i] = first._broadcast(x)._content + elif isinstance(x, (numbers.Number, numpy.number, numpy.bool, numpy.bool_)): + columns2[i] = JaggedArray(first._starts, first._stops, numpy.full(first._stops.max(), x, dtype=type(x)))._content + else: + raise TypeError("unrecognized type for JaggedArray.zip: {0}".format(type(x))) + + for n in columns3: + x = columns3[n] + if isinstance(x, JaggedArray): + columns3[n] = x._content + elif not isinstance(x, dict) and isinstance(x, Iterable): + columns3[n] = first._broadcast(x)._content + elif isinstance(x, (numbers.Number, numpy.number, numpy.bool, numpy.bool_)): + columns3[n] = JaggedArray(first._starts, first._stops, numpy.full(first._stops.max(), x, dtype=type(x)))._content + else: + raise TypeError("unrecognized type for JaggedArray.zip: {0}".format(type(x))) - newtable = self.Table(OrderedDict(zip(table._content, [x._content for x in inputs]))) - return cls(first._starts, first._stops, newtable) + if isclassmethod: + table = cls.Table.fget(None)(columns1, *columns2, **columns3) + return cls.JaggedArray.fget(None)(first._starts, first._stops, table) + else: + table = first.Table(columns1, *columns2, **columns3) + return first.JaggedArray(first._starts, first._stops, table) def pandas(self): import pandas diff --git a/awkward/version.py b/awkward/version.py index 6ec4af9c..a05eacfb 100644 --- a/awkward/version.py +++ b/awkward/version.py @@ -30,7 +30,7 @@ import re -__version__ = "0.8.3" +__version__ = "0.8.4" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) From 216af58ee9d3176b52807a07be97c5f3906cc95c Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 7 Feb 2019 10:10:43 -0600 Subject: [PATCH 2/3] add tests for JaggedArray.zip --- awkward/array/jagged.py | 10 ++++++++-- tests/test_jagged.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index ca26f31c..52893ebf 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -1457,10 +1457,16 @@ def ready(x): raise TypeError("unrecognized type for JaggedArray.zip: {0}".format(type(x))) if isclassmethod: - table = cls.Table.fget(None)(columns1, *columns2, **columns3) + if isinstance(columns1, dict) or len(columns3) > 0: + table = cls.Table.fget(None)(columns1, *columns2, **columns3) + else: + table = cls.Table.fget(None).named("tuple", columns1, *columns2) return cls.JaggedArray.fget(None)(first._starts, first._stops, table) else: - table = first.Table(columns1, *columns2, **columns3) + if isinstance(columns1, dict) or len(columns3) > 0: + table = first.Table(columns1, *columns2, **columns3) + else: + table = first.Table.named("tuple", columns1, *columns2, **columns3) return first.JaggedArray(first._starts, first._stops, table) def pandas(self): diff --git a/tests/test_jagged.py b/tests/test_jagged.py index 9eec8867..f222efbb 100644 --- a/tests/test_jagged.py +++ b/tests/test_jagged.py @@ -32,6 +32,7 @@ import numpy +import awkward from awkward import * from awkward.type import * @@ -420,3 +421,17 @@ def test_jagged_getstruct(self): assert [a[i : i + 2].tolist() for i in range(len(a) - 1)] == [[[(0.0, 0.0), (1.1, 1.1), (2.2, 2.2)], []], [[], [(3.3, 3.3), (4.4, 4.4), (5.5, 5.5), (6.6, 6.6), (7.7, 7.7)]], [[(3.3, 3.3), (4.4, 4.4), (5.5, 5.5), (6.6, 6.6), (7.7, 7.7)], [(8.8, 8.8), (9.9, 9.9)]], [[(8.8, 8.8), (9.9, 9.9)], []]] assert [x.tolist() for x in a[[2, 1, 0, -2]]] == [[(3.3, 3.3), (4.4, 4.4), (5.5, 5.5), (6.6, 6.6), (7.7, 7.7)], [], [(0.0, 0.0), (1.1, 1.1), (2.2, 2.2)], [(8.8, 8.8), (9.9, 9.9)]] assert [x.tolist() for x in a[[True, False, True, False, True]]] == [[(0.0, 0.0), (1.1, 1.1), (2.2, 2.2)], [(3.3, 3.3), (4.4, 4.4), (5.5, 5.5), (6.6, 6.6), (7.7, 7.7)], []] + + def test_jagged_zip(self): + a = awkward.fromiter([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) + b = awkward.JaggedArray([1, 5, 5], [4, 5, 7], [999, 10, 20, 30, 999, 40, 50, 999]) + c = numpy.array([100, 200, 300]) + d = 1000 + assert awkward.JaggedArray.zip(one=a, two=b).tolist() == [[{"one": 1.1, "two": 10}, {"one": 2.2, "two": 20}, {"one": 3.3, "two": 30}], [], [{"one": 4.4, "two": 40}, {"one": 5.5, "two": 50}]] + assert awkward.JaggedArray.zip(one=b, two=a).tolist() == [[{"one": 10, "two": 1.1}, {"one": 20, "two": 2.2}, {"one": 30, "two": 3.3}], [], [{"one": 40, "two": 4.4}, {"one": 50, "two": 5.5}]] + assert awkward.JaggedArray.zip(one=b, two=c).tolist() == [[{"one": 10, "two": 100}, {"one": 20, "two": 100}, {"one": 30, "two": 100}], [], [{"one": 40, "two": 300}, {"one": 50, "two": 300}]] + assert awkward.JaggedArray.zip(one=b, two=d).tolist() == [[{"one": 10, "two": 1000}, {"one": 20, "two": 1000}, {"one": 30, "two": 1000}], [], [{"one": 40, "two": 1000}, {"one": 50, "two": 1000}]] + assert a.zip(b).tolist() == [[(1.1, 10), (2.2, 20), (3.3, 30)], [], [(4.4, 40), (5.5, 50)]] + assert b.zip(a).tolist() == [[(10, 1.1), (20, 2.2), (30, 3.3)], [], [(40, 4.4), (50, 5.5)]] + assert b.zip(c).tolist() == [[(10, 100), (20, 100), (30, 100)], [], [(40, 300), (50, 300)]] + assert b.zip(d).tolist() == [[(10, 1000), (20, 1000), (30, 1000)], [], [(40, 1000), (50, 1000)]] From 55f5ce5257f23054b46f4c8eb8438dd9d04b7d33 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 7 Feb 2019 10:11:20 -0600 Subject: [PATCH 3/3] typo --- awkward/array/jagged.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index 52893ebf..bbc82b76 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -1466,7 +1466,7 @@ def ready(x): if isinstance(columns1, dict) or len(columns3) > 0: table = first.Table(columns1, *columns2, **columns3) else: - table = first.Table.named("tuple", columns1, *columns2, **columns3) + table = first.Table.named("tuple", columns1, *columns2) return first.JaggedArray(first._starts, first._stops, table) def pandas(self):