From 806809e8a724085c2aba9f19d0498285c279394e Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 24 Oct 2018 17:23:33 -0500 Subject: [PATCH 01/27] create a suite of Arrow objects to test --- awkward/view/__init__.py | 29 +++++++ awkward/view/arrow.py | 31 ++++++++ tests/test_arrow.py | 167 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+) create mode 100644 awkward/view/__init__.py create mode 100644 awkward/view/arrow.py create mode 100644 tests/test_arrow.py diff --git a/awkward/view/__init__.py b/awkward/view/__init__.py new file mode 100644 index 00000000..e831ea82 --- /dev/null +++ b/awkward/view/__init__.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/awkward/view/arrow.py b/awkward/view/arrow.py new file mode 100644 index 00000000..6f8faa07 --- /dev/null +++ b/awkward/view/arrow.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + diff --git a/tests/test_arrow.py b/tests/test_arrow.py new file mode 100644 index 00000000..7cbf7d29 --- /dev/null +++ b/tests/test_arrow.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import numpy +try: + import pyarrow +except ImportError: + pyarrow = None + +from awkward import * + +class Test(unittest.TestCase): + def runTest(self): + pass + + def test_array(self): + if pyarrow is not None: + arr = pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]) + + def test_array_null(self): + if pyarrow is not None: + arr = pyarrow.array([1.1, 2.2, 3.3, None, 4.4, 5.5]) + + def test_chunked_array(self): + if pyarrow is not None: + arr = pyarrow.chunked_array([pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]), pyarrow.array([]), pyarrow.array([6.6, 7.7, 8.8])]) + + def test_struct(self): + if pyarrow is not None: + arr = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + + def test_struct_null(self): + if pyarrow is not None: + arr = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + + def test_null_struct(self): + if pyarrow is not None: + arr = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + + def test_null_struct_null(self): + if pyarrow is not None: + arr = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + + def test_chunked_struct(self): + if pyarrow is not None: + arr = pyarrow.chunked_array([pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]), pyarrow.array([]), pyarrow.array([{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}])]) + + def test_nested_struct(self): + if pyarrow is not None: + arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + + def test_nested_struct_null(self): + if pyarrow is not None: + arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + + def test_null_nested_struct(self): + if pyarrow is not None: + arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + + def test_null_nested_struct_null(self): + if pyarrow is not None: + arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + + def test_struct_nested(self): + if pyarrow is not None: + arr = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}]) + + def test_struct_nested_null(self): + if pyarrow is not None: + arr = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}]) + + def test_nested_struct_nested(self): + if pyarrow is not None: + arr = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + + def test_null_nested_struct_nested_null(self): + if pyarrow is not None: + arr = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + + def test_strings(self): + if pyarrow is not None: + arr = pyarrow.array(["one", "two", "three", "four", "five"]) + + def test_strings_null(self): + if pyarrow is not None: + arr = pyarrow.array(["one", "two", None, "four", "five"]) + + def test_chunked_strings(self): + if pyarrow is not None: + arr = pyarrow.chunked_array([pyarrow.array(["one", "two", "three", "four", "five"]), pyarrow.array(["six", "seven", "eight"])]) + + def test_nested_strings(self): + if pyarrow is not None: + arr = pyarrow.array([["one", "two", "three"], [], ["four", "five"]]) + + def test_nested_strings_null(self): + if pyarrow is not None: + arr = pyarrow.array([["one", "two", None], [], ["four", "five"]]) + + def test_null_nested_strings_null(self): + if pyarrow is not None: + arr = pyarrow.array([["one", "two", None], [], None, ["four", "five"]]) + + def test_dictarray(self): + if pyarrow is not None: + arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + + def test_dictarray_null(self): + if pyarrow is not None: + arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, None, 1, None, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + + def test_batch(self): + if pyarrow is not None: + arr = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), + pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), + pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], + ["a", "b", "c", "d", "e"]) + + def test_batch(self): + if pyarrow is not None: + arr = pyarrow.Table.from_batches([ + pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), + pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), + pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], + ["a", "b", "c", "d", "e"]), + pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), + pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), + pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], + ["a", "b", "c", "d", "e"])]) From 2a3ae450c3607f76900215416a26d00e7a2a6631 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 06:17:46 -0500 Subject: [PATCH 02/27] test unions as well --- tests/test_arrow.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_arrow.py b/tests/test_arrow.py index 7cbf7d29..b599dbab 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -130,6 +130,30 @@ def test_null_nested_strings_null(self): if pyarrow is not None: arr = pyarrow.array([["one", "two", None], [], None, ["four", "five"]]) + def test_union_sparse(self): + if pyarrow is not None: + arr = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, 2.2, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + + def test_union_sparse_null(self): + if pyarrow is not None: + arr = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + + def test_union_sparse_null_null(self): + if pyarrow is not None: + arr = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, None, False, True, False])]) + + def test_union_dense(self): + if pyarrow is not None: + pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, 2.2, 3.3]), pyarrow.array([True, True, False])]) + + def test_union_dense_null(self): + if pyarrow is not None: + pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, True, False])]) + + def test_union_dense_null_null(self): + if pyarrow is not None: + pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, None, False])]) + def test_dictarray(self): if pyarrow is not None: arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) From af7aa6dc4cf9b03ef3e9f666af4e96bc5524c604 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 09:44:29 -0500 Subject: [PATCH 03/27] we need a StringArray (for Arrow) --- awkward/__init__.py | 4 +- awkward/array/jagged.py | 2 - awkward/array/objects.py | 11 +- awkward/derived/__init__.py | 29 ++++ awkward/derived/strings.py | 333 ++++++++++++++++++++++++++++++++++++ awkward/util.py | 1 + 6 files changed, 375 insertions(+), 5 deletions(-) create mode 100644 awkward/derived/__init__.py create mode 100644 awkward/derived/strings.py diff --git a/awkward/__init__.py b/awkward/__init__.py index d935910f..5494c502 100644 --- a/awkward/__init__.py +++ b/awkward/__init__.py @@ -37,6 +37,8 @@ from awkward.array.union import UnionArray from awkward.array.virtual import VirtualArray +from awkward.derived.strings import StringArray + from awkward.generate import fromiter from awkward.persist import serialize, deserialize, save, load, tohdf5, fromhdf5 @@ -44,4 +46,4 @@ # convenient access to the version number from awkward.version import __version__ -__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "fromiter", "serialize", "deserialize", "save", "load", "tohdf5", "fromhdf5", "__version__"] +__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "StringArray", "fromiter", "serialize", "deserialize", "save", "load", "tohdf5", "fromhdf5", "__version__"] diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index ed4a405e..9dd19d29 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -242,7 +242,6 @@ def __awkward_persist__(self, ident, fill, **kwargs): "call": ["awkward", n, "fromcounts"], "args": [fill(self.counts, n + ".counts", **kwargs), fill(self._content, n + ".content", **kwargs)]} - else: return {"id": ident, "call": ["awkward", n], @@ -1154,7 +1153,6 @@ def __awkward_persist__(self, ident, fill, **kwargs): "args": [fill(self.counts, n + ".counts", **kwargs), fill(self._content, n + ".content", **kwargs), {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._subdtype)]}]} - else: return {"id": ident, "call": ["awkward", n], diff --git a/awkward/array/objects.py b/awkward/array/objects.py index f90ff978..088c9bf5 100644 --- a/awkward/array/objects.py +++ b/awkward/array/objects.py @@ -30,6 +30,7 @@ import importlib import pickle +import base64 import awkward.array.base import awkward.type @@ -54,6 +55,12 @@ def maybemixin(sample, awkwardtype): else: return awkwardtype +def frompython(obj): + return base64.b64encode(pickle.dumps(obj)).decode("ascii") + +def topython(string): + return pickle.loads(base64.b64decode(string.encode("ascii"))) + class ObjectArray(awkward.array.base.AwkwardArrayWithContent): def __init__(self, content, generator, *args, **kwargs): self.content = content @@ -133,9 +140,9 @@ def __awkward_persist__(self, ident, fill, **kwargs): "call": ["awkward", n], "args": [fill(self._content, n + ".content", **kwargs), {"function": spec}]} if len(self._args) > 0: - out["*"] = {"call": ["pickle", "loads"], "args": [pickle.dumps(self._args)]} + out["*"] = {"call": ["awkward.array.objects", "topython"], "args": [frompython(self._args)]} if len(self._kwargs) > 0: - out["**"] = {"call": ["pickle", "loads"], "args": [pickle.dumps(self._kwargs)]} + out["**"] = {"call": ["awkward.array.objects", "topython"], "args": [frompython(self._kwargs)]} return out diff --git a/awkward/derived/__init__.py b/awkward/derived/__init__.py new file mode 100644 index 00000000..e831ea82 --- /dev/null +++ b/awkward/derived/__init__.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py new file mode 100644 index 00000000..21d7dc44 --- /dev/null +++ b/awkward/derived/strings.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import codecs + +import awkward.util +import awkward.array.jagged +import awkward.array.objects + +class StringMethods(object): + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + if method != "__call__": + raise NotImplemented + + if ufunc is awkward.util.numpy.equal or ufunc is awkward.util.numpy.not_equal: + if len(inputs) < 2: + raise ValueError("invalid number of arguments") + left, right = inputs[0], inputs[1] + + if isinstance(left, (str, bytes)): + left = StringArray.fromstr(len(right), left) + elif isinstance(left, awkward.util.numpy.ndarray) and (left.dtype.kind == "U" or left.dtype.kind == "S"): + left = StringArray.fromnumpy(left) + elif isinstance(left, awkward.util.numpy.ndarray) and left.dtype == awkward.util.numpy.dtype(object): + left = StringArray.fromiter(left) + elif not isinstance(left, StringMethods): + return awkward.util.numpy.zeros(len(right), dtype=awkward.util.BOOLTYPE) + + if isinstance(right, (str, bytes)): + right = StringArray.fromstr(len(left), right) + elif isinstance(right, awkward.util.numpy.ndarray) and (right.dtype.kind == "U" or right.dtype.kind == "S"): + right = StringArray.fromnumpy(right) + elif isinstance(right, awkward.util.numpy.ndarray) and right.dtype == awkward.util.numpy.dtype(object): + right = StringArray.fromiter(right) + elif not isinstance(right, StringMethods): + return awkward.util.numpy.zeros(len(left), dtype=awkward.util.BOOLTYPE) + + left = awkward.array.jagged.JaggedArray(left._starts, left._stops, left._content) + right = awkward.array.jagged.JaggedArray(right._starts, right._stops, right._content) + + maybeequal = (left.counts == right.counts) + + leftmask = left[maybeequal] + rightmask = right[maybeequal] + reallyequal = (leftmask == rightmask).count_nonzero() == leftmask.counts() + + out = awkward.util.numpy.zeros(len(left), dtype=awkward.util.BOOLTYPE) + out[maybeequal] = reallyequal + + if ufunc is awkward.util.numpy.equal: + return out + else: + return awkward.util.numpy.logical_not(out) + + else: + return super(StringMethods, self).__array_ufunc__(ufunc, method, *inputs, **kwargs) + +def tostring(x, decoder): + if decoder is None: + return x.tostring() + else: + return decoder(x, errors="replace")[0] + +class StringArray(StringMethods, awkward.array.objects.ObjectArray): + def __init__(self, starts, stops, content, encoding="utf-8"): + self._content = awkward.array.jagged.ByteJaggedArray(starts, stops, content, awkward.util.CHARTYPE) + self._generator = tostring + self._kwargs = {} + self.encoding = encoding + + @classmethod + def fromstr(cls, length, string, encoding="utf-8"): + if encoding is not None: + encoder = codecs.getencoder(encoding) + string = encoder(string, errors="replace")[0] + content = awkward.util.numpy.empty(length * len(string), dtype=awkward.util.CHARTYPE) + for i, x in string: + content[0::length] = ord(x) + counts = awkward.util.numpy.empty(length, dtype=awkward.util.INDEXTYPE) + counts[:] = length + return cls.fromcounts(counts, content, encoding) + + @classmethod + def fromnumpy(cls, array): + if array.dtype.kind == "S": + encoding = None + elif array.dtype.kind == "U": + encoding = "utf-32le" + else: + raise TypeError("not a string array") + + starts = awkward.util.numpy.arange( 0, len(array) * array.dtype.itemsize, array.dtype.itemsize) + stops = awkward.util.numpy.arange(array.dtype.itemsize, (len(array) + 1) * array.dtype.itemsize, array.dtype.itemsize) + content = array.view(awkward.util.CHARTYPE) + + shorter = awkward.util.numpy.ones(len(array), dtype=awkward.util.BOOLTYPE) + if array.dtype.kind == "S": + for checkat in range(array.dtype.itemsize - 1, -1, -1): + shorter &= (content[checkat::array.dtype.itemsize] == 0) + stops[shorter] -= 1 + if not shorter.any(): + break + + elif array.dtype.kind == "U": + content2 = content.view(awkward.util.numpy.uint32) + itemsize2 = array.dtype.itemsize >> 2 # itemsize // 4 + for checkat in range(itemsize2 - 1, -1, -1): + shorter &= (content2[checkat::itemsize2] == 0) # all four bytes are zero + stops[shorter] -= 4 + if not shorter.any(): + break + + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray(starts, stops, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromiter(cls, iterable, encoding="utf-8"): + if encoding is None: + encoded = iterable + else: + encoder = codecs.getencoder(encoding) + encoded = [encoder(x, errors="replace")[0] for x in iterable] + counts = [len(x) for x in encoded] + content = awkward.util.numpy.empty(sum(counts), dtype=awkward.util.CHARTYPE) + return cls.fromcounts(counts, content, encoding) + + @classmethod + def fromoffsets(cls, offsets, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromoffsets(offsets, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromcounts(cls, counts, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromcounts(counts, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromparents(cls, parents, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromparents(parents, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromuniques(cls, uniques, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromuniques(uniques, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromjagged(cls, jagged, encoding="utf-8"): + if jagged.content.type.to != awkward.util.CHARTYPE: + raise TypeError("jagged array must have CHARTYPE ({0})".format(str(awkward.util.CHARTYPE))) + out = cls.__new__(cls) + out._content = jagged + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + # def copy(self, content=None, generator=None, args=None, kwargs=None): + # out = self.__class__.__new__(self.__class__) + # out._content = self._content + # out._generator = self._generator + # out._args = self._args + # out._kwargs = self._kwargs + # if content is not None: + # out.content = content + # if generator is not None: + # out.generator = generator + # if args is not None: + # out.args = args + # if kwargs is not None: + # out.kwargs = kwargs + # return out + + # def deepcopy(self, content=None, generator=None, args=None, kwargs=None): + # out = self.copy(content=content, generator=generator, args=args, kwargs=kwargs) + # out._content = awkward.util.deepcopy(out._content) + # return out + + # def empty_like(self, **overrides): + # mine = {} + # mine["generator"] = overrides.pop("generator", self._generator) + # mine["args"] = overrides.pop("args", self._args) + # mine["kwargs"] = overrides.pop("kwargs", self._kwargs) + # if isinstance(self._content, awkward.util.numpy.ndarray): + # return self.copy(content=awkward.util.numpy.empty_like(self._content), **mine) + # else: + # return self.copy(content=self._content.empty_like(**overrides), **mine) + + # def zeros_like(self, **overrides): + # mine = {} + # mine["generator"] = overrides.pop("generator", self._generator) + # mine["args"] = overrides.pop("args", self._args) + # mine["kwargs"] = overrides.pop("kwargs", self._kwargs) + # if isinstance(self._content, awkward.util.numpy.ndarray): + # return self.copy(content=awkward.util.numpy.zeros_like(self._content), **mine) + # else: + # return self.copy(content=self._content.zeros_like(**overrides), **mine) + + # def ones_like(self, **overrides): + # mine = {} + # mine["generator"] = overrides.pop("generator", self._generator) + # mine["args"] = overrides.pop("args", self._args) + # mine["kwargs"] = overrides.pop("kwargs", self._kwargs) + # if isinstance(self._content, awkward.util.numpy.ndarray): + # return self.copy(content=awkward.util.numpy.ones_like(self._content), **mine) + # else: + # return self.copy(content=self._content.ones_like(**overrides), **mine) + + def __awkward_persist__(self, ident, fill, **kwargs): + self._valid() + n = self.__class__.__name__ + if awkward.array.jagged.offsetsaliased(self.starts, self.stops) and len(self.starts) > 0 and self.starts[0] == 0: + return {"id": ident, + "call": ["awkward", n, "fromcounts"], + "args": [fill(self.counts, n + ".counts", **kwargs), + fill(self.content, n + ".content", **kwargs), + self._encoding]} + else: + return {"id": ident, + "call": ["awkward", n], + "args": [fill(self.starts, n + ".starts", **kwargs), + fill(self.stops, n + ".stops", **kwargs), + fill(self.content, n + ".content", **kwargs), + self._encoding]} + + @property + def starts(self): + return self._content.starts + + @starts.setter + def starts(self, value): + self._content.starts = value + + @property + def stops(self): + return self._content.stops + + @stops.setter + def stops(self, value): + self._content.stops = value + + @property + def content(self): + return self._content.content + + @content.setter + def content(self, value): + self._content.content = value + + @property + def args(self): + return self._args + + @property + def kwargs(self): + return {} + + @property + def encoding(self): + return self._encoding + + @encoding.setter + def encoding(self, value): + if value is None: + decodefcn = None + else: + decodefcn = codecs.getdecoder(value) + self._encoding = value + self._args = (decodefcn,) + + @property + def offsets(self): + return self._content.offsets + + @property + def counts(self): + return self._content.counts + + @property + def parents(self): + return self._content.parents + + @property + def index(self): + return self._content.index + diff --git a/awkward/util.py b/awkward/util.py index a7a3477d..25f4d690 100644 --- a/awkward/util.py +++ b/awkward/util.py @@ -83,6 +83,7 @@ def is_intstring(x): TAGTYPE = numpy.dtype(numpy.uint8) MASKTYPE = numpy.dtype(numpy.bool_) BITMASKTYPE = numpy.dtype(numpy.uint8) +BOOLTYPE = numpy.dtype(numpy.bool_) def toarray(value, defaultdtype, passthrough=None): import awkward.array.base From 436e6b12060d8240ab90e9588883b5f690ec5fd4 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 10:20:49 -0500 Subject: [PATCH 04/27] StringArray does equality correctly (the most important ufunc for strings) --- awkward/array/jagged.py | 9 ++-- awkward/derived/strings.py | 106 ++++++++++++++++++------------------- 2 files changed, 56 insertions(+), 59 deletions(-) diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index 9dd19d29..19317640 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -663,18 +663,19 @@ def recurse(x): for i in range(len(inputs)): if isinstance(inputs[i], JaggedArray): if good is None: - inputs[i] = inputs[i].content + inputs[i] = inputs[i].flatten() else: - inputs[i] = inputs[i].content[good] + inputs[i] = inputs[i].flatten()[good] result = getattr(ufunc, method)(*inputs, **kwargs) + counts = stops - starts if isinstance(result, tuple): - return tuple(awkward.array.objects.Methods.maybemixin(type(x), JaggedArray)(starts, stops, x) if isinstance(x, (awkward.util.numpy.ndarray, awkward.array.base.AwkwardBase)) else x for x in result) + return tuple(awkward.array.objects.Methods.maybemixin(type(x), JaggedArray).fromcounts(counts, x) if isinstance(x, (awkward.util.numpy.ndarray, awkward.array.base.AwkwardBase)) else x for x in result) elif method == "at": return None else: - return awkward.array.objects.Methods.maybemixin(type(result), JaggedArray)(starts, stops, result) + return awkward.array.objects.Methods.maybemixin(type(result), JaggedArray).fromcounts(counts, result) @staticmethod def aligned(*jaggedarrays): diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py index 21d7dc44..f24ce6c7 100644 --- a/awkward/derived/strings.py +++ b/awkward/derived/strings.py @@ -62,14 +62,15 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): elif not isinstance(right, StringMethods): return awkward.util.numpy.zeros(len(left), dtype=awkward.util.BOOLTYPE) - left = awkward.array.jagged.JaggedArray(left._starts, left._stops, left._content) - right = awkward.array.jagged.JaggedArray(right._starts, right._stops, right._content) + left = awkward.array.jagged.JaggedArray(left.starts, left.stops, left.content) + right = awkward.array.jagged.JaggedArray(right.starts, right.stops, right.content) maybeequal = (left.counts == right.counts) leftmask = left[maybeequal] rightmask = right[maybeequal] - reallyequal = (leftmask == rightmask).count_nonzero() == leftmask.counts() + + reallyequal = (leftmask == rightmask).count_nonzero() == leftmask.counts out = awkward.util.numpy.zeros(len(left), dtype=awkward.util.BOOLTYPE) out[maybeequal] = reallyequal @@ -99,7 +100,7 @@ def __init__(self, starts, stops, content, encoding="utf-8"): def fromstr(cls, length, string, encoding="utf-8"): if encoding is not None: encoder = codecs.getencoder(encoding) - string = encoder(string, errors="replace")[0] + string = encoder(string)[0] content = awkward.util.numpy.empty(length * len(string), dtype=awkward.util.CHARTYPE) for i, x in string: content[0::length] = ord(x) @@ -150,9 +151,13 @@ def fromiter(cls, iterable, encoding="utf-8"): encoded = iterable else: encoder = codecs.getencoder(encoding) - encoded = [encoder(x, errors="replace")[0] for x in iterable] + encoded = [encoder(x)[0] for x in iterable] counts = [len(x) for x in encoded] content = awkward.util.numpy.empty(sum(counts), dtype=awkward.util.CHARTYPE) + i = 0 + for x in encoded: + content[i : i + len(x)] = awkward.util.numpy.frombuffer(x, dtype=awkward.util.CHARTYPE) + i += len(x) return cls.fromcounts(counts, content, encoding) @classmethod @@ -202,56 +207,47 @@ def fromjagged(cls, jagged, encoding="utf-8"): out.encoding = encoding return out - # def copy(self, content=None, generator=None, args=None, kwargs=None): - # out = self.__class__.__new__(self.__class__) - # out._content = self._content - # out._generator = self._generator - # out._args = self._args - # out._kwargs = self._kwargs - # if content is not None: - # out.content = content - # if generator is not None: - # out.generator = generator - # if args is not None: - # out.args = args - # if kwargs is not None: - # out.kwargs = kwargs - # return out - - # def deepcopy(self, content=None, generator=None, args=None, kwargs=None): - # out = self.copy(content=content, generator=generator, args=args, kwargs=kwargs) - # out._content = awkward.util.deepcopy(out._content) - # return out - - # def empty_like(self, **overrides): - # mine = {} - # mine["generator"] = overrides.pop("generator", self._generator) - # mine["args"] = overrides.pop("args", self._args) - # mine["kwargs"] = overrides.pop("kwargs", self._kwargs) - # if isinstance(self._content, awkward.util.numpy.ndarray): - # return self.copy(content=awkward.util.numpy.empty_like(self._content), **mine) - # else: - # return self.copy(content=self._content.empty_like(**overrides), **mine) - - # def zeros_like(self, **overrides): - # mine = {} - # mine["generator"] = overrides.pop("generator", self._generator) - # mine["args"] = overrides.pop("args", self._args) - # mine["kwargs"] = overrides.pop("kwargs", self._kwargs) - # if isinstance(self._content, awkward.util.numpy.ndarray): - # return self.copy(content=awkward.util.numpy.zeros_like(self._content), **mine) - # else: - # return self.copy(content=self._content.zeros_like(**overrides), **mine) - - # def ones_like(self, **overrides): - # mine = {} - # mine["generator"] = overrides.pop("generator", self._generator) - # mine["args"] = overrides.pop("args", self._args) - # mine["kwargs"] = overrides.pop("kwargs", self._kwargs) - # if isinstance(self._content, awkward.util.numpy.ndarray): - # return self.copy(content=awkward.util.numpy.ones_like(self._content), **mine) - # else: - # return self.copy(content=self._content.ones_like(**overrides), **mine) + def copy(self, starts=None, stops=None, content=None, encoding=None): + out = self.__class__.__new__(self.__class__) + out._content = awkward.array.jagged.ByteJaggedArray(self.starts, self.stops, self.content, awkward.util.CHARTYPE) + out._generator = self._generator + out._args = self._args + out._kwargs = self._kwargs + out._encoding = self._encoding + if starts is not None: + out.starts = starts + if stops is not None: + out.stops = stops + if content is not None: + out.content = content + if encoding is not None: + out.encoding = encoding + return out + + def deepcopy(self, starts=None, stops=None, content=None, encoding=None): + out = self.copy(starts=starts, stops=stops, content=content, encoding=encoding) + out._content._starts = awkward.util.deepcopy(out._content._starts) + out._content._stops = awkward.util.deepcopy(out._content._stops) + out._content._content = awkward.util.deepcopy(out._content._content) + return out + + def empty_like(self, **overrides): + mine = {} + mine["encoding"] = overrides.pop("encoding", self._encoding) + jagged = self._content.empty_like(**overrides) + return self.copy(jagged.starts, jagged.stops, jagged.content, **mine) + + def zeros_like(self, **overrides): + mine = {} + mine["encoding"] = overrides.pop("encoding", self._encoding) + jagged = self._content.zeros_like(**overrides) + return self.copy(jagged.starts, jagged.stops, jagged.content, **mine) + + def ones_like(self, **overrides): + mine = {} + mine["encoding"] = overrides.pop("encoding", self._encoding) + jagged = self._content.ones_like(**overrides) + return self.copy(jagged.starts, jagged.stops, jagged.content, **mine) def __awkward_persist__(self, ident, fill, **kwargs): self._valid() From abb918e8921421cf33e8667b56ab9f6fcb9e02f8 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 10:24:30 -0500 Subject: [PATCH 05/27] applying 'good' after 'flatten' is wrong --- awkward/array/jagged.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index 19317640..86b358a9 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -662,10 +662,7 @@ def recurse(x): for i in range(len(inputs)): if isinstance(inputs[i], JaggedArray): - if good is None: - inputs[i] = inputs[i].flatten() - else: - inputs[i] = inputs[i].flatten()[good] + inputs[i] = inputs[i].flatten() result = getattr(ufunc, method)(*inputs, **kwargs) From 7b1bbe856d31f12b90bfddf9895d76685aa5d3c3 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 10:38:01 -0500 Subject: [PATCH 06/27] pass through all 'fill' parameters so that schemas can look up schemas --- awkward/array/chunked.py | 10 +++++----- awkward/array/indexed.py | 20 ++++++++++---------- awkward/array/jagged.py | 24 ++++++++++++------------ awkward/array/masked.py | 18 +++++++++--------- awkward/array/objects.py | 4 ++-- awkward/array/table.py | 4 ++-- awkward/array/union.py | 12 ++++++------ awkward/array/virtual.py | 4 ++-- awkward/derived/strings.py | 12 ++++++------ awkward/persist.py | 6 +++--- 10 files changed, 57 insertions(+), 57 deletions(-) diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py index 344303b2..91627517 100644 --- a/awkward/array/chunked.py +++ b/awkward/array/chunked.py @@ -77,14 +77,14 @@ def ones_like(self, **overrides): mine = self._mine(overrides) return self.copy([awkward.util.numpy.ones_like(x) if isinstance(x, awkward.util.numpy.ndarray) else x.ones_like(**overrides) for x in self._chunks], counts=list(self._counts), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self.knowcounts() self._valid() n = self.__class__.__name__ return {"id": ident, "call": ["awkward", n], - "args": [{"list": [fill(x, n + ".chunk", **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, - fill(awkward.util.numpy.array([c for c in self._counts if c > 0]), n + ".counts", **kwargs)]} + "args": [{"list": [fill(x, n + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, + fill(awkward.util.numpy.array([c for c in self._counts if c > 0]), n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} @property def chunks(self): @@ -633,7 +633,7 @@ def _mine(self, overrides): mine["dtype"] = overrides.pop("dtype", self._dtype) return mine - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ @@ -648,7 +648,7 @@ def __awkward_persist__(self, ident, fill, **kwargs): "call": ["awkward", n], "args": [{"tuple": list(self._chunkshape)}, {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._dtype)]}, - {"list": [fill(x, n + ".chunk", **kwargs) for x in chunks]}]} + {"list": [fill(x, n + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in chunks]}]} @property def chunkshape(self): diff --git a/awkward/array/indexed.py b/awkward/array/indexed.py index c7cfa9d5..52f9e0af 100644 --- a/awkward/array/indexed.py +++ b/awkward/array/indexed.py @@ -87,13 +87,13 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides)) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ return {"id": ident, "call": ["awkward", n], - "args": [fill(self._index, n + ".index", **kwargs), - fill(self._content, n + ".content", **kwargs)]} + "args": [fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} @property def index(self): @@ -258,13 +258,13 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ return {"id": ident, "call": ["awkward", n], - "args": [fill(self._index, n + ".index", **kwargs), - fill(self._content, n + ".content", **kwargs), + "args": [fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._dtype)]}]} @property @@ -423,22 +423,22 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ if self._default is None or isinstance(self._default, (numbers.Real, awkward.util.numpy.integer, awkward.util.numpy.floating)): default = self._default elif isinstance(self._default, awkward.util.numpy.ndarray): - default = fill(self._default, n + ".default") + default = fill(self._default, n + ".default", prefix, suffix, schemasuffix, storage, compression, **kwargs) else: default = {"call": ["pickle", "loads"], "args": pickle.dumps(self._default)} return {"id": ident, "call": ["awkward", n], "args": [self._length, - fill(self._index, n + ".index", **kwargs), - fill(self._content, n + ".content", **kwargs), + fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), default]} @property diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index 86b358a9..61cd4535 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -234,20 +234,20 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides)) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ if offsetsaliased(self._starts, self._stops) and len(self._starts) > 0 and self._starts[0] == 0: return {"id": ident, "call": ["awkward", n, "fromcounts"], - "args": [fill(self.counts, n + ".counts", **kwargs), - fill(self._content, n + ".content", **kwargs)]} + "args": [fill(self.counts, n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} else: return {"id": ident, "call": ["awkward", n], - "args": [fill(self._starts, n + ".starts", **kwargs), - fill(self._stops, n + ".stops", **kwargs), - fill(self._content, n + ".content", **kwargs)]} + "args": [fill(self._starts, n + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._stops, n + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} @property def starts(self): @@ -1142,21 +1142,21 @@ def deepcopy(self, starts=None, stops=None, content=None, subdtype=None): out.subdtype = subdtype return out - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ if offsetsaliased(self._starts, self._stops) and len(self._starts) > 0 and self._starts[0] == 0: return {"id": ident, "call": ["awkward", n, "fromcounts"], - "args": [fill(self.counts, n + ".counts", **kwargs), - fill(self._content, n + ".content", **kwargs), + "args": [fill(self.counts, n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._subdtype)]}]} else: return {"id": ident, "call": ["awkward", n], - "args": [fill(self._starts, n + ".starts", **kwargs), - fill(self._stops, n + ".stops", **kwargs), - fill(self._content, n + ".content", **kwargs), + "args": [fill(self._starts, n + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._stops, n + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._subdtype)]}]} @property diff --git a/awkward/array/masked.py b/awkward/array/masked.py index f09d57ec..0da1511e 100644 --- a/awkward/array/masked.py +++ b/awkward/array/masked.py @@ -106,13 +106,13 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ return {"id": ident, "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", **kwargs), - fill(self._content, n + ".content", **kwargs), + "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), self._maskedwhen]} @property @@ -300,13 +300,13 @@ def _mine(self, overrides): mine["lsborder"] = overrides.pop("lsborder", self._lsborder) return mine - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ return {"id": ident, "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", **kwargs), - fill(self._content, n + ".content", **kwargs), + "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), self._maskedwhen, self._lsborder]} @@ -525,13 +525,13 @@ def copy(self, mask=None, content=None, maskedwhen=None): out._maskedwhen = maskedwhen return out - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ return {"id": ident, "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", **kwargs), - fill(self._content, n + ".content", **kwargs), + "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), self._maskedwhen]} @property diff --git a/awkward/array/objects.py b/awkward/array/objects.py index 088c9bf5..6e2d4162 100644 --- a/awkward/array/objects.py +++ b/awkward/array/objects.py @@ -119,7 +119,7 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() if self._generator.__module__ == "__main__": @@ -138,7 +138,7 @@ def __awkward_persist__(self, ident, fill, **kwargs): n = self.__class__.__name__ out = {"id": ident, "call": ["awkward", n], - "args": [fill(self._content, n + ".content", **kwargs), {"function": spec}]} + "args": [fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"function": spec}]} if len(self._args) > 0: out["*"] = {"call": ["awkward.array.objects", "topython"], "args": [frompython(self._args)]} if len(self._kwargs) > 0: diff --git a/awkward/array/table.py b/awkward/array/table.py index 32218e4c..5e4e690d 100644 --- a/awkward/array/table.py +++ b/awkward/array/table.py @@ -234,11 +234,11 @@ def ones_like(self, **overrides): out[n] = x.ones_like(**overrides) return out - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() return {"id": ident, "call": ["awkward", self.__class__.__name__, "frompairs"], - "args": [{"pairs": [[n, fill(x, self.__class__.__name__ + ".content", **kwargs)] for n, x in self._content.items()]}]} + "args": [{"pairs": [[n, fill(x, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)] for n, x in self._content.items()]}]} @property def base(self): diff --git a/awkward/array/union.py b/awkward/array/union.py index 34eb0e5b..78fa9cd6 100644 --- a/awkward/array/union.py +++ b/awkward/array/union.py @@ -95,21 +95,21 @@ def issequential(self): return False return True - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ if self.issequential: return {"id": ident, "call": ["awkward", n, "fromtags"], - "args": [fill(self._tags, n + ".tags", **kwargs), - {"list": [fill(x, n + ".contents", **kwargs) for x in self._contents]}]} + "args": [fill(self._tags, n + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"list": [fill(x, n + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} else: return {"id": ident, "call": ["awkward", n], - "args": [fill(self._tags, n + ".tags", **kwargs), - fill(self._index, n + ".index", **kwargs), - {"list": [fill(x, n + ".contents", **kwargs) for x in self._contents]}]} + "args": [fill(self._tags, n + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"list": [fill(x, n + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} @property def tags(self): diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index e7de4a98..2764e7c4 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -114,7 +114,7 @@ def ones_like(self, **overrides): else: return self.array.ones_like(**overrides) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ @@ -146,7 +146,7 @@ def __awkward_persist__(self, ident, fill, **kwargs): return out else: - return fill(self.array, n + ".array", **kwargs) + return fill(self.array, n + ".array", prefix, suffix, schemasuffix, storage, compression, **kwargs) @property def generator(self): diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py index f24ce6c7..9b7035bc 100644 --- a/awkward/derived/strings.py +++ b/awkward/derived/strings.py @@ -249,21 +249,21 @@ def ones_like(self, **overrides): jagged = self._content.ones_like(**overrides) return self.copy(jagged.starts, jagged.stops, jagged.content, **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() n = self.__class__.__name__ if awkward.array.jagged.offsetsaliased(self.starts, self.stops) and len(self.starts) > 0 and self.starts[0] == 0: return {"id": ident, "call": ["awkward", n, "fromcounts"], - "args": [fill(self.counts, n + ".counts", **kwargs), - fill(self.content, n + ".content", **kwargs), + "args": [fill(self.counts, n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self.content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), self._encoding]} else: return {"id": ident, "call": ["awkward", n], - "args": [fill(self.starts, n + ".starts", **kwargs), - fill(self.stops, n + ".stops", **kwargs), - fill(self.content, n + ".content", **kwargs), + "args": [fill(self.starts, n + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self.stops, n + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self.content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), self._encoding]} @property diff --git a/awkward/persist.py b/awkward/persist.py index c167997d..82dd2785 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -250,7 +250,7 @@ def serialize(obj, storage, name=None, delimiter="-", suffix=None, schemasuffix= normalized.append({"minsize": minsize, "types": types, "contexts": contexts, "pair": pair}) seen = {} - def fill(obj, context, **kwargs): + def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwargs): if id(obj) in seen: return {"ref": seen[id(obj)]} @@ -284,13 +284,13 @@ def fill(obj, context, **kwargs): len(obj)]} elif hasattr(obj, "__awkward_persist__"): - return obj.__awkward_persist__(ident, fill, **kwargs) + return obj.__awkward_persist__(ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs) else: raise TypeError("cannot serialize {0} instance (has no __awkward_persist__ method)".format(type(obj))) schema = {"awkward": awkward.version.__version__, - "schema": fill(obj, "", **kwargs)} + "schema": fill(obj, "", prefix, suffix, schemasuffix, storage, compression, **kwargs)} if prefix != "": schema["prefix"] = prefix From cce6f1d2902b674f148b3bebb2e7febc7887bd9f Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 12:09:46 -0500 Subject: [PATCH 07/27] revamp save/load and hdf5 APIs --- awkward/__init__.py | 4 +- awkward/array/chunked.py | 2 +- awkward/persist.py | 157 +++++++++++++++++++++++++++++---------- 3 files changed, 120 insertions(+), 43 deletions(-) diff --git a/awkward/__init__.py b/awkward/__init__.py index 5494c502..6606ebe8 100644 --- a/awkward/__init__.py +++ b/awkward/__init__.py @@ -41,9 +41,9 @@ from awkward.generate import fromiter -from awkward.persist import serialize, deserialize, save, load, tohdf5, fromhdf5 +from awkward.persist import serialize, deserialize, save, load, hdf5 # convenient access to the version number from awkward.version import __version__ -__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "StringArray", "fromiter", "serialize", "deserialize", "save", "load", "tohdf5", "fromhdf5", "__version__"] +__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "StringArray", "fromiter", "serialize", "deserialize", "save", "load", "hdf5", "__version__"] diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py index 91627517..354f062f 100644 --- a/awkward/array/chunked.py +++ b/awkward/array/chunked.py @@ -84,7 +84,7 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage return {"id": ident, "call": ["awkward", n], "args": [{"list": [fill(x, n + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, - fill(awkward.util.numpy.array([c for c in self._counts if c > 0]), n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} + [c for c in self._counts if c > 0]]} @property def chunks(self): diff --git a/awkward/persist.py b/awkward/persist.py index 82dd2785..bed1018d 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -36,9 +36,9 @@ import zipfile import zlib try: - from collections.abc import Mapping + from collections.abc import Mapping, MutableMapping except ImportError: - from collections import Mapping + from collections import Mapping, MutableMapping import numpy @@ -371,7 +371,72 @@ def unfill(schema): return unfill(schema["schema"]) -def save(file, mode="a", options=None, **arrays): +def keys(storage, name="", subschemas=True): + schema = storage[name] + if isinstance(schema, numpy.ndarray): + schema = schema.tostring() + if isinstance(schema, bytes): + schema = schema.decode("ascii") + schema = json.loads(schema) + + prefix = schema.get("prefix", "") + + def recurse(schema): + if isinstance(schema, dict): + if "call" in schema and isinstance(schema["call"], list) and len(schema["call"]) > 0: + for x in schema.get("args", []): + for y in recurse(x): + yield y + for x in schema.get("kwargs", {}).values(): + for y in recurse(x): + yield y + for x in schema.get("*", []): + for y in recurse(x): + yield y + for x in schema.get("**", {}).values(): + for y in recurse(x): + yield y + + elif "read" in schema: + if schema.get("absolute", False): + yield schema["read"] + else: + yield prefix + schema["read"] + + elif "list" in schema: + for x in schema["list"]: + for y in recurse(x): + yield y + + elif "tuple" in schema: + for x in schema["tuple"]: + for y in recurse(x): + yield y + + elif "pairs" in schema: + for n, x in schema["pairs"]: + for y in recurse(x): + yield y + + elif "function" in schema: + pass + + elif "ref" in schema: + pass + + yield name + for x in recurse(schema["schema"]): + yield x + +def save(file, array, name=None, mode="a", **options): + if isinstance(array, dict): + arrays = array + else: + arrays = {"": array} + + if name is not None: + arrays = {prefix + n: x for n, x in arrays.items()} + arraynames = list(arrays) for i in range(len(arraynames)): for j in range(i + 1, len(arraynames)): @@ -391,8 +456,7 @@ def save(file, mode="a", options=None, **arrays): file = file + ".akd" alloptions = {"delimiter": "-", "suffix": ".raw", "schemasuffix": ".json", "compression": compression} - if options is not None: - alloptions.update(options) + alloptions.update(options) options = alloptions class Wrap(object): @@ -411,26 +475,32 @@ def __setitem__(self, where, what): for name, array in arrays.items(): serialize(array, wrapped, name=name, **options) -class load(Mapping): - def __init__(self, file, options=None): +def load(file, **options): + f = Load(file, **options) + if list(f) == [""]: + out = f[""] + f.close() + return out + else: + return f + +class Load(Mapping): + def __init__(self, file, **options): class Wrap(object): def __init__(self): self.f = zipfile.ZipFile(file, mode="r") def __getitem__(self, where): return self.f.read(where) - def close(self): - self.f.close() self._file = Wrap() alloptions = {"schemasuffix": ".json", "whitelist": whitelist, "cache": None} - if options is not None: - alloptions.update(options) + alloptions.update(options) self.schemasuffix = alloptions.pop("schemasuffix") self.options = alloptions def __getitem__(self, where): - return deserialize(self._file, name=where + self.schemasuffix, **self.options) + return deserialize(self._file, name=where + self.schemasuffix, whitelist=self.options["whitelist"], cache=self.options["cache"]) def __iter__(self): for n in self._file.f.namelist(): @@ -444,57 +514,64 @@ def __len__(self): count += 1 return count + def __repr__(self): + return "".format(len(self)) + + def close(self): + self._file.f.close() + def __del__(self): - self._file.close() + self.close() -def tohdf5(group, options=None, **arrays): - alloptions = {"compression": compression} - if options is not None: - alloptions.update(options) - options = alloptions - options["delimiter"] = "/" - options["schemasuffix"] = "/schema.json" + def __enter__(self, *args, **kwds): + return self - class Wrap(object): - def __init__(self): - self.g = group - def __setitem__(self, where, what): - self.g[where] = numpy.frombuffer(what, dtype=numpy.uint8) + def __exit__(self, *args, **kwds): + self.close() - f = Wrap() - for name, array in arrays.items(): - group.create_group(name) - serialize(array, f, name=name, **options) +class hdf5(MutableMapping): + def __init__(self, group, **options): + alloptions = {"compression": compression, "whitelist": whitelist, "cache": None} + alloptions.update(options) + self.options = alloptions + self.options["delimiter"] = "/" + self.options["schemasuffix"] = "/schema.json" -class fromhdf5(Mapping): - def __init__(self, group, options=None): class Wrap(object): def __init__(self): self.g = group def __getitem__(self, where): return self.g[where].value + def __setitem__(self, where, what): + self.g[where] = numpy.frombuffer(what, dtype=numpy.uint8) self._group = Wrap() - alloptions = {"whitelist": whitelist, "cache": None} - if options is not None: - alloptions.update(options) - self.options = alloptions - def __getitem__(self, where): - return deserialize(self._group, name=where + "/schema.json", **self.options) + return deserialize(self._group, name=where + self.options["schemasuffix"], whitelist=self.options["whitelist"], cache=self.options["cache"]) + + def __setitem__(self, where, what): + self._group.g.create_group(where) + serialize(what, self._group, name=where, **self.options) + + def __delitem__(self, where): + for subname in keys(self._group, name=where + self.options["schemasuffix"]): + del self._group.g[subname] + del self._group.g[where] def __iter__(self): + schemaname = self.options["schemasuffix"].split("/")[-1] for subname in self._group.g: - if "schema.json" in self._group.g[subname]: + if schemaname in self._group.g[subname]: yield subname def __len__(self): + schemaname = self.options["schemasuffix"].split("/")[-1] count = 0 for subname in self._group.g: - if "schema.json" in self._group.g[subname]: + if schemaname in self._group.g[subname]: count += 0 return count def __repr__(self): - return "".format(repr(self._group.g.name)) + return "".format(repr(self._group.g.name), len(self)) From 84f73fce01b3ce01ad6c9dc98be333d86ec40160 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 12:12:40 -0500 Subject: [PATCH 08/27] disallow arbitrary kwargs in serialize --- awkward/persist.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/awkward/persist.py b/awkward/persist.py index bed1018d..470cbd41 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -208,6 +208,10 @@ def recurse(obj): def serialize(obj, storage, name=None, delimiter="-", suffix=None, schemasuffix=None, compression=compression, **kwargs): import awkward.array.base + for n in kwargs: + if n not in (): + raise TypeError("unrecognized serialization option: {0}".format(repr(n))) + if name is None or name == "": name = "" prefix = "" From 440b8ff7a88d25e327948e96eea83376292a87d9 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 12:16:29 -0500 Subject: [PATCH 09/27] little bug --- awkward/persist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awkward/persist.py b/awkward/persist.py index 470cbd41..a56c4636 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -439,7 +439,7 @@ def save(file, array, name=None, mode="a", **options): arrays = {"": array} if name is not None: - arrays = {prefix + n: x for n, x in arrays.items()} + arrays = {name + n: x for n, x in arrays.items()} arraynames = list(arrays) for i in range(len(arraynames)): From 0b42ade37e94c53df5ee6279f3105f3083138ac1 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 12:24:32 -0500 Subject: [PATCH 10/27] avoid mixing serialization and deserialization options in HDF5 --- awkward/persist.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/awkward/persist.py b/awkward/persist.py index a56c4636..9820f5a6 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -555,8 +555,13 @@ def __getitem__(self, where): return deserialize(self._group, name=where + self.options["schemasuffix"], whitelist=self.options["whitelist"], cache=self.options["cache"]) def __setitem__(self, where, what): + options = dict(self.options) + if "whitelist" in options: + del options["whitelist"] + if "cache" in options: + del options["cache"] self._group.g.create_group(where) - serialize(what, self._group, name=where, **self.options) + serialize(what, self._group, name=where, **options) def __delitem__(self, where): for subname in keys(self._group, name=where + self.options["schemasuffix"]): From 3b70de8e1da595030e17881233b0fcf008ad6b43 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 12:27:59 -0500 Subject: [PATCH 11/27] add one, not zero! --- awkward/persist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awkward/persist.py b/awkward/persist.py index 9820f5a6..2ea38315 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -579,7 +579,7 @@ def __len__(self): count = 0 for subname in self._group.g: if schemaname in self._group.g[subname]: - count += 0 + count += 1 return count def __repr__(self): From 5ed208b801393e66baa368ce7d748a62c599c228 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 14:23:53 -0500 Subject: [PATCH 12/27] implemented most of Arrow --- awkward/arrow.py | 76 ++++++++++++++++++++++++++++++++++++++++ awkward/view/__init__.py | 29 --------------- awkward/view/arrow.py | 31 ---------------- tests/test_arrow.py | 12 +++++++ 4 files changed, 88 insertions(+), 60 deletions(-) create mode 100644 awkward/arrow.py delete mode 100644 awkward/view/__init__.py delete mode 100644 awkward/view/arrow.py diff --git a/awkward/arrow.py b/awkward/arrow.py new file mode 100644 index 00000000..a3d084c4 --- /dev/null +++ b/awkward/arrow.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import awkward.array.chunked +import awkward.array.jagged +import awkward.array.masked +import awkward.array.table +import awkward.util + +ARROWMASKTYPE = awkward.util.numpy.uint8 +ARROWINDEXTYPE = awkward.util.numpy.int32 + +def view(obj): + import pyarrow + + def recurse(tpe, buffers): + if isinstance(tpe, pyarrow.lib.StructType): + pairs = [] + for i in range(tpe.num_children - 1, -1, -1): + pairs.insert(0, (tpe[i].name, recurse(tpe[i].type, buffers))) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, awkward.array.table.Table.frompairs(pairs), maskedwhen=False, lsborder=True) + + elif isinstance(tpe, pyarrow.lib.ListType): + content = recurse(tpe.value_type, buffers) + offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWINDEXTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, awkward.array.jagged.JaggedArray.fromoffsets(offsets, content), maskedwhen=False, lsborder=True) + + elif isinstance(tpe, pyarrow.lib.DataType): + content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=tpe.to_pandas_dtype()) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, content, maskedwhen=False, lsborder=True) + + else: + raise NotImplementedError(repr(tpe)) + + if isinstance(obj, pyarrow.lib.Array): + buffers = obj.buffers() + out = recurse(obj.type, buffers) + assert len(buffers) == 0 + return out + + elif isinstance(obj, pyarrow.lib.ChunkedArray): + chunks = [x for x in obj.chunks if len(x) > 0] + return awkward.array.chunked.ChunkedArray([view(x) for x in chunks], counts=[len(x) for x in chunks]) + + else: + raise NotImplementedError(type(obj)) diff --git a/awkward/view/__init__.py b/awkward/view/__init__.py deleted file mode 100644 index e831ea82..00000000 --- a/awkward/view/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2018, DIANA-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/awkward/view/arrow.py b/awkward/view/arrow.py deleted file mode 100644 index 6f8faa07..00000000 --- a/awkward/view/arrow.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2018, DIANA-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - diff --git a/tests/test_arrow.py b/tests/test_arrow.py index b599dbab..4b5f7ea0 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -50,6 +50,18 @@ def test_array_null(self): if pyarrow is not None: arr = pyarrow.array([1.1, 2.2, 3.3, None, 4.4, 5.5]) + def test_nested_array(self): + if pyarrow is not None: + arr = pyarrow.array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) + + def test_nested_array_null(self): + if pyarrow is not None: + arr = pyarrow.array([[1.1, 2.2, None], [], [4.4, 5.5]]) + + def test_null_nested_array_null(self): + if pyarrow is not None: + arr = pyarrow.array([[1.1, 2.2, None], [], None, [4.4, 5.5]]) + def test_chunked_array(self): if pyarrow is not None: arr = pyarrow.chunked_array([pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]), pyarrow.array([]), pyarrow.array([6.6, 7.7, 8.8])]) From 30b062d50e8388715a9eef8fde1eb38f64a6c777 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 15:14:44 -0500 Subject: [PATCH 13/27] implemented through unions --- awkward/arrow.py | 54 +++++++++++++++++++++++++++++++++----- awkward/derived/strings.py | 6 +++++ tests/test_arrow.py | 22 ++++++++++++---- 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/awkward/arrow.py b/awkward/arrow.py index a3d084c4..bab00c56 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -32,10 +32,13 @@ import awkward.array.jagged import awkward.array.masked import awkward.array.table +import awkward.derived.strings import awkward.util -ARROWMASKTYPE = awkward.util.numpy.uint8 -ARROWINDEXTYPE = awkward.util.numpy.int32 +ARROW_BITMASKTYPE = awkward.util.numpy.uint8 +ARROW_INDEXTYPE = awkward.util.numpy.int32 +ARROW_TAGTYPE = awkward.util.numpy.uint8 +ARROW_CHARTYPE = awkward.util.numpy.uint8 def view(obj): import pyarrow @@ -45,18 +48,55 @@ def recurse(tpe, buffers): pairs = [] for i in range(tpe.num_children - 1, -1, -1): pairs.insert(0, (tpe[i].name, recurse(tpe[i].type, buffers))) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWMASKTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) return awkward.array.masked.BitMaskedArray(mask, awkward.array.table.Table.frompairs(pairs), maskedwhen=False, lsborder=True) elif isinstance(tpe, pyarrow.lib.ListType): content = recurse(tpe.value_type, buffers) - offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWINDEXTYPE) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWMASKTYPE) + offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) return awkward.array.masked.BitMaskedArray(mask, awkward.array.jagged.JaggedArray.fromoffsets(offsets, content), maskedwhen=False, lsborder=True) + elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "sparse": + contents = [] + for i in range(tpe.num_children - 1, -1, -1): + contents.insert(0, recurse(tpe[i].type, buffers)) + assert buffers.pop() is None + tags = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_TAGTYPE) + index = awkward.util.numpy.arange(len(tags), dtype=ARROW_INDEXTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, awkward.array.union.UnionArray(tags, index, contents), maskedwhen=False, lsborder=True) + + elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "dense": + contents = [] + for i in range(tpe.num_children - 1, -1, -1): + contents.insert(0, recurse(tpe[i].type, buffers)) + index = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + tags = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_TAGTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, awkward.array.union.UnionArray(tags, index, contents), maskedwhen=False, lsborder=True) + + elif tpe == pyarrow.string(): + content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE) + offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding="utf-8"), maskedwhen=False, lsborder=True) + + elif tpe == pyarrow.binary(): + content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE) + offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding=None), maskedwhen=False, lsborder=True) + + elif tpe == pyarrow.bool_(): + content = awkward.util.numpy.unpackbits(awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE)).view(awkward.util.BOOLTYPE) + content = content.reshape(-1, 8)[:,::-1].reshape(-1) # lsborder=True + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, content, maskedwhen=False, lsborder=True) + elif isinstance(tpe, pyarrow.lib.DataType): content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=tpe.to_pandas_dtype()) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROWMASKTYPE) + mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) return awkward.array.masked.BitMaskedArray(mask, content, maskedwhen=False, lsborder=True) else: @@ -64,7 +104,7 @@ def recurse(tpe, buffers): if isinstance(obj, pyarrow.lib.Array): buffers = obj.buffers() - out = recurse(obj.type, buffers) + out = recurse(obj.type, buffers)[:len(obj)] assert len(buffers) == 0 return out diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py index 9b7035bc..8bb53d23 100644 --- a/awkward/derived/strings.py +++ b/awkward/derived/strings.py @@ -327,3 +327,9 @@ def parents(self): def index(self): return self._content.index + def __getitem__(self, where): + if isinstance(where, awkward.util.integer): + return super(StringArray, self).__getitem__(where) + else: + out = self._content[where] + return self.__class__(out.starts, out.stops, out.content, self.encoding) diff --git a/tests/test_arrow.py b/tests/test_arrow.py index 4b5f7ea0..f1fe7d34 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -46,6 +46,10 @@ def test_array(self): if pyarrow is not None: arr = pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]) + def test_boolean(self): + if pyarrow is not None: + arr = pyarrow.array([True, True, False, False, True]) + def test_array_null(self): if pyarrow is not None: arr = pyarrow.array([1.1, 2.2, 3.3, None, 4.4, 5.5]) @@ -120,11 +124,19 @@ def test_null_nested_struct_nested_null(self): def test_strings(self): if pyarrow is not None: - arr = pyarrow.array(["one", "two", "three", "four", "five"]) + arr = pyarrow.array(["one", "two", "three", u"fo\u2014ur", "five"]) def test_strings_null(self): if pyarrow is not None: - arr = pyarrow.array(["one", "two", None, "four", "five"]) + arr = pyarrow.array(["one", "two", None, u"fo\u2014ur", "five"]) + + def test_binary(self): + if pyarrow is not None: + arr = pyarrow.array([b"one", b"two", b"three", b"four", b"five"]) + + def test_binary_null(self): + if pyarrow is not None: + arr = pyarrow.array([b"one", b"two", None, b"four", b"five"]) def test_chunked_strings(self): if pyarrow is not None: @@ -156,15 +168,15 @@ def test_union_sparse_null_null(self): def test_union_dense(self): if pyarrow is not None: - pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, 2.2, 3.3]), pyarrow.array([True, True, False])]) + arr = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, 2.2, 3.3]), pyarrow.array([True, True, False])]) def test_union_dense_null(self): if pyarrow is not None: - pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, True, False])]) + arr = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, True, False])]) def test_union_dense_null_null(self): if pyarrow is not None: - pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, None, False])]) + arr = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, None, False])]) def test_dictarray(self): if pyarrow is not None: From 7cdd2fa7cb8b575ef299b21ed0787fec1e54bee7 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 15:47:32 -0500 Subject: [PATCH 14/27] all Arrow conversions done --- awkward/arrow.py | 25 ++++++++++++++++++++++++- awkward/derived/strings.py | 16 +++++++++++++++- tests/test_arrow.py | 6 +++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/awkward/arrow.py b/awkward/arrow.py index bab00c56..1b4956b1 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -29,6 +29,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import awkward.array.chunked +import awkward.array.indexed import awkward.array.jagged import awkward.array.masked import awkward.array.table @@ -44,7 +45,13 @@ def view(obj): import pyarrow def recurse(tpe, buffers): - if isinstance(tpe, pyarrow.lib.StructType): + if isinstance(tpe, pyarrow.lib.DictionaryType): + content = view(tpe.dictionary) + index = recurse(tpe.index_type, buffers) + assert isinstance(index, awkward.array.masked.BitMaskedArray) + return awkward.array.masked.BitMaskedArray(index.mask, awkward.array.indexed.IndexedArray(index.content, content), maskedwhen=index.maskedwhen, lsborder=index.lsborder) + + elif isinstance(tpe, pyarrow.lib.StructType): pairs = [] for i in range(tpe.num_children - 1, -1, -1): pairs.insert(0, (tpe[i].name, recurse(tpe[i].type, buffers))) @@ -112,5 +119,21 @@ def recurse(tpe, buffers): chunks = [x for x in obj.chunks if len(x) > 0] return awkward.array.chunked.ChunkedArray([view(x) for x in chunks], counts=[len(x) for x in chunks]) + elif isinstance(obj, pyarrow.lib.RecordBatch): + out = awkward.array.table.Table() + for n, x in zip(obj.schema.names, obj.columns): + out[n] = view(x) + return out + + elif isinstance(obj, pyarrow.lib.Table): + chunks = [] + counts = [] + for batch in obj.to_batches(): + chunk = view(batch) + if len(chunk) > 0: + chunks.append(chunk) + counts.append(len(chunk)) + return awkward.array.chunked.ChunkedArray(chunks, counts=counts) + else: raise NotImplementedError(type(obj)) diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py index 8bb53d23..af5ff07b 100644 --- a/awkward/derived/strings.py +++ b/awkward/derived/strings.py @@ -328,8 +328,22 @@ def index(self): return self._content.index def __getitem__(self, where): - if isinstance(where, awkward.util.integer): + if awkward.util.isstringslice(where): + raise IndexError("cannot index StringArray with string or sequence of strings") + + if isinstance(where, tuple) and len(where) == 0: + return self + if not isinstance(where, tuple): + where = (where,) + head, tail = where[0], where[1:] + + if isinstance(head, awkward.util.integer): return super(StringArray, self).__getitem__(where) + + elif tail == (): + out = self._content[where] + return self.__class__(out.starts, out.stops, out.content, self.encoding) + else: out = self._content[where] return self.__class__(out.starts, out.stops, out.content, self.encoding) diff --git a/tests/test_arrow.py b/tests/test_arrow.py index f1fe7d34..5f454e84 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -186,6 +186,10 @@ def test_dictarray_null(self): if pyarrow is not None: arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, None, 1, None, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + def test_null_dictarray(self): + if pyarrow is not None: + arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", None, "three"])) + def test_batch(self): if pyarrow is not None: arr = pyarrow.RecordBatch.from_arrays( @@ -196,7 +200,7 @@ def test_batch(self): pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], ["a", "b", "c", "d", "e"]) - def test_batch(self): + def test_table(self): if pyarrow is not None: arr = pyarrow.Table.from_batches([ pyarrow.RecordBatch.from_arrays( From 680a7f8aede85e3d044437c8b44054463a6036e9 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 16:00:53 -0500 Subject: [PATCH 15/27] Arrow test suite done, too --- tests/test_arrow.py | 118 +++++++++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 39 deletions(-) diff --git a/tests/test_arrow.py b/tests/test_arrow.py index 5f454e84..d66a2cd0 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -36,6 +36,7 @@ except ImportError: pyarrow = None +import awkward.arrow from awkward import * class Test(unittest.TestCase): @@ -44,165 +45,203 @@ def runTest(self): def test_array(self): if pyarrow is not None: - arr = pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]) + a = pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]) + assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, 4.4, 5.5] def test_boolean(self): if pyarrow is not None: - arr = pyarrow.array([True, True, False, False, True]) + a = pyarrow.array([True, True, False, False, True]) + assert awkward.arrow.view(a).tolist() == [True, True, False, False, True] def test_array_null(self): if pyarrow is not None: - arr = pyarrow.array([1.1, 2.2, 3.3, None, 4.4, 5.5]) + a = pyarrow.array([1.1, 2.2, 3.3, None, 4.4, 5.5]) + assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, None, 4.4, 5.5] def test_nested_array(self): if pyarrow is not None: - arr = pyarrow.array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) + a = pyarrow.array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) + assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, 3.3], [], [4.4, 5.5]] def test_nested_array_null(self): if pyarrow is not None: - arr = pyarrow.array([[1.1, 2.2, None], [], [4.4, 5.5]]) + a = pyarrow.array([[1.1, 2.2, None], [], [4.4, 5.5]]) + assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, None], [], [4.4, 5.5]] def test_null_nested_array_null(self): if pyarrow is not None: - arr = pyarrow.array([[1.1, 2.2, None], [], None, [4.4, 5.5]]) + a = pyarrow.array([[1.1, 2.2, None], [], None, [4.4, 5.5]]) + assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, None], [], None, [4.4, 5.5]] def test_chunked_array(self): if pyarrow is not None: - arr = pyarrow.chunked_array([pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]), pyarrow.array([]), pyarrow.array([6.6, 7.7, 8.8])]) + a = pyarrow.chunked_array([pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]), pyarrow.array([]), pyarrow.array([6.6, 7.7, 8.8])]) + assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8] def test_struct(self): if pyarrow is not None: - arr = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + a = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] def test_struct_null(self): if pyarrow is not None: - arr = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + a = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}] def test_null_struct(self): if pyarrow is not None: - arr = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + a = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] def test_null_struct_null(self): if pyarrow is not None: - arr = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + a = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}] def test_chunked_struct(self): if pyarrow is not None: - arr = pyarrow.chunked_array([pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]), pyarrow.array([]), pyarrow.array([{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}])]) + a = pyarrow.chunked_array([pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]), pyarrow.array([]), pyarrow.array([{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}])]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}] def test_nested_struct(self): if pyarrow is not None: - arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] def test_nested_struct_null(self): if pyarrow is not None: - arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] def test_null_nested_struct(self): if pyarrow is not None: - arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] def test_null_nested_struct_null(self): if pyarrow is not None: - arr = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] def test_struct_nested(self): if pyarrow is not None: - arr = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}]) + a = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}] def test_struct_nested_null(self): if pyarrow is not None: - arr = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}]) + a = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}] def test_nested_struct_nested(self): if pyarrow is not None: - arr = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + a = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]] def test_null_nested_struct_nested_null(self): if pyarrow is not None: - arr = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + a = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]] def test_strings(self): if pyarrow is not None: - arr = pyarrow.array(["one", "two", "three", u"fo\u2014ur", "five"]) + a = pyarrow.array(["one", "two", "three", u"fo\u2014ur", "five"]) + assert awkward.arrow.view(a).tolist() == ["one", "two", "three", u"fo\u2014ur", "five"] def test_strings_null(self): if pyarrow is not None: - arr = pyarrow.array(["one", "two", None, u"fo\u2014ur", "five"]) + a = pyarrow.array(["one", "two", None, u"fo\u2014ur", "five"]) + assert awkward.arrow.view(a).tolist() == ["one", "two", None, u"fo\u2014ur", "five"] def test_binary(self): if pyarrow is not None: - arr = pyarrow.array([b"one", b"two", b"three", b"four", b"five"]) + a = pyarrow.array([b"one", b"two", b"three", b"four", b"five"]) + assert awkward.arrow.view(a).tolist() == [b"one", b"two", b"three", b"four", b"five"] def test_binary_null(self): if pyarrow is not None: - arr = pyarrow.array([b"one", b"two", None, b"four", b"five"]) + a = pyarrow.array([b"one", b"two", None, b"four", b"five"]) + assert awkward.arrow.view(a).tolist() == [b"one", b"two", None, b"four", b"five"] def test_chunked_strings(self): if pyarrow is not None: - arr = pyarrow.chunked_array([pyarrow.array(["one", "two", "three", "four", "five"]), pyarrow.array(["six", "seven", "eight"])]) + a = pyarrow.chunked_array([pyarrow.array(["one", "two", "three", "four", "five"]), pyarrow.array(["six", "seven", "eight"])]) + assert awkward.arrow.view(a).tolist() == ["one", "two", "three", "four", "five", "six", "seven", "eight"] def test_nested_strings(self): if pyarrow is not None: - arr = pyarrow.array([["one", "two", "three"], [], ["four", "five"]]) + a = pyarrow.array([["one", "two", "three"], [], ["four", "five"]]) + assert awkward.arrow.view(a).tolist() == [["one", "two", "three"], [], ["four", "five"]] def test_nested_strings_null(self): if pyarrow is not None: - arr = pyarrow.array([["one", "two", None], [], ["four", "five"]]) + a = pyarrow.array([["one", "two", None], [], ["four", "five"]]) + assert awkward.arrow.view(a).tolist() == [["one", "two", None], [], ["four", "five"]] def test_null_nested_strings_null(self): if pyarrow is not None: - arr = pyarrow.array([["one", "two", None], [], None, ["four", "five"]]) + a = pyarrow.array([["one", "two", None], [], None, ["four", "five"]]) + assert awkward.arrow.view(a).tolist() == [["one", "two", None], [], None, ["four", "five"]] def test_union_sparse(self): if pyarrow is not None: - arr = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, 2.2, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, 2.2, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 2.2, 3.3, False] def test_union_sparse_null(self): if pyarrow is not None: - arr = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, None, 3.3, False] def test_union_sparse_null_null(self): if pyarrow is not None: - arr = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, None, False, True, False])]) + a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, None, False, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, None, None, 3.3, False] def test_union_dense(self): if pyarrow is not None: - arr = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, 2.2, 3.3]), pyarrow.array([True, True, False])]) + a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, 2.2, 3.3]), pyarrow.array([True, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, 2.2, 3.3, True, False] def test_union_dense_null(self): if pyarrow is not None: - arr = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, True, False])]) + a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, None, 3.3, True, False] def test_union_dense_null_null(self): if pyarrow is not None: - arr = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, None, False])]) + a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, None, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, None, 3.3, None, False] def test_dictarray(self): if pyarrow is not None: - arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + assert awkward.arrow.view(a).tolist() == ["one", "one", "three", "three", "two", "one", "three", "two", "two"] def test_dictarray_null(self): if pyarrow is not None: - arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, None, 1, None, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, None, 1, None, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + assert awkward.arrow.view(a).tolist() == ["one", "one", "three", None, "two", None, "three", "two", "two"] def test_null_dictarray(self): if pyarrow is not None: - arr = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", None, "three"])) + a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", None, "three"])) + assert awkward.arrow.view(a).tolist() == ["one", "one", "three", "three", None, "one", "three", None, None] def test_batch(self): if pyarrow is not None: - arr = pyarrow.RecordBatch.from_arrays( + a = pyarrow.RecordBatch.from_arrays( [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], ["a", "b", "c", "d", "e"]) + assert awkward.arrow.view(a).tolist() == [{"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d":{"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}] def test_table(self): if pyarrow is not None: - arr = pyarrow.Table.from_batches([ + a = pyarrow.Table.from_batches([ pyarrow.RecordBatch.from_arrays( [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), @@ -217,3 +256,4 @@ def test_table(self): pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], ["a", "b", "c", "d", "e"])]) + assert awkward.arrow.view(a).tolist() == [{"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d": {"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}, {"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d": {"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}] From 01c015eebfcbc4d22212122b150308742347550e Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 17:29:39 -0500 Subject: [PATCH 16/27] handle non-nullable cases --- awkward/arrow.py | 96 +++++++++++++++++++++++++++++------------ tests/test_arrow.py | 101 +++++++++++++++++++++++++++----------------- 2 files changed, 130 insertions(+), 67 deletions(-) diff --git a/awkward/arrow.py b/awkward/arrow.py index 1b4956b1..da5d9e34 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -44,74 +44,114 @@ def view(obj): import pyarrow - def recurse(tpe, buffers): + def popbuffers(tpe, buffers): if isinstance(tpe, pyarrow.lib.DictionaryType): content = view(tpe.dictionary) - index = recurse(tpe.index_type, buffers) - assert isinstance(index, awkward.array.masked.BitMaskedArray) - return awkward.array.masked.BitMaskedArray(index.mask, awkward.array.indexed.IndexedArray(index.content, content), maskedwhen=index.maskedwhen, lsborder=index.lsborder) + index = popbuffers(tpe.index_type, buffers) + if isinstance(index, awkward.array.masked.BitMaskedArray): + return awkward.array.masked.BitMaskedArray(index.mask, awkward.array.indexed.IndexedArray(index.content, content), maskedwhen=index.maskedwhen, lsborder=index.lsborder) + else: + return awkward.array.indexed.IndexedArray(index, content) elif isinstance(tpe, pyarrow.lib.StructType): pairs = [] for i in range(tpe.num_children - 1, -1, -1): - pairs.insert(0, (tpe[i].name, recurse(tpe[i].type, buffers))) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, awkward.array.table.Table.frompairs(pairs), maskedwhen=False, lsborder=True) + pairs.insert(0, (tpe[i].name, popbuffers(tpe[i].type, buffers))) + out = awkward.array.table.Table.frompairs(pairs) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out elif isinstance(tpe, pyarrow.lib.ListType): - content = recurse(tpe.value_type, buffers) + content = popbuffers(tpe.value_type, buffers) offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, awkward.array.jagged.JaggedArray.fromoffsets(offsets, content), maskedwhen=False, lsborder=True) + out = awkward.array.jagged.JaggedArray.fromoffsets(offsets, content) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "sparse": contents = [] for i in range(tpe.num_children - 1, -1, -1): - contents.insert(0, recurse(tpe[i].type, buffers)) + contents.insert(0, popbuffers(tpe[i].type, buffers)) assert buffers.pop() is None tags = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_TAGTYPE) index = awkward.util.numpy.arange(len(tags), dtype=ARROW_INDEXTYPE) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, awkward.array.union.UnionArray(tags, index, contents), maskedwhen=False, lsborder=True) + out = awkward.array.union.UnionArray(tags, index, contents) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "dense": contents = [] for i in range(tpe.num_children - 1, -1, -1): - contents.insert(0, recurse(tpe[i].type, buffers)) + contents.insert(0, popbuffers(tpe[i].type, buffers)) index = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) tags = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_TAGTYPE) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, awkward.array.union.UnionArray(tags, index, contents), maskedwhen=False, lsborder=True) + out = awkward.array.union.UnionArray(tags, index, contents) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out elif tpe == pyarrow.string(): content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE) offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding="utf-8"), maskedwhen=False, lsborder=True) + out = awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding="utf-8") + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out elif tpe == pyarrow.binary(): content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE) offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding=None), maskedwhen=False, lsborder=True) + out = awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding=None) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out elif tpe == pyarrow.bool_(): - content = awkward.util.numpy.unpackbits(awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE)).view(awkward.util.BOOLTYPE) - content = content.reshape(-1, 8)[:,::-1].reshape(-1) # lsborder=True - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, content, maskedwhen=False, lsborder=True) + out = awkward.util.numpy.unpackbits(awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE)).view(awkward.util.BOOLTYPE) + out = out.reshape(-1, 8)[:,::-1].reshape(-1) # lsborder=True + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out elif isinstance(tpe, pyarrow.lib.DataType): - content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=tpe.to_pandas_dtype()) - mask = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_BITMASKTYPE) - return awkward.array.masked.BitMaskedArray(mask, content, maskedwhen=False, lsborder=True) + out = awkward.util.numpy.frombuffer(buffers.pop(), dtype=tpe.to_pandas_dtype()) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out else: raise NotImplementedError(repr(tpe)) if isinstance(obj, pyarrow.lib.Array): buffers = obj.buffers() - out = recurse(obj.type, buffers)[:len(obj)] + out = popbuffers(obj.type, buffers)[:len(obj)] assert len(buffers) == 0 return out diff --git a/tests/test_arrow.py b/tests/test_arrow.py index d66a2cd0..b7bf5110 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -33,6 +33,7 @@ import numpy try: import pyarrow + import pyarrow.parquet except ImportError: pyarrow = None @@ -43,192 +44,192 @@ class Test(unittest.TestCase): def runTest(self): pass - def test_array(self): + def test_arrow_array(self): if pyarrow is not None: a = pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]) assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, 4.4, 5.5] - def test_boolean(self): + def test_arrow_boolean(self): if pyarrow is not None: a = pyarrow.array([True, True, False, False, True]) assert awkward.arrow.view(a).tolist() == [True, True, False, False, True] - def test_array_null(self): + def test_arrow_array_null(self): if pyarrow is not None: a = pyarrow.array([1.1, 2.2, 3.3, None, 4.4, 5.5]) assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, None, 4.4, 5.5] - def test_nested_array(self): + def test_arrow_nested_array(self): if pyarrow is not None: a = pyarrow.array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, 3.3], [], [4.4, 5.5]] - def test_nested_array_null(self): + def test_arrow_nested_array_null(self): if pyarrow is not None: a = pyarrow.array([[1.1, 2.2, None], [], [4.4, 5.5]]) assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, None], [], [4.4, 5.5]] - def test_null_nested_array_null(self): + def test_arrow_null_nested_array_null(self): if pyarrow is not None: a = pyarrow.array([[1.1, 2.2, None], [], None, [4.4, 5.5]]) assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, None], [], None, [4.4, 5.5]] - def test_chunked_array(self): + def test_arrow_chunked_array(self): if pyarrow is not None: a = pyarrow.chunked_array([pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]), pyarrow.array([]), pyarrow.array([6.6, 7.7, 8.8])]) assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8] - def test_struct(self): + def test_arrow_struct(self): if pyarrow is not None: a = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] - def test_struct_null(self): + def test_arrow_struct_null(self): if pyarrow is not None: a = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}] - def test_null_struct(self): + def test_arrow_null_struct(self): if pyarrow is not None: a = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] - def test_null_struct_null(self): + def test_arrow_null_struct_null(self): if pyarrow is not None: a = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}] - def test_chunked_struct(self): + def test_arrow_chunked_struct(self): if pyarrow is not None: a = pyarrow.chunked_array([pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]), pyarrow.array([]), pyarrow.array([{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}])]) assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}] - def test_nested_struct(self): + def test_arrow_nested_struct(self): if pyarrow is not None: a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] - def test_nested_struct_null(self): + def test_arrow_nested_struct_null(self): if pyarrow is not None: a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] - def test_null_nested_struct(self): + def test_arrow_null_nested_struct(self): if pyarrow is not None: a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] - def test_null_nested_struct_null(self): + def test_arrow_null_nested_struct_null(self): if pyarrow is not None: a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] - def test_struct_nested(self): + def test_arrow_struct_nested(self): if pyarrow is not None: a = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}]) assert awkward.arrow.view(a).tolist() == [{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}] - def test_struct_nested_null(self): + def test_arrow_struct_nested_null(self): if pyarrow is not None: a = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}]) assert awkward.arrow.view(a).tolist() == [{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}] - def test_nested_struct_nested(self): + def test_arrow_nested_struct_nested(self): if pyarrow is not None: a = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) assert awkward.arrow.view(a).tolist() == [[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]] - def test_null_nested_struct_nested_null(self): + def test_arrow_null_nested_struct_nested_null(self): if pyarrow is not None: a = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) assert awkward.arrow.view(a).tolist() == [[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]] - def test_strings(self): + def test_arrow_strings(self): if pyarrow is not None: a = pyarrow.array(["one", "two", "three", u"fo\u2014ur", "five"]) assert awkward.arrow.view(a).tolist() == ["one", "two", "three", u"fo\u2014ur", "five"] - def test_strings_null(self): + def test_arrow_strings_null(self): if pyarrow is not None: a = pyarrow.array(["one", "two", None, u"fo\u2014ur", "five"]) assert awkward.arrow.view(a).tolist() == ["one", "two", None, u"fo\u2014ur", "five"] - def test_binary(self): + def test_arrow_binary(self): if pyarrow is not None: a = pyarrow.array([b"one", b"two", b"three", b"four", b"five"]) assert awkward.arrow.view(a).tolist() == [b"one", b"two", b"three", b"four", b"five"] - def test_binary_null(self): + def test_arrow_binary_null(self): if pyarrow is not None: a = pyarrow.array([b"one", b"two", None, b"four", b"five"]) assert awkward.arrow.view(a).tolist() == [b"one", b"two", None, b"four", b"five"] - def test_chunked_strings(self): + def test_arrow_chunked_strings(self): if pyarrow is not None: a = pyarrow.chunked_array([pyarrow.array(["one", "two", "three", "four", "five"]), pyarrow.array(["six", "seven", "eight"])]) assert awkward.arrow.view(a).tolist() == ["one", "two", "three", "four", "five", "six", "seven", "eight"] - def test_nested_strings(self): + def test_arrow_nested_strings(self): if pyarrow is not None: a = pyarrow.array([["one", "two", "three"], [], ["four", "five"]]) assert awkward.arrow.view(a).tolist() == [["one", "two", "three"], [], ["four", "five"]] - def test_nested_strings_null(self): + def test_arrow_nested_strings_null(self): if pyarrow is not None: a = pyarrow.array([["one", "two", None], [], ["four", "five"]]) assert awkward.arrow.view(a).tolist() == [["one", "two", None], [], ["four", "five"]] - def test_null_nested_strings_null(self): + def test_arrow_null_nested_strings_null(self): if pyarrow is not None: a = pyarrow.array([["one", "two", None], [], None, ["four", "five"]]) assert awkward.arrow.view(a).tolist() == [["one", "two", None], [], None, ["four", "five"]] - def test_union_sparse(self): + def test_arrow_union_sparse(self): if pyarrow is not None: a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, 2.2, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) assert awkward.arrow.view(a).tolist() == [0.0, True, 2.2, 3.3, False] - def test_union_sparse_null(self): + def test_arrow_union_sparse_null(self): if pyarrow is not None: a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) assert awkward.arrow.view(a).tolist() == [0.0, True, None, 3.3, False] - def test_union_sparse_null_null(self): + def test_arrow_union_sparse_null_null(self): if pyarrow is not None: a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, None, False, True, False])]) assert awkward.arrow.view(a).tolist() == [0.0, None, None, 3.3, False] - def test_union_dense(self): + def test_arrow_union_dense(self): if pyarrow is not None: a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, 2.2, 3.3]), pyarrow.array([True, True, False])]) assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, 2.2, 3.3, True, False] - def test_union_dense_null(self): + def test_arrow_union_dense_null(self): if pyarrow is not None: a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, True, False])]) assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, None, 3.3, True, False] - def test_union_dense_null_null(self): + def test_arrow_union_dense_null_null(self): if pyarrow is not None: a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, None, False])]) assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, None, 3.3, None, False] - def test_dictarray(self): + def test_arrow_dictarray(self): if pyarrow is not None: a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) assert awkward.arrow.view(a).tolist() == ["one", "one", "three", "three", "two", "one", "three", "two", "two"] - def test_dictarray_null(self): + def test_arrow_dictarray_null(self): if pyarrow is not None: a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, None, 1, None, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) assert awkward.arrow.view(a).tolist() == ["one", "one", "three", None, "two", None, "three", "two", "two"] - def test_null_dictarray(self): + def test_arrow_null_dictarray(self): if pyarrow is not None: a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", None, "three"])) assert awkward.arrow.view(a).tolist() == ["one", "one", "three", "three", None, "one", "three", None, None] - def test_batch(self): + def test_arrow_batch(self): if pyarrow is not None: a = pyarrow.RecordBatch.from_arrays( [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), @@ -239,7 +240,7 @@ def test_batch(self): ["a", "b", "c", "d", "e"]) assert awkward.arrow.view(a).tolist() == [{"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d":{"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}] - def test_table(self): + def test_arrow_table(self): if pyarrow is not None: a = pyarrow.Table.from_batches([ pyarrow.RecordBatch.from_arrays( @@ -257,3 +258,25 @@ def test_table(self): pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], ["a", "b", "c", "d", "e"])]) assert awkward.arrow.view(a).tolist() == [{"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d": {"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}, {"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d": {"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}] + + def test_arrow_nonnullable_table(self): + if pyarrow is not None: + x = pyarrow.array([1, 2, 3]) + y = pyarrow.array([1.1, 2.2, 3.3]) + table = pyarrow.Table.from_arrays([x], ["x"]) + table2 = table.add_column(1, pyarrow.column(pyarrow.field("y", y.type, False), numpy.array([1.1, 2.2, 3.3]))) + assert awkward.arrow.view(table2).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] + + # def test_arrow_writeparquet(self): + # if pyarrow is not None: + # a = pyarrow.array([True, True, None, False, True]) + # b = pyarrow.array([0.0, 1.1, None, 3.3, 4.4]) + # c = pyarrow.array([None, [1.1, 2.2, None], [], None, [4.4, 5.5]]) + + + + # table = pyarrow.Table.from_arrays + + # writer = pyarrow.parquet.ParquetWriter("tests/samples/features-0_11_1.parquet", table.schema) + # writer.write_table(table) + # writer.close() From bf8a8c3f390017992f9452e09273bdb383cb34ef Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 25 Oct 2018 17:53:12 -0500 Subject: [PATCH 17/27] ready to read parquet --- tests/samples/features-0_11_1.parquet | Bin 0 -> 1042 bytes tests/test_arrow.py | 33 ++++++++++++++++++-------- 2 files changed, 23 insertions(+), 10 deletions(-) create mode 100644 tests/samples/features-0_11_1.parquet diff --git a/tests/samples/features-0_11_1.parquet b/tests/samples/features-0_11_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..730b7689376aa5dcc3171a31230eff8a9e047f19 GIT binary patch literal 1042 zcmcIk&ubGw6n-;1J3HAlXl389TlWw(UG|U^1FrNC=(beoA!xK7JSGhh1E$6#dewu7 z;>BzG2YB+}A0Y+BgMWY*#k0^NcvD{{zoeu$`xs`2_s#q9eQ$Sb>$l1%FvUt06^e5J zd9lC8m^fU^s7;pMHRh&ZOs5E_ajA$N1xb{luO^7E_V zlUlxtt3@nKTb>}kV&gk{fez>M!^WNGIm=<{%9Eo%AI^Yfzc`RfS%^N15XFeWD%%nF z+x>xP4_Y1KQqa%1{=uRHEBeNX&AjyS94{54mTI_BMc1&2XVNQ@-e$s0GiM8nDcvM9 z>Tlp?5#uS93G$En9{Da59oiiwU)EoP%d% Date: Thu, 25 Oct 2018 19:40:15 -0500 Subject: [PATCH 18/27] started schema2type --- awkward/arrow.py | 61 +++++++++++++++++++++++++- tests/samples/features-0_11_1.parquet | Bin 1042 -> 1911 bytes tests/test_arrow.py | 5 ++- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/awkward/arrow.py b/awkward/arrow.py index da5d9e34..bdfe58ad 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -34,6 +34,7 @@ import awkward.array.masked import awkward.array.table import awkward.derived.strings +import awkward.type import awkward.util ARROW_BITMASKTYPE = awkward.util.numpy.uint8 @@ -41,6 +42,56 @@ ARROW_TAGTYPE = awkward.util.numpy.uint8 ARROW_CHARTYPE = awkward.util.numpy.uint8 +def schema2type(schema): + + + def recurse(tpe): + if isinstance(tpe, pyarrow.lib.DictionaryType): + return recurse(tpe.dictionary.type) + + elif isinstance(tpe, pyarrow.lib.StructType): + out = None + for i in range(tpe.num_children): + x = awkward.type.ArrayType(tpe[i].name, recurse(tpe[i].type)) + if out is None: + out = x + else: + out = out & x + return out + + elif isinstance(tpe, pyarrow.lib.ListType): + return awkward.type.ArrayType(float("inf"), recurse(tpe.value_type)) + + elif isinstance(tpe, pyarrow.lib.UnionType): + out = None + for i in range(tpe.num_children): + x = recurse(tpe[i].type) + if out is None: + out = x + else: + out = out | x + return out + + elif tpe == pyarrow.string(): + raise NotImplementedError + + elif tpe == pyarrow.binary(): + raise NotImplementedError + + elif tpe == pyarrow.bool_(): + raise NotImplementedError + + elif isinstance(tpe, pyarrow.lib.DataType): + tpe.to_pandas_dtype() + raise NotImplementedError + + else: + raise NotImplementedError(repr(tpe)) + + + + + def view(obj): import pyarrow @@ -157,7 +208,10 @@ def popbuffers(tpe, buffers): elif isinstance(obj, pyarrow.lib.ChunkedArray): chunks = [x for x in obj.chunks if len(x) > 0] - return awkward.array.chunked.ChunkedArray([view(x) for x in chunks], counts=[len(x) for x in chunks]) + if len(chunks) == 1: + return chunks[0] + else: + return awkward.array.chunked.ChunkedArray([view(x) for x in chunks], counts=[len(x) for x in chunks]) elif isinstance(obj, pyarrow.lib.RecordBatch): out = awkward.array.table.Table() @@ -173,7 +227,10 @@ def popbuffers(tpe, buffers): if len(chunk) > 0: chunks.append(chunk) counts.append(len(chunk)) - return awkward.array.chunked.ChunkedArray(chunks, counts=counts) + if len(chunks) == 1: + return chunks[0] + else: + return awkward.array.chunked.ChunkedArray(chunks, counts=counts) else: raise NotImplementedError(type(obj)) diff --git a/tests/samples/features-0_11_1.parquet b/tests/samples/features-0_11_1.parquet index 730b7689376aa5dcc3171a31230eff8a9e047f19..ca8e09aab914b7db37148613b37f30d8e2303584 100644 GIT binary patch delta 185 zcmbQl@tto&G}A=ZB)K14GNN3PrYsC>k`j!GqD*2UVn-N(NUeuUZ42i_d+~|0`k5N| zCd)G`vVP-Ho5V9YkZ~o`BEiXj7`0h@1k~2?PqY`CyoXt0auib$vxcP3=JQP4jEa&r cYDAbU&a8%Jv>vk>rrCZRdzly*0vv-30dGt%@Bjb+ delta 44 zcmey)H;H3IG}GkGEQK5*k}{GuYG)WH?`IO9*kCxhgmvF!dv*hk4U7y70gge207NYf AP5=M^ diff --git a/tests/test_arrow.py b/tests/test_arrow.py index 150d3068..78b9f42c 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -287,9 +287,10 @@ def test_arrow_nonnullable_table(self): # ["a", "b", "c"])]) # writer = pyarrow.parquet.ParquetWriter("tests/samples/features-0_11_1.parquet", a.schema) # writer.write_table(a) + # writer.write_table(a) # writer.close() def test_arrow_readparquet(self): if pyarrow is not None: - a = pyarrow.parquet.read_table("tests/samples/features-0_11_1.parquet") - print(a) + file = pyarrow.parquet.ParquetFile("tests/samples/features-0_11_1.parquet") + From 54c413ad7a39900991d3f08e3b9f886847237c27 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 08:15:05 -0500 Subject: [PATCH 19/27] VirtualArray must have args and kwargs --- awkward/array/chunked.py | 6 +-- awkward/array/indexed.py | 16 +++--- awkward/array/jagged.py | 4 +- awkward/array/masked.py | 8 +-- awkward/array/objects.py | 33 ++++-------- awkward/array/virtual.py | 58 ++++++++++++++++----- awkward/arrow.py | 84 +++++++++++++++++++++++------- awkward/derived/strings.py | 6 +++ awkward/persist.py | 101 ++++++++++++++++++++++++++++++------- awkward/type.py | 4 ++ 10 files changed, 234 insertions(+), 86 deletions(-) diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py index 354f062f..c58cc656 100644 --- a/awkward/array/chunked.py +++ b/awkward/array/chunked.py @@ -84,7 +84,7 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage return {"id": ident, "call": ["awkward", n], "args": [{"list": [fill(x, n + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, - [c for c in self._counts if c > 0]]} + {"json": [int(c) for c in self._counts if c > 0]}]} @property def chunks(self): @@ -646,8 +646,8 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage return {"id": ident, "call": ["awkward", n], - "args": [{"tuple": list(self._chunkshape)}, - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._dtype)]}, + "args": [{"tuple": [{"json": int(x)} for x in self._chunkshape]}, + {"dtype": awkward.persist.dtype2json(self._dtype)}, {"list": [fill(x, n + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in chunks]}]} @property diff --git a/awkward/array/indexed.py b/awkward/array/indexed.py index 52f9e0af..11b6cf59 100644 --- a/awkward/array/indexed.py +++ b/awkward/array/indexed.py @@ -265,7 +265,7 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage "call": ["awkward", n], "args": [fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._dtype)]}]} + {"dtype": awkward.persist.dtype2json(self._dtype)}]} @property def content(self): @@ -427,16 +427,18 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage self._valid() n = self.__class__.__name__ - if self._default is None or isinstance(self._default, (numbers.Real, awkward.util.numpy.integer, awkward.util.numpy.floating)): - default = self._default - elif isinstance(self._default, awkward.util.numpy.ndarray): - default = fill(self._default, n + ".default", prefix, suffix, schemasuffix, storage, compression, **kwargs) + if self._default is None: + default = {"json": self._default} + elif isinstance(self._default, (numbers.Integral, awkward.util.numpy.integer)): + default = {"json": int(self._default)} + elif isinstance(self._default, (numbers.Real, awkward.util.numpy.floating)) and awkward.util.numpy.isfinite(self._default): + default = {"json": float(self._default)} else: - default = {"call": ["pickle", "loads"], "args": pickle.dumps(self._default)} + default = fill(self._default, n + ".default", prefix, suffix, schemasuffix, storage, compression, **kwargs) return {"id": ident, "call": ["awkward", n], - "args": [self._length, + "args": [{"json": int(self._length)}, fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), default]} diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index 61cd4535..43e044b9 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -1150,14 +1150,14 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage "call": ["awkward", n, "fromcounts"], "args": [fill(self.counts, n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._subdtype)]}]} + {"dtype": awkward.persist.dtype2json(self._subdtype)}]} else: return {"id": ident, "call": ["awkward", n], "args": [fill(self._starts, n + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._stops, n + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._subdtype)]}]} + {"dtype": awkward.persist.dtype2json(self._subdtype)}]} @property def content(self): diff --git a/awkward/array/masked.py b/awkward/array/masked.py index 0da1511e..9f47428a 100644 --- a/awkward/array/masked.py +++ b/awkward/array/masked.py @@ -113,7 +113,7 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage "call": ["awkward", n], "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), - self._maskedwhen]} + {"json": bool(self._maskedwhen)}]} @property def mask(self): @@ -307,8 +307,8 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage "call": ["awkward", n], "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), - self._maskedwhen, - self._lsborder]} + {"json": bool(self._maskedwhen)}, + {"json": bool(self._lsborder)}]} @property def mask(self): @@ -532,7 +532,7 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage "call": ["awkward", n], "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), - self._maskedwhen]} + {"json": int(self._maskedwhen)}]} @property def mask(self): diff --git a/awkward/array/objects.py b/awkward/array/objects.py index 6e2d4162..7d625c5d 100644 --- a/awkward/array/objects.py +++ b/awkward/array/objects.py @@ -29,10 +29,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import importlib -import pickle -import base64 import awkward.array.base +import awkward.persist import awkward.type import awkward.util @@ -55,14 +54,8 @@ def maybemixin(sample, awkwardtype): else: return awkwardtype -def frompython(obj): - return base64.b64encode(pickle.dumps(obj)).decode("ascii") - -def topython(string): - return pickle.loads(base64.b64decode(string.encode("ascii"))) - class ObjectArray(awkward.array.base.AwkwardArrayWithContent): - def __init__(self, content, generator, *args, **kwargs): + def __init__(self, content, generator, args=(), kwargs={}): self.content = content self.generator = generator self.args = args @@ -135,15 +128,14 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage if gen is not self._generator: raise TypeError("cannot persist ObjectArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") - n = self.__class__.__name__ + name = self.__class__.__name__ out = {"id": ident, - "call": ["awkward", n], - "args": [fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"function": spec}]} - if len(self._args) > 0: - out["*"] = {"call": ["awkward.array.objects", "topython"], "args": [frompython(self._args)]} - if len(self._kwargs) > 0: - out["**"] = {"call": ["awkward.array.objects", "topython"], "args": [frompython(self._kwargs)]} - + "call": ["awkward", name], + "args": [fill(self._content, name + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"function": spec}, + {"tuple": [fill(x, name + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, + {"dict": {n: fill(x, name + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}]} + return out @property @@ -182,16 +174,13 @@ def kwargs(self): def kwargs(self, value): if not isinstance(value, dict): raise TypeError("kwargs must be a dict") - self._kwargs = value + self._kwargs = dict(value) def __len__(self): return len(self._content) def _gettype(self, seen): - if len(self._content.shape) == 1: - return self._generator - else: - return awkward.type.ArrayType(*(self._content.shape[1:] + (self._generator,))) + return self._generator def _getshape(self): return (len(self._content),) diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index 2764e7c4..b0db8a19 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -50,8 +50,10 @@ def __ne__(self, other): def __getstate__(self): raise RuntimeError("VirtualArray.TransientKeys are not unique across processes, and hence should not be serialized") - def __init__(self, generator, cache=None, persistentkey=None, type=None, persistvirtual=True): + def __init__(self, generator, args=(), kwargs={}, cache=None, persistentkey=None, type=None, persistvirtual=True): self.generator = generator + self.args = args + self.kwargs = kwargs self.cache = cache self.persistentkey = persistentkey self.type = type @@ -60,9 +62,11 @@ def __init__(self, generator, cache=None, persistentkey=None, type=None, persist self._setitem = None self._delitem = None - def copy(self, generator=None, cache=None, persistentkey=None, type=None, persistvirtual=None): + def copy(self, generator=None, args=None, kwargs=None, cache=None, persistentkey=None, type=None, persistvirtual=None): out = self.__class__.__new__(self.__class__) out._generator = self._generator + out._args = self._args + out._kwargs = self._kwargs out._cache = self._cache out._persistentkey = self._persistentkey out._type = self._type @@ -78,6 +82,10 @@ def copy(self, generator=None, cache=None, persistentkey=None, type=None, persis out._delitem = list(self._delitem) if generator is not None: out.generator = generator + if args is not None: + out.args = args + if kwargs is not None: + out.kwargs = kwargs if cache is not None: out.cache = cache if persistentkey is not None: @@ -88,8 +96,8 @@ def copy(self, generator=None, cache=None, persistentkey=None, type=None, persis out.persistvirtual = persistvirtual return out - def deepcopy(self, generator=None, cache=None, persistentkey=None, type=None, persistvirtual=None): - out = self.copy(generator=generator, cache=cache, persistentkey=persistentkey, type=type, persistvirtual=persistvirtual) + def deepcopy(self, generator=None, args=None, kwargs=None, cache=None, persistentkey=None, type=None, persistvirtual=None): + out = self.copy(generator=generator, args=arge, kwargs=kwargs, cache=cache, persistentkey=persistentkey, type=type, persistvirtual=persistvirtual) out._array = awkward.util.deepcopy(out._array) if out._setitem is not None: for n in list(out._setitem): @@ -116,7 +124,7 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ + name = self.__class__.__name__ if self._persistvirtual: if self._generator.__module__ == "__main__": @@ -133,20 +141,25 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage raise TypeError("cannot persist VirtualArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") out = {"id": ident, - "call": ["awkward", n], - "args": [{"function": spec}], + "call": ["awkward", name], + "args": [{"function": spec}, + {"tuple": [fill(x, name + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, + {"dict": {n: fill(x, name + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}], "cacheable": True} others = {} if self._persistentkey is not None: - others["persistentkey"] = self._persistentkey + try: + others["persistentkey"] = {"json": awkward.persist.jsonable(self._persistentkey)} + except TypeError: + others["persistentkey"] = {"call": ["awkward.persist", "topython"], "args": [awkward.persist.frompython(self._persistentkey)]} if self._type is not None: - others["type"] = {"call": ["awkward.persist", "json2type"], "args": [awkward.persist.type2json(self._type)], "whitelistable": True} + others["type"] = {"call": ["awkward.persist", "json2type"], "args": [{"json": awkward.persist.type2json(self._type)}], "whitelistable": True} if len(others) > 0: out["kwargs"] = others return out else: - return fill(self.array, n + ".array", prefix, suffix, schemasuffix, storage, compression, **kwargs) + return fill(self.array, name + ".array", prefix, suffix, schemasuffix, storage, compression, **kwargs) @property def generator(self): @@ -155,9 +168,30 @@ def generator(self): @generator.setter def generator(self, value): if not callable(value): - raise TypeError("generator must be a callable (of zero arguments)") + raise TypeError("generator must be a callable") self._generator = value + + @property + def args(self): + return self._args + + @args.setter + def args(self, value): + if not isinstance(value, tuple): + value = (value,) + self._args = value + + @property + def kwargs(self): + return self._kwargs + + @kwargs.setter + def kwargs(self, value): + if not isinstance(value, dict): + raise TypeError("kwargs must be a dict") + self._kwargs = dict(value) + @property def cache(self): return self._cache @@ -269,7 +303,7 @@ def ismaterialized(self): return self._array is not None and self._array in self._cache def materialize(self): - array = awkward.util.toarray(self._generator(), awkward.util.DEFAULTTYPE) + array = awkward.util.toarray(self._generator(*self._args, **self._kwargs), awkward.util.DEFAULTTYPE) if self._setitem is not None: for n, x in self._setitem.items(): array[n] = x diff --git a/awkward/arrow.py b/awkward/arrow.py index bdfe58ad..53ee069b 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -43,54 +43,85 @@ ARROW_CHARTYPE = awkward.util.numpy.uint8 def schema2type(schema): - - - def recurse(tpe): + def recurse(tpe, nullable): if isinstance(tpe, pyarrow.lib.DictionaryType): - return recurse(tpe.dictionary.type) + out = recurse(tpe.dictionary.type, nullable) + if nullable: + return awkward.type.OptionType(out) + else: + return out elif isinstance(tpe, pyarrow.lib.StructType): out = None for i in range(tpe.num_children): - x = awkward.type.ArrayType(tpe[i].name, recurse(tpe[i].type)) + x = awkward.type.ArrayType(tpe[i].name, recurse(tpe[i].type, tpe[i].nullable)) if out is None: out = x else: out = out & x - return out + if nullable: + return awkward.type.OptionType(out) + else: + return out elif isinstance(tpe, pyarrow.lib.ListType): - return awkward.type.ArrayType(float("inf"), recurse(tpe.value_type)) + out = awkward.type.ArrayType(float("inf"), recurse(tpe.value_type, nullable)) + if nullable: + return awkward.type.OptionType(out) + else: + return out elif isinstance(tpe, pyarrow.lib.UnionType): out = None for i in range(tpe.num_children): - x = recurse(tpe[i].type) + x = recurse(tpe[i].type, nullable) if out is None: out = x else: out = out | x - return out + if nullable: + return awkward.type.OptionType(out) + else: + return out elif tpe == pyarrow.string(): - raise NotImplementedError + if nullable: + return awkward.type.OptionType(str) + else: + return str elif tpe == pyarrow.binary(): - raise NotImplementedError + if nullable: + return awkward.type.OptionType(bytes) + else: + return bytes elif tpe == pyarrow.bool_(): - raise NotImplementedError - + out = awkward.util.numpy.dtype(bool) + if nullable: + return awkward.type.OptionType(out) + else: + return out + elif isinstance(tpe, pyarrow.lib.DataType): - tpe.to_pandas_dtype() - raise NotImplementedError + if nullable: + return awkward.type.OptionType(tpe.to_pandas_dtype()) + else: + return tpe.to_pandas_dtype() else: raise NotImplementedError(repr(tpe)) + out = None + for name in schema.names: + field = schema.field_by_name(name) + mytype = awkward.type.ArrayType(name, recurse(field.type, field.nullable)) + if out is None: + out = mytype + else: + out = out & mytype - - + return out def view(obj): import pyarrow @@ -234,3 +265,22 @@ def popbuffers(tpe, buffers): else: raise NotImplementedError(type(obj)) + + + +def fromparquet(file, cache=None, metadata=None, common_metadata=None): + import pyarrow.parquet + + pqfile = pyarrow.parquet.ParquetFile(file, metadata=metadata, common_metadata=common_metadata) + tpe = schema2type(pqfile.schema.to_arrow_schema()) + names = tpe.columns + + chunks = [] + counts = [] + for i in range(pqfile.num_row_groups): + numrows = pqfile.metadata.row_group(i).num_rows + + + + + return awkward.array.chunked.ChunkedArray(chunks, counts) diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py index af5ff07b..d3a0c276 100644 --- a/awkward/derived/strings.py +++ b/awkward/derived/strings.py @@ -327,6 +327,12 @@ def parents(self): def index(self): return self._content.index + def _gettype(self, seen): + if self._encoding is None: + return bytes + else: + return str + def __getitem__(self, where): if awkward.util.isstringslice(where): raise IndexError("cannot index StringArray with string or sequence of strings") diff --git a/awkward/persist.py b/awkward/persist.py index 2ea38315..3daa0437 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -28,11 +28,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import fnmatch import importlib import json import numbers import os +import pickle import zipfile import zlib try: @@ -40,14 +42,12 @@ except ImportError: from collections import Mapping, MutableMapping -import numpy - import awkward.type import awkward.util import awkward.version compression = [ - {"minsize": 8192, "types": [numpy.bool_, numpy.bool, numpy.integer], "contexts": "*", "pair": (zlib.compress, ("zlib", "decompress"))}, + {"minsize": 8192, "types": [awkward.util.numpy.bool_, awkward.util.numpy.bool, awkward.util.numpy.integer], "contexts": "*", "pair": (zlib.compress, ("zlib", "decompress"))}, ] partner = { @@ -60,6 +60,12 @@ ["awkward", "Table"], ["awkward.persist", "*"]] +def frompython(obj): + return base64.b64encode(pickle.dumps(obj)).decode("ascii") + +def topython(string): + return pickle.loads(base64.b64decode(string.encode("ascii"))) + def spec2function(obj, whitelist=whitelist): for white in whitelist: for n, p in zip(obj, white): @@ -91,7 +97,7 @@ def recurse(obj): return [recurse(x) for x in obj] else: return obj - return numpy.dtype(recurse(obj)) + return awkward.util.numpy.dtype(recurse(obj)) def type2json(obj): if isinstance(obj, awkward.type.Type): @@ -134,7 +140,7 @@ def recurse(obj): else: return out - elif isinstance(obj, numpy.dtype): + elif isinstance(obj, awkward.util.numpy.dtype): return {"dtype": dtype2json(obj)} elif callable(obj): @@ -205,8 +211,34 @@ def recurse(obj): return awkward.type._resolve(recurse(obj), {}) +def jsonable(obj): + if obj is None: + return obj + + elif isinstance(obj, dict) and all(isinstance(n, str) for n in obj): + return {n: jsonable(x) for n, x in obj.items()} + + elif isinstance(obj, list): + return [jsonable(x) for x in obj] + + elif isinstance(obj, str): + return str(obj) + + elif isinstance(obj, (bool, awkward.util.numpy.bool_, awkward.util.numpy.bool)): + return bool(obj) # policy: eliminate Numpy types + + elif isinstance(obj, (numbers.Integral, awkward.util.numpy.integer)): + return int(obj) # policy: eliminate Numpy types + + elif isinstance(obj, (numbers.Real, awkward.util.numpy.floating)) and awkward.util.numpy.finite(obj): + return float(obj) # policy: eliminate Numpy types + + else: + raise TypeError("object cannot be losslessly serialized as JSON") + def serialize(obj, storage, name=None, delimiter="-", suffix=None, schemasuffix=None, compression=compression, **kwargs): import awkward.array.base + import awkward.array.virtual for n in kwargs: if n not in (): @@ -261,11 +293,14 @@ def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwa ident = len(seen) seen[id(obj)] = ident - if type(obj) is numpy.ndarray and len(obj.shape) != 0: + if type(obj) is awkward.util.numpy.dtype: + return {"dtype": dtype2json(obj)} + + elif type(obj) is awkward.util.numpy.ndarray and len(obj.shape) != 0: if len(obj.shape) > 1: - dtype = dtype2json(numpy.dtype((obj.dtype, obj.shape[1:]))) + dtype = awkward.util.numpy.dtype((obj.dtype, obj.shape[1:])) else: - dtype = dtype2json(obj.dtype) + dtype = obj.dtype for policy in normalized: minsize, types, contexts, pair = policy["minsize"], policy["types"], policy["contexts"], policy["pair"] @@ -276,22 +311,27 @@ def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwa return {"id": ident, "call": ["numpy", "frombuffer"], "args": [{"call": decompress, "args": [{"read": str(ident) + suffix}]}, - {"call": ["awkward.persist", "json2dtype"], "args": [dtype]}, - len(obj)]} + {"dtype": dtype2json(dtype)}, + {"json": len(obj)}]} else: storage[prefix + str(ident) + suffix] = obj.tostring() return {"id": ident, "call": ["numpy", "frombuffer"], "args": [{"read": str(ident) + suffix}, - {"call": ["awkward.persist", "json2dtype"], "args": [dtype]}, - len(obj)]} + {"dtype": dtype2json(dtype)}, + {"json": len(obj)}]} elif hasattr(obj, "__awkward_persist__"): return obj.__awkward_persist__(ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs) else: - raise TypeError("cannot serialize {0} instance (has no __awkward_persist__ method)".format(type(obj))) + try: + obj = jsonable(obj) + except TypeError: + return {"call": ["awkward.persist", "topython"], "args": [awkward.persist.frompython(obj)]} + else: + return {"json": obj} schema = {"awkward": awkward.version.__version__, "schema": fill(obj, "", prefix, suffix, schemasuffix, storage, compression, **kwargs)} @@ -305,7 +345,7 @@ def deserialize(storage, name="", whitelist=whitelist, cache=None): import awkward.array.virtual schema = storage[name] - if isinstance(schema, numpy.ndarray): + if isinstance(schema, awkward.util.numpy.ndarray): schema = schema.tostring() if isinstance(schema, bytes): schema = schema.decode("ascii") @@ -358,26 +398,38 @@ def unfill(schema): elif "pairs" in schema: return [(n, unfill(x)) for n, x in schema["pairs"]] + elif "dict" in schema: + return {n: unfill(x) for n, x in schema["dict"].items()} + + elif "dtype" in schema: + return json2dtype(schema["dtype"]) + elif "function" in schema: return spec2function(schema["function"], whitelist=whitelist) + elif "json" in schema: + return schema["json"] + elif "ref" in schema: if schema["ref"] in seen: return seen[schema["ref"]] else: return awkward.array.virtual.VirtualArray(lambda: seen[schema["ref"]]) - + else: - return schema + raise ValueError("unrecognized JSON object with fields {0}".format(", ".join(repr(x) for x in schema))) + + elif isinstance(schema, list): + raise ValueError("unrecognized JSON list with length {0}".format(len(schema))) else: - return schema + raise ValueError("unrecognized JSON object: {0}".format(repr(schema))) return unfill(schema["schema"]) def keys(storage, name="", subschemas=True): schema = storage[name] - if isinstance(schema, numpy.ndarray): + if isinstance(schema, awkward.util.numpy.ndarray): schema = schema.tostring() if isinstance(schema, bytes): schema = schema.decode("ascii") @@ -422,9 +474,20 @@ def recurse(schema): for y in recurse(x): yield y + elif "dict" in schema: + for x in schema["dict"].values(): + for y in recurse(x): + yield y + + elif "dtype" in schema: + pass + elif "function" in schema: pass + elif "json" in schema: + pass + elif "ref" in schema: pass @@ -547,7 +610,7 @@ def __init__(self): def __getitem__(self, where): return self.g[where].value def __setitem__(self, where, what): - self.g[where] = numpy.frombuffer(what, dtype=numpy.uint8) + self.g[where] = awkward.util.numpy.frombuffer(what, dtype=awkward.util.numpy.uint8) self._group = Wrap() diff --git a/awkward/type.py b/awkward/type.py index b0092ed2..24fc85d3 100644 --- a/awkward/type.py +++ b/awkward/type.py @@ -363,6 +363,10 @@ def dtype(self): out.append((n, x.dtype)) return awkward.util.numpy.dtype(out) + @property + def columns(self): + return list(self._fields) + def _isnumpy(self, seen): if id(self) in seen: return False From 8f031405040a0b89db32e2b853fc47c0d1239f5a Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 08:19:54 -0500 Subject: [PATCH 20/27] the 'n' for self.__class__.__name__ thing was stupid and dangerous --- awkward/array/chunked.py | 12 +++++------- awkward/array/indexed.py | 23 ++++++++++------------- awkward/array/jagged.py | 30 ++++++++++++++---------------- awkward/array/masked.py | 21 +++++++++------------ awkward/array/objects.py | 11 +++++------ awkward/array/union.py | 15 +++++++-------- awkward/array/virtual.py | 11 +++++------ 7 files changed, 55 insertions(+), 68 deletions(-) diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py index c58cc656..fecde3e3 100644 --- a/awkward/array/chunked.py +++ b/awkward/array/chunked.py @@ -80,10 +80,9 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self.knowcounts() self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [{"list": [fill(x, n + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, + "call": ["awkward", self.__class__.__name__], + "args": [{"list": [fill(x, self.__class__.__name__ + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, {"json": [int(c) for c in self._counts if c > 0]}]} @property @@ -635,8 +634,7 @@ def _mine(self, overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ - + chunks = [] for c, x in zip(self._counts, self._chunks): if 0 < c < len(x): @@ -645,10 +643,10 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage chunks.append(x) return {"id": ident, - "call": ["awkward", n], + "call": ["awkward", self.__class__.__name__], "args": [{"tuple": [{"json": int(x)} for x in self._chunkshape]}, {"dtype": awkward.persist.dtype2json(self._dtype)}, - {"list": [fill(x, n + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in chunks]}]} + {"list": [fill(x, self.__class__.__name__ + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in chunks]}]} @property def chunkshape(self): diff --git a/awkward/array/indexed.py b/awkward/array/indexed.py index 11b6cf59..efa8f425 100644 --- a/awkward/array/indexed.py +++ b/awkward/array/indexed.py @@ -89,11 +89,10 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} @property def index(self): @@ -260,11 +259,10 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"dtype": awkward.persist.dtype2json(self._dtype)}]} @property @@ -425,7 +423,6 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ if self._default is None: default = {"json": self._default} @@ -434,13 +431,13 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage elif isinstance(self._default, (numbers.Real, awkward.util.numpy.floating)) and awkward.util.numpy.isfinite(self._default): default = {"json": float(self._default)} else: - default = fill(self._default, n + ".default", prefix, suffix, schemasuffix, storage, compression, **kwargs) + default = fill(self._default, self.__class__.__name__ + ".default", prefix, suffix, schemasuffix, storage, compression, **kwargs) return {"id": ident, - "call": ["awkward", n], + "call": ["awkward", self.__class__.__name__], "args": [{"json": int(self._length)}, - fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), default]} @property diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index 43e044b9..b08f1e4b 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -236,18 +236,17 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ if offsetsaliased(self._starts, self._stops) and len(self._starts) > 0 and self._starts[0] == 0: return {"id": ident, - "call": ["awkward", n, "fromcounts"], - "args": [fill(self.counts, n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} + "call": ["awkward", self.__class__.__name__, "fromcounts"], + "args": [fill(self.counts, self.__class__.__name__ + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} else: return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._starts, n + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._stops, n + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._starts, self.__class__.__name__ + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._stops, self.__class__.__name__ + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} @property def starts(self): @@ -1144,19 +1143,18 @@ def deepcopy(self, starts=None, stops=None, content=None, subdtype=None): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ if offsetsaliased(self._starts, self._stops) and len(self._starts) > 0 and self._starts[0] == 0: return {"id": ident, - "call": ["awkward", n, "fromcounts"], - "args": [fill(self.counts, n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + "call": ["awkward", self.__class__.__name__, "fromcounts"], + "args": [fill(self.counts, self.__class__.__name__ + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"dtype": awkward.persist.dtype2json(self._subdtype)}]} else: return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._starts, n + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._stops, n + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._starts, self.__class__.__name__ + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._stops, self.__class__.__name__ + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"dtype": awkward.persist.dtype2json(self._subdtype)}]} @property diff --git a/awkward/array/masked.py b/awkward/array/masked.py index 9f47428a..5d225d05 100644 --- a/awkward/array/masked.py +++ b/awkward/array/masked.py @@ -108,11 +108,10 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._mask, self.__class__.__name__ + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"json": bool(self._maskedwhen)}]} @property @@ -302,11 +301,10 @@ def _mine(self, overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._mask, self.__class__.__name__ + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"json": bool(self._maskedwhen)}, {"json": bool(self._lsborder)}]} @@ -527,11 +525,10 @@ def copy(self, mask=None, content=None, maskedwhen=None): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._mask, self.__class__.__name__ + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"json": int(self._maskedwhen)}]} @property diff --git a/awkward/array/objects.py b/awkward/array/objects.py index 7d625c5d..38af3c44 100644 --- a/awkward/array/objects.py +++ b/awkward/array/objects.py @@ -127,14 +127,13 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage gen, genname = getattr(gen, genname[0]), genname[1:] if gen is not self._generator: raise TypeError("cannot persist ObjectArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") - - name = self.__class__.__name__ + out = {"id": ident, - "call": ["awkward", name], - "args": [fill(self._content, name + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"function": spec}, - {"tuple": [fill(x, name + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, - {"dict": {n: fill(x, name + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}]} + {"tuple": [fill(x, self.__class__.__name__ + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, + {"dict": {n: fill(x, self.__class__.__name__ + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}]} return out diff --git a/awkward/array/union.py b/awkward/array/union.py index 78fa9cd6..e4c4c735 100644 --- a/awkward/array/union.py +++ b/awkward/array/union.py @@ -97,19 +97,18 @@ def issequential(self): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ if self.issequential: return {"id": ident, - "call": ["awkward", n, "fromtags"], - "args": [fill(self._tags, n + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), - {"list": [fill(x, n + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} + "call": ["awkward", self.__class__.__name__, "fromtags"], + "args": [fill(self._tags, self.__class__.__name__ + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"list": [fill(x, self.__class__.__name__ + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} else: return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._tags, n + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), - fill(self._index, n + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), - {"list": [fill(x, n + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._tags, self.__class__.__name__ + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"list": [fill(x, self.__class__.__name__ + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} @property def tags(self): diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index b0db8a19..88a35d19 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -124,8 +124,7 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - name = self.__class__.__name__ - + if self._persistvirtual: if self._generator.__module__ == "__main__": raise TypeError("cannot persist VirtualArray: its generator is defined in __main__, which won't be available in a subsequent session") @@ -141,10 +140,10 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage raise TypeError("cannot persist VirtualArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") out = {"id": ident, - "call": ["awkward", name], + "call": ["awkward", self.__class__.__name__], "args": [{"function": spec}, - {"tuple": [fill(x, name + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, - {"dict": {n: fill(x, name + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}], + {"tuple": [fill(x, self.__class__.__name__ + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, + {"dict": {n: fill(x, self.__class__.__name__ + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}], "cacheable": True} others = {} if self._persistentkey is not None: @@ -159,7 +158,7 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage return out else: - return fill(self.array, name + ".array", prefix, suffix, schemasuffix, storage, compression, **kwargs) + return fill(self.array, self.__class__.__name__ + ".array", prefix, suffix, schemasuffix, storage, compression, **kwargs) @property def generator(self): From f28011f2414f5dc18514f640bddaff081847679f Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 08:37:26 -0500 Subject: [PATCH 21/27] consolidate serialization of functions --- awkward/array/objects.py | 28 ++++++---------------------- awkward/array/virtual.py | 15 +-------------- awkward/persist.py | 29 +++++++++++++++++++++++------ 3 files changed, 30 insertions(+), 42 deletions(-) diff --git a/awkward/array/objects.py b/awkward/array/objects.py index 38af3c44..53566883 100644 --- a/awkward/array/objects.py +++ b/awkward/array/objects.py @@ -114,28 +114,12 @@ def ones_like(self, **overrides): def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - - if self._generator.__module__ == "__main__": - raise TypeError("cannot persist ObjectArray: its generator is defined in __main__, which won't be available in a subsequent session") - if hasattr(self._generator, "__qualname__"): - spec = [self._generator.__module__] + self._generator.__qualname__.split(".") - else: - spec = [self._generator.__module__, self._generator.__name__] - - gen, genname = importlib.import_module(spec[0]), spec[1:] - while len(genname) > 0: - gen, genname = getattr(gen, genname[0]), genname[1:] - if gen is not self._generator: - raise TypeError("cannot persist ObjectArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") - - out = {"id": ident, - "call": ["awkward", self.__class__.__name__], - "args": [fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), - {"function": spec}, - {"tuple": [fill(x, self.__class__.__name__ + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, - {"dict": {n: fill(x, self.__class__.__name__ + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}]} - - return out + return {"id": ident, + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._generator, self.__class__.__name__ + ".generator", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"tuple": [fill(x, self.__class__.__name__ + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, + {"dict": {n: fill(x, self.__class__.__name__ + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}]} @property def content(self): diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index 88a35d19..78ed378b 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -126,22 +126,9 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage self._valid() if self._persistvirtual: - if self._generator.__module__ == "__main__": - raise TypeError("cannot persist VirtualArray: its generator is defined in __main__, which won't be available in a subsequent session") - if hasattr(self._generator, "__qualname__"): - spec = [self._generator.__module__] + self._generator.__qualname__.split(".") - else: - spec = [self._generator.__module__, self._generator.__name__] - - gen, genname = importlib.import_module(spec[0]), spec[1:] - while len(genname) > 0: - gen, genname = getattr(gen, genname[0]), genname[1:] - if gen is not self._generator: - raise TypeError("cannot persist VirtualArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") - out = {"id": ident, "call": ["awkward", self.__class__.__name__], - "args": [{"function": spec}, + "args": [fill(self._generator, self.__class__.__name__ + ".generator", prefix, suffix, schemasuffix, storage, compression, **kwargs), {"tuple": [fill(x, self.__class__.__name__ + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, {"dict": {n: fill(x, self.__class__.__name__ + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}], "cacheable": True} diff --git a/awkward/persist.py b/awkward/persist.py index 3daa0437..d4af23f2 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -35,6 +35,7 @@ import numbers import os import pickle +import types import zipfile import zlib try: @@ -277,13 +278,13 @@ def serialize(obj, storage, name=None, delimiter="-", suffix=None, schemasuffix= x = {"pair": x} minsize = x.get("minsize", 0) - types = x.get("types", (object,)) - if not isinstance(types, tuple): - types = (types,) + tpes = x.get("types", (object,)) + if not isinstance(tpes, tuple): + tpes = (tpes,) contexts = x.get("contexts", "*") pair = x["pair"] - normalized.append({"minsize": minsize, "types": types, "contexts": contexts, "pair": pair}) + normalized.append({"minsize": minsize, "types": tpes, "contexts": contexts, "pair": pair}) seen = {} def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwargs): @@ -303,8 +304,8 @@ def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwa dtype = obj.dtype for policy in normalized: - minsize, types, contexts, pair = policy["minsize"], policy["types"], policy["contexts"], policy["pair"] - if obj.nbytes >= minsize and issubclass(obj.dtype.type, tuple(types)) and any(fnmatch.fnmatchcase(context, p) for p in contexts): + minsize, tpes, contexts, pair = policy["minsize"], policy["types"], policy["contexts"], policy["pair"] + if obj.nbytes >= minsize and issubclass(obj.dtype.type, tuple(tpes)) and any(fnmatch.fnmatchcase(context, p) for p in contexts): compress, decompress = pair storage[prefix + str(ident) + suffix] = compress(obj) @@ -325,6 +326,22 @@ def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwa elif hasattr(obj, "__awkward_persist__"): return obj.__awkward_persist__(ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs) + elif isinstance(obj, (types.FunctionType, type)): + if obj.__module__ == "__main__": + raise TypeError("cannot persist an object containing functions defined on the console (__main__)") + if hasattr(obj, "__qualname__"): + spec = [obj.__module__] + obj.__qualname__.split(".") + else: + spec = [obj.__module__, obj.__name__] + + gen, genname = importlib.import_module(spec[0]), spec[1:] + while len(genname) > 0: + gen, genname = getattr(gen, genname[0]), genname[1:] + if gen is not obj: + raise TypeError("cannot persist a function or class that cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") + + return {"function": spec} + else: try: obj = jsonable(obj) From 1aaa4b36ca962a5a1a008a38098932de19ecc97f Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 08:50:48 -0500 Subject: [PATCH 22/27] more consistent about checking for id --- awkward/persist.py | 62 +++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/awkward/persist.py b/awkward/persist.py index d4af23f2..be765e25 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -35,7 +35,6 @@ import numbers import os import pickle -import types import zipfile import zlib try: @@ -326,29 +325,29 @@ def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwa elif hasattr(obj, "__awkward_persist__"): return obj.__awkward_persist__(ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs) - elif isinstance(obj, (types.FunctionType, type)): - if obj.__module__ == "__main__": - raise TypeError("cannot persist an object containing functions defined on the console (__main__)") - if hasattr(obj, "__qualname__"): - spec = [obj.__module__] + obj.__qualname__.split(".") - else: - spec = [obj.__module__, obj.__name__] + else: + if hasattr(obj, "__module__") and (hasattr(obj, "__qualname__") or hasattr(obj, "__name__")) and obj.__module__ != "__main__": + if hasattr(obj, "__qualname__"): + spec = [obj.__module__] + obj.__qualname__.split(".") + else: + spec = [obj.__module__, obj.__name__] - gen, genname = importlib.import_module(spec[0]), spec[1:] - while len(genname) > 0: - gen, genname = getattr(gen, genname[0]), genname[1:] - if gen is not obj: - raise TypeError("cannot persist a function or class that cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") + gen, genname = importlib.import_module(spec[0]), spec[1:] + while len(genname) > 0: + gen, genname = getattr(gen, genname[0]), genname[1:] - return {"function": spec} + if gen is obj: + return {"id": ident, "function": spec} - else: try: obj = jsonable(obj) except TypeError: - return {"call": ["awkward.persist", "topython"], "args": [awkward.persist.frompython(obj)]} + try: + return {"id": ident, "call": ["awkward.persist", "topython"], "args": [awkward.persist.frompython(obj)]} + except Exception as err: + raise TypeError("could not persist component as an array, awkward-array, importable function/class, JSON, or pickle; pickle error is\n\n {0}: {1}".format(err.__class__.__name__, str(err))) else: - return {"json": obj} + return {"id": ident, "json": obj} schema = {"awkward": awkward.version.__version__, "schema": fill(obj, "", prefix, suffix, schemasuffix, storage, compression, **kwargs)} @@ -396,46 +395,47 @@ def unfill(schema): kwargs.update({n: unfill(x) for n, x in schema["**"].items()}) out = gen(*args, **kwargs) - if "id" in schema: - seen[schema["id"]] = out - return out elif "read" in schema: if schema.get("absolute", False): - return storage[schema["read"]] + out = storage[schema["read"]] else: - return storage[prefix + schema["read"]] + out = storage[prefix + schema["read"]] elif "list" in schema: - return [unfill(x) for x in schema["list"]] + out = [unfill(x) for x in schema["list"]] elif "tuple" in schema: - return tuple(unfill(x) for x in schema["tuple"]) + out = tuple(unfill(x) for x in schema["tuple"]) elif "pairs" in schema: - return [(n, unfill(x)) for n, x in schema["pairs"]] + out = [(n, unfill(x)) for n, x in schema["pairs"]] elif "dict" in schema: - return {n: unfill(x) for n, x in schema["dict"].items()} + out = {n: unfill(x) for n, x in schema["dict"].items()} elif "dtype" in schema: - return json2dtype(schema["dtype"]) + out = json2dtype(schema["dtype"]) elif "function" in schema: - return spec2function(schema["function"], whitelist=whitelist) + out = spec2function(schema["function"], whitelist=whitelist) elif "json" in schema: - return schema["json"] + out = schema["json"] elif "ref" in schema: if schema["ref"] in seen: - return seen[schema["ref"]] + out = seen[schema["ref"]] else: - return awkward.array.virtual.VirtualArray(lambda: seen[schema["ref"]]) + out = awkward.array.virtual.VirtualArray(lambda: seen[schema["ref"]]) else: raise ValueError("unrecognized JSON object with fields {0}".format(", ".join(repr(x) for x in schema))) + if "id" in schema: + seen[schema["id"]] = out + return out + elif isinstance(schema, list): raise ValueError("unrecognized JSON list with length {0}".format(len(schema))) From 1855276c9f8dea3302725384d12d2ad36bdab5fe Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 10:25:51 -0500 Subject: [PATCH 23/27] much of the infrastructure is there, but VirtualArrays will have to ignore missing masks --- awkward/array/virtual.py | 7 +++-- awkward/arrow.py | 54 +++++++++++++++++++++++++-------- awkward/type.py | 64 +++++++++++++++++++++++++++++++++------- 3 files changed, 99 insertions(+), 26 deletions(-) diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index 78ed378b..c70de355 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -297,8 +297,11 @@ def materialize(self): for n in self._delitem: del array[n] - if self._type is not None and self._type != awkward.type.fromarray(array): - raise TypeError("materialized array has type\n\n{0}\n\nexpected type\n\n{1}".format(awkward.type._str(awkward.type.fromarray(array), indent=" "), awkward.type._str(self._type, indent=" "))) + if self._type is not None: + materializedtype = awkward.type.fromarray(array) + if ((isinstance(self._type, awkward.type.Type) and not self._type._eq(materializedtype, set(), ignoremask=True)) or + (not isinstance(self._type, awkward.type.Type) and not self._type == materializedtype)): + raise TypeError("materialized array has type\n\n{0}\n\nexpected type\n\n{1}".format(awkward.type._str(awkward.type.fromarray(array), indent=" "), awkward.type._str(self._type, indent=" "))) if self._cache is None: # states (1), (2), and (6) diff --git a/awkward/arrow.py b/awkward/arrow.py index 53ee069b..53f7d786 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -33,6 +33,7 @@ import awkward.array.jagged import awkward.array.masked import awkward.array.table +import awkward.array.virtual import awkward.derived.strings import awkward.type import awkward.util @@ -43,6 +44,8 @@ ARROW_CHARTYPE = awkward.util.numpy.uint8 def schema2type(schema): + import pyarrow + def recurse(tpe, nullable): if isinstance(tpe, pyarrow.lib.DictionaryType): out = recurse(tpe.dictionary.type, nullable) @@ -266,21 +269,46 @@ def popbuffers(tpe, buffers): else: raise NotImplementedError(type(obj)) - - -def fromparquet(file, cache=None, metadata=None, common_metadata=None): - import pyarrow.parquet - - pqfile = pyarrow.parquet.ParquetFile(file, metadata=metadata, common_metadata=common_metadata) - tpe = schema2type(pqfile.schema.to_arrow_schema()) - names = tpe.columns +class ParquetFile(object): + def __init__(self, file, cache=None, metadata=None, common_metadata=None): + self.file = file + self.cache = cache + self.metadata = metadata + self.common_metadata = common_metadata + self._init() + + def _init(self): + import pyarrow.parquet + self.parquetfile = pyarrow.parquet.ParquetFile(self.file, metadata=self.metadata, common_metadata=self.common_metadata) + self.type = schema2type(self.parquetfile.schema.to_arrow_schema()) + + def __getstate__(self): + return {"file": self.file, "metadata": self.metadata, "common_metadata": self.common_metadata} + + def __setstate__(self, state): + self.file = state["file"] + self.cache = None + self.metadata = state["metadata"] + self.common_metadata = state["common_metadata"] + self._init() + + def __call__(self, rowgroup, column): + print("read", rowgroup, column) + return view(self.parquetfile.read_row_group(rowgroup, columns=[column]))[column] + +def fromparquet(file, cache=None, persistvirtual=False, metadata=None, common_metadata=None): + parquetfile = ParquetFile(file, cache=cache, metadata=metadata, common_metadata=common_metadata) + columns = parquetfile.type.columns chunks = [] counts = [] - for i in range(pqfile.num_row_groups): - numrows = pqfile.metadata.row_group(i).num_rows - - + for i in range(parquetfile.parquetfile.num_row_groups): + numrows = parquetfile.parquetfile.metadata.row_group(i).num_rows + if numrows > 0: + chunk = awkward.array.table.Table() + for n in columns: + chunk[n] = awkward.array.virtual.VirtualArray(parquetfile, (i, n), type=awkward.type.ArrayType(numrows, parquetfile.type[n])) + chunks.append(chunk) + counts.append(numrows) - return awkward.array.chunked.ChunkedArray(chunks, counts) diff --git a/awkward/type.py b/awkward/type.py index 24fc85d3..ce103b0d 100644 --- a/awkward/type.py +++ b/awkward/type.py @@ -206,7 +206,7 @@ def __eq__(self, other): else: one = Type._canonical(Type._copy(self, {}), set()) two = Type._canonical(Type._copy(other, {}), set()) - return one._eq(two, set()) + return one._eq(two, set(), ignoremask=False) def __ne__(self, other): return not self.__eq__(other) @@ -329,15 +329,23 @@ def _substr(self, labeled, seen, indent): to = str(self._to) return takes + to - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, ArrayType) and self._takes == other._takes and self._to == other._to - + if isinstance(other, ArrayType) and self._takes == other._takes: + if isinstance(self._to, Type) and isinstance(other._to, Type): + return self._to._eq(other._to, seen, ignoremask=ignoremask) + elif not isinstance(self._to, Type) and not isinstance(other._to, Type): + return self._to == other._to + else: + return False + else: + return False + def __hash__(self): return hash((ArrayType, self._takes, self._to)) @@ -417,14 +425,27 @@ def _substr(self, labeled, seen, indent): out.append(("{0}{1:%ds} -> {2}" % width).format(indent, repr(n), to)) return "\n".join(out).lstrip(" ") - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, TableType) and [(n, self._fields[n]) for n in sorted(self._fields)] == [(n, other._fields[n]) for n in sorted(other._fields)] + if isinstance(other, TableType) and sorted(self._fields) == sorted(other._fields): + for n in self._fields: + if isinstance(self._fields[n], Type) and isinstance(other._fields[n], Type): + if not self._fields[n]._eq(other._fields[n], seen, ignoremask=ignoremask): + return False + elif not isinstance(self._fields[n], Type) and not isinstance(other._fields[n], Type): + if not self._fields[n] == other._fields[n]: + return False + else: + return False + else: + return True # nothing failed in the loop over fields + else: + return False def __hash__(self): return hash((TableType, tuple((n, self._fields[n]) for n in sorted(self._fields)))) @@ -487,14 +508,27 @@ def lstrip(x): out = [x + " " * (width - len(lstrip(x.split("\n")[-1]))) for x in subs] return "(" + (" |\n" + indent + " ").join(out) + " )" - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, UnionType) and set(self._possibilities) == set(other._possibilities) + if isinstance(other, UnionType) and len(self._possibilities) == len(other._possibilities): + for x, y in zip(sorted(self._possibilities), sorted(self._possibilities)): + if isinstance(x, Type) and isinstance(y, Type): + if not x._eq(y, seen, ignoremask=ignoremask): + return False + elif not isinstance(x, Type) and not isinstance(y, Type): + if not x == y: + return False + else: + return False + else: + return True # nothing failed in the loop over possibilities + else: + return False def __hash__(self): return hash((UnionType, tuple(self._possibilities))) @@ -512,7 +546,7 @@ def type(self, value): if isinstance(value, Type): self._type = value else: - self._type = self._finaltype(self._type) + self._type = self._finaltype(value) @property def shape(self): @@ -547,14 +581,22 @@ def _substr(self, labeled, seen, indent): type = str(self._type) return "?({0})".format(type) - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, OptionType) and self._type == other._type + if isinstance(other, OptionType) and self._type._eq(other._type, seen, ignoremask=ignoremask): + return True + if ignoremask: # applied asymmetrically; only the left can ignore mask + if isinstance(other, Type): + return self._type._eq(other, seen, ignoremask=ignoremask) + else: + return self._type == other + else: + return False def __hash__(self): return hash((OptionType, self._type)) From c844feb1e0e2f6f339a6c8fb6872a7c73bd533cc Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 10:53:25 -0500 Subject: [PATCH 24/27] reading Parquet lazily --- awkward/arrow.py | 1 - awkward/type.py | 22 ++++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/awkward/arrow.py b/awkward/arrow.py index 53f7d786..fae54cdd 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -293,7 +293,6 @@ def __setstate__(self, state): self._init() def __call__(self, rowgroup, column): - print("read", rowgroup, column) return view(self.parquetfile.read_row_group(rowgroup, columns=[column]))[column] def fromparquet(file, cache=None, persistvirtual=False, metadata=None, common_metadata=None): diff --git a/awkward/type.py b/awkward/type.py index ce103b0d..74064335 100644 --- a/awkward/type.py +++ b/awkward/type.py @@ -337,15 +337,13 @@ def _eq(self, other, seen, ignoremask=False): else: seen.add(id(self)) if isinstance(other, ArrayType) and self._takes == other._takes: - if isinstance(self._to, Type) and isinstance(other._to, Type): + if isinstance(self._to, Type): return self._to._eq(other._to, seen, ignoremask=ignoremask) - elif not isinstance(self._to, Type) and not isinstance(other._to, Type): - return self._to == other._to else: - return False + return self._to == other._to else: return False - + def __hash__(self): return hash((ArrayType, self._takes, self._to)) @@ -434,14 +432,12 @@ def _eq(self, other, seen, ignoremask=False): seen.add(id(self)) if isinstance(other, TableType) and sorted(self._fields) == sorted(other._fields): for n in self._fields: - if isinstance(self._fields[n], Type) and isinstance(other._fields[n], Type): + if isinstance(self._fields[n], Type): if not self._fields[n]._eq(other._fields[n], seen, ignoremask=ignoremask): return False - elif not isinstance(self._fields[n], Type) and not isinstance(other._fields[n], Type): + else: if not self._fields[n] == other._fields[n]: return False - else: - return False else: return True # nothing failed in the loop over fields else: @@ -517,14 +513,12 @@ def _eq(self, other, seen, ignoremask=False): seen.add(id(self)) if isinstance(other, UnionType) and len(self._possibilities) == len(other._possibilities): for x, y in zip(sorted(self._possibilities), sorted(self._possibilities)): - if isinstance(x, Type) and isinstance(y, Type): + if isinstance(x, Type): if not x._eq(y, seen, ignoremask=ignoremask): return False - elif not isinstance(x, Type) and not isinstance(y, Type): + else: if not x == y: return False - else: - return False else: return True # nothing failed in the loop over possibilities else: @@ -591,7 +585,7 @@ def _eq(self, other, seen, ignoremask=False): if isinstance(other, OptionType) and self._type._eq(other._type, seen, ignoremask=ignoremask): return True if ignoremask: # applied asymmetrically; only the left can ignore mask - if isinstance(other, Type): + if isinstance(self._type, Type): return self._type._eq(other, seen, ignoremask=ignoremask) else: return self._type == other From 91faba26027cfdc18e9f1f966dd392c0f3044e00 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 11:17:35 -0500 Subject: [PATCH 25/27] handle Python as a JSON type --- awkward/array/virtual.py | 3 ++- awkward/arrow.py | 2 +- awkward/persist.py | 9 ++++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index c70de355..38be4711 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -137,7 +137,8 @@ def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage try: others["persistentkey"] = {"json": awkward.persist.jsonable(self._persistentkey)} except TypeError: - others["persistentkey"] = {"call": ["awkward.persist", "topython"], "args": [awkward.persist.frompython(self._persistentkey)]} + others["persistentkey"] = {"python": awkward.persist.frompython(self._persistentkey)} + if self._type is not None: others["type"] = {"call": ["awkward.persist", "json2type"], "args": [{"json": awkward.persist.type2json(self._type)}], "whitelistable": True} if len(others) > 0: diff --git a/awkward/arrow.py b/awkward/arrow.py index fae54cdd..4c3bcc58 100644 --- a/awkward/arrow.py +++ b/awkward/arrow.py @@ -306,7 +306,7 @@ def fromparquet(file, cache=None, persistvirtual=False, metadata=None, common_me if numrows > 0: chunk = awkward.array.table.Table() for n in columns: - chunk[n] = awkward.array.virtual.VirtualArray(parquetfile, (i, n), type=awkward.type.ArrayType(numrows, parquetfile.type[n])) + chunk[n] = awkward.array.virtual.VirtualArray(parquetfile, (i, n), cache=cache, type=awkward.type.ArrayType(numrows, parquetfile.type[n]), persistvirtual=persistvirtual) chunks.append(chunk) counts.append(numrows) diff --git a/awkward/persist.py b/awkward/persist.py index be765e25..4224715b 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -343,7 +343,8 @@ def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwa obj = jsonable(obj) except TypeError: try: - return {"id": ident, "call": ["awkward.persist", "topython"], "args": [awkward.persist.frompython(obj)]} + return {"id": ident, "python": awkward.persist.frompython(obj)} + except Exception as err: raise TypeError("could not persist component as an array, awkward-array, importable function/class, JSON, or pickle; pickle error is\n\n {0}: {1}".format(err.__class__.__name__, str(err))) else: @@ -423,6 +424,9 @@ def unfill(schema): elif "json" in schema: out = schema["json"] + elif "python" in schema: + out = topython(schema["python"]) + elif "ref" in schema: if schema["ref"] in seen: out = seen[schema["ref"]] @@ -505,6 +509,9 @@ def recurse(schema): elif "json" in schema: pass + elif "python" in schema: + pass + elif "ref" in schema: pass From 05f85a4b76c099535fde5a480d7725856293c700 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 11:20:19 -0500 Subject: [PATCH 26/27] bump version number; this is a release --- awkward/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awkward/version.py b/awkward/version.py index 3ce216aa..af1a48e2 100644 --- a/awkward/version.py +++ b/awkward/version.py @@ -30,7 +30,7 @@ import re -__version__ = "0.3.0" +__version__ = "0.4.0" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) From 28e4862218682d84c1b80a218805bdae0a6c2c1c Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 26 Oct 2018 11:25:39 -0500 Subject: [PATCH 27/27] addresses #16: put in empty help messages for all arrays to clobber the wrong one --- awkward/array/base.py | 8 ++++++++ awkward/array/chunked.py | 8 ++++++++ awkward/array/indexed.py | 12 ++++++++++++ awkward/array/jagged.py | 8 ++++++++ awkward/array/masked.py | 12 ++++++++++++ awkward/array/objects.py | 8 ++++++++ awkward/array/table.py | 8 ++++++++ awkward/array/union.py | 4 ++++ awkward/array/virtual.py | 4 ++++ awkward/derived/strings.py | 8 ++++++++ 10 files changed, 80 insertions(+) diff --git a/awkward/array/base.py b/awkward/array/base.py index 532243e9..fed473f9 100644 --- a/awkward/array/base.py +++ b/awkward/array/base.py @@ -35,6 +35,10 @@ import awkward.util class AwkwardArray(awkward.util.NDArrayOperatorsMixin): + """ + AwkwardArray: abstract base class + """ + def __array__(self, *args, **kwargs): # hitting this function is usually undesirable; uncomment to search for performance bugs # raise Exception("{0} {1}".format(args, kwargs)) @@ -187,6 +191,10 @@ def minby(self, function): return self[function(*args, **kwargs).argmin()] class AwkwardArrayWithContent(AwkwardArray): + """ + AwkwardArrayWithContent: abstract base class + """ + def __setitem__(self, where, what): if isinstance(where, awkward.util.string): self._content[where] = what diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py index fecde3e3..9fa37caf 100644 --- a/awkward/array/chunked.py +++ b/awkward/array/chunked.py @@ -34,6 +34,10 @@ import awkward.util class ChunkedArray(awkward.array.base.AwkwardArray): + """ + ChunkedArray + """ + def __init__(self, chunks, counts=[]): self.chunks = chunks self.counts = counts @@ -610,6 +614,10 @@ def pandas(self): raise NotImplementedError class AppendableArray(ChunkedArray): + """ + AppendableArray + """ + def __init__(self, chunkshape, dtype, chunks=[]): self.chunkshape = chunkshape self.dtype = dtype diff --git a/awkward/array/indexed.py b/awkward/array/indexed.py index efa8f425..a90702fa 100644 --- a/awkward/array/indexed.py +++ b/awkward/array/indexed.py @@ -46,6 +46,10 @@ def invert(permutation): return out class IndexedArray(awkward.array.base.AwkwardArrayWithContent): + """ + IndexedArray + """ + def __init__(self, index, content): self.index = index self.content = content @@ -204,6 +208,10 @@ def pandas(self): return self._content[self._index].pandas() class ByteIndexedArray(IndexedArray): + """ + ByteIndexedArray + """ + def __init__(self, index, content, dtype): super(ByteIndexedArray, self).__init__(index, content) self.dtype = dtype @@ -363,6 +371,10 @@ def pandas(self): raise NotImplementedError class SparseArray(awkward.array.base.AwkwardArrayWithContent): + """ + SparseArray + """ + def __init__(self, length, index, content, default=None): self.length = length self.index = index diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index b08f1e4b..888adce8 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -102,6 +102,10 @@ def uniques2offsetsparents(uniques): return offsets, parents class JaggedArray(awkward.array.base.AwkwardArrayWithContent): + """ + JaggedArray + """ + def __init__(self, starts, stops, content): self.starts = starts self.stops = stops @@ -1089,6 +1093,10 @@ def pandas(self): return out class ByteJaggedArray(JaggedArray): + """ + ByteJaggedArray + """ + def __init__(self, starts, stops, content, subdtype): super(ByteJaggedArray, self).__init__(starts, stops, content) self.subdtype = subdtype diff --git a/awkward/array/masked.py b/awkward/array/masked.py index 5d225d05..e470100b 100644 --- a/awkward/array/masked.py +++ b/awkward/array/masked.py @@ -36,6 +36,10 @@ import awkward.util class MaskedArray(awkward.array.base.AwkwardArrayWithContent): + """ + MaskedArray + """ + ### WTF were the designers of numpy.ma thinking? # @staticmethod # def is_masked(x): @@ -278,6 +282,10 @@ def pandas(self): raise NotImplementedError class BitMaskedArray(MaskedArray): + """ + BitMaskedArray + """ + def __init__(self, mask, content, maskedwhen=True, lsborder=False): super(BitMaskedArray, self).__init__(mask, content, maskedwhen=maskedwhen) self.lsborder = lsborder @@ -505,6 +513,10 @@ def pandas(self): raise NotImplementedError class IndexedMaskedArray(MaskedArray): + """ + IndexedMaskedArray + """ + def __init__(self, mask, content, maskedwhen=-1): super(IndexedMaskedArray, self).__init__(mask, content, maskedwhen=maskedwhen) self._isvalid = False diff --git a/awkward/array/objects.py b/awkward/array/objects.py index 53566883..31fd10ce 100644 --- a/awkward/array/objects.py +++ b/awkward/array/objects.py @@ -36,6 +36,10 @@ import awkward.util class Methods(object): + """ + Methods: abstract mix-in + """ + @staticmethod def mixin(methods, awkwardtype): assert issubclass(methods, Methods) @@ -55,6 +59,10 @@ def maybemixin(sample, awkwardtype): return awkwardtype class ObjectArray(awkward.array.base.AwkwardArrayWithContent): + """ + ObjectArray + """ + def __init__(self, content, generator, args=(), kwargs={}): self.content = content self.generator = generator diff --git a/awkward/array/table.py b/awkward/array/table.py index 5e4e690d..753accec 100644 --- a/awkward/array/table.py +++ b/awkward/array/table.py @@ -35,9 +35,17 @@ import awkward.util class Table(awkward.array.base.AwkwardArray): + """ + Table + """ + ##################### class Row class Row(object): + """ + Table.Row + """ + __slots__ = ["_table", "_index"] def __init__(self, table, index): diff --git a/awkward/array/union.py b/awkward/array/union.py index e4c4c735..0be9f873 100644 --- a/awkward/array/union.py +++ b/awkward/array/union.py @@ -33,6 +33,10 @@ import awkward.util class UnionArray(awkward.array.base.AwkwardArray): + """ + UnionArray + """ + def __init__(self, tags, index, contents): self.tags = tags self.index = index diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index 38be4711..78febb5e 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -36,6 +36,10 @@ import awkward.util class VirtualArray(awkward.array.base.AwkwardArray): + """ + VirtualArray + """ + class TransientKey(object): def __init__(self, id): self._id = id diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py index d3a0c276..ab1798c1 100644 --- a/awkward/derived/strings.py +++ b/awkward/derived/strings.py @@ -35,6 +35,10 @@ import awkward.array.objects class StringMethods(object): + """ + StringMethods + """ + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if method != "__call__": raise NotImplemented @@ -90,6 +94,10 @@ def tostring(x, decoder): return decoder(x, errors="replace")[0] class StringArray(StringMethods, awkward.array.objects.ObjectArray): + """ + StringArray + """ + def __init__(self, starts, stops, content, encoding="utf-8"): self._content = awkward.array.jagged.ByteJaggedArray(starts, stops, content, awkward.util.CHARTYPE) self._generator = tostring