Merge pull request #50 from scikit-hep/issue-49

Issue 49
scikit-hep · Nov 30, 2018 · f0c3f7d · f0c3f7d
2 parents 2b5a069 + ae97261
commit f0c3f7d
Show file tree

Hide file tree

Showing 12 changed files with 142 additions and 53 deletions.
diff --git a/awkward/array/base.py b/awkward/array/base.py
@@ -56,9 +56,15 @@ class AwkwardArray(awkward.util.NDArrayOperatorsMixin):
     def at(self):
         return At(self)
 
+    allow_tonumpy = True
+    allow_iter = True
+
+    def _checktonumpy(self):
+        if not self.allow_tonumpy:
+            raise RuntimeError("awkward.array.base.AwkwardArray.allow_tonumpy is False; refusing to convert to Numpy")
+
     def __array__(self, dtype=None):
-        # hitting this function is usually undesirable; uncomment to search for performance bugs
-        # raise Exception
+        self._checktonumpy()
 
         if dtype is None:
             dtype = self.dtype
@@ -78,15 +84,21 @@ def __setstate__(self, state):
         self.__dict__.update(out.__dict__)
         self.__class__ = out.__class__
 
-    def __iter__(self):
+    def _checkiter(self):
+        if not self.allow_iter:
+            raise RuntimeError("awkward.array.base.AwkwardArray.allow_iter is False; refusing to iterate")
+
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         for i in range(len(self)):
             yield self[i]
 
     def __str__(self):
         if len(self) <= 6:
-            return "[{0}]".format(" ".join(awkward.util.array_str(x) for x in self))
+            return "[{0}]".format(" ".join(awkward.util.array_str(x) for x in self.__iter__(checkiter=False)))
         else:
-            return "[{0} ... {1}]".format(" ".join(awkward.util.array_str(x) for x in self[:3]), " ".join(awkward.util.array_str(x) for x in self[-3:]))
+            return "[{0} ... {1}]".format(" ".join(awkward.util.array_str(x) for x in self[:3].__iter__(checkiter=False)), " ".join(awkward.util.array_str(x) for x in self[-3:].__iter__(checkiter=False)))
 
     def __repr__(self):
         return "<{0} {1} at {2:012x}>".format(self.__class__.__name__, str(self), id(self))

diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py
@@ -279,20 +279,24 @@ def __str__(self):
         if self.countsknown:
             return super(ChunkedArray, self).__str__()
         else:
-            strs = [awkward.util.array_str(x) for x in self[:7]]
+            strs = [awkward.util.array_str(x) for x in self[:7].__iter__(checkiter=False)]
             if len(strs) < 7:
                 return super(ChunkedArray, self).__str__()
             else:
                 return "[{0} ...]".format(" ".join(strs))
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         for i, chunk in enumerate(self._chunks):
             if i >= len(self._counts):
                 self._counts.append(len(chunk))
             for x in chunk[:self._counts[i]]:
                 yield x
 
     def __array__(self, *args, **kwargs):
+        self._checktonumpy()
+
         if isinstance(self.type.to, awkward.util.numpy.dtype):
             if len(self) == 0:
                 return awkward.util.numpy.empty(0, dtype=awkward.util.DEFAULTTYPE)

diff --git a/awkward/array/indexed.py b/awkward/array/indexed.py
@@ -138,7 +138,9 @@ def _valid(self):
 
             self._isvalid = True
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
         for i in self._index:
             yield self._content[i]
@@ -303,7 +305,9 @@ def _valid(self):
 
             self._isvalid = True
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
         itemsize = self._dtype.itemsize
         for i in self._index:
@@ -530,7 +534,9 @@ def _valid(self):
 
             self._isvalid = True
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
 
         length = self._length

diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py
@@ -55,6 +55,8 @@ def offsets2parents(offsets):
     out = awkward.util.numpy.zeros(offsets[-1], dtype=awkward.util.INDEXTYPE)
     awkward.util.numpy.add.at(out, offsets[offsets != offsets[-1]][1:], 1)
     awkward.util.numpy.cumsum(out, out=out)
+    if offsets[0] > 0:
+        out[:offsets[0]] = -1
     return out
 
 def startsstops2parents(starts, stops):
@@ -416,10 +418,12 @@ def _validstartsstops(starts, stops):
         if starts.shape[1:] != stops.shape[1:]:
             raise ValueError("starts and stops must have the same dimensionality (shape[1:])")
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
         if len(self._starts.shape) != 1:
-            for x in super(JaggedArray, self).__iter__():
+            for x in super(JaggedArray, self).__iter__(checkiter=checkiter):
                 yield x
         else:
             stops = self._stops
@@ -481,14 +485,13 @@ def __getitem__(self, where):
 
                 offsets = counts2offsets(intheadsum)
 
-                return self.copy(starts=offsets[:-1].reshape(intheadsum.shape), stops=offsets[1:].reshape(intheadsum.shape), content=thyself._content[head._content])
+                headcontent = awkward.util.numpy.array(head._content, dtype=awkward.util.BOOLTYPE)
+                headcontent[head.parents < 0] = False
 
-            elif len(head.shape) == 1:
-                raise TypeError("jagged index must be boolean (mask) or integer (fancy indexing)")
+                return self.copy(starts=offsets[:-1].reshape(intheadsum.shape), stops=offsets[1:].reshape(intheadsum.shape), content=thyself._content[headcontent])
 
             else:
-                # the other cases are possible, but complicated; the first sets the form
-                raise NotImplementedError("jagged index content type: {0}".format(head._content.dtype))
+                raise TypeError("jagged index must be boolean (mask) or integer (fancy indexing)")
 
         else:
             starts = self._starts[head]
@@ -666,24 +669,24 @@ def _tojagged(self, starts=None, stops=None, copy=True):
             return self.copy(starts=starts, stops=stops, content=(awkward.util.deepcopy(self._content) if copy else self._content))
 
         else:
-            out = self.copy(starts=starts, stops=stops, content=awkward.util.numpy.empty(stops.max(), dtype=self._content.dtype))
-
-            if offsetsaliased(self._starts, self._stops) or awkward.util.numpy.array_equal(self._starts[1:], self._stops[:-1]):
-                content = self._content[self._starts[0]:self._stops[-1]]
-            elif (self._starts[:-1] < self._starts[1:]).all():
-                content = self._content[awkward.util.numpy.arange(len(self.parents), dtype=awkward.util.INDEXTYPE)[self.parents >= 0]]
-            else:
-                order = awkward.util.numpy.argsort(self.parents, kind="mergesort")
-                content = self._content[order[self.parents[order] >= 0]]
-
-            if offsetsaliased(starts, stops) or awkward.util.numpy.array_equal(starts[1:], stops[:-1]):
-                out._content[starts[0]:stops[-1]] = content
-            elif (starts[:-1] < starts[1:]).all():
-                out._content[awkward.util.numpy.arange(len(out.parents), dtype=awkward.util.INDEXTYPE)[out.parents >= 0]] = content
+            if offsetsaliased(starts, stops):
+                parents = offsets2parents(starts.base)
+            elif len(starts.shape) == 1 and awkward.util.numpy.array_equal(starts[1:], stops[:-1]):
+                if len(self._stops) == 0:
+                    offsets = awkward.util.numpy.array([0], dtype=awkward.util.INDEXTYPE)
+                else:
+                    offsets = awkward.util.numpy.append(starts, stops[-1])
+                parents = offsets2parents(offsets)
             else:
-                order = awkward.util.numpy.argsort(out.parents, kind="mergesort")
-                out._content[order[out.parents[order] >= 0]] = content
-
+                parents = startsstops2parents(starts, stops)
+
+            good = (parents >= 0)
+            increase = awkward.util.numpy.arange(len(parents), dtype=awkward.util.INDEXTYPE)
+            increase[good] -= increase[starts[parents[good]]]
+            index = self._starts[parents]
+            index += increase
+            out = self.copy(starts=starts, stops=stops, content=self._content[index])
+            out._parents = parents
             return out
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
@@ -1352,10 +1355,12 @@ def subdtype(self, value):
     def _gettype(self, seen):
         return awkward.type.ArrayType(awkward.util.numpy.inf, self._subdtype)
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
         if len(self._starts.shape) != 1:
-            for x in super(JaggedArray, self).__iter__():
+            for x in super(JaggedArray, self).__iter__(checkiter=checkiter):
                 yield x.view(self._subdtype)
         else:
             stops = self._stops

diff --git a/awkward/array/masked.py b/awkward/array/masked.py
@@ -171,7 +171,9 @@ def _valid(self):
 
             self._isvalid = True
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
 
         mask = self._mask
@@ -394,7 +396,9 @@ def _valid(self):
 
             self._isvalid = True
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
 
         one = awkward.util.numpy.uint8(1)
@@ -594,7 +598,9 @@ def _valid(self):
 
             self._isvalid = True
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
 
         mask = self._mask

diff --git a/awkward/array/objects.py b/awkward/array/objects.py
@@ -42,17 +42,14 @@ class Methods(object):
 
     @staticmethod
     def mixin(methods, awkwardtype):
-        assert issubclass(methods, Methods)
         assert not issubclass(methods, awkward.array.base.AwkwardArray)
-        assert issubclass(awkwardtype, awkward.array.base.AwkwardArray)
         assert not issubclass(awkwardtype, Methods)
         return type(awkwardtype.__name__ + "Methods", (methods, awkwardtype), {})
 
     @staticmethod
     def maybemixin(sample, awkwardtype):
         if issubclass(sample, Methods):
             assert issubclass(sample, awkward.array.base.AwkwardArray)
-            assert issubclass(awkwardtype, awkward.array.base.AwkwardArray)
             allbases = tuple(x for x in sample.__bases__ if not issubclass(x, awkward.array.base.AwkwardArray)) + (awkwardtype,)
             return type(awkwardtype.__name__ + "Methods", allbases, {})
         else:
@@ -179,7 +176,9 @@ def _getshape(self):
     def _valid(self):
         pass
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         for x in self._content:
             yield self.generator(x, *self._args, **self._kwargs)
 

diff --git a/awkward/array/table.py b/awkward/array/table.py
@@ -93,7 +93,9 @@ def __getitem__(self, where):
         def __dir__(self):
             return ["_" + x for x in self._table._content if x.isnumeric() or x.isidentifier()] + ["tolist"]
 
-        def __iter__(self):
+        def __iter__(self, checkiter=True):
+            if checkiter:
+                self._checkiter()
             i = 0
             while str(i) in self._table._content:
                 yield self._table._content[str(i)]
@@ -423,7 +425,10 @@ def _newslice(self, head):
     def _valid(self):
         pass
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
+
         if self._view is None:
             length = self._length()
             i = 0

diff --git a/awkward/array/union.py b/awkward/array/union.py
@@ -252,7 +252,9 @@ def _valid(self):
 
             self._isvalid = True
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         self._valid()
 
         tags = self._tags

diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py
@@ -327,10 +327,13 @@ def __del__(self):
             except:
                 pass
 
-    def __iter__(self):
+    def __iter__(self, checkiter=True):
+        if checkiter:
+            self._checkiter()
         return iter(self.array)
 
     def __array__(self, *args, **kwargs):
+        self._checktonumpy()
         return awkward.util.numpy.array(self.array, *args, **kwargs)
 
     def __getitem__(self, where):

diff --git a/awkward/version.py b/awkward/version.py
@@ -30,7 +30,7 @@
 
 import re
 
-__version__ = "0.5.1"
+__version__ = "0.5.2"
 version = __version__
 version_info = tuple(re.split(r"[-\.]", __version__))
 

diff --git a/specification.adoc b/specification.adoc
@@ -33,11 +33,13 @@ Necessary features of the underlying array library are described in this section
 
 The underlying array library must have the following features or some translation thereof.
 
-  * It must be possible to represent ordered sequences of primitive values, known as *arrays*. An array is homogeneous: all values in an array have the same type. Primitive types include integers, floating point numbers, and boolean (true/false) values. An awkward-array library inherits the primitives of the underlying library -- if a base array library doesn't support complex numbers (for instance), then neither does its awkward-array extension. Arrays need not be fixed-size, contiguous in memory, with only fixed-bytesize primitives, but these are the simplest cases.
-  * Arrays may be _N_-dimensional for any positive integer _N_, though only _N = 1_ is required. We will refer to an array's size in all dimensions as its *shape*, a tuple of positive integers whose product is the total number of elements in the array. The *length* of an array is the array's size in its first dimension.
-  * It must be possible to *extract* an array element by integer index or a tuple of _N_ integer indexes for an _N_-dimensional array. In this document, we will use 0-indexing: `0` extracts the first element, `1` extracts the second, etc. If an index is greater than or equal to _N_, it is out of bounds. An array library with 1-indexing (`1` extracts the first element) would correspondingly have 1-indexing in its awkward-array extensions. We will use a number between square brackets after the array, as in `myarray[5]`, or a comma-separated tuple like `myarray[3, 1, 4]`, to represent extraction.
-  * The array library may provide a mechanism for counting from the end of an array dimension. In Python, `-1` refers to the last element, `-2` to the second-to-last element, etc. We will use that convention. Given a negative index `i` and a dimension size `d`, the standard index is `i + d`. If the standard index is still negative, the index is out of bounds.
-  * It must be possible to *slice* an array to get a subsequence of that array along any dimension. A slice is defined by a *start* index (lower bound element number), a *stop* index (upper bound element number), and an optional *step* (stride length, for skipping a regular number of elements between each return element). In this document, we will use inclusive start values (the lower bound is included in the resulting subsequence) and exclusive stop values (the upper bound is not included). A step of `1` is equivalent to no step, and the step must not be `0`, though negative values may be allowed (to reverse order). If either start or stop is not provided, they may be assumed to be `0` and the size of the dimension (respectively) if step is positive, or one less than the size of the dimension and one less than `0` (respectively) if step is negative. If the difference between step and start is not an integer multiple of step, we take that the subsequence to be truncated at the last element before stop. If either start or step are beyond the bounds of the array, we take them to be truncated to the nearest legal values, which may result in an empty subsequence, but not an error. If negative indexes are allowed for element extraction, they may be allowed for slicing as well.
+   * It must be possible to represent ordered sequences of primitive values, known as *arrays*. An array is homogeneous: all values in an array have the same *dtype*, or primitive data type. Primitive types include integers, floating point numbers, and boolean (true/false) values. An awkward-array library inherits the primitives of the underlying library -- if a base array library doesn't support complex numbers (for instance), then neither does its awkward-array extension. Arrays need not be fixed-size, contiguous in memory, with only fixed-bytesize primitives, but these are the simplest cases.
+   * It must be possible to construct _N_-dimensional arrays for any positive integer _N_. We will refer to an array's size in all dimensions as its *shape*, a tuple of positive integers whose product is the total number of elements in the array. The *length* of an array is the array's size in its first dimension.
+   * It must be possible to *extract* an array element by integer index or a tuple of _N_ integer indexes for an _N_-dimensional array. In this document, we will use 0-indexing: `0` extracts the first element, `1` extracts the second, etc. If an index is greater than or equal to _N_, it is out of bounds. An array library with 1-indexing (`1` extracts the first element) would correspondingly have 1-indexing in its awkward-array extensions. We will use a number between square brackets after the array, as in `myarray[5]`, or a comma-separated tuple like `myarray[3, 1, 4]`, to represent extraction.
+   * The array library may provide a mechanism for counting from the end of an array dimension. In Python, `-1` refers to the last element, `-2` to the second-to-last element, etc. We will use that convention. Given a negative index `i` and a dimension size `d`, the standard index is `i + d`. If the standard index is still negative, the index is out of bounds.
+   * It must be possible to *slice* an array dimension to get a subsequence of that array along any dimension. A slice is defined by a *start* index (lower bound element number), a *stop* index (upper bound element number), and an optional *step* (stride length, for skipping a regular number of input elements between each returned element). In this document, we will use inclusive start values (the lower bound is included in the resulting subsequence) and exclusive stop values (the upper bound is not included). A step of `1` is equivalent to no step, and the step must not be `0`, though negative values may be allowed (to reverse order). If either start or stop is not provided, they may be assumed to be `0` and the size of the dimension (respectively) if step is positive, or one less than the size of the dimension and one less than `0` (respectively) if step is negative. If the difference between step and start is not an integer multiple of step, we take that the subsequence to be truncated at the last element before stop. If either start or step are beyond the bounds of the array, we take them to be truncated to the nearest legal values, which may result in an empty subsequence, but not an error. If negative indexes are allowed for element extraction, they may be allowed for slicing as well. The Python syntax for this operation is `myarray[start:stop:step]` (in which any `start`, `stop`, or `step` may be missing or `None`, and the second colon may be omitted if there is no `step`). We will use this syntax in this document.
+   * It must be possible to *mask* elements in an array dimension by a 1-dimensional boolean array of the same size as that array dimension. The result of such an operation is a sequence in the same order containing only the elements for which the matching boolean value is true. The Numpy syntax for this operation is to put the boolean `mymask` array in square brackets after the array: `myarray[mymask]`, but it may be a named function call.
+   * It must be possible to *gather* elements in an array dimension by a 1-dimensional integer array, mapping . The result of such an operation is a sequence