Skip to content

Commit

Permalink
Merge pull request #116 from SvoONs/contribution/add_date_as_object_flag
Browse files Browse the repository at this point in the history
Add date_as_object flag in core/index.py as_flat_series
  • Loading branch information
fjetter authored Aug 13, 2019
2 parents e8e9a71 + 134f928 commit c199c10
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Version 3.2.1 (2019-XX-XX)
``None``
- Streamline behavior of `store_dataset_from_ddf` when passing empty ddf.
- Fix an issue where a segmentation fault may be raised when comparing MetaPartition instances
- Expose a ``date_as_object`` flag in ``kartothek.core.index.as_flat_series``


Version 3.2.0 (2019-07-25)
Expand Down
8 changes: 6 additions & 2 deletions kartothek/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def load_partition_indices(self):
indices.update(self.indices)
return self.copy(indices=indices)

def get_indices_as_dataframe(self, columns=None):
def get_indices_as_dataframe(self, columns=None, date_as_object=True):
"""
Converts the dataset indices to a pandas dataframe.
Expand All @@ -359,6 +359,8 @@ def get_indices_as_dataframe(self, columns=None):
columns: list of str
If provided, the dataframe will only be constructed for the provided columns/indices.
If `None` is given, all indices are included.
date_as_object: bool, optional
Cast dates to objects.
"""
if columns is None:
columns = sorted(self.indices.keys())
Expand All @@ -373,7 +375,9 @@ def get_indices_as_dataframe(self, columns=None):
)
raise ValueError("Index `{}` unknown.")
df = pd.DataFrame(
self.indices[col].as_flat_series(partitions_as_index=True)
self.indices[col].as_flat_series(
partitions_as_index=True, date_as_object=date_as_object
)
)
dfs.append(df)

Expand Down
8 changes: 6 additions & 2 deletions kartothek/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,9 @@ def __eq__(self, other):
def __ne__(self, other):
return not (self == other)

def as_flat_series(self, compact=False, partitions_as_index=False):
def as_flat_series(
self, compact=False, partitions_as_index=False, date_as_object=True
):
"""
Convert the Index object to a pandas.Series
Expand All @@ -417,9 +419,11 @@ def as_flat_series(self, compact=False, partitions_as_index=False):
If True, the index will be unique and the Series.values will be a list of partitions/values
partitions_as_index: bool, optional
If True, the relation between index values and partitions will be reverted for the output
date_as_object: bool, optional
Cast dates to objects.
"""
table = _index_dct_to_table(self.index_dct, column=self.column)
df = table.to_pandas(date_as_object=True)
df = table.to_pandas(date_as_object=date_as_object)
result_column = _PARTITION_COLUMN_NAME
# This is the way the dictionary is directly translated
# value: [partition]
Expand Down
8 changes: 6 additions & 2 deletions tests/core/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,10 @@ def test_index_as_flat_series_highly_degenerated_asym():
assert_series_equal(ser_inv, expected_inv)


def test_index_as_flat_series_date():
@pytest.mark.parametrize(
"dtype, date_as_object", [(None, True), ("datetime64[ns]", False)]
)
def test_index_as_flat_series_date(dtype, date_as_object):
index1 = ExplicitSecondaryIndex(
column="col",
index_dct={
Expand All @@ -352,7 +355,7 @@ def test_index_as_flat_series_date():
},
dtype=pa.date32(),
)
ser = index1.as_flat_series()
ser = index1.as_flat_series(date_as_object=date_as_object)
ser = ser.sort_index()
expected = pd.Series(
["part_1", "part_2", "part_1"],
Expand All @@ -362,6 +365,7 @@ def test_index_as_flat_series_date():
datetime.date(2017, 1, 2),
datetime.date(2018, 2, 3),
],
dtype=dtype,
name="col",
),
name="partition",
Expand Down

0 comments on commit c199c10

Please sign in to comment.