Skip to content

Commit

Permalink
Merge pull request #386 from Crunch-io/fill-support
Browse files Browse the repository at this point in the history
Add `fill` support
  • Loading branch information
xbito authored Jul 24, 2020
2 parents d87e9ea + d7e89e3 commit ecee6a2
Show file tree
Hide file tree
Showing 3 changed files with 533 additions and 1 deletion.
175 changes: 174 additions & 1 deletion integration/test_recodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
from fixtures import NEWS_DATASET, NEWS_DATASET_ROWS, mr_in, RECODES_CSV_OUTPUT
from scrunch import connect
from scrunch.streaming_dataset import get_streaming_dataset
from scrunch.mutable_dataset import get_mutable_dataset
from pycrunch.importing import Importer


HOST = os.environ['SCRUNCH_HOST']
username = os.environ['SCRUNCH_USER']
password = os.environ['SCRUNCH_PASS']


site = connect(username, password, HOST)
assert site is not None, "Unable to connect to %s" % HOST


class TestRecodes(TestCase):
Expand Down Expand Up @@ -125,7 +129,6 @@ def test_recodes(self):
reader = csv.reader(StringIO(RECODES_CSV_OUTPUT))
headers = reader.next()


# rewrite the actual csv in the same order as the expected csv
actualf = StringIO()
writer = csv.writer(actualf)
Expand All @@ -141,3 +144,173 @@ def test_recodes(self):

output.close()
ds.delete()


class TestFill(TestCase):
def prepare_ds(self):
cats = [
{"id": 1, "name": "Daily", "missing": False, "numeric_value": None},
{"id": 2, "name": "Weekly", "missing": False,
"numeric_value": None},
{"id": 3, "name": "Monthly", "missing": False,
"numeric_value": None},
{"id": -1, "name": "No Data", "missing": True,
"numeric_value": None},
]
metadata = {
"coke_freq": {
"name": "frequency coke",
"type": "categorical",
"categories": cats
},
"pepsi_freq": {
"name": "frequency pepsi",
"type": "categorical",
"categories": cats
},
"pop_pref": {
"name": "Soda preference",
"type": "categorical",
"categories": [
{"id": 1, "name": "Coke", "missing": False,
"numeric_value": None},
{"id": 2, "name": "Pepsi", "missing": False,
"numeric_value": None},
{"id": -1, "name": "No Data", "missing": True,
"numeric_value": None},
]
}
}
ds_payload = {
'element': 'shoji:entity',
'body': {
'name': 'test_fill',
'table': {
'element': 'crunch:table',
'metadata': metadata
},
}
}

rows = [
["coke_freq", "pepsi_freq", "pop_pref"],
[1, 3, 1],
[2, 2, 1],
[3, 1, 1],
[1, 3, 2],
[2, 2, 2],
[3, 1, 2],
]
ds = site.datasets.create(ds_payload).refresh()
dataset = get_mutable_dataset(ds.body.id, site)
Importer().append_rows(ds, rows)
return dataset, ds

def test_fill(self):
dataset, api_ds = self.prepare_ds()
dataset.create_fill_values([
{"case": "pop_pref == 1", "variable": "coke_freq"},
{"case": "pop_pref == 2", "variable": "pepsi_freq"},
], alias="pop_freq", name="Pop frequency")

variables = api_ds.variables.by("alias")
new_id = variables["pop_freq"]["id"]
new_var = variables["pop_freq"].entity
self.assertTrue(new_var.body.derived)
self.assertEqual(new_var.body.name, "Pop frequency")

data = api_ds.follow("table", "limit=6")
cats = {c["name"]: c["id"] for c in data["metadata"][new_id]["categories"]}
self.assertEqual(data["data"][new_id], [
# Coke chunk
cats["Daily"],
cats["Weekly"],
cats["Monthly"],
# Pepsi chunk
cats["Monthly"],
cats["Weekly"],
cats["Daily"],
])
api_ds.delete()

def test_fill_w_else(self):
dataset, ds = self.prepare_ds()
dataset.create_fill_values([
{"case": "pop_pref == 1", "variable": "coke_freq"},
{"case": "else", "variable": "pepsi_freq"},
], alias="pop_freq", name="Pop frequency")

variables = ds.variables.by("alias")
new_id = variables["pop_freq"]["id"]
new_var = variables["pop_freq"].entity
self.assertTrue(new_var.body.derived)
self.assertEqual(new_var.body.name, "Pop frequency")

data = ds.follow("table", "limit=6")
cats = {c["name"]: c["id"] for c in data["metadata"][new_id]["categories"]}
self.assertEqual(data["data"][new_id], [
# Coke chunk
cats["Daily"],
cats["Weekly"],
cats["Monthly"],
# Pepsi chunk - Default case
cats["Monthly"],
cats["Weekly"],
cats["Daily"],
])
ds.delete()

def test_fill_w_else_code(self):
dataset, ds = self.prepare_ds()
dataset.create_fill_values([
{"case": "pop_pref == 1", "variable": "coke_freq"},
{"case": "else", "name": "Not Asked", "id": 99, "missing": False},
], alias="pop_freq", name="Pop frequency")

variables = ds.variables.by("alias")
new_id = variables["pop_freq"]["id"]
new_var = variables["pop_freq"].entity
self.assertTrue(new_var.body.derived)
self.assertEqual(new_var.body.name, "Pop frequency")

data = ds.follow("table", "limit=6")
cats = {c["name"]: c["id"] for c in data["metadata"][new_id]["categories"]}
self.assertEqual(data["data"][new_id], [
# Coke chunk
cats["Daily"],
cats["Weekly"],
cats["Monthly"],
# Default value
cats["Not Asked"],
cats["Not Asked"],
cats["Not Asked"],
])
ds.delete()

def test_fill_w_else_default(self):
dataset, ds = self.prepare_ds()
dataset.create_fill_values([
{"case": "pop_pref == 1", "variable": "coke_freq"},
], alias="pop_freq", name="Pop frequency")

variables = ds.variables.by("alias")
new_id = variables["pop_freq"]["id"]
new_var = variables["pop_freq"].entity
self.assertTrue(new_var.body.derived)
self.assertEqual(new_var.body.name, "Pop frequency")

data = ds.follow("table", "limit=6")
cats = {c["name"]: c["id"] for c in data["metadata"][new_id]["categories"]}
self.assertEqual(data["data"][new_id], [
# Coke chunk
cats["Daily"],
cats["Weekly"],
cats["Monthly"],
# Default value
{"?": -1},
{"?": -1},
{"?": -1},
])
ds.delete()


95 changes: 95 additions & 0 deletions scrunch/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,101 @@ def add_user(self, user, edit=False):
}
self.resource.permissions.patch(payload)

def create_fill_values(self, variables, name, alias, description=''):
"""
This function is similar to create_single_categorical in the sense
that the output is a 1D variable.
Will create a derived variable using a combination of Crunch's `fill`
and `case` functions, to create a new variable using the values from
the specified variables according to each expression.
dataset.create_fill_values([
{"case": "pop_pref == 1", "variable": "coke_freq"},
{"case": "pop_pref == 2", "variable": "pepsi_freq"},
# {"case": "else", "variable": "any_freq"},
{"case": "else", "missing": True, "name": "Not Asked", "id": 99},
], alias="pop_freq", name="Pop frequency")
The `else` case can be either a variable or a default category.
* In the case of variable it should have the shape:
{"case": "else", "variable": "<alias>"}
* In the case of a default category, it should indicate:
{"case": "else", "name": "Cat Name", "missing": <bool>, "id": <int cat code>}
:param variables: list of dictionaries with an `variable` and `case`
:param name: Name of the new variable
:param alias: Alias of the new variable
:param description: Description of the new variable
:return:
"""
if not hasattr(self.resource, 'variables'):
self.resource.refresh()

# Pluck `else` case out.
else_case = [c for c in variables if c["case"] == "else"]
else_case = else_case[0] if else_case else {}
variables = [c for c in variables if c["case"] != "else"]

if "variable" in else_case and "name" in else_case:
raise ValueError("Else case can be either variable or category not both")

aliases = {c["variable"] for c in variables}
vars_by_alias = self.resource.variables.by("alias")
types = {vars_by_alias[al]["type"] for al in aliases}
if types != {"categorical"}:
raise ValueError("All variables must be of type `categorical`")

cat_ids = list(range(1, len(variables) + 1))
args = [{
"column": cat_ids,
"type": {
"class": "categorical",
"ordinal": False,
"categories": [
{"id": c, "name": str(c), "missing": False, "numeric_value": None}
for c in cat_ids
]
}
}]
exprs = [parse_expr(c["case"]) for c in variables]
exprs = process_expr(exprs, self.resource)
args.extend(exprs)

if "name" in else_case:
# We are in the else_case of a category. Add there the extra default
args[0]["column"].append(else_case["id"])
args[0]["type"]["categories"].append({
"name": else_case["name"],
"missing": else_case.get("missing", False),
"id": else_case["id"],
"numeric_value": else_case.get("numeric_value", None),
})

expr = {"function": "case", "args": args}
fill_map = {str(cid): {"variable": vars_by_alias[v["variable"]]["id"]}
for cid, v in zip(cat_ids, variables)}

if "variable" in else_case:
# We are in the case of a default fill, replace the -1 with the new
# variable
fill_map["-1"] = {"variable": vars_by_alias[else_case["variable"]]["id"]}

fill_expr = {
"function": "fill",
"args": [
expr,
{"map": fill_map}
]
}
payload = shoji_entity_wrapper({
"alias": alias,
"name": name,
"description": description,
"derivation": fill_expr
})
return self._var_create_reload_return(payload)

def create_single_response(self, categories, name, alias, description='',
missing=True, notes=''):
"""
Expand Down
Loading

0 comments on commit ecee6a2

Please sign in to comment.