Skip to content

Commit

Permalink
Data prep improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Darren Edge committed Nov 4, 2024
1 parent ece3454 commit 46550ab
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 103 deletions.
242 changes: 141 additions & 101 deletions app/util/ui_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,39 +370,45 @@ def prepare_input_df(
'input': input_df_var.value.copy(deep=True)
}

if "input" in st.session_state[f"{workflow}_intermediate_dfs"] and st.session_state[f"{workflow}_intermediate_dfs"]["input"].shape != input_df_var.value.shape:
del st.session_state[f"{workflow}_suppress_zeros"]

reload = False
df_sequence = ["input", "selected", "datetime_bin", "numeric_bin", "expanded", "suppress_count", "suppress_null"]

def df_updated(df_name):
def df_updated(df_name, reset):
# ensure columns are propagated forward
index = df_sequence.index(df_name)
input_df = st.session_state[f"{workflow}_intermediate_dfs"]["input"]
last_df = st.session_state[f"{workflow}_intermediate_dfs"][df_name]
prior_df = st.session_state[f"{workflow}_intermediate_dfs"][df_sequence[index-1]] if index > 0 else input_df
for col in last_df.columns:
# add to all subsequent dataframes if not present
for df in df_sequence[df_sequence.index(df_name) + 1:]:
if df in st.session_state[f"{workflow}_intermediate_dfs"]:
st.session_state[f"{workflow}_intermediate_dfs"][df][col] = last_df[
col
]
# for all subsequent dataframes, remove columns that are not in the current dataframe
for df in df_sequence[df_sequence.index(df_name) + 1:]:
if df in st.session_state[f"{workflow}_intermediate_dfs"]:
for col in st.session_state[f"{workflow}_intermediate_dfs"][df].columns:
if col in input_df.columns and col not in last_df.columns:
# Don't remove expanded columns
# TODO: How to remove expanded columns when the original column is removed?
st.session_state[f"{workflow}_intermediate_dfs"][df].drop(columns=[col], inplace=True)
if reset:
for df in df_sequence[index:]:
st.session_state[f"{workflow}_intermediate_dfs"][df] = prior_df.copy(deep=True)
else:
# for all subsequent dataframes, remove columns that are not in the current dataframe
for df in df_sequence[index+1:]:
if df in st.session_state[f"{workflow}_intermediate_dfs"]:
for col in st.session_state[f"{workflow}_intermediate_dfs"][df].columns:
if col in input_df.columns and col not in last_df.columns:
st.session_state[f"{workflow}_intermediate_dfs"][df].drop(columns=[col], inplace=True)

def prepare_stage(df_name):
last_df_name = df_sequence[df_sequence.index(df_name)-1]
last_df = st.session_state[f"{workflow}_intermediate_dfs"][last_df_name]
if df_name not in st.session_state[f"{workflow}_intermediate_dfs"]:
st.session_state[f"{workflow}_intermediate_dfs"][df_name] = last_df.copy(deep=True)
this_df = st.session_state[f"{workflow}_intermediate_dfs"][df_name]
return last_df, this_df
index = df_sequence.index(df_name)
if index > 0:
last_df_name = df_sequence[index-1]
last_df = st.session_state[f"{workflow}_intermediate_dfs"][last_df_name]
if df_name not in st.session_state[f"{workflow}_intermediate_dfs"]:
st.session_state[f"{workflow}_intermediate_dfs"][df_name] = last_df.copy(deep=True)
this_df = st.session_state[f"{workflow}_intermediate_dfs"][df_name]
return last_df, this_df
else:
input_df = st.session_state[f"{workflow}_intermediate_dfs"]["input"]
return input_df, input_df

input_df = st.session_state[f"{workflow}_intermediate_dfs"]["input"]

Expand Down Expand Up @@ -447,7 +453,7 @@ def prepare_stage(df_name):
"nan", ""
)
st.session_state[f"{workflow}_last_attributes"] = selected_cols
df_updated("selected")
df_updated("selected", False)

# print(f'selected df: {st.session_state[f"{workflow}_intermediate_dfs"]["selected"]}')

Expand Down Expand Up @@ -475,21 +481,26 @@ def prepare_stage(df_name):
options=bin_size_options,
help="Select the bin size for the datetime columns you want to quantize. Quantizing datetime columns into bins makes it easier to synthesize data, but reduces the amount of information in the data. If you do not select any columns, no binning will be performed.",
)

if st.button("Quantize selected columns", key="quantize_date"):
reload = True
st.session_state[f"{workflow}_selected_binned_cols"] = selected_date_cols
st.session_state[f"{workflow}_selected_binned_size"] = bin_size

for col in selected_date_cols:
result = quantize_datetime(
this_df, col, bin_size
)
this_df[col] = result
this_df[col].replace(
"nan", ""
)
df_updated("datetime_bin")
c1, c2 = st.columns([1, 1])
with c1:
if st.button("Quantize selected columns", key="quantize_date"):
reload = True
st.session_state[f"{workflow}_selected_binned_cols"] = selected_date_cols
st.session_state[f"{workflow}_selected_binned_size"] = bin_size
print(this_df)
for col in selected_date_cols:
result = quantize_datetime(
this_df, col, bin_size
)
this_df[col] = result
this_df[col].replace(
"nan", ""
)
df_updated("datetime_bin", False)
with c2:
if st.button("Reset", key="reset_date"):
reload = True
df_updated("datetime_bin", True)

# print(f'datetime_bin_df: {st.session_state[f"{workflow}_intermediate_dfs"]["datetime_bin"]}')

Expand All @@ -516,19 +527,24 @@ def prepare_stage(df_name):
value=float(st.session_state[f"{workflow}_selected_trim_percent"]),
help="Percent of values to trim from the top and bottom of each column before binning. This helps to reduce the impact of outliers on the binning process. For example, if trim percent is 0.05, the top and bottom 5% of values will be trimmed from each column before binning. If 0, no trimming will be performed.",
)
c1, c2 = st.columns([1, 1])
with c1:
if st.button("Quantize selected columns", key="quantize_numeric"):
reload = True
if num_bins > 0:
for col in selected_numeric_cols:
qd = quantize_numeric(
last_df, col, num_bins, trim_percent
)
this_df[col] = qd

if st.button("Quantize selected columns", key="quantize_numeric"):
reload = True
if num_bins > 0:
for col in selected_numeric_cols:
qd = quantize_numeric(
last_df, col, num_bins, trim_percent
)
this_df[col] = qd

st.session_state[f"{workflow}_selected_num_bins"] = num_bins
st.session_state[f"{workflow}_selected_trim_percent"] = trim_percent
df_updated("numeric_bin")
st.session_state[f"{workflow}_selected_num_bins"] = num_bins
st.session_state[f"{workflow}_selected_trim_percent"] = trim_percent
df_updated("numeric_bin", False)
with c2:
if st.button("Reset", key="reset_quantize"):
reload = True
df_updated("numeric_bin", True)

# print(f'numeric bin df: {st.session_state[f"{workflow}_intermediate_dfs"]["numeric_bin"]}')

Expand All @@ -550,37 +566,68 @@ def prepare_stage(df_name):
col_delimiter = st.text_input(
"Column delimiter",
value="",
help="The character used to separate values in compound columns. If the delimiter is not present in a cell, the cell will be left unchanged.",
help="The character used to separate values in compound columns. If the delimiter is not present in a cell, the cell will be left unchanged. Any quotes around the entire list or individual values will be removed before processing, as will any enclosing square brackets.",
)
prefix_type = st.radio(
"Expanded column prefix type",
options=["Source column", "Custom"],
index=0,
help="Select the type of prefix to add to each new column created from the compound column. If 'Source column' is selected, the prefix will be the name of the source column. If 'Custom' is selected, you can enter a custom prefix.",
)
if prefix_type == "Source column":
prefix = ""
else:
prefix = st.text_input(
"Custom prefix",
value="",
help="Prefix to add to each new column created from the compound column. If no prefix is provided, no prefix will be added.",
)
c1, c2 = st.columns([1, 1])
with c1:
if st.button("Expand selected columns", key="expand_compound"):
reload = True
to_add = (selected_compound_cols, col_delimiter)
if (
col_delimiter != ""
and to_add not in st.session_state[f"{workflow}_selected_compound_cols"]
):
st.session_state[f"{workflow}_selected_compound_cols"].append(to_add)
for cols, delim in st.session_state[
f"{workflow}_selected_compound_cols"
]:
for col in cols:
if prefix_type == "Source column":
prefix = col
def convert_to_list(x):
print(x)
if type(x) != str:
print('not a string')
return []
if x[0] == "[" and x[-1] == "]":
x = x[1:-1]
vals = [y.strip() for y in x.split(delim)]
vals = [y[1:-1] if y[0] == '"' and y[-1] == '"' else y for y in vals if len(y) > 1]
vals = [y[1:-1] if y[0] == '\'' and y[-1] == '\'' else y for y in vals if len(y) > 1]
if prefix != "":
vals = [f"{prefix}{y}" for y in vals]
return vals
# add each value as a separate column with a 1 if the value is present in the compound column and None otherwise
values = last_df[col].apply(convert_to_list)
unique_values = {v for vals in values for v in vals}
unique_values = [x for x in unique_values if x != ""]
for val in unique_values:
st.session_state[f"{workflow}_{val}"] = False
this_df[val] = values.apply(
lambda x: "1" if val in x and val != "nan" else ""
)
if col in this_df.columns:
this_df.drop(columns=[col], inplace=True)
df_updated("expanded", False)
with c2:
if st.button("Reset", key="reset_expand"):
reload = True
df_updated("expanded", True)

if st.button("Expand selected columns", key="expand_compound"):
reload = True
to_add = (selected_compound_cols, col_delimiter)
if (
col_delimiter != ""
and to_add not in st.session_state[f"{workflow}_selected_compound_cols"]
):
st.session_state[f"{workflow}_selected_compound_cols"].append(to_add)
for cols, delim in st.session_state[
f"{workflow}_selected_compound_cols"
]:
for col in cols:
# add each value as a separate column with a 1 if the value is present in the compound column and None otherwise
values = last_df.apply(
lambda x: [y.strip() for y in x.split(delim)]
if type(x) == str
else []
)
unique_values = {v for vals in values for v in vals}
unique_values = [x for x in unique_values if x != ""]
for val in unique_values:
st.session_state[f"{workflow}_{val}"] = False
this_df[val] = values.apply(
lambda x: "1" if val in x and val != "nan" else ""
)
this_df.drop(columns=[col], inplace=True)
df_updated("expanded")

# print(f'expanded df: {st.session_state[f"{workflow}_intermediate_dfs"]["expanded"]}')

last_df, this_df = prepare_stage("suppress_count")
Expand Down Expand Up @@ -633,44 +680,37 @@ def prepare_stage(df_name):
if str(x) in value_counts and value_counts[str(x)] < min_value
else str(x)
)
df_updated("suppress_count")
df_updated("suppress_count", False)
# print(f'suppress count: {st.session_state[f"{workflow}_intermediate_dfs"]["suppress_count"]}')


initialized = True
if f"{workflow}_suppress_zeros" not in st.session_state:
print('Not initialized')
st.session_state[f"{workflow}_suppress_zeros"] = len(this_df) > 0
initialized = False

last_suppress_zeros = st.session_state[f"{workflow}_last_suppress_zeros"]
suppress_zeros = st.checkbox(
"Suppress boolean False / binary 0",
key=f"{workflow}_suppress_zeros_input",
value=st.session_state[f"{workflow}_suppress_zeros"] if initialized else True,
help="For boolean columns, maps the value False to None. For binary columns, maps the number 0 to None. This is useful when only the presence of an attribute is important, not the absence.",
value=True,
help="For boolean columns, maps the value False to None. For binary columns, maps the number 0 to None. This is useful when only the presence of an attribute is important, not the absence."
)

print(f'suppress zeros: {suppress_zeros}, last suppress zeros: {last_suppress_zeros}')
last_df, this_df = prepare_stage("suppress_null")
if suppress_zeros or suppress_zeros != last_suppress_zeros:
print(f'Changed with suppress_zeros: {suppress_zeros} and last_suppress_zeros: {last_suppress_zeros}')
st.session_state[f"{workflow}_last_suppress_zeros"] = suppress_zeros
if suppress_zeros:
print('suppressing zeros')
this_df = df_functions.suppress_boolean_binary(last_df, this_df)
else:
print('leaving zeros')
this_df = last_df.copy(deep=True)
if suppress_zeros != last_suppress_zeros:
reload = True


if not initialized or suppress_zeros:
st.session_state[f"{workflow}_suppress_zeros"] = suppress_zeros
this_df = df_functions.supress_boolean_binary(last_df, this_df)
df_updated("suppress_null")
if not suppress_zeros:
for col in this_df.columns:
unique_values = this_df[col].unique()
is_three_with_none = (
len(unique_values) == 3 and this_df[col].isna().any()
)
if len(unique_values) <= 2 or is_three_with_none:
this_df[col] = last_df[col]
df_updated("suppress_null")
df_updated("suppress_null", False)


# print(f'suppress null: {st.session_state[f"{workflow}_intermediate_dfs"]["suppress_null"]}')




processed_df = this_df.copy(deep=True)
processed_df.replace({"<NA>": np.nan}, inplace=True)
processed_df.replace({"nan": ""}, inplace=True)
Expand Down
3 changes: 2 additions & 1 deletion intelligence_toolkit/helpers/df_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ def fix_null_ints(in_df: pd.DataFrame) -> pd.DataFrame:
def get_current_time() -> str:
return pd.Timestamp.now().strftime("%Y%m%d%H%M%S")

def supress_boolean_binary(
def suppress_boolean_binary(
input_df: pd.DataFrame, output_df: pd.DataFrame | None = None
) -> pd.DataFrame:
if output_df is None:
output_df = input_df.copy()

for col in input_df.columns:
unique_values = [str(x) for x in input_df[col].unique()]
is_three_with_none = len(unique_values) == 3 and input_df[col].isna().any()
Expand Down
3 changes: 2 additions & 1 deletion intelligence_toolkit/query_text_data/graph_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def update_concept_graph_edges(node_to_period_counts, edge_to_period_counts, per
nps = sorted(set(TextBlob(chunk).noun_phrases))
filtered_nps = []
for np in nps:
parts = np.split()
# split on space or newline
parts = re.split(r"[\s\n]+", np)
if all([re.match(r"[a-zA-Z0-9\-]+", part) for part in parts]):
filtered_nps.append(np)
filtered_nps = sorted(filtered_nps)
Expand Down

0 comments on commit 46550ab

Please sign in to comment.