Skip to content

Commit

Permalink
add column header formatter to streamlit (#70)
Browse files Browse the repository at this point in the history
add colum header formatter to streamlit
  • Loading branch information
dayesouza authored Oct 28, 2024
1 parent 57d6082 commit 576cfc7
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 3 deletions.
7 changes: 6 additions & 1 deletion app/util/ui_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from toolkit.AI.classes import LLMCallback
from toolkit.AI.client import OpenAIClient
from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
from toolkit.helpers.texts import clean_for_column_name


def return_token_count(text: str) -> int:
Expand Down Expand Up @@ -174,7 +175,6 @@ def generative_batch_ai_component(
file_options = ["unicode-escape", "utf-8", "utf-8-sig"]
file_encoding_default = "unicode-escape"


def single_csv_uploader(
workflow,
upload_label,
Expand Down Expand Up @@ -211,6 +211,7 @@ def single_csv_uploader(
df = pd.read_csv(
file, encoding=encoding, encoding_errors="ignore", low_memory=False
)
df.columns = [clean_for_column_name(col) for col in df.columns]
input_df_var.value = df
processed_df_var.value = pd.DataFrame()
if f"{workflow}_intermediate_dfs" in st.session_state:
Expand Down Expand Up @@ -319,6 +320,9 @@ def multi_csv_uploader(
low_memory=False,
)
)
selected_df.columns = [
clean_for_column_name(col) for col in selected_df.columns
]
break
st.dataframe(
selected_df[:show_rows],
Expand Down Expand Up @@ -685,6 +689,7 @@ def prepare_stage(df_name):
processed_df_var.value = processed_df
for col, rename in st.session_state[f"{workflow}_rename_map"].items():
processed_df_var.value.rename(columns={col: rename}, inplace=True)
st.session_state[f"{workflow}_{rename}"] = st.session_state[f"{workflow}_{col}"]
if reload and len(input_df) > 0 and len(processed_df) > 0:
st.rerun()

Expand Down
4 changes: 2 additions & 2 deletions toolkit/detect_entity_networks/prepare_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@

from toolkit.detect_entity_networks.config import ENTITY_LABEL, FlagAggregatorType
from toolkit.helpers.constants import ATTRIBUTE_VALUE_SEPARATOR
from toolkit.helpers.texts import clean_text_for_csv


def clean_text(text: str | int) -> str:
# remove punctuation but retain characters and digits in any language
# compress whitespace to single space
cleaned_text = re.sub(r"[^\w\s&@\+]", "", str(text)).strip()
# cleaned_text = re.sub(r"[^\w\s&@+/]", "", str(text)).strip()
cleaned_text = clean_text_for_csv(text).strip()
return re.sub(r"\s+", " ", cleaned_text)


Expand Down
14 changes: 14 additions & 0 deletions toolkit/helpers/texts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project.
#
import re


def clean_text_for_csv(text: str | int) -> str:
# Replace non-alphanumeric characters
return re.sub(r"[^\w\s&@\+]", "", str(text))


def clean_for_column_name(text: str | int) -> str:
# Replace non-alphanumeric characters
return re.sub(r"[^\w\s&()\-_\+]", "", str(text))
3 changes: 3 additions & 0 deletions toolkit/tests/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project.
#
94 changes: 94 additions & 0 deletions toolkit/tests/helpers/test_texts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project.
#


from toolkit.helpers.texts import clean_for_column_name, clean_text_for_csv


class TestCleanTextForColumnName:
def test_clean_text_for_csv_empty(self) -> None:
assert clean_for_column_name("") == ""

def test_clean_text_for_csv_alpha_numeric(self) -> None:
assert clean_for_column_name("Hello123") == "Hello123"

def test_clean_text_for_csv_with_spaces(self) -> None:
assert clean_for_column_name("Hello World 123") == "Hello World 123"

def test_clean_text_for_csv_special_characters(self) -> None:
assert clean_for_column_name("Hello, World! 123") == "Hello World 123"

def test_clean_text_for_csv_email(self) -> None:
assert clean_for_column_name("[email protected]") == "userexamplecom"

def test_clean_text_for_csv_ampersand(self) -> None:
assert clean_for_column_name("R&D") == "R&D"

def test_clean_text_for_csv_plus_sign(self) -> None:
assert clean_for_column_name("C++") == "C++"

def test_clean_text_for_csv_at_symbol(self) -> None:
assert clean_for_column_name("user@domain") == "userdomain"

def test_clean_text_for_csv_numbers(self) -> None:
assert clean_for_column_name(123456) == "123456"

def test_clean_text_for_csv_mixed_characters(self) -> None:
assert clean_for_column_name("Hello@World&123+") == "HelloWorld&123+"

def test_clean_text_for_csv_only_special_characters(self) -> None:
assert clean_for_column_name("!@#$%^&*()_+={}-[]:\";'<>?,./") == "&()_+-"

def test_clean_text_for_csv_unicode_characters(self) -> None:
assert clean_for_column_name("你好,世界") == "你好世界"

def test_clean_text_for_csv_underscore(self) -> None:
assert clean_for_column_name("file_name") == "file_name"

def test_clean_text_for_csv_dash(self) -> None:
assert clean_for_column_name("file-name") == "file-name"


class TestCleanTextForCsv:
def test_clean_text_for_csv_empty(self) -> None:
assert clean_text_for_csv("") == ""

def test_clean_text_for_csv_alpha_numeric(self) -> None:
assert clean_text_for_csv("Hello123") == "Hello123"

def test_clean_text_for_csv_with_spaces(self) -> None:
assert clean_text_for_csv("Hello World 123") == "Hello World 123"

def test_clean_text_for_csv_special_characters(self) -> None:
assert clean_text_for_csv("Hello, World! 123") == "Hello World 123"

def test_clean_text_for_csv_email(self) -> None:
assert clean_text_for_csv("[email protected]") == "user@examplecom"

def test_clean_text_for_csv_ampersand(self) -> None:
assert clean_text_for_csv("R&D") == "R&D"

def test_clean_text_for_csv_plus_sign(self) -> None:
assert clean_text_for_csv("C++") == "C++"

def test_clean_text_for_csv_at_symbol(self) -> None:
assert clean_text_for_csv("user@domain") == "user@domain"

def test_clean_text_for_csv_numbers(self) -> None:
assert clean_text_for_csv(123456) == "123456"

def test_clean_text_for_csv_mixed_characters(self) -> None:
assert clean_text_for_csv("Hello@World&123+") == "Hello@World&123+"

def test_clean_text_for_csv_only_special_characters(self) -> None:
assert clean_text_for_csv("!@#$%^&*()_+={}[]:\";'<>?,./") == "@&_+"

def test_clean_text_for_csv_unicode_characters(self) -> None:
assert clean_text_for_csv("你好,世界") == "你好世界"

def test_clean_text_for_csv_underscore(self) -> None:
assert clean_text_for_csv("file_name") == "file_name"

def test_clean_text_for_csv_dash(self) -> None:
assert clean_text_for_csv("file-name") == "filename"

0 comments on commit 576cfc7

Please sign in to comment.