Skip to content

Commit

Permalink
#1 Fixing UX Bettina's fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
woodthom2 committed Nov 8, 2022
1 parent e20d89c commit 5960482
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 32 deletions.
8 changes: 6 additions & 2 deletions front_end/dash_callbacks/callbacks_gettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def add_gettext_callbacks(dash_app):
Output("please_wait_message", "children"),
Output("filter_by_cat", "children"),
Output("harmony_graphic", "src"),
Output("dropdown-edge", "options")
Output("dropdown-edge", "options"),
Output("add_row", "children")
],
inputs=[Input("select_language", "value")
]
Expand Down Expand Up @@ -112,6 +113,8 @@ def find_if_tooltip_cookie_present(language):
The AI converts the text of each question into a vector in 1600 dimensions using a neural network called GPT-2. This technique is called a *document embedding*.
The distance between any two questions is measured according to the cosine similarity metric between the two vectors. Two questions which are similar in meaning, even if worded differently or in different languages, will have a high degree of similarity between their vector representations. Questions which are very different tend to be far apart in the vector space.
You can read more about how Harmony works [in this blog post](https://harmonydata.org/how-does-harmony-work/).
"""),
_("Hide tip"),
_("AI tool built by "),
Expand All @@ -122,5 +125,6 @@ def find_if_tooltip_cookie_present(language):
_("Filter questions by category:"),
dash_app.get_asset_url(_('harmony_flowchart_en.png')),
[{"value": 1, "label": _("positive")}, {"value": -1, "label": _("negative")},
{"value": 0, "label": _("no connection")}]
{"value": 0, "label": _("no connection")}],
_("Add row")
]
9 changes: 2 additions & 7 deletions front_end/dash_callbacks/callbacks_view_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ def wake_up_tika_web_app_on_page_load(location):
Output("please_wait_message", "style"),
]
,
inputs=[Input("url", "href"), # on startup
Input("file_table", "data"), # Commented out temporarily to remove circular dependency warning
inputs=[Input("file_table", "data"), # Commented out temporarily to remove circular dependency warning
Input("dataset", "value"),
Input('upload-data', 'contents'),
State('upload-data', 'filename'),
Expand All @@ -108,7 +107,7 @@ def wake_up_tika_web_app_on_page_load(location):
],
prevent_initial_call=True
)
def user_uploaded_files(href, file_table,
def user_uploaded_files(file_table,
selected_datasets, all_file_contents, file_names, file_date, parsed_documents):

print("file_names", file_names)
Expand Down Expand Up @@ -243,7 +242,6 @@ def display_questions(document_content, language, add_row, old_cols, old_data):
old_data.append({c['id']: '' for c in old_cols})
return [old_cols, old_data]


if language == "pt":
from application import pt_lang
_ = pt_lang.gettext
Expand All @@ -252,9 +250,6 @@ def display_questions(document_content, language, add_row, old_cols, old_data):

dfs = []




for file_name, pages in document_content.items():
if file_name.endswith("pdf"):
text = "\n".join(pages)
Expand Down
22 changes: 12 additions & 10 deletions front_end/dash_callbacks/callbacks_view_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ def add_view_2_callbacks(dash_app):
State("excerpt_table", "columns"),
State("excerpt_table", "data"),
State("excerpt_table", "derived_virtual_indices"),
State("excerpt_table", "derived_virtual_selected_rows"),
State("excerpt_table", "selected_rows"),
State("similaritystore", "data"),
],
)
def find_similarity(_, tab, columns, data, filtered_rows, selected_rows, old_similarity_store):
print ("SEL", selected_rows)
print("SEL", selected_rows)
"""
This function does the heavy lifting.
Expand All @@ -58,9 +58,9 @@ def find_similarity(_, tab, columns, data, filtered_rows, selected_rows, old_sim

# If user has selected some rows with the checkbox, use only them.
if selected_rows is not None and len(selected_rows) > 0:
data=[data[r] for r in selected_rows]
elif filtered_rows is not None and len(filtered_rows) > 0:
data = [data[r] for r in filtered_rows]
data = [data[r] for r in selected_rows]
# elif filtered_rows is not None and len(filtered_rows) > 0:
# data = [data[r] for r in filtered_rows]

df_questions = deserialise_questions_dataframe(columns, data)

Expand Down Expand Up @@ -170,12 +170,14 @@ def display_value_of_edge(selection, elements):
State("excerpt_table", "columns"),
State("excerpt_table", "data"),
State("excerpt_table", "derived_virtual_indices"),
State("excerpt_table", "derived_virtual_selected_rows"),
State("excerpt_table", "selected_rows"),
Input("manual_edges", "data"),
Input("select_language", "value")
],
prevent_initial_call=True
)
def display_similarity_graph(pickled, sensitivity, categories_to_display, columns, data,filtered_rows, selected_rows, manual_edges_serialisable, language):
def display_similarity_graph(pickled, sensitivity, categories_to_display, columns, data, filtered_rows,
selected_rows, manual_edges_serialisable, language):
if language == "pt":
from application import pt_lang
_ = pt_lang.gettext
Expand All @@ -184,9 +186,9 @@ def display_similarity_graph(pickled, sensitivity, categories_to_display, column

# If user has selected some rows with the checkbox, use only them.
if selected_rows is not None and len(selected_rows) > 0:
data=[data[r] for r in selected_rows]
elif filtered_rows is not None and len(filtered_rows) > 0:
data = [data[r] for r in filtered_rows]
data = [data[r] for r in selected_rows]
# elif filtered_rows is not None and len(filtered_rows) > 0:
# data = [data[r] for r in filtered_rows]

matches = pkl.loads(codecs.decode(pickled.encode(), "base64"))

Expand Down
2 changes: 1 addition & 1 deletion front_end/dash_layout/body.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
dcc.Dropdown(id="filter_questions", options=[], value=None, multi=False,
style={'float': 'left', 'width': '50%', "margin-left": "20px"})],
style={"display": "flex", "width": "100%"}),
html.Button("+", id="add_row"),
html.Button(id="add_row"),

dcc.Loading([
dash_table.DataTable(
Expand Down
4 changes: 2 additions & 2 deletions front_end/locale/pt/LC_MESSAGES/body.mo
Git LFS file not shown
11 changes: 10 additions & 1 deletion front_end/locale/pt/LC_MESSAGES/body.po
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgstr ""
"Project-Id-Version: \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-10-28 15:14-0300\n"
"PO-Revision-Date: 2022-11-02 16:46+0000\n"
"PO-Revision-Date: 2022-11-08 12:02+0000\n"
"Last-Translator: \n"
"Language-Team: \n"
"Language: pt_BR\n"
Expand Down Expand Up @@ -188,6 +188,9 @@ msgid ""
"differently or in different languages, will have a high degree of similarity between their "
"vector representations. Questions which are very different tend to be far apart in the vector "
"space.\n"
"\n"
"You can read more about how Harmony works [in this blog post](https://harmonydata.org/how-"
"does-harmony-work/).\n"
msgstr ""
"## Uso do gráfico\n"
"\n"
Expand All @@ -213,6 +216,9 @@ msgstr ""
"formuladas de forma diferente ou em idiomas diferentes, terão um alto grau de similaridade "
"entre suas representações vetoriais. Perguntas muito diferentes, por outro lado, tendem a "
"estar distanciadas no espaço vetorial.\n"
"\n"
"Você pode ler mais sobre como o Harmony funciona [nesta postagem do blog](https://harmonydata."
"org/how-does-harmony-work/).\n"

#: body.py:258
msgid "➋ Check the matches"
Expand Down Expand Up @@ -327,3 +333,6 @@ msgstr "Categoria da Pergunta"

msgid "File"
msgstr "Arquivo"

msgid "Add row"
msgstr "Nova linha"
2 changes: 1 addition & 1 deletion front_end/utils/excel_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def convert_jsonified_excel_to_questions_dataframe(excel_in_json_format: str) ->
# Check if header row present, in which case remove it
rows_to_delete = []
for i in range(len(df_questions)):
if df_questions.question.iloc[i] is None or df_questions.question.iloc[i].lower() in ["question", "text",
if df_questions.question.iloc[i] is None or type(df_questions.question.iloc[i]) is not str or df_questions.question.iloc[i].lower() in ["question", "text",
"pergunta", "texto"]:
rows_to_delete.append(i)

Expand Down
24 changes: 16 additions & 8 deletions front_end/utils/question_category_classifier.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import bz2
import pickle as pkl
import re
import traceback
import sys


from utils.language_utils import get_clean_language_code
from utils.pt_en_dict import pt_en_map
Expand Down Expand Up @@ -35,11 +38,16 @@ def categorise_questions(self, df):

parsed_questions = df.apply(lambda r: parse_questions(get_spacy_model(get_clean_language_code(r.language)), r.question), axis=1)

categories = self.model.predict(parsed_questions)
# Override if empty strings
for i in range(len(df)):
lc = df.question.iloc[i].strip().lower()
if lc == "":
categories[i] = ""

df["question_category"] = categories
try:
categories = self.model.predict(parsed_questions)
# Override if empty strings
for i in range(len(df)):
lc = df.question.iloc[i].strip().lower()
if lc == "":
categories[i] = ""

df["question_category"] = categories
except:
print ("Exception categorising questions")
traceback.print_exception(*sys.exc_info())
df["question_category"] = "N/A"

0 comments on commit 5960482

Please sign in to comment.