From 59604827614cec03b68e16dc3f96c394feff0418 Mon Sep 17 00:00:00 2001 From: Thomas Wood Date: Tue, 8 Nov 2022 13:08:19 +0000 Subject: [PATCH] #1 Fixing UX Bettina's fixes --- front_end/dash_callbacks/callbacks_gettext.py | 8 +++++-- front_end/dash_callbacks/callbacks_view_1.py | 9 ++----- front_end/dash_callbacks/callbacks_view_2.py | 22 +++++++++-------- front_end/dash_layout/body.py | 2 +- front_end/locale/pt/LC_MESSAGES/body.mo | 4 ++-- front_end/locale/pt/LC_MESSAGES/body.po | 11 ++++++++- front_end/utils/excel_processor.py | 2 +- .../utils/question_category_classifier.py | 24 ++++++++++++------- 8 files changed, 50 insertions(+), 32 deletions(-) diff --git a/front_end/dash_callbacks/callbacks_gettext.py b/front_end/dash_callbacks/callbacks_gettext.py index bdae360..d39d463 100644 --- a/front_end/dash_callbacks/callbacks_gettext.py +++ b/front_end/dash_callbacks/callbacks_gettext.py @@ -35,7 +35,8 @@ def add_gettext_callbacks(dash_app): Output("please_wait_message", "children"), Output("filter_by_cat", "children"), Output("harmony_graphic", "src"), - Output("dropdown-edge", "options") + Output("dropdown-edge", "options"), + Output("add_row", "children") ], inputs=[Input("select_language", "value") ] @@ -112,6 +113,8 @@ def find_if_tooltip_cookie_present(language): The AI converts the text of each question into a vector in 1600 dimensions using a neural network called GPT-2. This technique is called a *document embedding*. The distance between any two questions is measured according to the cosine similarity metric between the two vectors. Two questions which are similar in meaning, even if worded differently or in different languages, will have a high degree of similarity between their vector representations. Questions which are very different tend to be far apart in the vector space. + +You can read more about how Harmony works [in this blog post](https://harmonydata.org/how-does-harmony-work/). """), _("Hide tip"), _("AI tool built by "), @@ -122,5 +125,6 @@ def find_if_tooltip_cookie_present(language): _("Filter questions by category:"), dash_app.get_asset_url(_('harmony_flowchart_en.png')), [{"value": 1, "label": _("positive")}, {"value": -1, "label": _("negative")}, - {"value": 0, "label": _("no connection")}] + {"value": 0, "label": _("no connection")}], + _("Add row") ] diff --git a/front_end/dash_callbacks/callbacks_view_1.py b/front_end/dash_callbacks/callbacks_view_1.py index eab6209..d1d4812 100644 --- a/front_end/dash_callbacks/callbacks_view_1.py +++ b/front_end/dash_callbacks/callbacks_view_1.py @@ -98,8 +98,7 @@ def wake_up_tika_web_app_on_page_load(location): Output("please_wait_message", "style"), ] , - inputs=[Input("url", "href"), # on startup - Input("file_table", "data"), # Commented out temporarily to remove circular dependency warning + inputs=[Input("file_table", "data"), # Commented out temporarily to remove circular dependency warning Input("dataset", "value"), Input('upload-data', 'contents'), State('upload-data', 'filename'), @@ -108,7 +107,7 @@ def wake_up_tika_web_app_on_page_load(location): ], prevent_initial_call=True ) - def user_uploaded_files(href, file_table, + def user_uploaded_files(file_table, selected_datasets, all_file_contents, file_names, file_date, parsed_documents): print("file_names", file_names) @@ -243,7 +242,6 @@ def display_questions(document_content, language, add_row, old_cols, old_data): old_data.append({c['id']: '' for c in old_cols}) return [old_cols, old_data] - if language == "pt": from application import pt_lang _ = pt_lang.gettext @@ -252,9 +250,6 @@ def display_questions(document_content, language, add_row, old_cols, old_data): dfs = [] - - - for file_name, pages in document_content.items(): if file_name.endswith("pdf"): text = "\n".join(pages) diff --git a/front_end/dash_callbacks/callbacks_view_2.py b/front_end/dash_callbacks/callbacks_view_2.py index 97bf0ef..1f3cb44 100644 --- a/front_end/dash_callbacks/callbacks_view_2.py +++ b/front_end/dash_callbacks/callbacks_view_2.py @@ -33,12 +33,12 @@ def add_view_2_callbacks(dash_app): State("excerpt_table", "columns"), State("excerpt_table", "data"), State("excerpt_table", "derived_virtual_indices"), - State("excerpt_table", "derived_virtual_selected_rows"), + State("excerpt_table", "selected_rows"), State("similaritystore", "data"), ], ) def find_similarity(_, tab, columns, data, filtered_rows, selected_rows, old_similarity_store): - print ("SEL", selected_rows) + print("SEL", selected_rows) """ This function does the heavy lifting. @@ -58,9 +58,9 @@ def find_similarity(_, tab, columns, data, filtered_rows, selected_rows, old_sim # If user has selected some rows with the checkbox, use only them. if selected_rows is not None and len(selected_rows) > 0: - data=[data[r] for r in selected_rows] - elif filtered_rows is not None and len(filtered_rows) > 0: - data = [data[r] for r in filtered_rows] + data = [data[r] for r in selected_rows] + # elif filtered_rows is not None and len(filtered_rows) > 0: + # data = [data[r] for r in filtered_rows] df_questions = deserialise_questions_dataframe(columns, data) @@ -170,12 +170,14 @@ def display_value_of_edge(selection, elements): State("excerpt_table", "columns"), State("excerpt_table", "data"), State("excerpt_table", "derived_virtual_indices"), - State("excerpt_table", "derived_virtual_selected_rows"), + State("excerpt_table", "selected_rows"), Input("manual_edges", "data"), Input("select_language", "value") ], + prevent_initial_call=True ) - def display_similarity_graph(pickled, sensitivity, categories_to_display, columns, data,filtered_rows, selected_rows, manual_edges_serialisable, language): + def display_similarity_graph(pickled, sensitivity, categories_to_display, columns, data, filtered_rows, + selected_rows, manual_edges_serialisable, language): if language == "pt": from application import pt_lang _ = pt_lang.gettext @@ -184,9 +186,9 @@ def display_similarity_graph(pickled, sensitivity, categories_to_display, column # If user has selected some rows with the checkbox, use only them. if selected_rows is not None and len(selected_rows) > 0: - data=[data[r] for r in selected_rows] - elif filtered_rows is not None and len(filtered_rows) > 0: - data = [data[r] for r in filtered_rows] + data = [data[r] for r in selected_rows] + # elif filtered_rows is not None and len(filtered_rows) > 0: + # data = [data[r] for r in filtered_rows] matches = pkl.loads(codecs.decode(pickled.encode(), "base64")) diff --git a/front_end/dash_layout/body.py b/front_end/dash_layout/body.py index 7126cb6..a0029f7 100644 --- a/front_end/dash_layout/body.py +++ b/front_end/dash_layout/body.py @@ -119,7 +119,7 @@ dcc.Dropdown(id="filter_questions", options=[], value=None, multi=False, style={'float': 'left', 'width': '50%', "margin-left": "20px"})], style={"display": "flex", "width": "100%"}), - html.Button("+", id="add_row"), + html.Button(id="add_row"), dcc.Loading([ dash_table.DataTable( diff --git a/front_end/locale/pt/LC_MESSAGES/body.mo b/front_end/locale/pt/LC_MESSAGES/body.mo index 4d1e7d9..e46e50c 100644 --- a/front_end/locale/pt/LC_MESSAGES/body.mo +++ b/front_end/locale/pt/LC_MESSAGES/body.mo @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa5003edd8d4f3b858b2a6a9c40a0dcfda9ecb88f6ff07ee8aa97bd234413db0 -size 9293 +oid sha256:59c9c2c4a3e4f0b9720b94c3b272d5898bb1e1aaa726349367492752801cf68d +size 9589 diff --git a/front_end/locale/pt/LC_MESSAGES/body.po b/front_end/locale/pt/LC_MESSAGES/body.po index b260473..74aa8ba 100644 --- a/front_end/locale/pt/LC_MESSAGES/body.po +++ b/front_end/locale/pt/LC_MESSAGES/body.po @@ -8,7 +8,7 @@ msgstr "" "Project-Id-Version: \n" "Report-Msgid-Bugs-To: \n" "POT-Creation-Date: 2022-10-28 15:14-0300\n" -"PO-Revision-Date: 2022-11-02 16:46+0000\n" +"PO-Revision-Date: 2022-11-08 12:02+0000\n" "Last-Translator: \n" "Language-Team: \n" "Language: pt_BR\n" @@ -188,6 +188,9 @@ msgid "" "differently or in different languages, will have a high degree of similarity between their " "vector representations. Questions which are very different tend to be far apart in the vector " "space.\n" +"\n" +"You can read more about how Harmony works [in this blog post](https://harmonydata.org/how-" +"does-harmony-work/).\n" msgstr "" "## Uso do gráfico\n" "\n" @@ -213,6 +216,9 @@ msgstr "" "formuladas de forma diferente ou em idiomas diferentes, terão um alto grau de similaridade " "entre suas representações vetoriais. Perguntas muito diferentes, por outro lado, tendem a " "estar distanciadas no espaço vetorial.\n" +"\n" +"Você pode ler mais sobre como o Harmony funciona [nesta postagem do blog](https://harmonydata." +"org/how-does-harmony-work/).\n" #: body.py:258 msgid "➋ Check the matches" @@ -327,3 +333,6 @@ msgstr "Categoria da Pergunta" msgid "File" msgstr "Arquivo" + +msgid "Add row" +msgstr "Nova linha" diff --git a/front_end/utils/excel_processor.py b/front_end/utils/excel_processor.py index 779f3b8..6f9e059 100644 --- a/front_end/utils/excel_processor.py +++ b/front_end/utils/excel_processor.py @@ -49,7 +49,7 @@ def convert_jsonified_excel_to_questions_dataframe(excel_in_json_format: str) -> # Check if header row present, in which case remove it rows_to_delete = [] for i in range(len(df_questions)): - if df_questions.question.iloc[i] is None or df_questions.question.iloc[i].lower() in ["question", "text", + if df_questions.question.iloc[i] is None or type(df_questions.question.iloc[i]) is not str or df_questions.question.iloc[i].lower() in ["question", "text", "pergunta", "texto"]: rows_to_delete.append(i) diff --git a/front_end/utils/question_category_classifier.py b/front_end/utils/question_category_classifier.py index 2a31bff..f970295 100644 --- a/front_end/utils/question_category_classifier.py +++ b/front_end/utils/question_category_classifier.py @@ -1,6 +1,9 @@ import bz2 import pickle as pkl import re +import traceback +import sys + from utils.language_utils import get_clean_language_code from utils.pt_en_dict import pt_en_map @@ -35,11 +38,16 @@ def categorise_questions(self, df): parsed_questions = df.apply(lambda r: parse_questions(get_spacy_model(get_clean_language_code(r.language)), r.question), axis=1) - categories = self.model.predict(parsed_questions) - # Override if empty strings - for i in range(len(df)): - lc = df.question.iloc[i].strip().lower() - if lc == "": - categories[i] = "" - - df["question_category"] = categories \ No newline at end of file + try: + categories = self.model.predict(parsed_questions) + # Override if empty strings + for i in range(len(df)): + lc = df.question.iloc[i].strip().lower() + if lc == "": + categories[i] = "" + + df["question_category"] = categories + except: + print ("Exception categorising questions") + traceback.print_exception(*sys.exc_info()) + df["question_category"] = "N/A"