GatorEducator · hadenwIV · Mar 30, 2021 · Mar 30, 2021 · Mar 31, 2021 · Mar 31, 2021
diff --git a/git-standup b/git-standup
diff --git a/src/analyzer.py b/src/analyzer.py
@@ -1,14 +1,21 @@
 """Text Proprocessing"""
 from collections import Counter
+
+import pickle
+from . import markdown as md
+
 from textblob import TextBlob
 import pandas as pd
+
 import re
 import string
 from typing import List, Tuple
 import spacy
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+import nltk
 
-from . import markdown as md
+nltk.download("wordnet")
+nltk.download("stopwords")
 
 PARSER = spacy.load("en_core_web_sm")
 
@@ -55,7 +62,7 @@ def tokenize(normalized_text: str) -> List[str]:
 
 
 def compute_frequency(
-        token_lst: List[str], amount=50
+    token_lst: List[str], amount=50
 ) -> List[Tuple[str, int]]:  # noqa: E501
     """Compute word frequency from a list of tokens"""
     word_freq = Counter(token_lst)
@@ -68,6 +75,40 @@ def word_frequency(text: str, amount=50) -> List[Tuple[str, int]]:
     return compute_frequency(tokenize(normalize(text)), amount)
 
 
+def category_frequency(responses: List[str]) -> dict:
+    """A pipeline to normalize, tokenize, and
+    find category frequency of raw text"""
+
+    for i in range(len(responses)):
+        responses[i] = normalize(responses[i])
+    if "" in responses:
+        responses.remove("")
+
+    with open("text_classifier", "rb") as training_model:
+        model = pickle.load(training_model)
+
+    with open("vectorizer", "rb") as training_vectorizer:
+        vectorizer = pickle.load(training_vectorizer)
+
+    category_dict = {
+        "Ethics": 0,
+        "Professional Skills": 0,
+        "Technical Skills": 0
+    }
+
+    for element in responses:
+        element = vectorizer.transform([element]).toarray()
+        label = model.predict(element)[0]
+        if label == 0:
+            category_dict["Ethics"] += 1
+        if label == 1:
+            category_dict["Professional Skills"] += 1
+        if label == 2:
+            category_dict["Technical Skills"] += 1
+
+    return category_dict
+
+
 def dir_frequency(dirname: str, amount=50) -> List[Tuple[str, int]]:
     """A pipeline to normalize, tokenize, and
     find word frequency of a directory of raw input file"""

diff --git a/src/visualization.py b/src/visualization.py
@@ -62,6 +62,24 @@ def facet_freq_barplot(
     return grid
 
 
+def facet_category_barplot(category_df):
+    """facet colored bar plot for category frequencies"""
+
+    base = (
+        alt.Chart(category_df)
+        .mark_bar()
+        .encode(
+            x="Student:N",
+            y="Frequency:Q",
+            color="Category:N",
+            order=alt.Order("Category", sort="descending")
+        )
+        .properties(width=570,)
+    ).interactive()
+
+    return base
+
+
 def facet_senti_barplot(senti_df, options, column_name, plots_per_row=3):
     """facet bar plot for word frequencies"""
     base = (

diff --git a/streamlit_web.py b/streamlit_web.py
@@ -85,7 +85,12 @@ def main():
                 interactive()
             success_msg.empty()
 
+
+
+def readme():
+
 def landing_src():
+
     """function to load and configurate readme source"""
 
     with open("docs/LANDING_PAGE.md") as landing_file:
@@ -99,6 +104,7 @@ def landing_src():
 
         st.markdown(landing_src, unsafe_allow_html=True)
 
+
 def landing_pg():
     """landing page"""
     landing = st.sidebar.selectbox("Welcome", ["Home", "Interactive"])
@@ -169,9 +175,14 @@ def load_model(name):
 @st.cache(allow_output_mutation=True, suppress_st_warning=True)
 def import_data(data_retreive_method, paths):
     """pipeline to import data from local or aws"""
+
+    if data_retreive_method == "Local file system":
+        json_lst = []
+
     json_lst = []
     global main_md_dict
     if data_retreive_method == "Path input":
+
         try:
             for path in paths:
                 json_lst.append(md.collect_md(path))
@@ -229,7 +240,7 @@ def df_preprocess(df):
 def frequency():
     """main function for frequency analysis"""
     freq_type = st.sidebar.selectbox(
-        "Type of frequency analysis", ["Overall", "Student", "Question"]
+        "Type of frequency analysis", ["Overall", "Student", "Question", "Category"]
     )
     if freq_type == "Overall":
         freq_range = st.sidebar.slider(
@@ -256,10 +267,15 @@ def frequency():
             f"Most frequent words in individual questions in **{assign_text}**"
         )
         question_freq(freq_range)
+    elif freq_type == "Category":
+        st.header(
+            f"Frequency of responses focused on ethics, technical skills, and professional skills in **{assign_text}**"
+        )
+        category_freq()
 
 
 def overall_freq(freq_range):
-    """page fore overall word frequency"""
+    """page for overall word frequency"""
     plots_range = st.sidebar.slider(
         "Select the number of plots per row", 1, 5, value=3
     )
@@ -282,7 +298,7 @@ def overall_freq(freq_range):
             freq_df, assignments, "assignments", plots_per_row=plots_range
         )
     )
-
+    freq_df.to_csv('frequency_archives/' + str(item) + '.csv')
 
 def student_freq(freq_range):
     """page for individual student's word frequency"""
@@ -326,7 +342,6 @@ def student_freq(freq_range):
             )
         )
 
-
 def question_freq(freq_range):
     """page for individual question's word frequency"""
     # drop columns with all na
@@ -373,6 +388,48 @@ def question_freq(freq_range):
             )
         )
 
+def category_freq():
+    """page for word category frequency"""
+
+    questions_end = len(main_df.columns) - 3
+    question_df = main_df[main_df.columns[1:questions_end]]
+    category_df = pd.DataFrame(columns=["Ethics", "Professional Skills", "Technical Skills", "Student"])
+    simple_df = pd.DataFrame(columns=["Student", "Category"])
+    user_responses = []
+    categories = {}
+    row_number = 0
+    id = 0
+    ordered_student_ids = []
+    ordered_categories = []
+    ordered_frequencies = []
+
+    for i, row in question_df.iterrows():
+        # add each user's responses to a list to pass in to dataframe
+        for col in range(len(question_df.columns)):
+            if col == 0: # append student ID
+                id = (str(main_df.iloc[row_number]["reflection by"]))
+            else: # append categories of response
+                response = row[col]
+                user_responses.append(response)
+        row_number += 1
+        categories = az.category_frequency(user_responses)
+        for element in categories:
+            ordered_student_ids.append(id)
+            ordered_categories.append(element)
+            ordered_frequencies.append(categories[element])
+        categories["Student"] = id
+        category_df = category_df.append(categories, ignore_index=True)
+        user_responses.clear()
+    simple_df["Student"] = ordered_student_ids
+    simple_df["Category"] = ordered_categories
+    simple_df["Frequency"] = ordered_frequencies
+
+    st.altair_chart(
+        vis.facet_category_barplot(
+            simple_df,
+        )
+    )
+
 
 def sentiment():
     """main function for sentiment analysis"""

diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
@@ -161,6 +161,26 @@ def test_tfidf():
     assert vector is not None
 
 
+
+def test_category_frequency():
+    "test that professional skills, technical skills, and ethics are properly \
+    classified "
+    text = ["One professional skill that I practiced was communicating \
+     independently with a team. I did this by atttending all meetings, using \
+     Zenhub, and including everyone in the major decision making process. I \
+     also practiced the professional skill of resolving conflicts by talking \
+     through the conflict with my group members, coming to a resolution, and \
+     apologizing for the mishap that I caused."]
+    output = az.category_frequency(text)
+    print(output)
+    assert output["Professional Skills"] == 1
+
+    text = ["One technical skill that I practiced was installing Python \
+    packages and integrating these packages with my code."]
+    output = az.category_frequency(text)
+    print(output)
+    assert output["Technical Skills"] == 1
+
 def test_top_polarized_word():
     """Tests if the positive/negative words columns are created"""
     df = pd.DataFrame(columns=[cts.TOKEN, cts.POSITIVE, cts.NEGATIVE])

diff --git a/text_classifier b/text_classifier
diff --git a/vectorizer b/vectorizer