From 7378f4e394099461f843840c0ddb8baecffe8fb2 Mon Sep 17 00:00:00 2001
From: Shikhar Gupta <53982388+SG115@users.noreply.github.com>
Date: Sun, 15 May 2022 19:04:51 +0530
Subject: [PATCH] Add files via upload

---
 app.py            |   88 ++++
 helper.py         |  587 +++++++++++++++++++++++++
 preprocessor.py   |  144 +++++++
 report.py         |   96 +++++
 requirements.txt  |   13 +
 stop_hinglish.txt | 1056 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1984 insertions(+)
 create mode 100644 app.py
 create mode 100644 helper.py
 create mode 100644 preprocessor.py
 create mode 100644 report.py
 create mode 100644 requirements.txt
 create mode 100644 stop_hinglish.txt

diff --git a/app.py b/app.py
new file mode 100644
index 0000000..e4beeb6
--- /dev/null
+++ b/app.py
@@ -0,0 +1,88 @@
+from time import sleep
+import streamlit as st
+import preprocessor
+import os
+# os.system('cls||clear')
+import helper
+import base64
+from fpdf import FPDF
+import matplotlib.pyplot as plt
+import report
+from tempfile import NamedTemporaryFile
+# from sklearn.datasets import load_iris
+st.title("Whatsapp Chat Analyzer")
+
+## adding bootstrap 
+st.markdown("""<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3" crossorigin="anonymous">""", unsafe_allow_html=True)
+
+
+
+uploaded_file = st.sidebar.file_uploader("Choose a file",type=['txt'])
+if uploaded_file is not None:
+    raw_text = str(uploaded_file.read(),"utf-8")
+    df = preprocessor.preprocess(raw_text)
+    # st.text(raw_text)
+
+    #fetch unique users
+
+    user_list = df['Author'].unique().tolist()
+    if None in user_list:
+        user_list.remove(None)
+    user_list.sort()
+    user_list.insert(0, "Overall")
+    selected_user = st.sidebar.selectbox("Show analysis wrt", user_list)
+    report_pdf = report.create_report(df)
+    if st.sidebar.button("Show Analysis"):
+
+        # TOP STATS CARD
+        temp_html,links = helper.topstats_table(df,selected_user,0)
+        # st.write(df)
+        # LINKS CARD
+        with st.expander(f"Show links sent by {selected_user}"):
+            helper.links_table(links,selected_user)
+
+        # EMOJI CARD
+        with st.expander(f"Show Emoji Analysis of {selected_user}"):
+            helper.emoji_table(df,selected_user,0)
+
+        # Common Words Card
+        with st.expander(f"Show Commonly used words by {selected_user}"):
+            helper.most_common_words(df,selected_user,0)
+        
+        # Monthly timeline 
+        with st.expander(f"Show Monthly Timeline Stats of {selected_user}"):
+            helper.monthly_timeline(df,selected_user)
+
+        # Week Activity Map
+        with st.expander(f"Show Week Activity of {selected_user}"):
+            helper.week_activity_map(df,selected_user,0)
+
+        # Daily timeline 
+        with st.expander(f"Show Daily Timeline Stats of {selected_user}"):
+            helper.daily_timeline(df,selected_user,0)
+
+        # Month Activity Map
+        # with st.expander(f"Show Monthy Activity Map of {selected_user}"):
+        #     helper.month_activity_map(df,selected_user)
+        # # Activity heatmap
+        # with st.expander(f"Show Activity Heat Map of {selected_user}"):
+        #     helper.activity_heatmap(df,selected_user)
+
+
+        # with st.expander(f"Sentiment Report"):
+        #     helper.find_sentimentreport(df)
+
+
+        # Group Icon Changed
+        with st.expander(f"Show some facts"):
+            helper.group_icon_changed(df)
+
+
+    
+
+
+    # with open("Download.csv",encoding="utf8") as f:
+    #     st.download_button('Download Chat Report', f)  # Defaults to 'text/plain'
+
+
+    
diff --git a/helper.py b/helper.py
new file mode 100644
index 0000000..96414a6
--- /dev/null
+++ b/helper.py
@@ -0,0 +1,587 @@
+import emoji
+import streamlit as st
+import re
+import pandas as pd
+from collections import Counter
+from matplotlib import pyplot as plt
+from matplotlib.font_manager import FontProperties
+from plotnine import *
+import itertools 
+import altair as alt
+import datetime
+from datetime import date
+import calendar
+import seaborn as sns
+import json
+from json import dumps
+import numpy as np
+import pdfkit
+from altair_saver import save
+from fpdf import FPDF
+import base64
+# from fpdf import FPDF
+
+# from wordcloud import WordCloud
+
+# import streamlit_wordcloud as WordCloud
+
+# Load Apple Color Emoji font 
+prop = FontProperties(fname='/System/Library/Fonts/Apple Color Emoji.ttc')
+
+def findDay(date):
+    date = str(date)
+    year, month, day = (int(i) for i in date.split('-'))   
+    born = datetime.date(year, month, day)
+    return born.strftime("%A")
+ 
+
+def fetch_stats(selected_user, df):
+
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+
+    num_messages = df.shape[0]
+    words = []
+    for message in df['Message']:
+        words.extend(message.split())
+
+    num_media_msgs = df[df['Message'] == ' <Media omitted>'].shape[0]
+
+    links = []
+    URLPATTERN = re.compile(r'https:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}')
+    for message in df['Message']:
+        link = URLPATTERN.search(message)
+        if link is None:
+            continue
+        links.append(link.group())
+    # for message in df['Message']:
+    #     links.extend(URLExtract.find_urls(message))
+    return num_messages, len(words), num_media_msgs, links
+
+def links_table(links,author):
+    st.write("Total links shared: ",len(links))
+    if len(links):
+        st.markdown(f"""
+
+        <p>Links shared by <strong>{author}</strong></p>
+
+        """,unsafe_allow_html=True)
+
+    for link in links:
+        if link[0]=='w':
+            link = "https://" + link
+        st.markdown(f"""
+            <div class="card">
+                <ul>
+                    <li><a href={link}>{link}</a></li>
+                </ul>
+            </div>
+        """,unsafe_allow_html=True)
+
+    
+def topstats_table(df,author,report):
+    total_messages, total_words,total_media,links = fetch_stats(author, df)
+    total_links = len(links)
+
+    temp_html = f"""
+            <div class="table-responsive" >
+                <h4>Top Stats of {author}</h4>
+                <table class="table table-sm">
+                    <thead>
+                        <tr>
+                            <th scope="col text-justify" class="text-center">Total Messages</th>
+                            <th scope="col" class="text-center">Total Words</th>
+                            <th scope="col" class="text-center">Total Media</th>
+                            <th scope="col" class="text-center">Total Links</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        <tr class="table-secondary">
+                            <th class="text-center">{total_messages}</th>
+                            <td class="text-center">{total_words}</td>
+                            <td class="text-center">{total_media}</td>
+                            <td class="text-center">{total_links}</td>
+                        </tr>
+                    </tbody>
+                </table>
+            </div>
+        """
+    st.markdown(temp_html, unsafe_allow_html=True)
+    temp_html2 = f"""
+    <div class="card">
+        <ul>
+            <p>op Stats of {author}</p>
+            <li>Total Messages : {total_messages} </li>
+            <li>Total words : {total_words} </li>
+            <li>Total Media Sent : {total_media} </li>
+            <li>Total Links Sent : {total_links} </li>
+        </ul>
+    </div>
+    """
+    return temp_html2,links
+
+# def create_word_cloud(selected_user,df):
+
+#     f = open('stop_hinglish.txt', 'r')
+#     stop_words = f.read()
+
+#     if selected_user != 'Overall':
+#         df = df[df['Author'] == selected_user]
+
+#     temp = df[df['Author'] != 'group_notification']
+#     temp = temp[temp['Message'] != ' <Media omitted>\n']
+
+#     def remove_stop_words(message):
+#         y = []
+#         for word in message.lower().split():
+#             if word not in stop_words:
+#                 y.append(word)
+#         return " ".join(y)
+
+#     wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
+#     temp['Message'] = temp['Message'].apply(remove_stop_words)
+#     df_wc = wc.generate(temp['Message'].str.cat(sep=" "))
+#     return df_wc
+
+
+def monthly_timeline(df,selected_user):
+
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+
+    timeline = df.groupby(['Year', 'Month_num', 'Month']).count()['Message'].reset_index()
+
+    time = []
+    for i in range(timeline.shape[0]):
+        time.append(timeline['Month'][i] + "-" + str(timeline['Year'][i]))
+
+    timeline['Time'] = time
+    # st.write(timeline)
+    ### Naive code to find the max mess month
+    # idx = timeline['Message'].idxmax()
+    # st.write(df.iloc[idx])
+    
+    maxx_mssg = 0
+    maxx_idx = 0
+
+    minn_mssg = 400000
+    minn_idx = 0
+    for idx in timeline.index:
+        if timeline['Message'][idx] > maxx_mssg:
+            maxx_mssg = timeline['Message'][idx]
+            maxx_idx = idx 
+        if timeline['Message'][idx] < minn_mssg:
+            minn_mssg = timeline['Message'][idx]
+            minn_idx = idx
+
+    # st.write(maxx_idx)
+    mostactive_month = timeline['Month'][maxx_idx]
+    mostactive_year = timeline['Year'][maxx_idx]
+
+    leastactive_month = timeline['Month'][minn_idx]
+    leastactive_year = timeline['Year'][minn_idx]
+
+    # st.header(f"Least Active Month was {leastactive_month} in {leastactive_year} with a total of {minn_mssg} messages.")
+    # Most Active Month with Year
+    st.markdown(f"""
+            <div>
+                <br>
+                <p>Here are some important facts:</p>
+                <ul>
+                    <li>Most Active Month of <strong>{selected_user}</strong> was <strong>{mostactive_month}</strong> in <strong>{mostactive_year}</strong> with a total of <strong>{maxx_mssg}</strong> messages.</li>
+                    <br/>
+                    <li>Least Active Month of <strong>{selected_user}</strong> was <strong>{leastactive_month}</strong> in <strong>{leastactive_year}</strong> with a total of <strong>{minn_mssg}</strong> messages.</li>
+                    <br/>
+                </ul>
+            </div>
+        """,unsafe_allow_html=True)
+
+    st.write(timeline)
+    st.write(f"Monthly Timeline of {selected_user}")
+    fig,ax = plt.subplots()
+    ax.plot(timeline['Time'], timeline['Message'],color='green')
+    plt.xticks(rotation='vertical')
+    st.pyplot(fig)
+    # return timeline
+
+
+def create_download_link(val, filename):
+    b64 = base64.b64encode(val)  # val looks like b'...'
+    return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}.pdf">Download file</a>'
+
+def daily_timeline(df,selected_user,report):
+
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+
+    daily_timeline = df.groupby('Only_Date').count()['Message'].reset_index()
+    # st.write(daily_timeline)
+
+    maxx_mssg = 0
+    maxx_idx = 0
+    for idx in daily_timeline.index:
+        if daily_timeline['Message'][idx] > maxx_mssg:
+            maxx_mssg = daily_timeline['Message'][idx]
+            maxx_idx = idx 
+    
+    mostactive_day = daily_timeline['Only_Date'][maxx_idx]
+
+    minn_mssg = 400000
+    minn_idx = 0
+    for idx in daily_timeline.index:
+        if daily_timeline['Message'][idx] < minn_mssg:
+            minn_mssg = daily_timeline['Message'][idx]
+            minn_idx = idx 
+    
+    leastactive_day = daily_timeline['Only_Date'][minn_idx]
+    mostactive_weekday = findDay(mostactive_day)
+    leastactive_weekday = findDay(leastactive_day)
+    if report==0:
+        st.markdown(f"""
+            <div>
+                <br>
+                <p>Here are some interesting facts:</p>
+                <ul>
+                    <li>Most Active Day of <strong>{selected_user}</strong> was <strong>{mostactive_day} ({mostactive_weekday})</strong> with a total of <strong>{maxx_mssg}</strong> messages.</li>
+                    <li>Least Active Day of <strong>{selected_user}</strong> was <strong>{leastactive_day} ({leastactive_weekday})</strong> with a total of <strong>{minn_mssg}</strong> messages.</li>
+                    <br/>
+                </ul>
+            </div>
+        """,unsafe_allow_html=True)
+    # daily timeline
+    daily_timeline.columns = ['Date','Messages']
+    
+    daily_timeline['Date'] = daily_timeline['Date'].astype(str)
+
+
+    
+    c = alt.Chart(daily_timeline).mark_circle().encode(
+            color='Messages',
+            y=alt.Y('Date',sort='-x'),
+            x='Messages',
+            tooltip = ['Date','Messages']
+        ).properties(
+            title = 'Daily Activity'
+        ).configure_mark(
+            opacity=1,
+            # color='red'
+        )     
+    if report==0:
+        st.write("Below is the Week Activity chart of ",selected_user)
+        st.altair_chart(c, use_container_width=True)
+
+    c.save('chart.png')
+    
+
+    # pdfkit.from_file('simple.html', 'out2.pdf')
+    # st.write("\n")
+    # st.write(f"Daily Timeline of {selected_user}")
+    # fig, ax = plt.subplots()
+    # ax.plot(daily_timeline['Only_Date'], daily_timeline['Message'], color='orange')
+    # plt.xticks(rotation='vertical')
+    # st.pyplot(fig)
+    # return daily_timeline
+
+def week_activity_map(df,selected_user,report):
+
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+
+    busy_day = df['Day_Name'].value_counts()
+    # st.write(busy_day['Monday'])
+    maxx_mssg = 0
+    most_busyday = ""
+    minn_mssg = 400000
+    least_busyday = ""
+
+    f = 0  # temp
+    for day in busy_day.index:
+        if f==0:
+            most_busyday = day 
+            maxx_mssg = busy_day[day]
+        
+        f = f + 1
+        least_busyday = day
+        minn_mssg = busy_day[day]
+
+    
+    if report==0:
+        st.markdown(f"""
+            <div>
+                <br>
+                <p>Here are some important facts:</p>
+                <ul>
+                    <li>Most Weekly Active Day of <strong>{selected_user}</strong> was <strong>{most_busyday}</strong> with a collective total of <strong>{maxx_mssg}</strong> messages.</li>
+                    <li>Least Weekly Active Day of <strong>{selected_user}</strong> was <strong>{least_busyday}</strong> with a collective total of <strong>{minn_mssg}</strong> messages.</li>
+                    <br/>
+                </ul>
+            </div>
+        """,unsafe_allow_html=True)
+    
+
+    # st.write("Most busy day")
+    # fig,ax = plt.subplots()
+    # ax.bar(busy_day.index,busy_day.values,color='purple')
+    # plt.xticks(rotation='vertical')
+    # st.pyplot(fig)
+    
+    # st.write(df.head(5))\
+    busy_day = busy_day.reset_index(level=0)
+    busy_day.columns = ['Day','Message']
+    # st.write(busy_day)
+
+    
+    c = alt.Chart(busy_day).mark_bar().encode(
+            color='Message',
+            y=alt.Y('Day',sort='-x'),
+            x='Message',
+            tooltip = ['Day','Message']
+        ).properties(
+            title = 'Weekly Activity'
+        ).configure_mark(
+            opacity=1,
+            # color='red'
+        )     
+    if f==0:
+        st.write("Below is the Week Activity chart of ",selected_user)
+        st.altair_chart(c, use_container_width=True)
+    c.save('week_activity_map.png')
+    # st.write(busy_day)
+    # c = alt.Chart(busy_day.reset_index()).mark_line().encode(
+    #     x='index:T',
+    #     y='Day_Name:Q',
+    # )
+    # st.altair_chart(c, use_container_width=True)
+    # return df['Day_Name'].value_counts()
+
+def month_activity_map(df,selected_user):
+
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+
+    st.write("Most busy Month")
+    busy_day = df['Month'].value_counts()
+    fig,ax = plt.subplots()
+    ax.bar(busy_day.index,busy_day.values,color='purple')
+    plt.xticks(rotation='vertical')
+    st.pyplot(fig)
+    # return df['month'].value_counts()
+
+def activity_heatmap(df,selected_user):
+
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+
+    user_heatmap = df.pivot_table(index='Day_Name', columns='Period', values='Message', aggfunc='count').fillna(0)
+    st.write("Weekly Activity Map")
+    fig,ax = plt.subplots()
+    ax = sns.heatmap(user_heatmap)
+    st.pyplot(fig)
+    # return user_heatmap
+
+
+def most_common_words(df,selected_user,report):
+
+    f = open('stop_hinglish.txt','r')
+    stop_words = f.read()
+
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+
+    temp = df[df['Author'] != 'group_notification']
+    temp = temp[temp['Message'] != ' <Media omitted>']
+
+    words = []
+    for message in temp['Message']:
+        for word in message.lower().split():
+            if word not in stop_words and len(word)>2:
+                words.append(word)
+
+
+
+    # most_common_df = pd.DataFrame(Counter(words).most_common(20))
+    # # st.title("Most Common Words")
+    # # # most_common_df = helper.most_common_words(selected_user, df)
+    # # fig, ax = plt.subplots()
+    # # ax.barh(most_common_df[0], most_common_df[1])
+    # # plt.xticks(rotation='vertical')
+    # # st.pyplot(fig)
+    # st.write(most_common_df)
+    # most_common_df[0] = most_common_df[0].astype(str)
+    # most_common_df[1] = most_common_df[1].astype(str)
+
+    unique_words = set(words)
+    count_words = dict.fromkeys(unique_words, 0)
+
+    for w in words:
+        count_words[w] = count_words[w] + 1
+    
+    # for e in unique_words:
+        # print(e,count_words[e])
+    count_words = {key: value for key, value in sorted(count_words.items(), key=lambda item: item[1],reverse=True)}
+    # count_words = dict((v,k) for k,v in count_words.items())
+    # count_words = dict(itertools.islice(count_words.items(), 20)) 
+    words_df = pd.DataFrame.from_dict(count_words,orient ='index',columns=['Word Count'])
+    words_df['Word'] = words_df.index
+
+
+    # st.write(words_df)
+    # st.bar_chart(words_df)
+    # alt.Chart(words_df).mark_point().encode(
+    #     color='msg_count',
+    #     y='index',
+    #     x='msg_count'
+    # )
+
+    if report==0:
+        st.markdown(f"""
+            <div>
+                <br>
+                <p>Here are some important facts:</p>
+                <ul>
+                <p> Top 5 most used words by {selected_user} are:
+                    <li><strong>{words_df['Word'][0]}</strong> used <strong>{words_df['Word Count'][0]}</strong> times.</li>
+                    <li><strong>{words_df['Word'][1]}</strong> used <strong>{words_df['Word Count'][2]}</strong> times.</li>
+                    <li><strong>{words_df['Word'][3]}</strong> used <strong>{words_df['Word Count'][3]}</strong> times.</li>
+                    <li><strong>{words_df['Word'][4]}</strong> used <strong>{words_df['Word Count'][4]}</strong> times.</li>
+                    <li><strong>{words_df['Word'][5]}</strong> used <strong>{words_df['Word Count'][5]}</strong> times.</li>
+                </ul>
+            </div>
+        """,unsafe_allow_html=True)
+    
+    
+    c = alt.Chart(words_df.head(20)).mark_bar().encode(
+            color='Word Count',
+            y=alt.Y('Word',sort='-x'),
+            x='Word Count',
+            tooltip = ['Word','Word Count']
+        ).properties(
+            title = 'Word Frequency Chart'
+        ).configure_mark(
+            opacity=1,
+            # color='red'
+        )     
+    if report==0:
+        st.write("Below is the word frequency chart of ",selected_user)
+        st.altair_chart(c, use_container_width=True)
+    c.save('word_chart.png')
+    # return most_common_df
+
+def emoji_table(df,selected_user,report):
+    if selected_user != 'Overall':
+        df = df[df['Author'] == selected_user]
+    emojis = []
+    for message in df['Message']:
+        for c in message:
+            if c in emoji.UNICODE_EMOJI['en']:
+                emojis.append(c)
+        # if any(char in emoji.UNICODE_EMOJI['en'] for char in message):
+        #     print(message)
+        #     emojis.append(message)
+    # emoji_df = pd.DataFrame(Counter(emojis).most_common(20))
+    # print(emojis)
+    unique_emojis = set(emojis)
+    count_emojis = dict.fromkeys(unique_emojis, 0)
+
+    for e in emojis:
+        count_emojis[e] = count_emojis[e] + 1
+    
+    # for e in unique_emojis:
+        # print(e,count_emojis[e])
+    count_emojis = {key: value for key, value in sorted(count_emojis.items(), key=lambda item: item[1],reverse=True)}
+    count_emojis = dict(itertools.islice(count_emojis.items(), 10)) 
+    emoji_df = pd.DataFrame.from_dict(count_emojis,orient ='index',columns=['Emoji Count'])
+    # st.bar_chart(emoji_df)
+    # st.write(emoji_df)
+    emoji_df['Emoji'] = emoji_df.index
+    
+    c = alt.Chart(emoji_df.head(20)).mark_bar().encode(
+            color='Emoji Count',
+            # y='Emoji',
+            y=alt.Y('Emoji', sort='-x'),
+            x='Emoji Count',
+            tooltip = ['Emoji','Emoji Count']
+        ).properties(
+            title = 'Emoji Frequency Chart'
+        ).configure_mark(
+            opacity=1,
+            # color='red'
+        )  
+    if report==0:
+        st.write("Below is the emoji frequency chart of ",selected_user)   
+        st.altair_chart(c, use_container_width=True)
+
+    c.save('emoji_chart.png')
+    # Horizontal stacked bar chart
+    # data = pd.melt(emoji_df.reset_index(), id_vars=["index"])
+    # chart = (
+    #     alt.Chart(data)
+    #     .mark_bar()
+    #     .encode(
+    #         x=alt.X("value", type="quantitative", title=""),
+    #         y=alt.Y("index", type="nominal", title=""),
+    #         color=alt.Color("variable", type="nominal", title=""),
+    #         order=alt.Order("variable", sort="descending"),
+    #     )
+    # )
+    # st.altair_chart(chart, use_container_width=True)
+
+
+    #emoji analysis
+    # emoji_df = helper.emoji_helper(selected_user, df)
+    # st.title("Emojis Analysis")
+    # cnt = 0
+    # for key,value in count_emojis.items():
+    #     if cnt>=5:
+    #         break
+    #     cnt = cnt + 1
+    #     st.markdown(f"""
+    #         <div class="card">
+    #             <ul>
+    #                 <li>{key} : {value}</li>
+    #             </ul>
+    #         </div>
+    #     """,unsafe_allow_html=True)
+    # st.write(emoji_df)
+    # col1, col2 = st.columns(2)
+
+    # with col1:
+    #     st.dataframe(emoji_df)
+    # with col2:
+    #     fig, ax = plt.subplots()
+    #     ax.pie(emoji_df[1].head(), labels = emoji_df[0].head(), autopct = "%0.2f")
+    #     st.pyplot(fig)
+    # return emoji_df 
+# def top_emoji(df,author):
+
+
+
+
+
+
+
+### For Top Stats Card, use the below code:
+
+"""
+    <div class="card" style="margin:1rem;">
+        <div class="card-body">
+            <h2 class="card-title">Top Statistics of {author}</h5>
+            <h4 class="card-subtitle mb-2 text-muted">Total Messages --> {total_messages}</h6>
+            <h4 class="card-subtitle">Total words--> {total_words}</p>
+        </div>
+    </div>
+
+"""
+
+
+# def find_sentimentreport(df):
+    
+
+
+def group_icon_changed(df):
+    cnt = 0
+    s = "changed this group's icon"
+    for message in df['Message']:
+        cnt = cnt + message.count(s)
+
+    st.write("Group Icon changed a total of",cnt,"times.")
\ No newline at end of file
diff --git a/preprocessor.py b/preprocessor.py
new file mode 100644
index 0000000..fc28060
--- /dev/null
+++ b/preprocessor.py
@@ -0,0 +1,144 @@
+import pandas as pd
+import re
+import streamlit as st
+
+
+def startsWithDateAndTime(s):
+    pattern = '^([0-9]+)(/)([0-9]+)(/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) [AaPp][Mm] -'
+    result = re.match(pattern, s)
+    if result:
+        return True
+    return False
+
+def FindAuthor(s):
+    patterns = [
+        '([\w]+):',                        # First Name
+        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
+        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
+        '([\w]+)[\u263a-\U0001f999]+:',    # Name and Emoji    
+        '([+]d{2} d{5} d{5}):',            # Mobile No. (India)
+        '([+]d{2} d{3} d{3} d{4}):'        # Mobile No. (US)
+    ]
+    pattern = '^' + '|'.join(patterns)
+    result = re.match(pattern, s)
+    if result:
+        return True
+    return False
+
+def getDataPoint(line):   
+    splitLine = line.split(' - ') 
+    dateTime = splitLine[0]
+    date, time = dateTime.split(', ') 
+    message = ' '.join(splitLine[1:])
+    # print(line)
+    # print(date)
+    if FindAuthor(message): 
+        splitMessage = message.split(':') 
+        author = splitMessage[0] 
+        message = ' '.join(splitMessage[1:])
+    else:
+        author = None
+    return date, time, author, message
+
+def check_dataframe(df):
+    ### Checking shape of dataset.
+    df.shape
+    ### Checking basic information of dataset
+    df.info()
+    ### Checking no. of null values in dataset
+    df.isnull().sum()
+    ### Checking head part of dataset
+    df.head(50)
+    ### Checking tail part of dataset
+    df.tail(50)
+    ### Droping Nan values from dataset
+    df = df.dropna()
+    df = df.reset_index(drop=True)
+    df.shape
+    ### Checking no. of authors of group
+    df['Author'].nunique()
+    ### Checking authors of group
+    df['Author'].unique()
+
+
+def preprocess(data):
+    parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe    
+    messageBuffer = [] 
+    date, time, author = None, None, None
+    f = 0
+    for line in data.splitlines():
+        f = f + 1
+        ### Skipping first line of the file because contains information related to something about end-to-end encryption
+        if f==1:
+            continue
+        # line = line.strip() 
+        if startsWithDateAndTime(line): 
+            if len(messageBuffer) > 0: 
+                parsedData.append([date, time, author, ' '.join(messageBuffer)]) 
+            messageBuffer.clear() 
+            date, time, author, message = getDataPoint(line) 
+
+            messageBuffer.append(message) 
+        else:
+            messageBuffer.append(line)
+
+
+    df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
+    # print(df['Author'].unique())
+    ### To download the created Dataframe
+    temp_df = df
+    temp_df.to_csv('Download.csv')
+    # st.download_button('Download CSV','Download.csv')  # Defaults to 'text/plain'
+    # with open("Download.csv",encoding="utf8") as f:
+    #     st.download_button('Download Chat CSV', f)  # Defaults to 'text/plain'
+    
+    # check_dataframe(df)
+    ## changing datatype of "Date" column.
+    df["Date"] = pd.to_datetime(df["Date"],dayfirst=True)
+
+    # ### Adding one more column of "Day" for better analysis, here we use datetime library which help us to do this task easily.
+    # weeks = {
+    # 0 : 'Monday',
+    # 1 : 'Tuesday',
+    # 2 : 'Wednesday',
+    # 3 : 'Thrusday',
+    # 4 : 'Friday',
+    # 5 : 'Saturday',
+    # 6 : 'Sunday'
+    # }
+    # df['Day'] = df['Date'].dt.weekday.map(weeks)
+    # ### Rearranging the columns for better understanding
+    # df = df[['Date','Day','Time','Author','Message']]
+    # ### Changing the datatype of column "Day".
+    # df['Day'] = df['Day'].astype('category')
+    # ### Looking newborn dataset.
+    # df.head()
+    df['Only_Date'] = df['Date'].dt.date
+    df['Year'] = df['Date'].dt.year
+    df['Month_num'] = df['Date'].dt.month
+    df['Month'] = df['Date'].dt.month_name()
+    df['Day'] = df['Date'].dt.day
+    df['Day_Name'] = df['Date'].dt.day_name()
+    df['Hour'] = df['Date'].dt.hour
+    df['Minute'] = df['Date'].dt.minute
+
+    period = []
+    for hour in df[['Day_Name', 'Hour']]['Hour']:
+        if hour == 23:
+            period.append(str(hour) + "-" + str('00'))
+        elif hour == 0:
+            period.append(str('00') + "-" + str(hour + 1))
+        else:
+            period.append(str(hour) + "-" + str(hour + 1))
+
+    df['Period'] = period
+
+    return df
+
+
+    ### message buffer is used for messages with line gaps. Long messages like 
+    # Chip rate:The number of chips (bits) in the spreading signal is significantly greater than the data bits. Chip rate is measured in "megachips per second" (Mcps), which is millions of chips per second.
+    # Hint: related to Bit Error Rate (BER) and Signal Noise Ration (SNR), file attachment allowed in this question.
+    ## the above two messages are sent by the same user but are seperated by a '\n' so message buffer store the last messages, so that they both
+    ## correspond to the same user.
+
diff --git a/report.py b/report.py
new file mode 100644
index 0000000..989db13
--- /dev/null
+++ b/report.py
@@ -0,0 +1,96 @@
+from time import sleep
+import streamlit as st
+import preprocessor
+import helper
+import base64
+from fpdf import FPDF
+import matplotlib.pyplot as plt
+import PIL
+from tempfile import NamedTemporaryFile
+# from sklearn.datasets import load_iris
+from fpdf import FPDF, HTMLMixin
+
+class PDF(FPDF, HTMLMixin):
+    pass
+
+
+def create_download_link(val, filename):
+
+    b64 = base64.b64encode(val)  # val looks like b'...'
+    return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}.pdf">Download Chat Report PDF</a>'
+
+
+def show_pdf(file_path):
+    with open(file_path,"rb") as f:
+        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
+    st.markdown(pdf_display, unsafe_allow_html=True)
+
+
+def create_report(df):
+
+    helper.daily_timeline(df,'Overall',1)
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size = 12)
+    pdf.cell(180, 10, txt = "Daily Activity Time Series Graph of Overall", 
+         ln = 1, align = 'C')
+    pdf.image('chart.png', 50, 25, 100, 200)
+
+
+
+
+    helper.week_activity_map(df,'Overall',1)
+    pdf.add_page()
+    pdf.cell(180, 10, txt = "Week Activity Time Series Graph of Overall", 
+         ln = 1, align = 'C')
+    # image = PIL.Image.open("week_activity_map.png")
+    # width, height = image.size
+    pdf.image('week_activity_map.png', 50, 25, 100, 50)
+
+
+
+
+    helper.most_common_words(df,'Overall',1)
+    pdf.cell(180, 150, txt = "Word Frequency Graph of Overall", 
+         ln = 1, align = 'C')
+    pdf.image('word_chart.png', 50, 100, 80, 50)
+
+
+    pdf.add_page()
+    helper.emoji_table(df,'Overall',1)
+    pdf.cell(180, 10, txt = "Emoji Frequency Graph of Overall", 
+         ln = 1, align = 'C')
+    pdf.image('emoji_chart.png', 50, 20, 100, 50)
+
+
+    html = create_download_link(pdf.output(dest="S"), "testfile")
+    st.markdown(html, unsafe_allow_html=True)
+
+
+    # show_pdf('testfile.pdf')
+
+
+# df = load_iris(as_frame=True)["data"]
+
+
+# figs = []
+
+# for col in df.columns:
+#     fig, ax = plt.subplots()
+#     ax.plot(df[col])
+#     st.pyplot(fig)
+#     figs.append(fig)
+
+# export_as_pdf = st.button("Export Report")
+
+
+# if export_as_pdf:
+#     pdf = FPDF()
+#     for fig in figs:
+#         pdf.add_page()
+#         with NamedTemporaryFile(delete=False, suffix=".png") as tmpfile:
+#                 fig.savefig(tmpfile.name)
+#                 pdf.image(tmpfile.name, 10, 10, 200, 100)
+#     html = create_download_link(pdf.output(dest="S"), "testfile")
+#     st.markdown(html, unsafe_allow_html=True)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..caf6835
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+streamlit~=1.9.0
+matplotlib~=3.5.2
+seaborn~=0.11.2
+urlextract
+pandas~=1.4.2
+emoji~=1.7.0
+plotnine~=0.8.0
+altair~=4.2.0
+image~=1.5.33
+pandas
+numpy
+fpdf
+altair_saver
\ No newline at end of file
diff --git a/stop_hinglish.txt b/stop_hinglish.txt
new file mode 100644
index 0000000..dd8dc0c
--- /dev/null
+++ b/stop_hinglish.txt
@@ -0,0 +1,1056 @@
+.
+..
+...
+?
+-
+--
+1
+2
+3
+4
+5
+6
+7
+8
+9
+0
+a
+aadi
+aaj
+aap
+aapne
+aata
+aati
+aaya
+aaye
+ab
+abbe
+abbey
+abe
+abhi
+able
+about
+above
+accha
+according
+accordingly
+acha
+achcha
+across
+actually
+after
+afterwards
+again
+against
+agar
+ain
+aint
+ain't
+aisa
+aise
+aisi
+alag
+all
+allow
+allows
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+andar
+another
+any
+anybody
+anyhow
+anyone
+anything
+anyway
+anyways
+anywhere
+ap
+apan
+apart
+apna
+apnaa
+apne
+apni
+appear
+are
+aren
+arent
+aren't
+around
+arre
+as
+aside
+ask
+asking
+at
+aur
+avum
+aya
+aye
+baad
+baar
+bad
+bahut
+bana
+banae
+banai
+banao
+banaya
+banaye
+banayi
+banda
+bande
+bandi
+bane
+bani
+bas
+bata
+batao
+bc
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+behind
+being
+below
+beside
+besides
+best
+better
+between
+beyond
+bhai
+bheetar
+bhi
+bhitar
+bht
+bilkul
+bohot
+bol
+bola
+bole
+boli
+bolo
+bolta
+bolte
+bolti
+both
+brief
+bro
+btw
+but
+by
+came
+can
+cannot
+cant
+can't
+cause
+causes
+certain
+certainly
+chahiye
+chaiye
+chal
+chalega
+chhaiye
+clearly
+c'mon
+com
+come
+comes
+could
+couldn
+couldnt
+couldn't
+d
+de
+dede
+dega
+degi
+dekh
+dekha
+dekhe
+dekhi
+dekho
+denge
+dhang
+di
+did
+didn
+didnt
+didn't
+dijiye
+diya
+diyaa
+diye
+diyo
+do
+does
+doesn
+doesnt
+doesn't
+doing
+done
+dono
+dont
+don't
+doosra
+doosre
+down
+downwards
+dude
+dunga
+dungi
+during
+dusra
+dusre
+dusri
+dvaara
+dvara
+dwaara
+dwara
+each
+edu
+eg
+eight
+either
+ek
+else
+elsewhere
+enough
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+far
+few
+fifth
+fir
+first
+five
+followed
+following
+follows
+for
+forth
+four
+from
+further
+furthermore
+gaya
+gaye
+gayi
+get
+gets
+getting
+ghar
+given
+gives
+go
+goes
+going
+gone
+good
+got
+gotten
+greetings
+guys
+haan
+had
+hadd
+hadn
+hadnt
+hadn't
+hai
+hain
+hamara
+hamare
+hamari
+hamne
+han
+happens
+har
+hardly
+has
+hasn
+hasnt
+hasn't
+have
+haven
+havent
+haven't
+having
+he
+hello
+help
+hence
+her
+here
+hereafter
+hereby
+herein
+here's
+hereupon
+hers
+herself
+he's
+hi
+him
+himself
+his
+hither
+hm
+hmm
+ho
+hoga
+hoge
+hogi
+hona
+honaa
+hone
+honge
+hongi
+honi
+hopefully
+hota
+hotaa
+hote
+hoti
+how
+howbeit
+however
+hoyenge
+hoyengi
+hu
+hua
+hue
+huh
+hui
+hum
+humein
+humne
+hun
+huye
+huyi
+i
+i'd
+idk
+ie
+if
+i'll
+i'm
+imo
+in
+inasmuch
+inc
+inhe
+inhi
+inho
+inka
+inkaa
+inke
+inki
+inn
+inner
+inse
+insofar
+into
+inward
+is
+ise
+isi
+iska
+iskaa
+iske
+iski
+isme
+isn
+isne
+isnt
+isn't
+iss
+isse
+issi
+isski
+it
+it'd
+it'll
+itna
+itne
+itni
+itno
+its
+it's
+itself
+ityaadi
+ityadi
+i've
+ja
+jaa
+jab
+jabh
+jaha
+jahaan
+jahan
+jaisa
+jaise
+jaisi
+jata
+jayega
+jidhar
+jin
+jinhe
+jinhi
+jinho
+jinhone
+jinka
+jinke
+jinki
+jinn
+jis
+jise
+jiska
+jiske
+jiski
+jisme
+jiss
+jisse
+jitna
+jitne
+jitni
+jo
+just
+jyaada
+jyada
+k
+ka
+kaafi
+kab
+kabhi
+kafi
+kaha
+kahaa
+kahaan
+kahan
+kahi
+kahin
+kahte
+kaisa
+kaise
+kaisi
+kal
+kam
+kar
+kara
+kare
+karega
+karegi
+karen
+karenge
+kari
+karke
+karna
+karne
+karni
+karo
+karta
+karte
+karti
+karu
+karun
+karunga
+karungi
+kaun
+kaunsa
+kayi
+kch
+ke
+keep
+keeps
+keh
+kehte
+kept
+khud
+ki
+kin
+kine
+kinhe
+kinho
+kinka
+kinke
+kinki
+kinko
+kinn
+kino
+kis
+kise
+kisi
+kiska
+kiske
+kiski
+kisko
+kisliye
+kisne
+kitna
+kitne
+kitni
+kitno
+kiya
+kiye
+know
+known
+knows
+ko
+koi
+kon
+konsa
+koyi
+krna
+krne
+kuch
+kuchch
+kuchh
+kul
+kull
+kya
+kyaa
+kyu
+kyuki
+kyun
+kyunki
+lagta
+lagte
+lagti
+last
+lately
+later
+le
+least
+lekar
+lekin
+less
+lest
+let
+let's
+li
+like
+liked
+likely
+little
+liya
+liye
+ll
+lo
+log
+logon
+lol
+look
+looking
+looks
+ltd
+lunga
+m
+maan
+maana
+maane
+maani
+maano
+magar
+mai
+main
+maine
+mainly
+mana
+mane
+mani
+mano
+many
+mat
+may
+maybe
+me
+mean
+meanwhile
+mein
+mera
+mere
+merely
+meri
+message
+might
+mightn
+mightnt
+mightn't
+mil
+mjhe
+more
+moreover
+most
+mostly
+much
+mujhe
+must
+mustn
+mustnt
+mustn't
+my
+myself
+na
+naa
+naah
+nahi
+nahin
+nai
+name
+namely
+nd
+ne
+near
+nearly
+necessary
+neeche
+need
+needn
+neednt
+needn't
+needs
+neither
+never
+nevertheless
+new
+next
+nhi
+nine
+no
+nobody
+non
+none
+noone
+nope
+nor
+normally
+not
+nothing
+novel
+now
+nowhere
+o
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+on
+once
+one
+ones
+only
+onto
+or
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+own
+par
+pata
+pe
+pehla
+pehle
+pehli
+people
+per
+perhaps
+phla
+phle
+phli
+placed
+please
+plus
+poora
+poori
+provides
+pura
+puri
+q
+que
+quite
+raha
+rahaa
+rahe
+rahi
+rakh
+rakha
+rakhe
+rakhen
+rakhi
+rakho
+rather
+re
+really
+reasonably
+regarding
+regardless
+regards
+rehte
+rha
+rhaa
+rhe
+rhi
+ri
+right
+s
+sa
+saara
+saare
+saath
+sab
+sabhi
+sabse
+sahi
+said
+sakta
+saktaa
+sakte
+sakti
+same
+sang
+sara
+sath
+saw
+say
+saying
+says
+se
+second
+secondly
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sensible
+sent
+serious
+seriously
+seven
+several
+shall
+shan
+shant
+shan't
+she
+she's
+should
+shouldn
+shouldnt
+shouldn't
+should've
+si
+sir
+sir.
+since
+six
+so
+soch
+some
+somebody
+somehow
+someone
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+still
+sub
+such
+sup
+sure
+t
+tab
+tabh
+tak
+take
+taken
+tarah
+teen
+teeno
+teesra
+teesre
+teesri
+tell
+tends
+tera
+tere
+teri
+th
+tha
+than
+thank
+thanks
+thanx
+that
+that'll
+thats
+that's
+the
+theek
+their
+theirs
+them
+themselves
+then
+thence
+there
+thereafter
+thereby
+therefore
+therein
+theres
+there's
+thereupon
+these
+they
+they'd
+they'll
+they're
+they've
+thi
+thik
+thing
+think
+thinking
+third
+this
+tho
+thoda
+thodi
+thorough
+thoroughly
+those
+though
+thought
+three
+through
+throughout
+thru
+thus
+tjhe
+to
+together
+toh
+too
+took
+toward
+towards
+tried
+tries
+true
+truly
+try
+trying
+tu
+tujhe
+tum
+tumhara
+tumhare
+tumhari
+tune
+twice
+two
+um
+umm
+un
+under
+unhe
+unhi
+unho
+unhone
+unka
+unkaa
+unke
+unki
+unko
+unless
+unlikely
+unn
+unse
+until
+unto
+up
+upar
+upon
+us
+use
+used
+useful
+uses
+usi
+using
+uska
+uske
+usne
+uss
+usse
+ussi
+usually
+vaala
+vaale
+vaali
+vahaan
+vahan
+vahi
+vahin
+vaisa
+vaise
+vaisi
+vala
+vale
+vali
+various
+ve
+very
+via
+viz
+vo
+waala
+waale
+waali
+wagaira
+wagairah
+wagerah
+waha
+wahaan
+wahan
+wahi
+wahin
+waisa
+waise
+waisi
+wala
+wale
+wali
+want
+wants
+was
+wasn
+wasnt
+wasn't
+way
+we
+we'd
+well
+we'll
+went
+were
+we're
+weren
+werent
+weren't
+we've
+what
+whatever
+what's
+when
+whence
+whenever
+where
+whereafter
+whereas
+whereby
+wherein
+where's
+whereupon
+wherever
+whether
+which
+while
+who
+whoever
+whole
+whom
+who's
+whose
+why
+will
+willing
+with
+within
+without
+wo
+woh
+wohi
+won
+wont
+won't
+would
+wouldn
+wouldnt
+wouldn't
+y
+ya
+yadi
+yah
+yaha
+yahaan
+yahan
+yahi
+yahin
+ye
+yeah
+yeh
+yehi
+yes
+yet
+you
+you'd
+you'll
+your
+you're
+yours
+yourself
+yourselves
+you've
+yup
\ No newline at end of file