diff --git a/.dockerignore b/.dockerignore index bee8a64..4316f38 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,3 @@ __pycache__ +.dockerignore +stanford-corenlp-full-2018-10-05/ diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.md b/.github/ISSUE_TEMPLATE/bug_report_template.md new file mode 100644 index 0000000..82a5f2a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report_template.md @@ -0,0 +1,24 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: 'bug' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request_template.md b/.github/ISSUE_TEMPLATE/feature_request_template.md new file mode 100644 index 0000000..a1e43a6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request_template.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: 'enhancement' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..c4d0dde --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,30 @@ + +Fixes # + + +## Description: + +- Files changed: +- Dependencies if any: +- Basic tests done to validate: +- Conflicts if any: + + +## Checklist: + +- [ ] All coding conventions are followed +- [ ] Style guidelines for this project have been followed +- [ ] 4 space indentation is used +- [ ] Relevant comments are added +- [ ] The code has been tested +- [ ] No new warnings are generated + + +## Screenshots: + + + +## Other Information: + + +@ diff --git a/.gitignore b/.gitignore index 163abbb..2dd07cd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,127 @@ stanford-corenlp-full-2018-10-05/ __pycache__ -config.py +# Editors +.vscode/ +.idea/ + +# Vagrant +.vagrant/ + +# Mac/OSX +.DS_Store + +# Windows +Thumbs.db + +# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..538ea1a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,12 @@ +language: python +python: + - "3.6" +cache: pip +install: + - pip install -r requirements.txt +before_script: + - black --check . || true +script: + - flake8 . --count --select=E101,E722,E9,F4,F63,F7,F82,W191 --show-source --statistics + - flake8 . --count --exit-zero --max-line-length=127 --statistics + - pytest diff --git a/Dockerfile b/Dockerfile index 890b4db..4ac074a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,12 @@ -FROM python:3.6 +FROM python:3.6-slim-stretch + +RUN apt-get update && apt-get install -y git wget unzip COPY requirements.txt ./requirements.txt RUN pip install -r requirements.txt -COPY . /usr/var/ -WORKDIR /usr/var/ +COPY docker-entrypoint.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/docker-entrypoint.sh -CMD ["python", "./init.py"] +WORKDIR /usr/var/MapBot +ENTRYPOINT ["/bin/sh","/usr/local/bin/docker-entrypoint.sh"] diff --git a/ENV/.env b/ENV/.env new file mode 100644 index 0000000..ea53ab8 --- /dev/null +++ b/ENV/.env @@ -0,0 +1,10 @@ +DB_USER=root +DB_PASSWORD= +DB_HOST=localhost +DATABASE=mapbot +DB_PORT=3306 +GCLOUD_API_KEY= +JAVAHOME= +STANFORD_PATH_TO_JAR= +STANFORD_PATH_TO_MODELS_JAR= +TELEGRAM_BOT_TOKEN= diff --git a/ENV/docker.env b/ENV/docker.env new file mode 100644 index 0000000..21eb755 --- /dev/null +++ b/ENV/docker.env @@ -0,0 +1,8 @@ +DB_USER=root +DB_PASSWORD=root +DB_HOST=db +DATABASE=mapbot +DB_PORT=3306 +JAVAHOME=/usr/local/openjdk-11/bin/java +STANFORD_PATH_TO_JAR=./stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar +STANFORD_PATH_TO_MODELS_JAR=./stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2-models.jar diff --git a/README.md b/README.md index 3980ecb..395566a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ -# MapBot +

+ +

+ +# MapBot [![Build Status](https://travis-ci.com/vishakha-lall/MapBot.svg?branch=gssoc-master)](https://travis-ci.com/vishakha-lall/MapBot) #### Hey! I'm your friendly navigator bot! Try me out, not to brag but I'm FUN! @@ -48,18 +52,9 @@ Check out all related information [here](GSSoC.md) - Enter root password when prompted - `create database mapbot;` - Verify creation of the database `show databases;` -- Unzip the StanfordCoreNLP package in the repository and keep the file names `stanford-corenlp-x.x.x.jar` and `stanford-corenlp-x.x.x-models.jar` handy. -- Add config.py file to .gitignore to avoid pushing changes made to config -- Run `git rm --cached config.py` -- Edit the `config.py` file with the corresponding values - - user = "root" - - password = - - host = "localhost" - - database = "mapbot" - - key = - - stanford_path_to_jar = - - stanford_path_to_models_jar = - - javahome = +- Unzip the StanfordCoreNLP package in the repository and keep the file paths `stanford-corenlp-x.x.x.jar` and `stanford-corenlp-x.x.x-models.jar` handy. +- Run `git update-index --assume-unchanged ENV/.env` +- Fill the existing template in `ENV/.env` with the corresponding values following the `KEY=VALUE` format - Install dependencies from `requirements.txt` file. Run `pip install -r requirements.txt` - You're all set up, run the `init.py` file. `python init.py` - It is recommended that you set this project up in a virtual environment to keep the dependencies separated and for easier debugging. Here's how you can do that - @@ -70,28 +65,53 @@ Check out all related information [here](GSSoC.md) #### What are some pre-requisites? (with Docker) -- StanfordCoreNLP - - StanfordCoreNLP has a dependency on Java 8. `java -version` should complete successfully with version 1.8 or higher. - - Windows- Download as a .zip file from [here](https://stanfordnlp.github.io/CoreNLP/download.html). - - Linux and MacOS- Follow the instructions to download the file from [here](https://stanfordnlp.github.io/CoreNLP/download.html). - Docker - Take a look at [this](https://docs.docker.com/install/) for detailed installation instructions for Docker on Windows, Linux and Mac systems. - Verify the installations by `docker --version` and `docker-compose --version` -- You won't need to download MySQL locally to make Docker work, but it's recommended as a pre-requisite to be able to debug programs outside Docker. #### How to set me up Docker style? -- Clone the repository -- Unzip the StanfordCoreNLP package in the repository. Make sure the StanfordCoreNLP folder you downloaded in the prerequisite steps are in the cloned repository folder. -- Add config.py file to .gitignore to avoid pushing changes made to config -- Run `git rm --cached config.py` -- Modify *only* the following fields in `config.py` file with the corresponding values: - - key = +- Download the `start.sh` and modify it appropriately: + - `git clone -b ` + - `export GCLOUD_API_KEY=` - You're all set up, kick off with `start.sh` file by running `bash start.sh`. ------ + +#### What are some pre-requisites? (with Telegram Bot) + +- MySQL + - Install the community version of mySQL from the [official mySQL documentation page](https://dev.mysql.com/doc/mysql-installation-excerpt/5.7/en/). + - Create root user credentials during installation. + - Verify the installation, running the command `mysql -uroot -p -hlocalhost` should open the mySQL monitor. (Enter the root password when prompted) +- StanfordCoreNLP + - StanfordCoreNLP has a dependency on Java 8. `java -version` should complete successfully with version 1.8 or higher. + - Windows- Download as a .zip file from [here](https://stanfordnlp.github.io/CoreNLP/download.html). + - Linux and MacOS- Follow the instructions to download the file from [here](https://stanfordnlp.github.io/CoreNLP/download.html). +- Telegram + - Download the [Telegram](https://telegram.org/apps) for your chosen platform. + +#### How to set me up on Telegram? + +- Clone the repository +- Create the **mapbot** database in mySQL + - `mysql -uroot -p -hlocalhost` + - Enter root password when prompted + - `create database mapbot;` + - Verify creation of the database `show databases;` +- Unzip the StanfordCoreNLP package in the repository and keep the file paths `stanford-corenlp-x.x.x.jar` and `stanford-corenlp-x.x.x-models.jar` handy. +- Run `git update-index --assume-unchanged ENV/.env` +- Fill the existing template in `ENV/.env` with the corresponding values following the `KEY=VALUE` format +- For `TELEGRAM_BOT_TOKEN=`, open your Telegram app and follow [this](https://core.telegram.org/bots#creating-a-new-bot) tutorial on how to create a new bot on Telegram and get your own bot token. Once your token is generated, update the `.env` file in `/ENV` with it. +- Find your bot on Telegram using `@bot_username` that you chose, and send the first text to your new bot. Nothing is supposed to happen for now. No worries. +- Install dependencies from `requirements.txt` file. Run `pip install -r requirements.txt` +- You're all set up, run the `telegram.py` file. `python telegram.py` and converse with your bot in real time. +- It is recommended that you set this project up in a virtual environment to keep the dependencies separated and for easier debugging. Here's how you can do that - + 1. [Python](https://realpython.com/python-virtual-environments-a-primer/#why-the-need-for-virtual-environments) + 2. [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) +------ #### How do I work? -The analysis folder contains data files for the project. The sentences.csv contains the base training dataset which is used to classify the user's input into three classes - Statement, Question, and Chat. Going through some examples would clarify the difference between statement and chat. The featuresDump.csv is the result of text pre-processing done using the code in features.py and featuresDump.py. +The `/analysis` folder contains data files for the project. The `sentences.csv` contains the base training dataset which is used to classify the user's input into three classes - *Statement*, *Question*, and *Chat*. Going through some examples would clarify the difference between statement and chat. The `featuresDump.csv` is the result of text pre-processing done using the code in `features.py` and `featuresDump.py`. ------ #### Want to see me in action? diff --git a/app.py b/app.py new file mode 100644 index 0000000..6ee2735 --- /dev/null +++ b/app.py @@ -0,0 +1,19 @@ +from flask import Flask, jsonify +from chatbot import message_to_bot, setup + +app = Flask(__name__) + + +@app.route("/chatbot/", methods=["GET"]) +def chat(user_input): + try: + response = message_to_bot(user_input, clf, learn_response) + except Exception: + return jsonify({"message": ("Unable to get response", learn_response)}, 500) + + return jsonify({"message": response}, 200) + + +if __name__ == "__main__": + clf, learn_response = setup() + app.run(debug=False) diff --git a/chatbot.py b/chatbot.py index c06caa8..16b02c8 100644 --- a/chatbot.py +++ b/chatbot.py @@ -1,103 +1,144 @@ -from utilities import parse_sentence -from utilities import classify_model -from utilities import classify_sentence -from utilities import setup_nltk +import utilities import databaseconnect -from googleMapsApiModule import direction -from googleMapsApiModule import geocoding +import googleMapsApiModule +from enum import Enum, auto import logging -import logger_config -location_dict={"origin":"null","destination":"null"} +import logger_config + +location_dict = {"origin": "null", "destination": "null"} log = logging.getLogger(__name__) -log.info('Entered module: %s' % __name__) +log.info("Entered module: %s" % __name__) + + +class LearnResponse(Enum): + MESSAGE = auto() + TRAIN_ME = auto() + ORIGIN = auto() + DESTINATION = auto() + @logger_config.logger def setup(): - setup_nltk() - logging.debug('NLTK setup completed') - clf = classify_model() - logging.debug('Classification model ready') + utilities.setup_nltk() + logging.debug("NLTK setup completed") + clf = utilities.classify_model_adv(model="rf") + logging.debug("Classification model ready") databaseconnect.setup_database() - logging.debug('Database setup completed, database connected') - learn_response = 0 + logging.debug("Database setup completed, database connected") + learn_response = LearnResponse.MESSAGE.name return clf, learn_response + @logger_config.logger -def message_to_bot(H,clf,learn_response): - if learn_response == 2: - location_dict["origin"]=H +def message_to_bot(H, clf, learn_response): + if learn_response == LearnResponse.ORIGIN.name: + location_dict["origin"] = H B = "Can you help me with the destination location?" - learn_response = 3 - return B,learn_response - if learn_response == 3: - location_dict["destination"]=H - origin, destination = location_dict["origin"], location_dict["destination"] - direction(origin,destination) + learn_response = LearnResponse.DESTINATION.name + return B, learn_response + if learn_response == LearnResponse.DESTINATION.name: + location_dict["destination"] = H + origin, destination = ( + location_dict["origin"], + location_dict["destination"], + ) + googleMapsApiModule.direction(origin, destination) B = "I will certainly help you with that." - learn_response = 0 - return B,learn_response - if H.lower() == "bye" or H.lower() == "bye." or H.lower() == "bye!": #empty input + learn_response = LearnResponse.MESSAGE.name + return B, learn_response + if "bye" in H.lower().split(" "): # check in words within H B = "Bye! I'll miss you!" - return B,learn_response #exit loop - #grammar parsing + return B, learn_response # exit loop + if not H: + B = "Please say something!" + return B, learn_response # empty input + # grammar parsing subj = set() obj = set() verb = set() - triples,root = parse_sentence(H) + triples, root = utilities.parse_sentence(H) triples = list(triples) for t in triples: - if t[0][1][:2] == 'VB': + if t[0][1][:2] == "VB": verb.add(t[0][0]) relation = t[1] - if relation[-4:] == 'subj': + if relation[-4:] == "subj": subj.add(t[2][0]) - if relation[-3:] == 'obj': + if relation[-3:] == "obj": obj.add(t[2][0]) - logging.debug("\t"+"Subject: "+str(subj)+"\n"+"\t"+"Object: "+str(obj)+"\n"+"\t"+"Topic: "+str(root)+"\n"+"\t"+"Verb: "+str(verb)) + logging.debug( + "\t" + + "Subject: " + + str(subj) + + "\n" + + "\t" + + "Object: " + + str(obj) + + "\n" + + "\t" + + "Topic: " + + str(root) + + "\n" + + "\t" + + "Verb: " + + str(verb) + ) subj = list(subj) obj = list(obj) verb = list(verb) proper_nouns = set() for t in triples: - if t[0][1] == 'NNP': + if t[0][1] == "NNP": proper_nouns.add(t[0][0]) - if t[2][1] == 'NNP': + if t[2][1] == "NNP": proper_nouns.add(t[2][0]) proper_nouns == list(proper_nouns) - logging.debug("\t"+"Proper Nouns: "+str(proper_nouns)) - #classification - classification = classify_sentence(clf,H) - #logging.debug(classification) - if learn_response == 0: - databaseconnect.add_to_database(classification,subj,root,verb,H) - if (classification == 'C'): + logging.debug("\t" + "Proper Nouns: " + str(proper_nouns)) + # classification + classification = utilities.classify_sentence(clf, H) + # logging.debug(classification) + if learn_response == LearnResponse.MESSAGE.name: + databaseconnect.add_to_database(classification, subj, root, verb, H) + if classification == "C": B = databaseconnect.get_chat_response() - elif (classification == 'Q'): - B,learn_response = databaseconnect.get_question_response(subj,root,verb) - if learn_response == 1 and (len(proper_nouns) == 0 or (len(proper_nouns) == 1 and H.split(" ",1)[0] != "Where")): - databaseconnect.add_learnt_statement_to_database(subj,root,verb) - if learn_response == 1 and (len(proper_nouns) >= 2 or (len(proper_nouns) == 1 and H.split(" ",1)[0] == "Where")): - learn_response = 0 + elif classification == "Q": + B, learn_response = databaseconnect.get_question_response(subj, root, verb) + if learn_response == LearnResponse.TRAIN_ME.name and ( + len(proper_nouns) == 0 + or (len(proper_nouns) == 1 and H.split(" ", 1)[0] != "Where") + ): + databaseconnect.add_learnt_statement_to_database(subj, root, verb) + if learn_response == LearnResponse.TRAIN_ME.name and ( + len(proper_nouns) >= 2 + or (len(proper_nouns) == 1 and H.split(" ", 1)[0] == "Where") + ): + learn_response = LearnResponse.MESSAGE.name B = "I will certainly help you with that." else: B = "Oops! I'm not trained for this yet." else: - B,learn_response = databaseconnect.learn_question_response(H) - if (len(proper_nouns) >= 2 or (len(proper_nouns) >= 1 and H.split(" ",1)[0] == "Where")) and len(subj) != 0: + B, learn_response = databaseconnect.learn_question_response(H) + if ( + len(proper_nouns) >= 2 + or (len(proper_nouns) >= 1 and H.split(" ", 1)[0] == "Where") + ) and len(subj) != 0: if subj[0] == "distance": if len(proper_nouns) == 2: - location_dict["origin"]=proper_nouns.pop() - location_dict["destination"]=proper_nouns.pop() - origin, destination = location_dict["origin"], location_dict["destination"] - direction(origin,destination) + location_dict["origin"] = proper_nouns.pop() + location_dict["destination"] = proper_nouns.pop() + origin, destination = ( + location_dict["origin"], + location_dict["destination"], + ) + googleMapsApiModule.direction(origin, destination) else: B = "I didn't get that. Can you please give me the origin location?" - learn_response = 2 + learn_response = LearnResponse.ORIGIN.name if len(proper_nouns) == 1: location = proper_nouns.pop() if subj[0] == "geocoding" or subj[0] == location: - geocoding(location) - learn_response = 0 + googleMapsApiModule.geocoding(location) + learn_response = LearnResponse.MESSAGE.name B = "I will certainly help you with that." - return B,learn_response + return B, learn_response diff --git a/config.py b/config.py index eb79b70..d17b399 100644 --- a/config.py +++ b/config.py @@ -1,18 +1,26 @@ -user = "root" -password = "root" -host = "localhost" -database = "mapbot" -port = "3306" -key = "*Google_Cloud_API_key*" #Will be provided by mentors -stanford_path_to_jar = "./stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar" #"*your_path_to_stanford-corenlp-x.x.x.jar*" -stanford_path_to_models_jar = "./stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2-models.jar" #"*your_path_to_stanford-corenlp-x.x.x-models.jar*" -javahome = '*your_path_to_jdk_bin_java.exe*'#for eg. 'C:\\Program\ Files\\Java\\jdk1.8.0_201\\bin\\java.exe' or '/usr/local/openjdk-11/bin/java' - -# DONOT CHANGE THE VALUES BELOW DURING INITIAL CONFIGURATION SET UP import os -if os.getenv("DOCKER"): - # print("Inside Docker") - user = "root" - password = "root" - host = "db" - javahome = '/usr/local/openjdk-11/bin/java' \ No newline at end of file +from dotenv import load_dotenv + +if os.getenv("DOCKER") == "Y": + load_dotenv("ENV/docker.env") +else: + load_dotenv("ENV/.env") + +""" MAKE SURE you have filled environment variables in `.env` files in `./ENV/` folder""" + +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +host = os.getenv("DB_HOST") +database = os.getenv("DATABASE") +port = os.getenv("DB_PORT") +key = os.getenv("GCLOUD_API_KEY") # Will be provided by mentors +tbot_token = os.getenv("TELEGRAM_BOT_TOKEN") + +# your_path_to_stanford-corenlp-x.x.x.jar +stanford_path_to_jar = os.getenv("STANFORD_PATH_TO_JAR") + +# your_path_to_stanford-corenlp-x.x.x-models.jar +stanford_path_to_models_jar = os.getenv("STANFORD_PATH_TO_MODELS_JAR") + +# for eg. 'C:\\Program\ Files\\Java\\jdk1.8.0_201\\bin\\java.exe' or '/usr/local/openjdk-11/bin/java' +javahome = os.getenv("JAVAHOME") diff --git a/constants.py b/constants.py index 223ac4a..ecc1e36 100644 --- a/constants.py +++ b/constants.py @@ -1 +1,9 @@ -BASE_URL={'direction':"https://www.google.com/maps/dir/?api=1" , 'geocoding':"https://www.google.com/maps/search/?api=1&query"} +BASE_URL = { + "direction": "https://www.google.com/maps/dir/?api=1", + "geocoding": "https://www.google.com/maps/search/?api=1&query", + "latlng": "https://maps.googleapis.com/maps/api/geocode/json", + "timezone": "https://maps.googleapis.com/maps/api/timezone/json", + "mapsstatic": "https://maps.googleapis.com/maps/api/staticmap", + "elevation": "https://maps.googleapis.com/maps/api/elevation/json", + "places": "https://maps.googleapis.com/maps/api/place", +} diff --git a/databaseconnect.py b/databaseconnect.py index b916a21..2ec8eed 100644 --- a/databaseconnect.py +++ b/databaseconnect.py @@ -47,7 +47,6 @@ def connection_to_database(): def setup_database(): db = connection_to_database() cur = db.cursor() - cur.execute( "CREATE TABLE IF NOT EXISTS chat_table(id INTEGER PRIMARY KEY AUTO_INCREMENT, root_word VARCHAR(40), subject VARCHAR(40), verb VARCHAR(40), sentence VARCHAR(200))" # noqa: E501 ) @@ -63,7 +62,6 @@ def setup_database(): return db - @logger_config.logger # add classified sentences to database def add_to_database(classification, subject, root, verb, H): @@ -97,7 +95,6 @@ def add_to_database(classification, subject, root, verb, H): if r[-1] == H: exist = 1 break - if exist == 0: # do not add if question already exists cur.execute( "INSERT INTO statement_table(subject,root_word,verb,sentence) VALUES (%s,%s,%s,%s)", (str(subject), str(root), str(verb), H,) @@ -128,7 +125,7 @@ def get_chat_response(): def get_question_response(subject, root, verb): db = connection_to_database() cur = db.cursor(prepared=True) - if str(subject) == '[]': + if str(subject) == "[]": cur.execute("SELECT verb FROM statement_table") res = cur.fetchall() found = 0 @@ -161,7 +158,7 @@ def get_question_response(subject, root, verb): res = cur.fetchone() checkVerb = res[0] # checkVerb is a string while verb is a list. checkVerb ['verb'] - if checkVerb == '[]': + if checkVerb == "[]": cur.execute( "SELECT sentence FROM statement_table WHERE subject= %s", (str(subject[0]),) ) @@ -179,7 +176,6 @@ def get_question_response(subject, root, verb): else: B = "Sorry I don't know the response to this. Please train me." return B, chatbot.LearnResponse.TRAIN_ME.name - else: B = "Sorry I don't know the response to this. Please train me." return B, chatbot.LearnResponse.TRAIN_ME.name @@ -222,7 +218,7 @@ def clear_table(table_name): if input("Enter 'Y' to confirm cleaning of BOTH tables: ") in ("Y", "y",): for table in tables_to_be_cleaned: - cur.execute("DELETE FROM %s",(table,)) + cur.execute("DELETE FROM %s",(table,)) db.commit() logging.debug("Tables cleaned successfully") else: @@ -242,7 +238,6 @@ def clear_table(table_name): return db - @logger_config.logger def describe_table(cur, table_name): cur.execute("DESC %s",(table_name,)) @@ -259,4 +254,4 @@ def describe_table(cur, table_name): logging.debug() return records_no - + diff --git a/docker-compose.yml b/docker-compose.yml index 3425cc0..7b7924b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,19 +17,17 @@ services: - java_storage:/usr/local/openjdk-11/ bot: - image: python:3.6 + image: chttrje/chatbot container_name: mapbot_bot environment: - - DOCKER=true - build: - context: . - dockerfile: Dockerfile + - DOCKER=Y + - GCLOUD_API_KEY depends_on: - db - java volumes: - java_storage:/usr/local/openjdk-11/:ro - - .:/usr/var + - .:/usr/var/MapBot stdin_open: true tty: true diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100644 index 0000000..96847c6 --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,14 @@ +set -ex + +git pull + +if [ ! -d ./stanford-corenlp-full-2018-10-05 ]; then + wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip -nc -c +fi + +if [ -f stanford-corenlp-full-2018-10-05.zip ]; then + unzip -n stanford-corenlp-full-2018-10-05.zip + rm stanford-corenlp-full-2018-10-05.zip +fi + +python init.py diff --git a/features.py b/features.py index 987e946..c40e972 100644 --- a/features.py +++ b/features.py @@ -1,7 +1,6 @@ # pass in a sentence, pass out it's features import nltk import pandas as pd -import csv import sys import hashlib import re @@ -13,160 +12,166 @@ import logger_config log = logging.getLogger(__name__) -log.info('Entered module: %s' % __name__) +log.info("Entered module: %s" % __name__) lemma = nltk.wordnet.WordNetLemmatizer() -sno = nltk.stem.SnowballStemmer('english') - -line = ["xxx","Oracle 12.2 will be released for on-premises users on 15 March 2017",0,"S"] - -pos = [] #list of PartsOfSpeech -output = "" #comma separated string -header = "" #string for describing features header - -VerbCombos = ['VB', - 'VBD', - 'VBG', - 'VBN', - 'VBP', - 'VBZ', - 'WDT', - 'WP', - 'WP$', - 'WRB', - 'MD'] - -questionTriples = ['CD-VB-VBN', - 'MD-PRP-VB' , - 'MD-VB-CD' , - 'NN-IN-DT' , - 'PRP-VB-PRP' , - 'PRP-WP-NNP' , - 'VB-CD-VB' , - 'VB-PRP-WP' , - 'VBZ-DT-NN' , - 'WP-VBZ-DT' , - 'WP-VBZ-NNP' , - 'WRB-MD-VB'] - -statementTriples = ['DT-JJ-NN', - 'DT-NN-VBZ', - 'DT-NNP-NNP', - 'IN-DT-NN', - 'IN-NN-NNS', - 'MD-VB-VBN', - 'NNP-IN-NNP', - 'NNP-NNP-NNP', - 'NNP-VBZ-DT', - 'NNP-VBZ-NNP', - 'NNS-IN-DT', - 'VB-VBN-IN', - 'VBZ-DT-JJ'] - - -startTuples = ['NNS-DT', - 'WP-VBZ', - 'WRB-MD'] - -endTuples = ['IN-NN', - 'VB-VBN', - 'VBZ-NNP'] - -# Because python dict's return key-vals in random order, provide ordered list to pass to ML models -feature_keys = ["id", -"wordCount", -"stemmedCount", -"stemmedEndNN", -"CD", -"NN", -"NNP", -"NNPS", -"NNS", -"PRP", -"VBG", -"VBZ", -"startTuple0", -"endTuple0", -"endTuple1", -"endTuple2", -"verbBeforeNoun", -"qMark", -"qVerbCombo", -"qTripleScore", -"sTripleScore", -"class"] +sno = nltk.stem.SnowballStemmer("english") + +line = [ + "xxx", + "Oracle 12.2 will be released for on-premises users on 15 March 2017", + 0, + "S", +] + +pos = [] # list of PartsOfSpeech +output = "" # comma separated string +header = "" # string for describing features header + +VerbCombos = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "MD"] + +questionTriples = [ + "CD-VB-VBN", + "MD-PRP-VB", + "MD-VB-CD", + "NN-IN-DT", + "PRP-VB-PRP", + "PRP-WP-NNP", + "VB-CD-VB", + "VB-PRP-WP", + "VBZ-DT-NN", + "WP-VBZ-DT", + "WP-VBZ-NNP", + "WRB-MD-VB", +] + +statementTriples = [ + "DT-JJ-NN", + "DT-NN-VBZ", + "DT-NNP-NNP", + "IN-DT-NN", + "IN-NN-NNS", + "MD-VB-VBN", + "NNP-IN-NNP", + "NNP-NNP-NNP", + "NNP-VBZ-DT", + "NNP-VBZ-NNP", + "NNS-IN-DT", + "VB-VBN-IN", + "VBZ-DT-JJ", +] + + +startTuples = ["NNS-DT", "WP-VBZ", "WRB-MD"] + +endTuples = ["IN-NN", "VB-VBN", "VBZ-NNP"] + +"""Because python dict's return key-vals in random order, provide ordered + list to pass to ML models""" +feature_keys = [ + "id", + "wordCount", + "stemmedCount", + "stemmedEndNN", + "CD", + "NN", + "NNP", + "NNPS", + "NNS", + "PRP", + "VBG", + "VBZ", + "startTuple0", + "endTuple0", + "endTuple1", + "endTuple2", + "verbBeforeNoun", + "qMark", + "qVerbCombo", + "qTripleScore", + "sTripleScore", + "class", +] + @logger_config.logger def strip_sentence(sentence): sentence = sentence.strip(",") - sentence = ''.join(filter(lambda x: x in string.printable, sentence)) #strip out non-alpha-numerix - sentence = sentence.translate(str.maketrans('','',string.punctuation)) #strip punctuation - return(sentence) + sentence = "".join(filter(lambda x: x in string.printable, sentence)) + # strip out non-alpha-numerix + sentence = sentence.translate(str.maketrans("", "", string.punctuation)) + # strip punctuation + return sentence + @logger_config.logger def exists_pair_combos(comboCheckList, sentence): pos = get_pos(sentence) - tag_string = "-".join([ i[1] for i in pos ]) + tag_string = "-".join([i[1] for i in pos]) combo_list = [] - for pair in itertools.permutations(comboCheckList,2): - if(pair[0] == "MD"): # * Kludge - strip off leading MD * - pair = ["",""] + for pair in itertools.permutations(comboCheckList, 2): + if pair[0] == "MD": # Kludge - strip off leading MD + pair = ["", ""] combo_list.append("-".join(pair)) if any(code in tag_string for code in combo_list): - return 1 + return 1 else: return 0 + @logger_config.logger # Parts Of Speech def get_pos(sentence): sentenceParsed = word_tokenize(sentence) - return(nltk.pos_tag(sentenceParsed)) + return nltk.pos_tag(sentenceParsed) + @logger_config.logger # Count Q-Marks def count_qmark(sentence): - return(sentence.count("?") ) + return sentence.count("?") + @logger_config.logger # Count a specific POS-Type -#VBG = count_POSType(pos,'VBG') +# VBG = count_POSType(pos,'VBG') def count_POSType(pos, ptype): - count = 0 - tags = [ i[1] for i in pos ] - return(tags.count(ptype)) - #if ptype in tags: + tags = [i[1] for i in pos] + return tags.count(ptype) + # if ptype in tags: # VBG = 1 - #return(VBG) + # return(VBG) + @logger_config.logger # Does Verb occur before first Noun def exists_vb_before_nn(pos): - pos_tags = [ i[1] for i in pos ] - #Strip the Verbs to all just "V" - pos_tags = [ re.sub(r'V.*','V', str) for str in pos_tags ] - #Strip the Nouns to all just "NN" - pos_tags = [ re.sub(r'NN.*','NN', str) for str in pos_tags ] + pos_tags = [i[1] for i in pos] + # Strip the Verbs to all just "V" + pos_tags = [re.sub(r"V.*", "V", str) for str in pos_tags] + # Strip the Nouns to all just "NN" + pos_tags = [re.sub(r"NN.*", "NN", str) for str in pos_tags] - vi =99 - ni =99 - mi =99 + vi = 99 + ni = 99 + mi = 99 - #Get first NN index + # Get first NN index if "NN" in pos_tags: ni = pos_tags.index("NN") - #Get first V index + # Get first V index if "V" in pos_tags: vi = pos_tags.index("V") - #get Modal Index + # get Modal Index if "MD" in pos_tags: mi = pos_tags.index("MD") - if vi < ni or mi < ni : - return(1) + if vi < ni or mi < ni: + return 1 else: - return(0) + return 0 + @logger_config.logger # Stemmed sentence ends in "NN-NN"? @@ -175,32 +180,35 @@ def exists_stemmed_end_NN(stemmed): stemmed_end = get_first_last_tuples(" ".join(stemmed))[1] if stemmed_end == "NN-NN": stemmedEndNN = 1 - return(stemmedEndNN) + return stemmedEndNN + @logger_config.logger # Go through the predefined list of start-tuples, 1 / 0 if given startTuple occurs in the list def exists_startTuple(startTuple): exists_startTuples = [] - for tstring in startTuples: #startTuples defined as global var + for tstring in startTuples: # startTuples defined as global var if startTuple in tstring: exists_startTuples.append(1) else: exists_startTuples.append(0) - return(exists_startTuples) + return exists_startTuples + @logger_config.logger # Go through the predefined list of end-tuples, 1 / 0 if given Tuple occurs in the list def exists_endTuple(endTuple): exists_endTuples = [] - for tstring in endTuples: #endTuples defined as global var + for tstring in endTuples: # endTuples defined as global var if endTuple in tstring: exists_endTuples.append(1) else: exists_endTuples.append(0) - return(exists_endTuples) + return exists_endTuples + @logger_config.logger -#loop round list of triples and construct a list of binary 1/0 vals if triples occur in list +# loop round list of triples and construct a list of binary 1/0 vals if triples occur in list def exists_triples(triples, tripleSet): exists = [] for tstring in tripleSet: @@ -208,47 +216,54 @@ def exists_triples(triples, tripleSet): exists.append(1) else: exists.append(0) - return(exists) + return exists + @logger_config.logger # Get a sentence and spit out the POS triples def get_triples(pos): list_of_triple_strings = [] - pos = [ i[1] for i in pos ] # extract the 2nd element of the POS tuples in list + pos = [i[1] for i in pos] # extract the 2nd element of the POS tuples in list n = len(pos) if n > 2: # need to have three items - for i in range(0,n-2): - t = "-".join(pos[i:i+3]) # pull out 3 list item from counter, convert to string + for i in range(0, n - 2): + t = "-".join( + pos[i : i + 3] # noqa: E203 + ) # pull out 3 list item from counter, convert to string list_of_triple_strings.append(t) return list_of_triple_strings + @logger_config.logger def get_first_last_tuples(sentence): first_last_tuples = [] sentenceParsed = word_tokenize(sentence) - pos = nltk.pos_tag(sentenceParsed) #Parts Of Speech - pos = [ i[1] for i in pos ] # extract the 2nd element of the POS tuples in list + pos = nltk.pos_tag(sentenceParsed) # Parts Of Speech + pos = [i[1] for i in pos] # extract the 2nd element of the POS tuples in list n = len(pos) first = "" last = "" if n > 1: # need to have three items - first = "-".join(pos[0:2]) # pull out first 2 list items - last = "-".join(pos[-2:]) # pull out last 2 list items + first = "-".join(pos[0:2]) # pull out first 2 list items + last = "-".join(pos[-2:]) # pull out last 2 list items first_last_tuples = [first, last] return first_last_tuples + @logger_config.logger def lemmatize(sentence): """ - pass in a sentence as a string, return just core text that has been "lematised" - stop words are removed - could effect ability to detect if this is a question or answer - - depends on import lemma = nltk.wordnet.WordNetLemmatizer() and from nltk.corpus import stopwords + pass in a sentence as a string, return just core text that has + been "lematised" stop words are removed - could effect ability to detect if + this is a question or answer - depends on import + lemma = nltk.wordnet.WordNetLemmatizer() + and from nltk.corpus import stopwords """ - stop_words = set(stopwords.words('english')) + stop_words = set(stopwords.words("english")) word_tokens = word_tokenize(sentence) filtered_sentence = [] @@ -261,14 +276,17 @@ def lemmatize(sentence): return lem + @logger_config.logger def stematize(sentence): """ pass in a sentence as a string, return just core text stemmed - stop words are removed - could effect ability to detect if this is a question or answer - - depends on import sno = nltk.stem.SnowballStemmer('english') and from nltk.corpus import stopwords + stop words are removed - could effect ability to detect if this is a + question or answer - depends on import + sno = nltk.stem.SnowballStemmer('english') + and from nltk.corpus import stopwords """ - stop_words = set(stopwords.words('english')) + stop_words = set(stopwords.words("english")) word_tokens = word_tokenize(sentence) filtered_sentence = [] @@ -281,114 +299,143 @@ def stematize(sentence): return stemmed + ######################################################################### # A wrapper function to put it all together - build a csv line to return # A header string is also returned for optional use -def get_string(id,sentence,c="X"): - header,output = "","" + + +def get_string(id, sentence, c="X"): + header, output = "", "" pos = get_pos(sentence) - qMark = count_qmark(sentence) #count Qmarks before stripping punctuation + qMark = count_qmark(sentence) # count Qmarks before stripping punctuation sentence = strip_sentence(sentence) - #lemmed = lemmatize(sentence) + # lemmed = lemmatize(sentence) stemmed = stematize(sentence) wordCount = len(sentence.split()) stemmedCount = len(stemmed) - qVerbCombo = exists_pair_combos(VerbCombos,sentence) + qVerbCombo = exists_pair_combos(VerbCombos, sentence) verbBeforeNoun = exists_vb_before_nn(pos) - output = id + "," + str(wordCount) + "," + str(stemmedCount) + "," + str(qVerbCombo)+ "," + str(qMark) + "," + str(verbBeforeNoun) + output = ( + id + + "," + + str(wordCount) + + "," + + str(stemmedCount) + + "," + + str(qVerbCombo) + + "," + + str(qMark) + + "," + + str(verbBeforeNoun) + ) header = header + "id,wordCount,stemmedCount,qVerbCombo,qMark,verbBeforeNoun" # list of POS-TYPES to count , generate a list of counts in the CSV line - for ptype in ["VBG", "VBZ", "NNP", "NN", "NNS", "NNPS","PRP", "CD" ]: - output = output + "," + str( count_POSType(pos,ptype) ) + for ptype in ["VBG", "VBZ", "NNP", "NN", "NNS", "NNPS", "PRP", "CD"]: + output = output + "," + str(count_POSType(pos, ptype)) header = header + "," + ptype output = output + "," + str(exists_stemmed_end_NN(stemmed)) header = header + ",StemmedEndNN," - ## get Start Tuples and End Tuples Features ## - startTuple,endTuple = get_first_last_tuples(sentence) + # get Start Tuples and End Tuples Features ## + startTuple, endTuple = get_first_last_tuples(sentence) - l = exists_startTuple(startTuple) #list [1/0] for exists / not exists - output = output + "," + ",".join(str(i) for i in l) - for i in range(0,len(l)): - header = header + "startTuple" + str(i+1) + "," + list1 = exists_startTuple(startTuple) # list [1/0] for exists / not exists + output = output + "," + ",".join(str(i) for i in list1) + for i in range(0, len(list1)): + header = header + "startTuple" + str(i + 1) + "," - l = exists_endTuple(endTuple) #list [1/0] for exists / not exists - output = output + "," + ",".join(str(i) for i in l) - for i in range(0,len(l)): - header = header + "endTuple" + str(i+1) + "," + list1 = exists_endTuple(endTuple) # list [1/0] for exists / not exists + output = output + "," + ",".join(str(i) for i in list1) + for i in range(0, len(list1)): + header = header + "endTuple" + str(i + 1) + "," - ## look for special Triple Combinations ## + # look for special Triple Combinations ## triples = get_triples(pos) # all the triple sequences in the sentence POS list - l = exists_triples(triples, questionTriples) - total = sum(l) + list1 = exists_triples(triples, questionTriples) + total = sum(list1) output = output + "," + str(total) header = header + "qTripleScore" + "," - l = exists_triples(triples, statementTriples) - total = sum(l) + list1 = exists_triples(triples, statementTriples) + total = sum(list1) output = output + "," + str(total) header = header + "sTripleScore" + "," - output = output + "," + c #Class Type on end + output = output + "," + c # Class Type on end header = header + "class" - return output,header + return output, header + + # End of Get String wrapper + @logger_config.logger # Build a dictionary of features -def features_dict(id,sentence,c="X"): +def features_dict(id, sentence, c="X"): features = {} pos = get_pos(sentence) features["id"] = id - features["qMark"] = count_qmark(sentence) #count Qmarks before stripping punctuation + features["qMark"] = count_qmark( + sentence + ) # count Qmarks before stripping punctuation sentence = strip_sentence(sentence) stemmed = stematize(sentence) - startTuple,endTuple = get_first_last_tuples(sentence) + startTuple, endTuple = get_first_last_tuples(sentence) features["wordCount"] = len(sentence.split()) features["stemmedCount"] = len(stemmed) - features["qVerbCombo"] = exists_pair_combos(VerbCombos,sentence) + features["qVerbCombo"] = exists_pair_combos(VerbCombos, sentence) features["verbBeforeNoun"] = exists_vb_before_nn(pos) - for ptype in ["VBG", "VBZ", "NNP", "NN", "NNS", "NNPS","PRP", "CD" ]: - features[ptype] = count_POSType(pos,ptype) + for ptype in ["VBG", "VBZ", "NNP", "NN", "NNS", "NNPS", "PRP", "CD"]: + features[ptype] = count_POSType(pos, ptype) features["stemmedEndNN"] = exists_stemmed_end_NN(stemmed) - l = exists_startTuple(startTuple) #list [1/0] for exists / not exists - for i in range(0,len(l)): - features["startTuple" + str(i)] = l[i] + list1 = exists_startTuple(startTuple) # list [1/0] for exists / not exists + for i in range(0, len(list1)): + features["startTuple" + str(i)] = list1[i] - l = exists_endTuple(endTuple) #list [1/0] for exists / not exists - for i in range(0,len(l)): - features["endTuple" + str(i)] = l[i] + list1 = exists_endTuple(endTuple) # list [1/0] for exists / not exists + for i in range(0, len(list1)): + features["endTuple" + str(i)] = list1[i] - ## look for special Triple Combinations ## + # look for special Triple Combinations ## triples = get_triples(pos) # all the triple sequences in the sentence POS list - l = exists_triples(triples, questionTriples) # a list of 1/0 for hits on this triple-set - features["qTripleScore"] = sum(l) # add all the triple matches up to get a score + list1 = exists_triples( + triples, questionTriples + ) # a list of 1/0 for hits on this triple-set + features["qTripleScore"] = sum( + list1 + ) # add all the triple matches up to get a score - l = exists_triples(triples, statementTriples) # Do same check for the Statement t-set - features["sTripleScore"] = sum(l) # add all the triple matches up to get a score + list1 = exists_triples( + triples, statementTriples + ) # Do same check for the Statement t-set + features["sTripleScore"] = sum( + list1 + ) # add all the triple matches up to get a score - features["class"] = c #Class Type on end + features["class"] = c # Class Type on end return features + @logger_config.logger # pass in dict, get back series def features_series(features_dict): - values=[] + values = [] for key in feature_keys: values.append(features_dict[key]) @@ -396,15 +443,17 @@ def features_series(features_dict): return features_series -## MAIN ## -if __name__ == '__main__': - # ID, WordCount, StemmedCount, Qmark, VBG, StemmedEnd, StartTuples, EndTuples, QuestionTriples, StatementTriples, Class - # [1/0] [NN-NN?] [3 x binary] [3 x binary] [10 x binary] [10 x binary] +# MAIN ## +if __name__ == "__main__": + + """ID, WordCount, StemmedCount, Qmark, VBG, StemmedEnd, StartTuples, + EndTuples, QuestionTriples, StatementTriples, Class + [1/0] [NN-NN?] [3 x binary] [3 x binary] [10 x binary] [10 x binary]""" logging.debug("Starting...") - c = "X" # Dummy class + c = "X" # Dummy class header = "" output = "" @@ -413,21 +462,21 @@ def features_series(features_dict): else: sentence = line[1] - id = hashlib.md5(str(sentence).encode('utf-8')).hexdigest()[:16] + id = hashlib.md5(str(sentence).encode("utf-8")).hexdigest()[:16] - features = features_dict(id,sentence, c) - pos = get_pos(sentence) #NLTK Parts Of Speech, duplicated just for the printout + features = features_dict(id, sentence, c) + pos = get_pos(sentence) # NLTK Parts Of Speech, duplicated just for the printout logging.debug(pos) logging.debug(features) - for key,value in features.items(): + for key, value in features.items(): logging.debug(key, value) - #header string + # header string for key, value in features.items(): - header = header + ", " + key #keys come out in a random order - output = output + ", " + str(value) - header = header[1:] #strip the first ","" off - output = output[1:] #strip the first ","" off + header = header + ", " + key # keys come out in a random order + output = output + ", " + str(value) + header = header[1:] # strip the first ","" off + output = output[1:] # strip the first ","" off logging.debug("HEADER:", header) logging.debug("VALUES:", output) diff --git a/featuresDump.py b/featuresDump.py index 51b40f6..3ff726d 100644 --- a/featuresDump.py +++ b/featuresDump.py @@ -1,88 +1,90 @@ # Use the features.py module to dump out features # read in a CSV of sentences and bulk-dump to dump.csv of features -#Input CSV fmt: 1st field is sentence ID, 2nd field is text to process, 3rd field is class +# Input CSV fmt: 1st field is sentence ID, 2nd field is text to process, 3rd field is class import csv import sys import hashlib from pathlib import Path -import features # features.py is bepoke util to extract NLTK POS features from sentences +import features # features.py is bepoke util to extract NLTK POS features from sentences import logging log = logging.getLogger(__name__) -log.info('Entered module: %s' % __name__) +log.info("Entered module: %s" % __name__) if len(sys.argv) > 1: FNAME = Path(sys.argv[1]) else: - FNAME = Path('./analysis/sentences.csv') + FNAME = Path("./analysis/sentences.csv") logging.debug("reading input from ", FNAME) - if len(sys.argv) > 2: FOUT = Path(sys.argv[2]) else: - FOUT = Path('./analysis/featuresDump.csv') + FOUT = Path("./analysis/featuresDump.csv") logging.debug("Writing output to ", FOUT) -fin = open(FNAME, 'rt') -fout = open(FOUT, 'wt', newline='') - -keys = ["id", -"wordCount", -"stemmedCount", -"stemmedEndNN", -"CD", -"NN", -"NNP", -"NNPS", -"NNS", -"PRP", -"VBG", -"VBZ", -"startTuple0", -"endTuple0", -"endTuple1", -"endTuple2", -"verbBeforeNoun", -"qMark", -"qVerbCombo", -"qTripleScore", -"sTripleScore", -"class"] +fin = open(FNAME, "rt") +fout = open(FOUT, "wt", newline="") + +keys = [ + "id", + "wordCount", + "stemmedCount", + "stemmedEndNN", + "CD", + "NN", + "NNP", + "NNPS", + "NNS", + "PRP", + "VBG", + "VBZ", + "startTuple0", + "endTuple0", + "endTuple1", + "endTuple2", + "verbBeforeNoun", + "qMark", + "qVerbCombo", + "qTripleScore", + "sTripleScore", + "class", +] reader = csv.reader(fin) loopCount = 0 -next(reader) #Assume we have a header +next(reader) # Assume we have a header for line in reader: sentence = line[0] - c = line[1] #class-label - id = hashlib.md5(str(sentence).encode('utf-8')).hexdigest()[:16] # generate a unique ID + c = line[1] # class-label + id = hashlib.md5(str(sentence).encode("utf-8")).hexdigest()[:16] + # generate a unique ID output = "" header = "" - #get header and string output - #output, header = features.get_string(id,sentence,c) - f = features.features_dict(id,sentence, c) + # get header and string output + # output, header = features.get_string(id,sentence,c) + f = features.features_dict(id, sentence, c) for key in keys: value = f[key] header = header + ", " + key output = output + ", " + str(value) - if loopCount == 0: # only extract and print header for first dict item - header = header[1:] #strip the first ","" off - logging.debug(header) - fout.writelines(header + '\n') + if loopCount == 0: # only extract and print header for first dict item + header = header[1:] # strip the first ","" off + logging.debug(header) + fout.writelines(header + "\n") - output = output[1:] #strip the first ","" off + output = output[1:] # strip the first ","" off loopCount = loopCount + 1 logging.debug(output) - fout.writelines(output + '\n') + fout.writelines(output + "\n") fin.close() fout.close() diff --git a/googleMapsApiModule.py b/googleMapsApiModule.py index 17867d6..7daa638 100644 --- a/googleMapsApiModule.py +++ b/googleMapsApiModule.py @@ -1,54 +1,120 @@ import googlemaps import webbrowser import config -import mysql.connector from constants import BASE_URL import requests import logging import logger_config +from datetime import datetime +import calendar -gmaps = googlemaps.Client(config.key) #global variable gmaps + +gmaps = googlemaps.Client(config.key) # global variable gmaps log = logging.getLogger(__name__) -log.info('Entered module: %s' % __name__) +log.info("Entered module: %s" % __name__) + -@logger_config.logger -def direction(origin,destination): - result = gmaps.directions(origin,destination) - address = f'origin={origin}&destination={destination}' +@logger_config.logger +def direction(origin, destination): + result = gmaps.directions(origin, destination) + logging.debug("Summary: " + result[0]["summary"]) + address = f"origin={origin}&destination={destination}" result_url = f'{BASE_URL["direction"]}&{address.lower().replace(" ", "+")}' logging.debug(result_url) webbrowser.open_new(result_url) + return result_url + + +@logger_config.logger +def get_timestamp(date_time): + yr, mon, day, hr, mi = map(int, date_time.split()) + d = datetime(yr, mon, day, hr, mi) + timestamp = calendar.timegm(d.timetuple()) + return timestamp + + +@logger_config.logger +def get_lat_lng(place): + response = requests.get(f'{BASE_URL["latlng"]}?address={place}&key={config.key}') + resp_json_payload = response.json() + lat_lng = resp_json_payload["results"][0]["geometry"]["location"] + return lat_lng + + +@logger_config.logger +def timezone(place, date_time): + # format of datetime should be YYYY MM DD Hrs Mins and place should be a normal string + lat_lng = get_lat_lng(place) + timestamp = get_timestamp(date_time) + response = requests.get( + f'{BASE_URL["timezone"]}?location={lat_lng["lat"]},{lat_lng["lng"]}×tamp={timestamp}&key={config.key}' + ) + resp_dict = response.json() + for key in resp_dict: + print(f"{key} : {resp_dict[key]}") + return resp_dict["timeZoneId"] -@logger_config.logger + +@logger_config.logger def geocoding(search_location): result = gmaps.geocode(search_location) - logging.debug("Formatted Address: "+result[0]['formatted_address']) - logging.debug("Latitude: "+str(result[0]['geometry']['location']['lat'])+" "+"Longitude: "+str(result[0]['geometry']['location']['lng'])) + logging.debug("Formatted Address: " + result[0]["formatted_address"]) + logging.debug( + "Latitude: " + + str(result[0]["geometry"]["location"]["lat"]) + + " " + + "Longitude: " + + str(result[0]["geometry"]["location"]["lng"]) + ) address = search_location result_url = f'{BASE_URL["geocoding"]}={address.lower().replace(" ", "+")}' webbrowser.open_new(result_url) + return result_url + -@logger_config.logger +@logger_config.logger def mapsstatic(search_location): address = search_location - result_url = f'https://maps.googleapis.com/maps/api/staticmap?center={address.lower().replace(" ", "+")}&zoom=13&scale=1&size=600x350&maptype=roadmap&key={config.key}&format=png&visual_refresh=true&markers=size:mid%7Ccolor:0xff0000%7Clabel:L%7C{address.lower().replace(" ", "+")}' + result_url = f'{BASE_URL["mapsstatic"]}?center={address.lower().replace(" ", "+")}&zoom=13&scale=1&size=600x350&maptype=roadmap&key={config.key}&format=png&visual_refresh=true&markers=size:mid%7Ccolor:0xff0000%7Clabel:L%7C{address.lower().replace(" ", "+")}' # noqa: E501 logging.debug(result_url) webbrowser.open_new(result_url) + return result_url + """Summary or Description of the Function Parameters: - search_location(str): The location entered by user + search_location(str): The location entered by user Returns: result_value(int): elevation(in metres) above/below sea level """ + + +@logger_config.logger def elevation(search_location): result = gmaps.geocode(search_location) - json = requests.get(f'https://maps.googleapis.com/maps/api/elevation/json?locations={result[0]["geometry"]["location"]["lat"]},{result[0]["geometry"]["location"]["lng"]}&key={config.key}').json() - result_value = json['results'][0]['elevation'] - position = "above" if result_value>0 else "below" - print(f'{search_location} is {round(result_value,2)} metres {position} sea level') - + json = requests.get( + f'{BASE_URL["elevation"]}?locations={result[0]["geometry"]["location"]["lat"]},{result[0]["geometry"]["location"]["lng"]}&key={config.key}' # noqa: E501 + ).json() + result_value = json["results"][0]["elevation"] + position = "above" if result_value > 0 else "below" + print(f"{search_location} is {round(result_value,2)} metres {position} sea level") + return result_value +@logger_config.logger +def places(search_location): + address = search_location + json = requests.get( + f'{BASE_URL["places"]}/findplacefromtext/json?input={address.lower().replace(" ", "+")}&inputtype=textquery&fields=photos,formatted_address,place_id&key={config.key}' # noqa: E501 + ).json() + logging.debug("Address:" + json["candidates"][0]["formatted_address"]) + details = requests.get( + f'{BASE_URL["places"]}/details/json?place_id={json["candidates"][0]["place_id"]}&fields=rating,formatted_phone_number&key={config.key}' # noqa: E501 + ).json() + logging.debug("Rating:" + str(details["result"]["rating"])) + logging.debug("Phone:" + details["result"]["formatted_phone_number"]) + photo = f'{BASE_URL["places"]}/photo?maxwidth=400&photoreference={json["candidates"][0]["photos"][0]["photo_reference"]}&key={config.key}' # noqa: E501 + webbrowser.open_new(photo) + return json["candidates"][0]["place_id"] diff --git a/init.py b/init.py index a208485..47954b9 100644 --- a/init.py +++ b/init.py @@ -1,7 +1,11 @@ from chatbot import setup from chatbot import message_to_bot -clf, learn_response = setup() -while(True): - received_message = input("You: ") - send_message, learn_response = message_to_bot(received_message,clf,learn_response) - print("MapBot: "+send_message) \ No newline at end of file + +clf, learn_response = setup() +EXIT_CONVERSATION = "Bye! I'll miss you!" +while True: + received_message = input("You: ") + send_message, learn_response = message_to_bot(received_message, clf, learn_response) + print("MapBot: " + send_message) + if send_message == EXIT_CONVERSATION: + break diff --git a/logger_config.py b/logger_config.py index 7747aa1..b410f33 100644 --- a/logger_config.py +++ b/logger_config.py @@ -1,17 +1,19 @@ +from functools import wraps import logging -FORMAT = '%(asctime)s - %(name)-s - %(levelname)-s - %(message)s' + +FORMAT = "%(asctime)s - %(name)-s - %(levelname)-s - %(message)s" LEVEL = logging.DEBUG logging.basicConfig(format=FORMAT, level=LEVEL) log = logging.getLogger(__name__) + def logger(fn): - from functools import wraps - import inspect @wraps(fn) def wrapper(*args, **kwargs): log = logging.getLogger(fn.__name__) - log.info('About to run %s' % fn.__name__) + log.info("About to run %s" % fn.__name__) out = fn(*args, **kwargs) - log.info('Done running %s' % fn.__name__) + log.info("Done running %s" % fn.__name__) return out + return wrapper diff --git a/requirements.txt b/requirements.txt index 3f3ef82..98cef71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,8 @@ pandas==0.25.1 mysql-connector-python==8.0.18 numpy==1.16.5 scikit-learn==0.21.3 +flake8==3.7.9 +pytest==5.4.1 +python-dotenv==0.12.0 +black==19.10b0 +xgboost==1.0.0 \ No newline at end of file diff --git a/start.sh b/start.sh index 0b5b20d..31cf261 100644 --- a/start.sh +++ b/start.sh @@ -1,3 +1,9 @@ #!/usr/bin/env bash -docker-compose up -d && docker attach mapbot_bot +if [ ! -d MapBot ]; then + git clone https://github.com/vishakha-lall/MapBot.git -b gssoc-master +fi + +export GCLOUD_API_KEY= + +docker-compose -f ./MapBot/docker-compose.yml up -d && docker attach mapbot_bot diff --git a/telegram.py b/telegram.py new file mode 100644 index 0000000..e1be808 --- /dev/null +++ b/telegram.py @@ -0,0 +1,107 @@ +import requests +import json +import time +import logging +import logger_config + +log = logging.getLogger(__name__) +log.info("Entered module: %s" % __name__) + + +class TelegramBot(object): + """Class for TelegramBot housing functions required to interact with Telegram.""" + + @logger_config.logger + def __init__(self, TOKEN: str) -> None: + """Initiates a TelegramBot object with unique Telegram BOT_TOKEN and creates the base URL.""" + super(TelegramBot, self).__init__() + self.TOKEN = TOKEN + self.URL = f"https://api.telegram.org/bot{TOKEN}/" + logging.debug("Telegram Bot ready") + + @logger_config.logger + def send_message(self, text: str, chat_id: int) -> None: + """Combine :text: and :chat_id:, create message and perform requests to Telegram Bot API.""" + url = self.URL + f"sendMessage?text={text}&chat_id={chat_id}" + self.get_url(url) + + @logger_config.logger + def get_url(self, url: str) -> str: + """Gather response from :url: and decode using 'utf8'.""" + response = requests.get(url) + content = response.content.decode("utf8") + return content + + @logger_config.logger + def get_json_from_url(self, url: str) -> dict: + """Takes :url: and returns json-like object of response.""" + content = self.get_url(url) + js = json.loads(content) + return js + + @logger_config.logger + def get_updates(self) -> dict: + """Gets json-like object of message Updates to bot.""" + url = self.URL + "getUpdates" + js = self.get_json_from_url(url) + return js + + @logger_config.logger + def get_last_chat_id_and_text(self) -> (str, int): + """Fetches :updates: and returns last update's :text: and :chat_id:.""" + updates = self.get_updates() + try: + text = updates["result"][-1]["message"]["text"] + except Exception: + logging.debug("Message not text") + text = None + chat_id = updates["result"][-1]["message"]["chat"]["id"] + return (text, chat_id) + + +if __name__ == "__main__": + import config + + TOKEN = config.tbot_token + # Creates a TelegramBot object with tbot_token present in `config.py` + tbot = TelegramBot(TOKEN) + + from chatbot import setup + from chatbot import message_to_bot + + clf, learn_response = setup() + EXIT_CONVERSATION = "Bye! I'll miss you!" + CONFUSED_CONVERSATION = "Sorry, I didn't get you. Could you try again?" + logging.debug("MapBot ready") + + last_textchat = (None, None) + # initialized to continously check for new messages + try: + while True: + received_message, chat_id = tbot.get_last_chat_id_and_text() + logging.debug(received_message) + logging.debug(chat_id) + + if (received_message, chat_id) != last_textchat: + # checking if any new messages have arrived since the last message + + if received_message is None: + # if latest message to bot is not of text format + print(chat_id) + tbot.send_message(CONFUSED_CONVERSATION, chat_id) + else: + logging.debug("Received: " + received_message) + reply_message, learn_response = message_to_bot( + received_message, clf, learn_response + ) + tbot.send_message(reply_message, chat_id) + if reply_message == EXIT_CONVERSATION: + break + + last_textchat = (received_message, chat_id) + # Wait for 0.5 secs before rechecking for new messages. (good for server) + time.sleep(0.5) + + except Exception as e: + logging.debug("EXCEPTION OCCURRED") + logging.debug(e) diff --git a/test_databaseconnect.py b/test_databaseconnect.py new file mode 100644 index 0000000..25ad8ee --- /dev/null +++ b/test_databaseconnect.py @@ -0,0 +1,138 @@ +import databaseconnect +import config +import pytest +import mysql.connector as mysql + + +class TestClass: + @pytest.fixture(scope="session", autouse=True) + def setup_init(self): + # Will be executed before the first test + main_database = config.database + config.database = "test" + try: + test_db = mysql.connect( + host=config.host, + user=config.user, + passwd=config.password, + database=config.database, + ) + + cursor = test_db.cursor() + cursor.execute("CREATE DATABASE {}".format(config.database)) + print("test database created") + + except Exception: + print("Failed to create test database") + # rolling back to main db + config.database = main_database + pytest.exit("Exiting test!") + + yield test_db + + # Will be executed after the last test is executed + try: + mycursor = test_db.cursor() + mycursor.execute("DROP DATABASE {}".format(config.database)) + mycursor.close() + print("test database deleted.") + + except Exception: + print("Failed to delete test database.") + + config.database = main_database + + def test_setup_database(self): + db = databaseconnect.setup_database() + cursor = db.cursor() + cursor.execute("SHOW TABLES") + tables = cursor.fetchall() + expected_tables = [ + ("chat_table"), + ("statement_table"), + ("question_table"), + ("directions_table"), + ] + assert tables.sort() == expected_tables.sort() + + def test_add_to_database_of_chat_table(self): + db = databaseconnect.add_to_database("C", "subject", "root", "verb", "H") + cursor = db.cursor() + cursor.execute( + "select * from chat_table where root_word='root' and verb='verb' and sentence='H'" + ) + res = cursor.fetchone()[0] + assert res == 1 + + def test_add_to_database_of_question_table(self): + db = databaseconnect.add_to_database("Q", "subject", "root", "verb", "H") + cursor = db.cursor() + cursor.execute( + "select * from question_table where subject='subject' and root_word='root' and verb='verb' and sentence='H'" + ) + res = cursor.fetchone()[0] + assert res == 1 + + def test_add_to_database_of_statement_table(self): + db = databaseconnect.add_to_database("O", "subject", "root", "verb", "H") + cursor = db.cursor() + cursor.execute( + "select * from statement_table where subject='subject' and root_word='root' and verb='verb' and sentence='H'" + ) + res = cursor.fetchone()[0] + assert res == 1 + + def test_get_chat_response(self): + response = databaseconnect.get_chat_response() + assert type(response) is str + + def test_get_question_response_without_subject(self): + response = databaseconnect.get_question_response("[]", "root", "verb") + assert type(response) is tuple + + def test_get_question_response_with_subject(self): + response = databaseconnect.get_question_response("subject", "root", "verb") + assert type(response) is tuple + + def test_add_learnt_statement_to_database(self): + db = databaseconnect.add_learnt_statement_to_database("subject", "root", "verb") + cursor = db.cursor() + cursor.execute( + "select * from question_table where subject='subject' and root_word='root' and verb='verb'" + ) + res = cursor.fetchone()[0] + assert res == 1 + + def test_learn_question_response(self): + response = databaseconnect.learn_question_response("H") + assert type(response) is tuple + + def test_clear_table_with_chat_table(self, monkeypatch): + from io import StringIO + + yes = StringIO("y\n") + monkeypatch.setattr("sys.stdin", yes) + + db = databaseconnect.clear_table("chat_table") + + cursor = db.cursor() + cursor.execute("select * from chat_table") + entries = cursor.fetchone() + assert entries is None + + def test_clear_table_with_statement_or_question_table(self, monkeypatch): + from io import StringIO + + yes = StringIO("y\n") + monkeypatch.setattr("sys.stdin", yes) + + db = databaseconnect.clear_table("statement_table") + + cursor = db.cursor() + cursor.execute("select * from statement_table") + entries_1 = cursor.fetchone() + + cursor.execute("select * from question_table") + entries_2 = cursor.fetchone() + + assert entries_1 is None and entries_2 is None diff --git a/test_googleMapsApiModule.py b/test_googleMapsApiModule.py new file mode 100644 index 0000000..ea80c9c --- /dev/null +++ b/test_googleMapsApiModule.py @@ -0,0 +1,70 @@ +import googleMapsApiModule +import config +import pytest +from googlemaps.exceptions import ApiError + + +class TestClass: + def test_direction_with_valid_input(self): + result = googleMapsApiModule.direction("paris", "brussels") + assert ( + result + == "https://www.google.com/maps/dir/?api=1&origin=paris&destination=brussels" + ) + + def test_direction_with_invalid_input(self): + with pytest.raises(ApiError): + result = googleMapsApiModule.direction("kjajw", "qwiuq") + assert ( + result + == "https://www.google.com/maps/dir/?api=1&origin=kjajw&destination=qwiuq" + ) + + def test_geocoding_with_valid_input(self): + result = googleMapsApiModule.geocoding("denver") + assert result == "https://www.google.com/maps/search/?api=1&query=denver" + + def test_geocoding_with_invalid_input(self): + with pytest.raises(IndexError): + result = googleMapsApiModule.geocoding("kahakd...") + assert result == "https://www.google.com/maps/search/?api=1&query=kahakd..." + + def test_mapsstatic_with_valid_input(self): + result = googleMapsApiModule.mapsstatic("sydney") + assert ( + result + == "https://maps.googleapis.com/maps/api/staticmap?center=sydney&zoom=13&scale=1&size=600x350&maptype=roadmap&key=" + + config.key + + "&format=png&visual_refresh=true&markers=size:mid%7Ccolor:0xff0000%7Clabel:L%7Csydney" + ) + + def test_elevation_with_valid_input(self): + result = googleMapsApiModule.elevation("moscow") + assert type(result) is float + + def test_elevation_with_invalid_input(self): + with pytest.raises(IndexError): + result = googleMapsApiModule.elevation("hihih") + assert type(result) is float + + def test_places_with_valid_input(self): + result = googleMapsApiModule.places("princeton university") + assert result == "ChIJ6baYzdjmw4kRTwKQ-tZ-ugI" + + def test_places_with_invalid_input(self): + with pytest.raises(IndexError): + result = googleMapsApiModule.places("esffsf") + assert ( + result + == "https://maps.googleapis.com/maps/api/place/photo?maxwidth=400&photoreference=CmRaAAAA8o1VGVvds8zkqh745Pa6t2KcBbMA&key=" # noqa: E501 + + config.key + ) + + def test_timezone_with_valid_input(self): + result = googleMapsApiModule.timezone("ohio", "2000 11 21 11 41") + assert result == "America/New_York" + + def test_timezone_with_invalid_input(self): + with pytest.raises(ValueError): + result = googleMapsApiModule.timezone("wijd..", "2000 18 21 11 41") + assert result == "America/New_York" diff --git a/test_utilities.py b/test_utilities.py new file mode 100644 index 0000000..f462f71 --- /dev/null +++ b/test_utilities.py @@ -0,0 +1,57 @@ +import utilities + + +class TestClass: + test_input = "The quick brown fox jumps over the lazy dog." + clf = utilities.classify_model() + + def test_setup_nltk(self): + result = utilities.setup_nltk() + assert result + + def test_parse_sentence(self): + triples, root = utilities.parse_sentence(self.test_input) + triples = list(triples) + assert (("jumps", "VBZ"), "nsubj", ("fox", "NN")) in triples + assert (("jumps", "VBZ"), "nmod", ("dog", "NN")) in triples + assert root == "jumps" + + def test_classify_model(self): + from features import features_dict + import hashlib + import numpy as np + + keys = [ + "id", + "wordCount", + "stemmedCount", + "stemmedEndNN", + "CD", + "NN", + "NNP", + "NNPS", + "NNS", + "PRP", + "VBG", + "VBZ", + "startTuple0", + "endTuple0", + "endTuple1", + "endTuple2", + "verbBeforeNoun", + "qMark", + "qVerbCombo", + "qTripleScore", + "sTripleScore", + "class", + ] + id = hashlib.md5(str(self.test_input).encode("utf-8")).hexdigest()[:16] + f = features_dict(id, self.test_input) + features = [f[k] for k in keys][1:-1] + features = np.array(features).reshape(1, -1) + + assert self.clf.predict(features)[0] == "S" + + def test_classify_sentence(self): + result = utilities.classify_sentence(self.clf, self.test_input) + assert result == "S" diff --git a/utilities.py b/utilities.py index 75255ee..b21e8b3 100644 --- a/utilities.py +++ b/utilities.py @@ -1,87 +1,149 @@ from pathlib import Path import logging -import logger_config +import logger_config log = logging.getLogger(__name__) -log.info('Entered module: %s' % __name__) +log.info("Entered module: %s" % __name__) + @logger_config.logger def setup_nltk(): import nltk - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') - nltk.download('stopwords') -@logger_config.logger -#grammar parsing -def parse_sentence(user_input): #returns root word, triples of StanfordDependencyParser + punkt = nltk.download("punkt") + averaged_perceptron_tagger = nltk.download("averaged_perceptron_tagger") + stopwords = nltk.download("stopwords") + + return all((punkt, averaged_perceptron_tagger, stopwords)) + + +@logger_config.logger +# grammar parsing +def parse_sentence(user_input): + # returns root word, triples of StanfordDependencyParser import os from nltk.parse.stanford import StanfordDependencyParser import config + path_to_jar = config.stanford_path_to_jar path_to_models_jar = config.stanford_path_to_models_jar - dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) - os.environ['JAVAHOME'] = config.javahome + dependency_parser = StanfordDependencyParser( + path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar + ) + os.environ["JAVAHOME"] = config.javahome result = dependency_parser.raw_parse(user_input) - dep = next(result) # get next item from the iterator result - return dep.triples(),dep.root["word"] + dep = next(result) # get next item from the iterator result + return dep.triples(), dep.root["word"] + @logger_config.logger -#classification into statements questions and chat +# classification into statements questions and chat def classify_model(): import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier + + FNAME = Path("analysis/featuresDump.csv") + df = pd.read_csv(filepath_or_buffer=FNAME,) + df.columns = df.columns[:].str.strip() # Strip any leading spaces from col names + df["class"] = df["class"].map(lambda x: x.strip()) + width = df.shape[1] + # split into test and training (is_train: True / False col) + np.random.seed(seed=1) + df["is_train"] = np.random.uniform(0, 1, len(df)) <= 0.75 + train, test = ( + df[df["is_train"] == True], + df[df["is_train"] == False], + ) # noqa: E712 + features = df.columns[1 : width - 1] # noqa: E203 + # remove the first ID col and last col=classifier + # Fit an RF Model for "class" given features + clf = RandomForestClassifier(n_jobs=2, n_estimators=100) + clf.fit(train[features], train["class"]) + # Predict against test set + preds = clf.predict(test[features]) + predout = ( # noqa: F841 + pd.DataFrame({"id": test["id"], "predicted": preds, "actual": test["class"]}), + ) + # Creates F841 in flake8. Needs to be addressed. `predout` is never used. + return clf + +@logger_config.logger +# classification into statements questions and chat with more choice of ml algorithms parameter tuned by Gridsearchcv. +# By default this uses random forest if no argument passed.by @eaglewarrior +def classify_model_adv(model="rf"): + import numpy as np + import pandas as pd + from sklearn.ensemble import RandomForestClassifier + from sklearn.svm import SVC + from xgboost import XGBClassifier + from sklearn.ensemble import AdaBoostClassifier + from sklearn.tree import DecisionTreeClassifier + from sklearn.naive_bayes import MultinomialNB FNAME = Path('analysis/featuresDump.csv') - df = pd.read_csv(filepath_or_buffer = FNAME, ) + df = pd.read_csv(filepath_or_buffer=FNAME, ) df.columns = df.columns[:].str.strip() # Strip any leading spaces from col names df['class'] = df['class'].map(lambda x: x.strip()) width = df.shape[1] - #split into test and training (is_train: True / False col) + # split into test and training (is_train: True / False col) np.random.seed(seed=1) df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 - train, test = df[df['is_train']==True], df[df['is_train']==False] - features = df.columns[1:width-1] #remove the first ID col and last col=classifier - # Fit an RF Model for "class" given features - clf = RandomForestClassifier(n_jobs=2, n_estimators = 100) + train, test = df[df['is_train'] == True], df[df['is_train'] == False] + features = df.columns[1:width-1] # remove the first ID col and last col=classifier + # Fit an Model for "class" given features rf:random forest,xgb:xgboost,nb:naive bayes,ada:adaboost + if model=="svm": + clf = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False) + elif model=="rf": + clf= RandomForestClassifier(n_jobs=2, n_estimators=100) + elif model=="xgb": + clf= XGBClassifier(learning_rate= 0.01, n_estimators= 500) + elif model=="nb": + clf= MultinomialNB() + ## best performance by adaboost + elif model=="ada": + clf=AdaBoostClassifier(learning_rate=0.001,n_estimators=2000) clf.fit(train[features], train['class']) # Predict against test set preds = clf.predict(test[features]) - predout = pd.DataFrame({ 'id' : test['id'], 'predicted' : preds, 'actual' : test['class'] }) + predout = pd.DataFrame({'id': test['id'], 'predicted': preds, 'actual': test['class']}) return clf @logger_config.logger -def classify_sentence(clf,user_input): +def classify_sentence(clf, user_input): import features import pandas as pd - keys = ["id", - "wordCount", - "stemmedCount", - "stemmedEndNN", - "CD", - "NN", - "NNP", - "NNPS", - "NNS", - "PRP", - "VBG", - "VBZ", - "startTuple0", - "endTuple0", - "endTuple1", - "endTuple2", - "verbBeforeNoun", - "qMark", - "qVerbCombo", - "qTripleScore", - "sTripleScore", - "class"] - myFeatures = features.features_dict('1',user_input, 'X') - values=[] + + keys = [ + "id", + "wordCount", + "stemmedCount", + "stemmedEndNN", + "CD", + "NN", + "NNP", + "NNPS", + "NNS", + "PRP", + "VBG", + "VBZ", + "startTuple0", + "endTuple0", + "endTuple1", + "endTuple2", + "verbBeforeNoun", + "qMark", + "qVerbCombo", + "qTripleScore", + "sTripleScore", + "class", + ] + myFeatures = features.features_dict("1", user_input, "X") + values = [] for key in keys: values.append(myFeatures[key]) s = pd.Series(values) width = len(s) - myFeatures = s[1:width-1] #All but the last item (this is the class for supervised learning mode) + myFeatures = s[1 : width - 1] # noqa: E203 + # All but the last item (this is the class for supervised learning mode) predict = clf.predict([myFeatures]) return predict[0].strip()