From c865634d75619972ce00a1b1a3283d024b14a49e Mon Sep 17 00:00:00 2001 From: Gabriele Venturi Date: Tue, 6 Jun 2023 00:35:31 +0200 Subject: [PATCH] feat: use conversational=False by default As data analysts need a non-conversational answer most of the times, we make it so it returns the actual dataframe instead --- README.md | 11 +++++-- docs/getting-started.md | 46 +++++++++++++++++++++++++--- examples/from_csv.py | 2 +- examples/from_dataframe.py | 2 +- examples/with_multiple_dataframes.py | 13 ++++---- examples/with_privacy_enforced.py | 2 +- pandasai/__init__.py | 16 +++++----- tests/test_pandasai.py | 4 +-- 8 files changed, 70 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 8a67e0bd6..3b1fc9af4 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ pip install pandasai > Disclaimer: GDP data was collected from [this source](https://ourworldindata.org/grapher/gross-domestic-product?tab=table), published by World Development Indicators - World Bank (2022.05.26) and collected at National accounts data - World Bank / OECD. It relates to the year of 2020. Happiness indexes were extracted from [the World Happiness Report](https://ftnnews.com/images/stories/documents/2020/WHR20.pdf). Another useful [link](https://data.world/makeovermonday/2020w19-world-happiness-report-2020). -PandasAI is designed to be used in conjunction with [pandas](https://github.com/pandas-dev/pandas). It makes Pandas conversational, allowing you to ask questions about your data and get answers back, in the form of pandas DataFrames. +PandasAI is designed to be used in conjunction with [pandas](https://github.com/pandas-dev/pandas). It makes Pandas conversational, allowing you to ask questions about your data and get answers back, in the form of pandas DataFrames. ### Queries @@ -55,7 +55,7 @@ df = pd.DataFrame({ from pandasai.llm.openai import OpenAI llm = OpenAI(api_token="YOUR_API_TOKEN") -pandas_ai = PandasAI(llm, conversational=False) +pandas_ai = PandasAI(llm) pandas_ai(df, prompt='Which are the 5 happiest countries?') ``` @@ -136,19 +136,24 @@ You can find more examples in the [examples](examples) directory. ## Command-Line Tool Pai is the command line tool designed to provide a convenient way to interact with PandasAI through a command line interface (CLI). In order to access the CLI tool, make sure to create a virtualenv for testing purpose and to install project dependencies in your local virtual environment using `pip` by running the following command: + ``` pip install -e . ``` + Alternatively, you can use `poetry` to create and activate the virtual environment by running the following command: + ``` poetry shell ``` + Inside the activated virtual environment, install the project dependencies by running the following command: + ``` poetry install ``` -By following these steps, you will now have the necessary environment to access the CLI tool. +By following these steps, you will now have the necessary environment to access the CLI tool. ``` pai [OPTIONS] diff --git a/docs/getting-started.md b/docs/getting-started.md index ce1188ba9..02f670418 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -34,7 +34,7 @@ df = pd.DataFrame({ from pandasai.llm.openai import OpenAI llm = OpenAI(api_token="YOUR_API_TOKEN") -pandas_ai = PandasAI(llm, conversational=False) +pandas_ai = PandasAI(llm) pandas_ai.run(df, prompt='Which are the 5 happiest countries?') ``` @@ -73,7 +73,7 @@ from pandasai.llm.openai import OpenAI df = pd.read_csv("data/Loan payments data.csv") llm = OpenAI() -pandas_ai = PandasAI(llm, verbose=True) +pandas_ai = PandasAI(llm, verbose=True, conversational=True) response = pandas_ai.run(df, "How many loans are from men and have been paid off?") print(response) # Output: 247 loans have been paid off by men. @@ -93,7 +93,7 @@ from pandasai.llm.openai import OpenAI df = pd.DataFrame(dataframe) llm = OpenAI() -pandas_ai = PandasAI(llm, verbose=True, conversational=False) +pandas_ai = PandasAI(llm, verbose=True) response = pandas_ai.run(df, "Calculate the sum of the gdp of north american countries") print(response) # Output: 20901884461056 @@ -148,7 +148,7 @@ employees_df = pd.DataFrame(employees_data) salaries_df = pd.DataFrame(salaries_data) llm = OpenAI() -pandas_ai = PandasAI(llm, verbose=True, conversational=False) +pandas_ai = PandasAI(llm, verbose=True) response = pandas_ai.run( [employees_df, salaries_df], "Who gets paid the most?", @@ -156,3 +156,41 @@ response = pandas_ai.run( print(response) # Output: Olivia gets paid the most. ``` + +### Chain of commands + +You can chain commands by passing the output of one command to the next one. In the example, we first filter the original +dataframe by gender and then by loans that have been paid off. + +```python +import pandas as pd + +from pandasai import PandasAI +from pandasai.llm.openai import OpenAI + +df = pd.read_csv("examples/data/Loan payments data.csv") + +llm = OpenAI() +pandas_ai = PandasAI(llm, verbose=True) + +# We filter by males only +from_males_df = pandas_ai(df, "Filter the dataframe by males") +paid_from_males_df = pandas_ai(from_males_df, "Filter the dataframe by loans that have been paid off") +print(paid_from_males_df) +# Output: +# [247 rows x 11 columns] +# Loan_ID loan_status Principal terms effective_date due_date paid_off_time past_due_days age education Gender +# 0 xqd20166231 PAIDOFF 1000 30 9/8/2016 10/7/2016 9/14/2016 19:31 NaN 45 High School or Below male +# 3 xqd20160004 PAIDOFF 1000 15 9/8/2016 9/22/2016 9/22/2016 20:00 NaN 27 college male +# 5 xqd20160706 PAIDOFF 300 7 9/9/2016 9/15/2016 9/9/2016 13:45 NaN 35 Master or Above male +# 6 xqd20160007 PAIDOFF 1000 30 9/9/2016 10/8/2016 10/7/2016 23:07 NaN 29 college male +# 7 xqd20160008 PAIDOFF 1000 30 9/9/2016 10/8/2016 10/5/2016 20:33 NaN 36 college male +# .. ... ... ... ... ... ... ... ... ... ... ... +# 294 xqd20160295 PAIDOFF 1000 30 9/14/2016 10/13/2016 10/13/2016 13:00 NaN 36 Bechalor male +# 296 xqd20160297 PAIDOFF 800 15 9/14/2016 9/28/2016 9/21/2016 4:42 NaN 27 college male +# 297 xqd20160298 PAIDOFF 1000 30 9/14/2016 10/13/2016 10/13/2016 9:00 NaN 29 High School or Below male +# 298 xqd20160299 PAIDOFF 1000 30 9/14/2016 10/13/2016 10/13/2016 9:00 NaN 40 High School or Below male +# 299 xqd20160300 PAIDOFF 1000 30 9/14/2016 10/13/2016 10/13/2016 11:00 NaN 28 college male + +# [247 rows x 11 columns] +``` diff --git a/examples/from_csv.py b/examples/from_csv.py index fcef70726..3307e3f57 100644 --- a/examples/from_csv.py +++ b/examples/from_csv.py @@ -8,7 +8,7 @@ df = pd.read_csv("examples/data/Loan payments data.csv") llm = OpenAI() -pandas_ai = PandasAI(llm, verbose=True) +pandas_ai = PandasAI(llm, verbose=True, conversational=True) response = pandas_ai(df, "How many loans are from men and have been paid off?") print(response) # Output: 247 loans have been paid off by men. diff --git a/examples/from_dataframe.py b/examples/from_dataframe.py index c70aab669..1f8567b3d 100644 --- a/examples/from_dataframe.py +++ b/examples/from_dataframe.py @@ -10,7 +10,7 @@ df = pd.DataFrame(dataframe) llm = OpenAI() -pandas_ai = PandasAI(llm, verbose=True, conversational=False) +pandas_ai = PandasAI(llm, verbose=True) response = pandas_ai(df, "Calculate the sum of the gdp of north american countries") print(response) # Output: 20901884461056 diff --git a/examples/with_multiple_dataframes.py b/examples/with_multiple_dataframes.py index 6b4653682..5bad66842 100644 --- a/examples/with_multiple_dataframes.py +++ b/examples/with_multiple_dataframes.py @@ -1,18 +1,19 @@ """Example of using PandasAI on multiple Pandas DataFrame""" import pandas as pd + from pandasai import PandasAI from pandasai.llm.openai import OpenAI employees_data = { - 'EmployeeID': [1, 2, 3, 4, 5], - 'Name': ['John', 'Emma', 'Liam', 'Olivia', 'William'], - 'Department': ['HR', 'Sales', 'IT', 'Marketing', 'Finance'] + "EmployeeID": [1, 2, 3, 4, 5], + "Name": ["John", "Emma", "Liam", "Olivia", "William"], + "Department": ["HR", "Sales", "IT", "Marketing", "Finance"], } salaries_data = { - 'EmployeeID': [1, 2, 3, 4, 5], - 'Salary': [5000, 6000, 4500, 7000, 5500] + "EmployeeID": [1, 2, 3, 4, 5], + "Salary": [5000, 6000, 4500, 7000, 5500], } employees_df = pd.DataFrame(employees_data) @@ -20,7 +21,7 @@ llm = OpenAI() -pandas_ai = PandasAI(llm, verbose=True) +pandas_ai = PandasAI(llm, verbose=True, conversational=True) response = pandas_ai([employees_df, salaries_df], "Who gets paid the most?") print(response) # Output: Olivia diff --git a/examples/with_privacy_enforced.py b/examples/with_privacy_enforced.py index 67b64a77f..eca6845d1 100644 --- a/examples/with_privacy_enforced.py +++ b/examples/with_privacy_enforced.py @@ -10,7 +10,7 @@ df = pd.DataFrame(dataframe) llm = OpenAI() -pandas_ai = PandasAI(llm, verbose=True, conversational=False, enforce_privacy=True) +pandas_ai = PandasAI(llm, verbose=True, enforce_privacy=True) response = pandas_ai( df, "Calculate the sum of the gdp of north american countries", diff --git a/pandasai/__init__.py b/pandasai/__init__.py index 8228cc03c..c4222b2c7 100644 --- a/pandasai/__init__.py +++ b/pandasai/__init__.py @@ -27,7 +27,7 @@ from pandasai.llm.openai import OpenAI llm = OpenAI(api_token="YOUR_API_TOKEN") - pandas_ai = PandasAI(llm, conversational=False) + pandas_ai = PandasAI(llm) pandas_ai(df, prompt='Which are the 5 happiest countries?') ``` @@ -37,7 +37,7 @@ import re import sys from contextlib import redirect_stdout -from typing import Optional +from typing import Optional, Union import astor import matplotlib.pyplot as plt @@ -81,7 +81,7 @@ class PandasAI: _verbose (bool, optional): To show the intermediate outputs e.g. python code generated and execution step on the prompt. Default to False _is_conversational_answer (bool, optional): Whether to return answer in conversational - form. Default to True + form. Default to False _enforce_privacy (bool, optional): Do not display the data on prompt in case of Sensitive data. Default to False _max_retries (int, optional): max no. of tries to generate code on failure. Default to 3 @@ -99,7 +99,7 @@ class PandasAI: _llm: LLM _verbose: bool = False - _is_conversational_answer: bool = True + _is_conversational_answer: bool = False _enforce_privacy: bool = False _max_retries: int = 3 _is_notebook: bool = False @@ -118,7 +118,7 @@ class PandasAI: def __init__( self, llm=None, - conversational=True, + conversational=False, verbose=False, enforce_privacy=False, save_charts=False, @@ -129,7 +129,7 @@ def __init__( Args: llm (object): LLMs option to be used for API access. Default is None - conversational (bool): Whether to return answer in conversational form. Default to True + conversational (bool): Whether to return answer in conversational form. Default to False verbose (bool): To show the intermediate outputs e.g. python code generated and execution step on the prompt. Default to False enforce_privacy (bool): Execute the codes with Privacy Mode ON. Default to False @@ -176,7 +176,7 @@ def run( show_code: bool = False, anonymize_df: bool = True, use_error_correction_framework: bool = True, - ) -> str: + ) -> Union[str, pd.DataFrame]: """ Run the PandasAI to make Dataframes Conversational. @@ -287,7 +287,7 @@ def __call__( show_code: bool = False, anonymize_df: bool = True, use_error_correction_framework: bool = True, - ) -> str: + ) -> Union[str, pd.DataFrame]: """ __call__ method of PandasAI class. It calls the `run` method. diff --git a/tests/test_pandasai.py b/tests/test_pandasai.py index 9f76eeb30..93a0690a0 100644 --- a/tests/test_pandasai.py +++ b/tests/test_pandasai.py @@ -29,7 +29,7 @@ def pandasai(self, llm): def test_init(self, pandasai): assert pandasai._llm is not None - assert pandasai._is_conversational_answer is True + assert pandasai._is_conversational_answer is False assert pandasai._verbose is False def test_init_without_llm(self): @@ -44,7 +44,7 @@ def test_conversational_answer(self, pandasai, llm): def test_run(self, pandasai, llm): df = pd.DataFrame() llm._output = "1" - assert pandasai.run(df, "What number comes before 2?") == "1" + assert pandasai.run(df, "What number comes before 2?") == 1 def test_run_with_conversational_answer(self, pandasai, llm): df = pd.DataFrame()