diff --git a/coml/core.py b/coml/core.py index 07e3631..8e677e6 100644 --- a/coml/core.py +++ b/coml/core.py @@ -1,19 +1,23 @@ from __future__ import annotations import copy +import random import re import warnings -from typing import Any, cast +from typing import Any, Callable, Literal, TypeVar, cast import colorama from langchain.chat_models.base import BaseChatModel +from langchain.embeddings.base import Embeddings from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage +from scipy.spatial.distance import cosine as cosine_distance from .prompt_utils import ( CHECK_INSTRUCTION, EXPLAIN_INSTRUCTION, FIX_INSTRUCTION, GENERATE_INSTRUCTION, + GENERATE_INSTRUCTION_COT, SANITY_CHECK_INSTRUCTION, SUGGEST_INSTRUCTION, FixContext, @@ -33,6 +37,8 @@ _debug_mode: bool = False +_Type = TypeVar("_Type") + def debug_messages(*messages: BaseMessage) -> None: if not _debug_mode: @@ -91,8 +97,58 @@ def parse_code(response: str) -> str: class CoMLAgent: - def __init__(self, llm: BaseChatModel): + """ + CoML agent that accepts data science requests and generates code. + + Attributes: + llm: The language model that generates responses. + prompt_version: The version of prompt to use (can be ``v1`` or ``v2``). + prompt_validation: A function that takes a list of messages and returns + whether the prompt is valid, which is useful for limiting the number of + tokens in the prompt. + num_examples: The number of examples to show in the prompt. It can be a + number between 0 and 1, interpreted as the percentage of examples to show. + It can also be an integer, interpreted as the number of examples to show. + message_style: Can be ``chatgpt`` in which system messages are shown, or + ``gemini`` in which only human and ai messages are shown. + chain_of_thought: Whether to use chain of thought (COT) in the prompt. + context_order: The order of the context in the prompt. Default to ``vcr``. + ``v`` for variable descriptions, ``c`` for codes, ``r`` for request. + ensemble: Perform ``ensemble`` number of LLM calls and ensemble the results. + ensemble_shuffle: Shuffle the examples in the prompt before ensemble. + example_ranking: A model that ranks the examples. If provided, the examples + will be ranked by the model before selecting the examples. + intact_instruction: Whether to instruct LLM to keep the variables unmodified. + For experimenting purposes only. + """ + + def __init__( + self, + llm: BaseChatModel, + prompt_version: Literal["v1", "v2", "kaggle", "leetcode"] = "v2", + prompt_validation: Callable[[list[BaseMessage]], bool] | None = None, + num_examples: float | int = 1.0, + message_style: Literal["chatgpt", "gemini"] = "chatgpt", + chain_of_thought: bool = False, + context_order: Literal[ + "vcr", "cvr", "rvc", "rcv", "vr", "rv", "cr", "rc", "r" + ] = "vcr", + ensemble: int | None = None, + ensemble_shuffle: bool = True, + example_ranking: Embeddings | None = None, + intact_instruction: bool = True, + ): self.llm = llm + self.prompt_version = prompt_version + self.prompt_validation = prompt_validation + self.num_examples = num_examples + self.message_style = message_style + self.chain_of_thought = chain_of_thought + self.context_order = context_order + self.ensemble = ensemble + self.ensemble_shuffle = ensemble_shuffle + self.example_ranking = example_ranking + self.intact_instruction = intact_instruction def _fix_context_from_any_context( self, context: GenerateContext | FixContext, **kwargs: Any @@ -110,27 +166,162 @@ def _fix_context_from_any_context( context["interactions"].append(InteractionIncomplete(**kwargs)) return context + def _pre_generation(self, messages: list[BaseMessage]) -> list[BaseMessage]: + if self.message_style == "gemini": + # Merge the first two messages. + if len(messages) > 1 and isinstance(messages[1], HumanMessage): + messages1_content = cast(str, messages[1].content) + if not messages1_content.startswith("### Task Start ###"): + messages1_content = "### Task Start ###\n\n" + messages1_content + messages[1] = HumanMessage( + content=cast(str, messages[0].content) + "\n\n" + messages1_content + ) + messages = messages[1:] + else: + messages[0] = HumanMessage(content=cast(str, messages[0].content)) + + if self.prompt_validation is not None and not self.prompt_validation(messages): + raise ValueError("Prompt validation failed.") + + return messages + + def _ensemble_generate(self, messages: list[BaseMessage]) -> BaseMessage: + """Ensemble the result from multiple LLM calls.""" + + if not self.ensemble: + return self._generate(messages) + + results: list[tuple[float, BaseMessage]] = [] + for _ in range(self.ensemble): + if self.ensemble_shuffle and len(messages) > 2: + # Shuffle the examples + first_message = messages[0] + interactions = messages[1:] + + start_indices = [] + # Can be [Task A Human, AI, Human AI, Task B Human, AI] + for index, message in enumerate(interactions): + if isinstance(message, HumanMessage) and cast( + str, message.content + ).startswith("### Task Start ###"): + start_indices.append(index) + + # Can be [Human, AI, Human AI] + if not start_indices: + # Loosen the constraint and find all human messages + start_indices = [ + index + for index, message in enumerate(interactions) + if isinstance(message, HumanMessage) + ] + + groups = [ + interactions[index:index_next] + for index, index_next in zip(start_indices, start_indices[1:]) + ] + + # Shuffle the groups and combine them + random.shuffle(groups) + messages = ( + [first_message] + + [message for group in groups for message in group] + + interactions[start_indices[-1] :] + ) + + messages = self._pre_generation(messages) + result = self.llm.generate([messages], logprobs=True) + message = result.generations[0][0].message # type: ignore + generation_info = result.generations[0][0].generation_info + if generation_info is None or "logprobs" not in generation_info: + raise ValueError("Logprobs not found in generation_info.") + logprobs = [ + content["logprob"] for content in generation_info["logprobs"]["content"] + ] + if not logprobs: + mean_logprobs = float("-inf") + else: + mean_logprobs = sum(logprobs) / len(logprobs) + + results.append((mean_logprobs, message)) + + results.sort(key=lambda x: x[0], reverse=True) + + return results[0][1] + + def _generate(self, messages: list[BaseMessage]) -> BaseMessage: + """Generate a response from the LLM.""" + messages = self._pre_generation(messages) + return self.llm(messages) + + def _select_examples(self, query: str, fewshots: list[_Type]) -> list[_Type]: + """Select examples from the fewshots.""" + if self.num_examples == 0: + return [] + + if self.example_ranking is not None: + documents = [cast(Any, shot).get("request", "N/A") for shot in fewshots] + embeddings = self.example_ranking.embed_documents(documents) + # Use embed_documents instead of embed_query because the latter has cache + query_embedding = self.example_ranking.embed_documents([query])[0] + similarities = [ + (cosine_distance(query_embedding, embedding), shot) + for embedding, shot in zip(embeddings, fewshots) + ] + similarities.sort(key=lambda x: x[0]) + fewshots = [shot for _, shot in similarities] + + if isinstance(self.num_examples, int): + return fewshots[: self.num_examples] + else: + num_shots = max(int(len(fewshots) * self.num_examples), 1) + return fewshots[:num_shots] + def generate_code( - self, request: str, variable_descriptions: dict[str, str], codes: list[str] + self, + request: str, + variable_descriptions: dict[str, str], + codes: list[str], ) -> GenerateContext: - fewshots = cached_generate_fewshots() - messages: list[BaseMessage] = [ - SystemMessage(content=GENERATE_INSTRUCTION), - ] - for shot in fewshots: - question, answer = render_generate_context(shot) + fewshots = cached_generate_fewshots(self.prompt_version) + messages: list[BaseMessage] = [] + + if self.chain_of_thought: + generate_instruction = GENERATE_INSTRUCTION_COT + else: + generate_instruction = GENERATE_INSTRUCTION + if not self.intact_instruction: + generate_instruction = re.sub( + r"- Do not overwrite or modify.*\n", "", generate_instruction + ) + for shot in fewshots: + if "answer_wo_intact" in shot: + shot["answer"] = shot.pop("answer_wo_intact") + if "rationale_wo_intact" in shot: + shot["rationale"] = shot.pop("rationale_wo_intact") + messages.append(SystemMessage(content=generate_instruction)) + + for shot in self._select_examples(request, fewshots): + question, answer = render_generate_context( + shot, cot=self.chain_of_thought, context_order=self.context_order + ) messages.append(HumanMessage(content=question)) if answer is not None: messages.append(AIMessage(content=answer)) context = GenerateContextIncomplete( variables=variable_descriptions, codes=codes, request=request ) - question, _ = render_generate_context(context) + question, _ = render_generate_context( + context, cot=self.chain_of_thought, context_order=self.context_order + ) messages.append(HumanMessage(content=question)) + debug_messages(*messages) - response = self.llm(messages) + response = self._ensemble_generate(messages) debug_messages(response) + + if not isinstance(response.content, str): + raise ValueError(f"Response is not a string: {response.content}") code = parse_code(response.content) return {**context, "answer": code} @@ -142,6 +333,7 @@ def fix_code( prev_context: GenerateContext | FixContext, ) -> FixContext | None: fewshots = cached_fix_fewshots() + fewshots = self._select_examples(prev_context["request"] or "N/A", fewshots) messages: list[BaseMessage] = [ SystemMessage(content=FIX_INSTRUCTION), ] @@ -149,15 +341,16 @@ def fix_code( prev_context, error=error, output=output, hint=hint ) for shot in fewshots + [context]: - interactions = render_fix_context(shot) + interactions = render_fix_context(shot, context_order=self.context_order) for index, interaction in enumerate(interactions): if index % 2 == 0: messages.append(HumanMessage(content=interaction)) else: messages.append(AIMessage(content=interaction)) - debug_messages(*messages[-2:]) - response = self.llm(messages) + debug_messages(*messages) + + response = self._ensemble_generate(messages) debug_messages(response) explanation, observation, code = parse_fix(response.content) if "THE CODE IS CORRECT." in observation: @@ -186,7 +379,7 @@ def suggest(self, codes: list[str]) -> list[str]: HumanMessage(content=human_message), ] debug_messages(*messages) - response = self.llm(messages) + response = self._generate(messages) suggestions = re.split(r"\d+\.\s+", response.content) suggestions = [s.strip().replace("\n", " ") for s in suggestions if s.strip()] debug_messages(response) @@ -199,7 +392,7 @@ def explain(self, code: str) -> str: HumanMessage(content=code), ] debug_messages(*messages) - response = self.llm(messages) + response = self._generate(messages) debug_messages(response) return response.content @@ -212,7 +405,7 @@ def static_check( HumanMessage(content=render_check_context(code, context)), ] debug_messages(*messages) - response = self.llm(messages) + response = self._generate(messages) debug_messages(response) reason, last_line = response.content.rstrip().rsplit("\n", 1) if "INCORRECT" in last_line.upper(): @@ -236,7 +429,7 @@ def output_sanity_check( ), ] debug_messages(*messages) - response = self.llm(messages) + response = self._generate(messages) debug_messages(response) reason, last_line = response.content.rstrip().rsplit("\n", 1) if "INCORRECT" in last_line.upper(): diff --git a/coml/magics.py b/coml/magics.py index af12d1a..0f163c5 100644 --- a/coml/magics.py +++ b/coml/magics.py @@ -285,9 +285,11 @@ def display_statuses(statuses): ) html += message_template.format( display_names[name], - loading - if name not in statuses - else VERIFY_STATUS_ICON[statuses[name]["result"]], + ( + loading + if name not in statuses + else VERIFY_STATUS_ICON[statuses[name]["result"]] + ), detail_message, ) diff --git a/coml/prompt_utils.py b/coml/prompt_utils.py index 5965d21..b27e999 100644 --- a/coml/prompt_utils.py +++ b/coml/prompt_utils.py @@ -3,14 +3,19 @@ import json import re import types +import warnings from pathlib import Path -from typing import Any, TypedDict, cast +from typing import Any, Literal, TypedDict, cast + +import pandas as pd +from typing_extensions import NotRequired class GenerateContextIncomplete(TypedDict): variables: dict[str, str] codes: list[str] request: str + rationale: NotRequired[str] class GenerateContext(GenerateContextIncomplete): @@ -37,36 +42,132 @@ class FixContext(TypedDict): interactions: list[InteractionIncomplete | Interaction] +def lida_dataframe_describe(df: pd.DataFrame, n_samples: int) -> list[dict]: + """Get properties of each column in a pandas DataFrame, in which way used in LIDA.""" + + def check_type(dtype: str, value): + """Cast value to right type to ensure it is JSON serializable""" + if "float" in str(dtype): + return float(value) + elif "int" in str(dtype): + return int(value) + else: + return value + + properties_list = [] + for column in df.columns: + try: + dtype = df[column].dtype + except AttributeError: + # This can sometimes happen when the column is a dataframe by itself + properties_list.append( + {"column": column, "properties": {"dtype": "unknown"}} + ) + continue + + properties = {} + if dtype in [int, float, complex]: + properties["dtype"] = "number" + properties["std"] = check_type(dtype, df[column].std()) + properties["min"] = check_type(dtype, df[column].min()) + properties["max"] = check_type(dtype, df[column].max()) + + elif dtype == bool: + properties["dtype"] = "boolean" + elif dtype == object: + # Check if the string column can be cast to a valid datetime + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + pd.to_datetime(df[column], errors="raise") + properties["dtype"] = "date" + except ValueError: + # Check if the string column has a limited number of values + if df[column].nunique() / len(df[column]) < 0.5: + properties["dtype"] = "category" + else: + properties["dtype"] = "string" + elif pd.api.types.is_categorical_dtype(df[column]): + properties["dtype"] = "category" + elif pd.api.types.is_datetime64_any_dtype(df[column]): + properties["dtype"] = "date" + else: + properties["dtype"] = str(dtype) + + # add min max if dtype is date + if properties["dtype"] == "date": + try: + properties["min"] = df[column].min() + properties["max"] = df[column].max() + except TypeError: + cast_date_col = pd.to_datetime(df[column], errors="coerce") + properties["min"] = cast_date_col.min() + properties["max"] = cast_date_col.max() + # Add additional properties to the output dictionary + nunique = df[column].nunique() + if "samples" not in properties: + non_null_values = df[column][df[column].notnull()].unique() + n_samples = min(n_samples, len(non_null_values)) + samples = ( + pd.Series(non_null_values).sample(n_samples, random_state=42).tolist() + ) + properties["samples"] = samples + properties["num_unique_values"] = nunique + # properties["semantic_type"] = "" + # properties["description"] = "" + properties_list.append({"column": column, "properties": properties}) + + return properties_list + + PANDAS_DESCRIPTION_CONFIG: Any = dict(max_cols=10, max_colwidth=20, max_rows=10) MAXIMUM_LIST_ITEMS = 30 -def describe_variable(value: Any) -> str: +def describe_variable( + value: Any, + pandas_description_config: Any | None = None, + maximum_list_items: int | None = None, + dataframe_format: Literal["coml", "lida"] = "coml", +) -> str: import numpy import pandas + if pandas_description_config is None: + pandas_description_config = PANDAS_DESCRIPTION_CONFIG + if maximum_list_items is None: + maximum_list_items = MAXIMUM_LIST_ITEMS + if isinstance(value, numpy.ndarray): return "numpy.ndarray(shape={}, dtype={})".format(value.shape, value.dtype) elif isinstance(value, pandas.DataFrame): - return "pandas.DataFrame(shape={}, columns={})\n{}".format( - value.shape, - describe_variable(value.columns.tolist()), - add_indent(value.to_string(**PANDAS_DESCRIPTION_CONFIG).rstrip()), - ) + if dataframe_format == "coml": + return "pandas.DataFrame(shape={}, columns={})\n{}".format( + value.shape, + describe_variable(value.columns.tolist()), + add_indent(value.to_string(**pandas_description_config).rstrip()), + ) + elif dataframe_format == "lida": + return "pandas.DataFrame(shape={}, columns={})".format( + value.shape, + lida_dataframe_describe( + value, n_samples=pandas_description_config.get("max_rows", 10) + ), + ) elif isinstance(value, pandas.Series): return "pandas.Series(shape={})".format(value.shape) elif isinstance(value, list): - if len(value) > MAXIMUM_LIST_ITEMS: + if len(value) > maximum_list_items: return "[{}, ...]".format( - ", ".join(describe_variable(v) for v in value[:MAXIMUM_LIST_ITEMS]) + ", ".join(describe_variable(v) for v in value[:maximum_list_items]) ) return "[{}]".format(", ".join(describe_variable(v) for v in value)) elif isinstance(value, dict): - if len(value) > MAXIMUM_LIST_ITEMS: + if len(value) > maximum_list_items: return "{{{}, ...}}".format( ", ".join( f"{k}: {describe_variable(v)}" - for k, v in list(value.items())[:MAXIMUM_LIST_ITEMS] + for k, v in list(value.items())[:maximum_list_items] ) ) return "{{{}}}".format( @@ -161,10 +262,12 @@ def render_ipython_cells(codes: list[str]) -> str: def render_generate_context( context: GenerateContext | GenerateContextIncomplete, + cot: bool = False, + context_order: str = "vcr", ) -> tuple[str, str | None]: if context["variables"]: variables = ( - "Variables:\n\n" + "### Variables\n\n" + "".join( f"{name}: {desc}\n" for name, desc in context["variables"].items() ) @@ -173,24 +276,36 @@ def render_generate_context( else: variables = "No variables available currently.\n\n" if context["codes"]: - code = "Executed code:\n\n" + render_ipython_cells(context["codes"]) + "\n" + code = "### Executed code\n\n" + render_ipython_cells(context["codes"]) + "\n" else: code = "No code has been executed yet.\n\n" if context["request"]: - request = "Request:\n" + context["request"].rstrip() + request = "### Request\n\n" + context["request"].rstrip() else: request = "User request is unclear." + if cot: + request += "\nLet's think it step by step." + + contexts = { + "v": variables, + "c": code, + "r": request, + } + contexts = "\n\n".join([contexts[c].rstrip() for c in list(context_order)]) + if "answer" in context: answer = render_code(context["answer"]) + if cot and "rationale" in context: + answer = context["rationale"].rstrip() + "\n\n" + answer else: answer = None - return code + variables + request, answer + return contexts, answer -def render_fix_context(context: FixContext) -> list[str]: +def render_fix_context(context: FixContext, context_order: str = "vcr") -> list[str]: all_interactions: list[str] = [] task_begin = "### Task Start ###\n\n" if context["request"] is None: @@ -200,7 +315,7 @@ def render_fix_context(context: FixContext) -> list[str]: + render_code(context["first_attempt"]) ) else: - first_request, _ = render_generate_context(context) # type: ignore + first_request, _ = render_generate_context(context, context_order=context_order) # type: ignore first_request = task_begin + first_request all_interactions += [first_request, render_code(context["first_attempt"])] interaction_prefix = "" @@ -244,7 +359,7 @@ def render_fix_context(context: FixContext) -> list[str]: else: hint = "- The user did not provide any hint.\n\n" - post_instruction = 'With the information above, please first explain the code line-by-line, and then observe what might be wrong. Finally, you should provide the fixed code. If you think the code is correct, you can simply write "THE CODE IS CORRECT." in the observation section.' + post_instruction = 'With the information above, please first explain the code line-by-line, and then observe what might be wrong. Finally, you should provide the fixed code. If you think the code is correct, please write "THE CODE IS CORRECT." in the observation section.' all_interactions.append(instruction + error + output + hint + post_instruction) @@ -263,12 +378,12 @@ def render_fix_context(context: FixContext) -> list[str]: + interaction["observation"].rstrip() + "\n\n" ) - all_interactions.append( - explanation - + observation - + "The fixed code:\n\n" - + render_code(interaction["code"]) + code_section = ( + "The fixed code:\n\n" + render_code(interaction["code"]) + if interaction["code"] + else "" ) + all_interactions.append(explanation + observation + code_section) return all_interactions @@ -300,14 +415,19 @@ def render_sanity_check_context( Instructions: -- The generated code should be wrapped by ``` before and after it. +- The generated code should be *one single code block* wrapped by ``` before and after it. - Import necessary libraries at the beginning. You can leverage the Python libraries such as `pandas`, `sklearn`, `matplotlib`, `seaborn`, and etc. to achieve user's request. - The output of a cell is the last statement of the code. Do not use `print` to output the result or `return` to return the result. - Do not overwrite or modify the variables provided by the user, unless the user has explicitly asked for it. For example, if the user has provided a DataFrame `df`, you should not reassign `df`, unless the user asks to modify `df` in-place. """ +GENERATE_INSTRUCTION_COT = f"""{GENERATE_INSTRUCTION.rstrip()} +- Think before you write. You should first understand the user's request, think about how to achieve it, and then write the code. +""" + FIX_INSTRUCTION = f"""{GENERATE_INSTRUCTION.rstrip()} - If the user thinks the generated code is problematic, you should help fix it. The user will provide you with the exception message (if any), the output of the code (if any), and a hint (if any). You should provide a line-by-line explanation of the code, and point out what is wrong with the code. You should also provide the fixed code. +- If you think the provided problematic code is actually correct, you should first explain the code, and write "THE CODE IS CORRECT." (in upper case) in the observation section. The fixed code can be omitted. """ SUGGEST_INSTRUCTION = """You're a data scientist. Given the code that has already been written, suggest three things that can be done next. Write the response in the following format: @@ -335,13 +455,15 @@ def render_sanity_check_context( """ -def cached_generate_fewshots() -> list[GenerateContext]: - with open(Path(__file__).parent / "prompts/generate_fewshots.json") as f: +def cached_generate_fewshots(prompt_version: str) -> list[GenerateContext]: + with open( + Path(__file__).parent / f"prompts/generate_fewshots_{prompt_version}.json" + ) as f: return json.load(f) def cached_fix_fewshots() -> list[FixContext]: - with open(Path(__file__).parent / "prompts/fix_fewshots.json") as f: + with open(Path(__file__).parent / "prompts/fix_fewshots_v2.json") as f: return json.load(f) @@ -411,9 +533,9 @@ def update_fix_fewshots(): ] shot1 = extract_shot("../pandas_exercises/outputs/03-grouping-occupation.py", 3) - shot1[ - "first_attempt" - ] = "(users[users['gender'] == 'M'].groupby('occupation').gender.count() / users.groupby('occupation').gender.count()).sort_values(ascending=False)" + shot1["first_attempt"] = ( + "(users[users['gender'] == 'M'].groupby('occupation').gender.count() / users.groupby('occupation').gender.count()).sort_values(ascending=False)" + ) shot1["interactions"] = [ { "error": None, diff --git a/coml/prompts/fix_fewshots_v2.json b/coml/prompts/fix_fewshots_v2.json new file mode 100644 index 0000000..413a1e6 --- /dev/null +++ b/coml/prompts/fix_fewshots_v2.json @@ -0,0 +1,131 @@ +[ + { + "variables": { + "flowers": "pandas.DataFrame(shape=(5, 2), columns=[\"name\", \"count\"])\n name count\n 0 rose 10\n 1 tulip 20\n 2 daisy 30\n 3 lily 40\n 4 orchid 50" + }, + "codes": [ + "flowers['count'].sum()", + "flowers = flowers.sort_values('count', ascending=True)" + ], + "request": "Plot a bar chart of the number of flowers for each type.", + "first_attempt": "import seaborn as sns\nsns.barplot(x='name', y='count', data=flowers)", + "interactions": [ + { + "error": "Traceback (most recent call last):\n File \"/home/coml/coml.py\", line 36, in seaborn_example\n sns.barplot(x='name', y='count', data=flowers)", + "output": null, + "hint": null, + "explanation": "`sns.barplot(x='name', y='count', data=flowers)`: This line of code is using the seaborn library's barplot function to create a bar chart. 'name' and 'count' are the column names from the DataFrame 'flowers' that will be used for the x and y axes, respectively.", + "observation": "The seaborn library, represented as `sns`, hasn't been imported, which is why the error is occurring.", + "code": "import seaborn as sns\nsns.barplot(x='name', y='count', data=flowers)" + } + ] + }, + { + "variables": { + "X_train": "numpy.ndarray(shape=(2, 3), dtype=float32)", + "y_train": "numpy.ndarray(shape=(2,), dtype=float32)" + }, + "codes": [ + "from sklearn.linear_model import LinearRegression" + ], + "request": "Fit a linear regression model to the training data.", + "first_attempt": "model = LinearRegression()\nmodel.fit(X_train, y_train)", + "interactions": [ + { + "error": null, + "output": "LinearRegression()", + "hint": null, + "explanation": "1. `model = LinearRegression()`: This line of code creates a new linear regression model.\n2. `model.fit(X_train, y_train)`: This line of code fits the model to the training data.", + "observation": "THE CODE IS CORRECT.", + "code": "" + } + ] + }, + { + "variables": { + "a": "numpy.ndarray(shape=(5,), dtype=int64)", + "b": "numpy.ndarray(shape=(5,), dtype=int64)" + }, + "codes": [], + "request": "Run a t-test to test whether the mean of a is significantly different from the mean of b. Output the p-value.", + "first_attempt": "from scipy.stats import ttest_ind\nresult = ttest_ind(a, b)\nresult.pvalue", + "interactions": [ + { + "error": null, + "output": "0.5191867712416913", + "hint": null, + "explanation": "1. `from scipy.stats import ttest_ind`: This imports the function `ttest_ind` from the `scipy.stats` module. The function `ttest_ind` performs a two-sample t-test, which tests whether the means of two independent samples are significantly different.\n2. `result = ttest_ind(a, b)`: This line performs the two-sample t-test on the arrays `a` and `b`. The result is an object that contains the t-statistic and the p-value of the test.\n3. `print(result.pvalue)`: This prints the p-value of the test result.", + "observation": "The code prints the result directly to the console. It should instead put the result directly in the last line of code.", + "code": "from scipy.stats import ttest_ind\nresult = ttest_ind(a, b)\nresult.pvalue" + } + ] + }, + { + "variables": { + "fruits": "pandas.DataFrame(shape=(5, 3), columns=[\"name\", \"price\", \"color\"])\n name price color\n 0 apple $1.20 red\n 1 banana $0.50 yellow\n 2 cherry $0.75 red\n 3 durian $2.50 green\n 4 elderberry $1.00 purple" + }, + "codes": [ + "fruits[\"name\"].nunique()", + "fruits[\"color\"].value_counts()" + ], + "request": "What is the average price of red fruits?", + "first_attempt": "fruits[fruits['color'] == 'red']['price'].apply(lambda x: float(x.replace('$', ''))).mean()", + "interactions": [ + { + "error": null, + "output": "0.975", + "hint": null, + "explanation": "1. `fruits[\"price_float\"] = fruits[\"price\"].apply(lambda x: float(x.replace(\"$\", \"\")))`: This line of code creates a new column \"price_float\" in the \"fruits\" DataFrame. This column is created by applying a function that removes the dollar sign from each value in the \"price\" column and converts the resulting string to a float.\n2. `fruits[fruits[\"color\"] == \"red\"][\"price_float\"].mean()`: This line of code first filters the \"fruits\" DataFrame to include only rows where the \"color\" column is \"red\". It then selects the \"price_float\" column from the filtered DataFrame and calculates the mean (average) of this column's values.", + "observation": "The code creates a new column in the \"fruits\" DataFrame, which is actually not necessary and violating the principle that no overwriting or modification should be done unless specified.", + "code": "fruits[fruits['color'] == 'red']['price'].apply(lambda x: float(x.replace('$', ''))).mean()" + } + ] + }, + { + "variables": { + "people": "pandas.DataFrame(shape=(6, 3), columns=[\"name\", \"gender\", \"age\"])\n name gender age\n 0 Alice Female 25\n 1 Bob Male 30\n 2 Charlie Male 20\n 3 David Male 35\n 4 Eve Female 40\n 5 Frank Other 45" + }, + "codes": [], + "request": "What is the average age of people for each gender?", + "first_attempt": "people.groupby('gender').age.mean().reset_index()", + "interactions": [ + { + "error": null, + "output": " gender age\n0 Male 28.333333\n1 Female 32.500000", + "hint": null, + "explanation": "1. The code first creates a dictionary with two keys: \"gender\" and \"age\".\n2. For each key, it assigns a list as value. The list for \"gender\" includes two strings: \"Male\" and \"Female\". The list for \"age\" includes two function calls which compute the mean age for male and female people, respectively.\n3. The code then creates a DataFrame using this dictionary.", + "observation": "The code only computes the average ages for \"Male\" and \"Female\" genders. However, there is another gender \"Other\" in the dataset which is not included in the computation.", + "code": "people.groupby('gender').age.mean().reset_index()" + } + ] + }, + { + "variables": { + "admissions": "pandas.DataFrame(shape=(5, 2), columns=[\"Date\", \"Applicants\"])\n Date Applicants\n 0 2016-9-30 1,536\n 1 2016-12-31 2,204\n 2 2017-3-15 11,385\n 3 2017-6-9 250\n 4 2017-6-15 7,359" + }, + "codes": [ + "import pandas as pd", + "admissions = pd.read_csv('admissions.csv')" + ], + "request": "Count the number of applicants between 2016-10-1 and 2017-6-10 per day.", + "first_attempt": "admissions_cvt = pd.DataFrame({\n \"Date\": pd.to_datetime(admissions.Date),\n \"Applicants\": admissions.Applicants.str.replace(',', '').astype(int),\n})\ndays = pd.Timestamp(\"2017-6-10\") - pd.Timestamp(\"2016-10-1\") + pd.Timedelta(days=1)\nadmissions_cvt[(admissions_cvt.Date >= \"2016-10-1\") & (admissions_cvt.Date <= \"2017-6-10\")].Applicants.sum() / days.days", + "interactions": [ + { + "error": "Traceback (most recent call last):\n File \"/home/coml/coml.py\", line 175, in days_example\n admissions[(admissions.Date >= \"2016-10-1\") & (admissions.Date <= \"2017-6-10\")].Applicants.sum() / days.days\nTypeError: unsupported operand type(s) for /: 'str' and 'int'", + "output": null, + "hint": null, + "explanation": "1. `days = pd.Timestamp(\"2017-6-10\") - pd.Timestamp(\"2016-10-1\") + pd.Timedelta(days=1)`: calculates the total number of days between 2016-10-1 and 2017-6-10.\n2. `admissions[(admissions.Date >= \"2016-10-1\") & (admissions.Date <= \"2017-6-10\")].Applicants.sum() / days.days`: filters the admissions dataframe for the specified date range, sums the applicants, and divides it by the total number of days.", + "observation": "The error message indicates that there's a type mismatch when performing an operation. Looking at the code, it seems that the 'Applicants' column is being treated as a string, not a number. This is probably due to the comma in the numbers, which needs to be removed. The date column should also be converted to a datetime object for comparison.", + "code": "admissions.Date = pd.to_datetime(admissions.Date)\nadmissions.Applicants = admissions.Applicants.str.replace(',', '').astype(int)\ndays = pd.Timestamp(\"2017-6-10\") - pd.Timestamp(\"2016-10-1\") + pd.Timedelta(days=1)\nadmissions[(admissions.Date >= \"2016-10-1\") & (admissions.Date <= \"2017-6-10\")].Applicants.sum() / days.days" + }, + { + "error": null, + "output": "54.6996", + "hint": null, + "explanation": "1. `admissions.Date = pd.to_datetime(admissions.Date)`: This line converts the 'Date' column of the DataFrame to a datetime object, and writes it back to the DataFrame.\n2. `admissions.Applicants = admissions.Applicants.str.replace(',', '').astype(int)`: This line removes the comma from the 'Applicants' column values and converts the column to integer type.\n3. `days = pd.Timestamp(\"2017-6-10\") - pd.Timestamp(\"2016-10-1\") + pd.Timedelta(days=1)`: This line calculates the total number of days between the two given dates.\n4. `admissions[(admissions.Date >= \"2016-10-1\") & (admissions.Date <= \"2017-6-10\")].Applicants.sum() / days.days`: This line filters the DataFrame to include only the rows where the date is between the two given dates. It then sums up the 'Applicants' column values and divides it by the total number of days calculated in the previous step.", + "observation": "The code contains inplace changes to the original DataFrame, but the request did not specify inplace modifications to the DataFrame. The request can be fulfilled by creating a new DataFrame instead of overwrting the original one.", + "code": "admissions_cvt = pd.DataFrame({\n \"Date\": pd.to_datetime(admissions.Date),\n \"Applicants\": admissions.Applicants.str.replace(',', '').astype(int),\n})\ndays = pd.Timestamp(\"2017-6-10\") - pd.Timestamp(\"2016-10-1\") + pd.Timedelta(days=1)\nadmissions_cvt[(admissions_cvt.Date >= \"2016-10-1\") & (admissions_cvt.Date <= \"2017-6-10\")].Applicants.sum() / days.days" + } + ] + } +] \ No newline at end of file diff --git a/coml/prompts/generate_fewshots_kaggle.json b/coml/prompts/generate_fewshots_kaggle.json new file mode 100644 index 0000000..77d3e81 --- /dev/null +++ b/coml/prompts/generate_fewshots_kaggle.json @@ -0,0 +1,70 @@ +[ + { + "variables": { + "dataset": "pandas.DataFrame(shape=(5043, 28), columns=[\"color\", \"director_name\", \"num_critic_for_reviews\", \"duration\", \"director_facebook_likes\", \"actor_3_facebook_likes\", \"actor_2_name\", \"actor_1_facebook_likes\", \"gross\", \"genres\", \"actor_1_name\", \"movie_title\", \"num_voted_users\", \"cast_total_facebook_likes\", \"actor_3_name\", \"facenumber_in_poster\", \"plot_keywords\", \"movie_imdb_link\", \"num_user_for_reviews\", \"language\", \"country\", \"content_rating\", \"budget\", \"title_year\", \"actor_2_facebook_likes\", \"imdb_score\", \"aspect_ratio\", \"movie_facebook_likes\"])\n color director_name num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_2_name ... content_rating budget title_year actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes\n 0 Color James Cameron 723.0 178.0 0.0 855.0 Joel David ... ... PG-13 237000000.0 2009.0 936.0 7.9 1.78 33000 \n 1 Color Gore Verbinski 302.0 169.0 563.0 1000.0 Orlando Bloom ... PG-13 300000000.0 2007.0 5000.0 7.1 2.35 0 \n ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n 5041 Color Daniel Hsia 14.0 100.0 0.0 489.0 Daniel Henney ... PG-13 NaN 2012.0 719.0 6.3 2.35 660 \n 5042 Color Jon Gunn 43.0 90.0 16.0 16.0 Brian Herzl... ... PG 1100.0 2004.0 23.0 6.6 1.85 456" + }, + "codes": [ + "import pandas as pd\n\ndataset = pd.read_csv(\"data/movie_metadata.csv\")", + "dataset.isnull().sum()" + ], + "request": "List the major genres in the dataset. Show the result in a DataFrame with columns `genre` and `count` (number of movies of that genre).\n", + "answer": "# Create a new variable to avoid unintentional modifications to raw DataFrame.\ngenres = pd.DataFrame({\n \"movie\": dataset[\"movie_title\"],\n \"genre\": dataset[\"genres\"].apply(lambda x: x.split(\"|\")[0]),\n})\n\n# Count the number of movies in each genre.\ngenres.groupby(\"genre\").count()[[\"movie\"]].rename(columns={\"movie\": \"count\"}).reset_index()" + }, + { + "variables": { + "iris": "pandas.DataFrame(shape=(150, 5), columns=[\"SepalLengthCm\", \"SepalWidthCm\", \"PetalLengthCm\", \"PetalWidthCm\", \"Species\"])\n SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n Id \n 1 5.1 3.5 1.4 0.2 Iris-setosa\n 2 4.9 3.0 1.4 0.2 Iris-setosa\n .. ... ... ... ... ...\n 149 6.2 3.4 5.4 2.3 Iris-virginica\n 150 5.9 3.0 5.1 1.8 Iris-virginica", + "z": "5.735584236235733", + "pval": "0.05682424941067306" + }, + "codes": [ + "import pandas as pd\nimport numpy as np", + "iris = pd.read_csv(\"data/iris.csv\", index_col=\"Id\")", + "from scipy.stats import normaltest\nz, pval = normaltest(iris[\"SepalLengthCm\"])\npval" + ], + "request": "Analyze the correlations of the dataset. Eliminate upper triangle for readability.\n", + "answer": "corr = iris.select_dtypes(\"number\").corr(method=\"pearson\")\ncorr.where(np.tril(np.ones(corr.shape)).astype(bool))" + }, + { + "variables": { + "train_data": "pandas.DataFrame(shape=(891, 12), columns=[\"PassengerId\", \"Survived\", \"Pclass\", \"Name\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Ticket\", \"Fare\", \"Cabin\", \"Embarked\"])\n PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked\n 0 1 0 3 Braund, Mr.... male 22.0 1 0 A/5 21171 7.2500 NaN S\n 1 2 1 1 Cumings, Mr... female 38.0 1 0 PC 17599 71.2833 C85 C\n .. ... ... ... ... ... ... ... ... ... ... ... ...\n 889 890 1 1 Behr, Mr. K... male 26.0 0 0 111369 30.0000 C148 C\n 890 891 0 3 Dooley, Mr.... male 32.0 0 0 370376 7.7500 NaN Q", + "test_data": "pandas.DataFrame(shape=(418, 11), columns=[\"PassengerId\", \"Pclass\", \"Name\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Ticket\", \"Fare\", \"Cabin\", \"Embarked\"])\n PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked\n 0 892 3 Kelly, Mr. ... male 34.5 0 0 330911 7.8292 NaN Q\n 1 893 3 Wilkes, Mrs... female 47.0 1 0 363272 7.0000 NaN S\n .. ... ... ... ... ... ... ... ... ... ... ...\n 416 1308 3 Ware, Mr. F... male NaN 0 0 359309 8.0500 NaN S\n 417 1309 3 Peter, Mast... male NaN 1 1 2668 22.3583 NaN C", + "women": "pandas.Series(shape=(314,))" + }, + "codes": [ + "import numpy as np\nimport pandas as pd", + "train_data = pd.read_csv(\"data/titanic_train.csv\")\ntrain_data.shape", + "test_data = pd.read_csv(\"data/titanic_test.csv\")\ntest_data.shape", + "women = train_data.loc[train_data.Sex == \"female\"][\"Survived\"]\nsum(women) / len(women) * 100" + ], + "request": "List out the names with age 42.\n", + "answer": "train_data.loc[train_data.Age == 42, \"Name\"].tolist()" + }, + { + "variables": { + "twc": "pandas.DataFrame(shape=(93, 8), columns=[\"tweet_id\", \"author_id\", \"inbound\", \"created_at\", \"text\", \"response_tweet_id\", \"in_response_to_tweet_id\", \"text_wo_punct\"])\n tweet_id author_id inbound created_at text response_tweet_id in_response_to_tweet_id text_wo_punct\n 0 119237 105834 True Wed Oct 11 ... @AppleSuppo... 119236 NaN applesuppor...\n 1 119238 ChaseSupport False Wed Oct 11 ... @105835 You... NaN 119239.0 105835 your...\n .. ... ... ... ... ... ... ... ...\n 91 119333 105861 True Wed Oct 11 ... @Tesco bit ... 119335,119336 119332.0 tesco bit o...\n 92 119335 Tesco False Wed Oct 11 ... @105861 If ... NaN 119333.0 105861 if t..." + }, + "codes": [ + "import numpy as np\nimport pandas as pd", + "twc = pd.read_csv(\"data/twc-sample.csv\")", + "import string\n\ndef remove_punctuation(text: str):\n return text.lower().translate(str.maketrans(\"\", \"\", string.punctuation))\n\ntwc[\"text_wo_punct\"] = twc[\"text\"].apply(remove_punctuation)" + ], + "request": "Remove English stopwords from the `text_wo_punct` column. Save the result in a new column `text_wo_stopwords`.\n", + "answer": "from nltk.corpus import stopwords\n\nSTOPWORDS = set(stopwords.words(\"english\"))\n\ntwc[\"text_wo_stopwords\"] = twc[\"text_wo_punct\"].apply(\n lambda text: \" \".join([word for word in str(text).split() if word not in STOPWORDS])\n)" + }, + { + "variables": { + "df": "pandas.DataFrame(shape=(1599, 12), columns=[\"fixed acidity\", \"volatile acidity\", \"citric acid\", \"residual sugar\", \"chlorides\", \"free sulfur dioxide\", \"total sulfur dioxide\", \"density\", \"pH\", \"sulphates\", \"alcohol\", \"quality\"])\n fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality\n 0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5\n 1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5\n ... ... ... ... ... ... ... ... ... ... ... ... ...\n 1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5\n 1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 6", + "z": "pandas.DataFrame(shape=(1599, 12), columns=[\"fixed acidity\", \"volatile acidity\", \"citric acid\", \"residual sugar\", \"chlorides\", \"free sulfur dioxide\", \"total sulfur dioxide\", \"density\", \"pH\", \"sulphates\", \"alcohol\", \"quality\"])\n fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality\n 0 0.528360 0.961877 1.391472 0.453218 0.243707 0.466193 0.379133 0.558274 1.288643 0.579207 0.960246 0.787823\n 1 0.298547 1.967442 1.391472 0.043416 0.223875 0.872638 0.624363 0.028261 0.719933 0.128950 0.584777 0.787823\n ... ... ... ... ... ... ... ... ... ... ... ... ...\n 1597 1.390155 0.654620 0.775267 0.382271 0.264960 1.542054 0.075043 0.676657 1.677400 0.305990 0.209308 0.787823\n 1598 1.332702 1.216849 1.021999 0.752894 0.434990 0.203223 0.135861 0.666057 0.511130 0.010924 0.541630 0.450848", + "X": "pandas.DataFrame(shape=(1599, 11), columns=[\"fixed acidity\", \"volatile acidity\", \"citric acid\", \"residual sugar\", \"chlorides\", \"free sulfur dioxide\", \"total sulfur dioxide\", \"density\", \"pH\", \"sulphates\", \"alcohol\"])\n fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol\n 0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4\n 1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8\n ... ... ... ... ... ... ... ... ... ... ... ...\n 1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2\n 1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0", + "y": "pandas.Series(shape=(1599,))" + }, + "codes": [ + "import pandas as pd", + "df = pd.read_csv(\"data/winequality-red.csv\")", + "import numpy as np\nfrom scipy import stats\n\nz = np.abs(stats.zscore(df))\ndf[(z < 3).all(axis=1)]", + "from sklearn.model_selection import train_test_split\n\nX = df.drop(columns=\"quality\")\ny = df[\"quality\"]" + ], + "request": "Split X and y into train and test sets. Use 20% of the data for testing.\nSet random state to 42.\nFit a RandomForestClassifier `clf` with 100 estimators on the training data, also using random state 42.\n", + "answer": "from sklearn.ensemble import RandomForestClassifier\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\nclf = RandomForestClassifier(n_estimators=100, random_state=42)\nclf.fit(X_train, y_train)" + } +] \ No newline at end of file diff --git a/coml/prompts/generate_fewshots_leetcode.json b/coml/prompts/generate_fewshots_leetcode.json new file mode 100644 index 0000000..d4bda23 --- /dev/null +++ b/coml/prompts/generate_fewshots_leetcode.json @@ -0,0 +1,42 @@ +[ + { + "variables": {}, + "codes": [ + "import pandas as pd" + ], + "request": "Write a function `def capital_gainloss(stocks: pd.DataFrame) -> pd.DataFrame`.\n\n`stocks` is a DataFrame with the following columns:\n- stock_name: str\n- operation: str\n- operation_day: int\n- price: int\nEach record indicates that the stock which has `stock_name` had an operation on the day `operation_day` with the price. The `operation` must be one of `Sell`, `Buy`. It is guaranteed that each 'Sell' operation for a stock has a corresponding 'Buy' operation in a previous day. It is also guaranteed that each 'Buy' operation for a stock has a corresponding 'Sell' operation in an upcoming day.\n\nThe function should report the **Capital gain/loss** for each stock. The **Capital gain/loss** of a stock is the total gain or loss after buying and selling the stock one or many times. Return the result table in **any order**.\n\nThe result format is in the following example.\n\nExample input:\n```\nstocks:\n+---------------+-----------+---------------+--------+\n| stock_name | operation | operation_day | price |\n+---------------+-----------+---------------+--------+\n| Leetcode | Buy | 1 | 1000 |\n| Corona Masks | Buy | 2 | 10 |\n| Leetcode | Sell | 5 | 9000 |\n| Handbags | Buy | 17 | 30000 |\n| Corona Masks | Sell | 3 | 1010 |\n| Corona Masks | Buy | 4 | 1000 |\n| Corona Masks | Sell | 5 | 500 |\n| Corona Masks | Buy | 6 | 1000 |\n| Handbags | Sell | 29 | 7000 |\n| Corona Masks | Sell | 10 | 10000 |\n+---------------+-----------+---------------+--------+\n```\n\nExample output:\n```\n+---------------+-------------------+\n| stock_name | capital_gain_loss |\n+---------------+-------------------+\n| Corona Masks | 9500 |\n| Leetcode | 8000 |\n| Handbags | -23000 |\n+---------------+-------------------+\n```\n\nExample explanation:\n- Leetcode stock was bought at day 1 for 1000$ and was sold at day 5 for 9000$. Capital gain = 9000 - 1000 = 8000$.\n- Handbags stock was bought at day 17 for 30000$ and was sold at day 29 for 7000$. Capital loss = 7000 - 30000 = -23000$.\n- Corona Masks stock was bought at day 1 for 10$ and was sold at day 3 for 1010$. It was bought again at day 4 for 1000$ and was sold at day 5 for 500$. At last, it was bought at day 6 for 1000$ and was sold at day 10 for 10000$. Capital gain/loss is the sum of capital gains/losses for each ('Buy' --> 'Sell') operation = (1010 - 10) + (500 - 1000) + (10000 - 1000) = 1000 - 500 + 9000 = 9500$.\n", + "answer": "def capital_gainloss(stocks: pd.DataFrame) -> pd.DataFrame:\n # Create a copy of the dataframe to avoid modifying the original one\n df = stocks.copy()\n\n # Calculate the capital gain/loss for each stock\n df[\"price\"] = df.apply(\n lambda x: -x[\"price\"] if x[\"operation\"] == \"Buy\" else x[\"price\"], axis=1\n )\n result = df.groupby(\"stock_name\")[\"price\"].sum().reset_index()\n result.columns = [\"stock_name\", \"capital_gain_loss\"]\n\n return result" + }, + { + "variables": {}, + "codes": [ + "import pandas as pd" + ], + "request": "Write a function `def consecutive_numbers(logs: pd.DataFrame) -> pd.DataFrame`.\n\n`logs` is a DataFrame with the following columns:\n- id: int\n- num: int\n`id` is an autoincrement column and serves as the primary key.\n\nThe function should find all numbers that appear at least three times consecutively and return the result in a DataFrame with a single column `ConsecutiveNums`. The result should be returned in **any order**.\n\nExample input:\n```\nlogs:\n+----+-----+\n| id | num |\n+----+-----+\n| 1 | 1 |\n| 2 | 1 |\n| 3 | 1 |\n| 4 | 2 |\n| 5 | 1 |\n| 6 | 2 |\n| 7 | 2 |\n+----+-----+\n```\n\nExample output:\n```\n+-----------------+\n| ConsecutiveNums |\n+-----------------+\n| 1 |\n+-----------------+\n```\n\nExample explanation:\n- 1 is the only number that appears consecutively for at least three times.\n", + "answer": "def consecutive_numbers(logs: pd.DataFrame) -> pd.DataFrame:\n # Create a new column to identify consecutive numbers\n logs[\"shifted_num\"] = logs[\"num\"].shift(2)\n logs[\"is_consecutive\"] = logs[\"num\"].eq(logs[\"num\"].shift()) & logs[\"num\"].eq(logs[\"shifted_num\"])\n\n # Filter out the rows where numbers are not consecutive\n consecutive = logs[logs[\"is_consecutive\"]]\n\n # Get the unique numbers that are consecutive\n result = pd.DataFrame(consecutive[\"num\"].unique(), columns=[\"ConsecutiveNums\"])\n\n return result" + }, + { + "variables": {}, + "codes": [ + "import pandas as pd" + ], + "request": "Write a function `def employee_bonus(employee: pd.DataFrame, bonus: pd.DataFrame) -> pd.DataFrame`.\n\n`employee` is a DataFrame with the following columns:\n- empId: int\n- name: str\n- supervisor: int\n- salary: int\nEach record indicates the name and the ID of an employee in addition to their salary and the id of their manager.\n\n`bonus` is a DataFrame with the following columns:\n- empId: int\n- bonus: int\nEach record contains the id of an employee and their respective bonus.\n\nThe function should report the name and bonus amount of each employee with a bonus **less than** `1000`.\n\nReturn the result table in **any order**.\n\nThe result format is in the following example.\n\nExample input:\n```\nemployee:\n+-------+--------+------------+--------+\n| empId | name | supervisor | salary |\n+-------+--------+------------+--------+\n| 3 | Brad | null | 4000 |\n| 1 | John | 3 | 1000 |\n| 2 | Dan | 3 | 2000 |\n| 4 | Thomas | 3 | 4000 |\n+-------+--------+------------+--------+\nbonus:\n+-------+-------+\n| empId | bonus |\n+-------+-------+\n| 2 | 500 |\n| 4 | 2000 |\n+-------+-------+\n```\n\nExample output:\n```\n+------+-------+\n| name | bonus |\n+------+-------+\n| Brad | null |\n| John | null |\n| Dan | 500 |\n+------+-------+\n```\n", + "answer": "def employee_bonus(employee: pd.DataFrame, bonus: pd.DataFrame) -> pd.DataFrame:\n # Merge the employee and bonus dataframes on empId.\n merged_df = pd.merge(employee[[\"empId\", \"name\"]], bonus, on=\"empId\", how=\"left\")\n\n # Filter out the employees with bonus >= 1000 or without a bonus.\n result_df = merged_df[merged_df[\"bonus\"].isna() | (merged_df[\"bonus\"] < 1000)]\n\n # Select only the name and bonus columns for the final output.\n return result_df[[\"name\", \"bonus\"]]" + }, + { + "variables": {}, + "codes": [ + "import pandas as pd" + ], + "request": "Write a function `def price_at_given_date(products: pd.DataFrame) -> pd.DataFrame`.\n\n`products` is a DataFrame with the following columns:\n- product_id: int\n- new_price: int\n- change_date: datetime\nEach record indicates that the price of some product was changed to a new price at some date. Assume the price of all products before any change is `10`.\n\nThe function should return the prices of all products on `2019-08-16`. Return the result table in **any order**.\n\nThe result format is in the following example.\n\nExample input:\n```\nproducts:\n+------------+-----------+-------------+\n| product_id | new_price | change_date |\n+------------+-----------+-------------+\n| 1 | 20 | 2019-08-14 |\n| 2 | 50 | 2019-08-14 |\n| 1 | 30 | 2019-08-15 |\n| 1 | 35 | 2019-08-16 |\n| 2 | 65 | 2019-08-17 |\n| 3 | 20 | 2019-08-18 |\n+------------+-----------+-------------+\n```\n\nExample output:\n```\n+------------+-------+\n| product_id | price |\n+------------+-------+\n| 2 | 50 |\n| 1 | 35 |\n| 3 | 10 |\n+------------+-------+\n```\n", + "answer": "def price_at_given_date(products: pd.DataFrame) -> pd.DataFrame:\n # Filter out the prices that are set after the given date\n filtered_products = products[products[\"change_date\"] <= \"2019-08-16\"]\n\n # Sort by product_id and change_date to get the latest price before or on the given date\n sorted_products = filtered_products.sort_values(\n by=[\"product_id\", \"change_date\"], ascending=[True, False]\n )\n\n # Drop duplicates to keep only the latest price for each product\n latest_prices = sorted_products.drop_duplicates(subset=[\"product_id\"])\n\n # If a product has no price change before the given date, its price is assumed to be 10\n all_products = pd.DataFrame({\"product_id\": products[\"product_id\"].unique()})\n result = pd.merge(\n all_products,\n latest_prices[[\"product_id\", \"new_price\"]],\n on=\"product_id\",\n how=\"left\",\n )\n result[\"price\"] = result[\"new_price\"].fillna(10).astype(int)\n\n # Select the required columns and return the result\n return result[[\"product_id\", \"price\"]]" + }, + { + "variables": {}, + "codes": [ + "import pandas as pd" + ], + "request": "Write a function `def latest_login(logins: pd.DataFrame) -> pd.DataFrame`.\n\n`logins` is a DataFrame with the following columns:\n- user_id: int\n- time_stamp: datetime\nEach record contains information about the login time for the user with ID `user_id`. The combination of `user_id` and `time_stamp` is unique for this table.\n\nThe function should report the **latest** login for all users in the year `2020`. Do **not** include the users who did not login in `2020`. Return the result table **in any order**.\n\nThe result format is in the following example.\n\nExample input:\n```\nlogins:\n+---------+---------------------+\n| user_id | time_stamp |\n+---------+---------------------+\n| 6 | 2020-06-30 15:06:07 |\n| 6 | 2021-04-21 14:06:06 |\n| 6 | 2019-03-07 00:18:15 |\n| 8 | 2020-02-01 05:10:53 |\n| 8 | 2020-12-30 00:46:50 |\n| 2 | 2020-01-16 02:49:50 |\n| 2 | 2019-08-25 07:59:08 |\n| 14 | 2019-07-14 09:00:00 |\n| 14 | 2021-01-06 11:59:59 |\n+---------+---------------------+\n```\n\nExample output:\n```\n+---------+---------------------+\n| user_id | last_stamp |\n+---------+---------------------+\n| 6 | 2020-06-30 15:06:07 |\n| 8 | 2020-12-30 00:46:50 |\n| 2 | 2020-01-16 02:49:50 |\n+---------+---------------------+\n```\n\nExample explanation:\n- User 6 logged into their account 3 times but only once in 2020, so we include this login in the result table.\n- User 8 logged into their account 2 times in 2020, once in February and once in December. We include only the latest one (December) in the result table.\n- User 2 logged into their account 2 times but only once in 2020, so we include this login in the result table.\n- User 14 did not login in 2020, so we do not include them in the result table.\n", + "answer": "def latest_login(logins: pd.DataFrame) -> pd.DataFrame:\n # Filter logins for the year 2020\n logins_2020 = logins[logins[\"time_stamp\"].dt.year == 2020]\n\n # Find the latest login for each user in 2020\n latest_logins = logins_2020.groupby(\"user_id\")[\"time_stamp\"].max().reset_index()\n\n # Rename the column for the output\n latest_logins = latest_logins.rename(columns={\"time_stamp\": \"last_stamp\"})\n\n return latest_logins" + } +] \ No newline at end of file diff --git a/coml/prompts/generate_fewshots_v2.json b/coml/prompts/generate_fewshots_v2.json new file mode 100644 index 0000000..4ce1c45 --- /dev/null +++ b/coml/prompts/generate_fewshots_v2.json @@ -0,0 +1,59 @@ +[ + { + "variables": { + "df": "pandas.DataFrame(shape=(5, 2), columns=[\"width\", \"height\"])\n width height\n 0 35 20\n 1 42 12\n 2 38 30\n 3 45 25\n 4 50 15" + }, + "codes": [ + "# Compute the area\ndf[\"area\"] = df[\"width\"] * df[\"height\"]" + ], + "request": "Output the correlation between width and height.", + "answer": "df[\"width\"].corr(df[\"height\"])", + "rationale": "To find the correlation between the 'width' and 'height' columns in the DataFrame `df`, we can use the `corr()` method provided by pandas, which computes pairwise correlations of columns, excluding NA/null values. Since we are only interested in the correlation between two specific columns, we can isolate those two columns and then call the `corr()` method.\n\nThe steps to do this are as follows:\n\n1. Select the 'width' and 'height' columns from the DataFrame `df`.\n2. Call the `corr()` method on 'width' column, with 'height' column as its parameter.\n\nHere is the code to do this:" + }, + { + "variables": { + "df": "pandas.DataFrame(shape=(10, 4), columns=[\"students\", \"score\", \"grade\", \"pass\"])\n students score grade pass\n 0 Alice 53 F False\n 1 Bob 68 D False\n .. ... ... ... ...\n 8 Ivan 45 F False\n 9 Judy 80 B True" + }, + "codes": [ + "# Add a column \"grade\" based on the score\ndf[\"grade\"] = df[\"score\"].apply(lambda x: \"A\" if x >= 90 else \"B\" if x >= 80 else \"C\" if x >= 70 else \"D\" if x >= 60 else \"F\")", + "# Add a column \"pass\" based on the grade\ndf[\"pass\"] = df[\"grade\"].apply(lambda x: x in [\"A\", \"B\", \"C\"])" + ], + "request": "Who are the top 3 students in the class?", + "answer": "df.sort_values(by=\"score\", ascending=False).head(3)[\"students\"].tolist()", + "rationale": "To identify the top 3 students in the class, we need to follow these steps:\n\n1. **Sort the DataFrame:** We need to sort the DataFrame based on the 'score' column in descending order because the students with the highest scores are considered the top students.\n2. **Select the Top 3:** After sorting, we select the first three rows of the DataFrame, which will represent the top 3 students.\n3. **Extract Student Names:** Finally, we extract the 'students' column from these top 3 rows to get the names of the top students, and convert it to a list.\n\nGiven that the DataFrame is named `df`, here is how you can perform these steps in code:" + }, + { + "variables": { + "df": "pandas.DataFrame(shape=(6, 4), columns=[\"product\", \"price\", \"weight\", \"screen\"])\n product price weight screen\n 0 Macbook Pro $1,299 3.0 lbs 13.3\"\n 1 Macbook Air $999 2.8 lbs 13.3\"\n 2 Surface Pro $799 1.7 lbs 12.3\"\n 3 Surface Laptop $999 2.8 lbs 13.5\"\n 4 Surface Book $1,499 3.3 lbs 13.5\"\n 5 Surface Studio $3,499 21 lbs 28\"" + }, + "codes": [ + "# Convert the price to float inplace\ndf[\"price\"] = df[\"price\"].apply(lambda x: float(x.replace(\"$\", \"\").replace(\",\", \"\")))" + ], + "request": "Compare the average weight of surface products and macbook products?", + "answer": "\n# Import pandas as it seems unimported.\nimport pandas as pd\n\n# Create a new variable to avoid unintentional modifications to raw DataFrame.\ndf_product_weight = pd.DataFrame({\n \"product\": df[\"product\"].apply(lambda x: \"Macbook\" if \"Macbook\" in x else \"Surface\"),\n \"weight\": df[\"weight\"].apply(lambda x: float(x.replace(\" lbs\", \"\"))),\n})\ndf_product_weight.groupby(\"product\").mean()", + "rationale": "To compare the average weight of Surface products and MacBook products, we'll need to:\n\n1. Extract the weight as a numerical value from the \"weight\" column.\n2. Create a new column to classify each product as either 'Surface' or 'Macbook'.\n3. Use the `groupby` function to calculate the average weight for each product group.\n\nWe need to avoid unintentional modifications to the raw DataFrame, as the request didn't ask for in-place changes to the data.", + "answer_wo_intact": "# Import pandas as it seems unimported.\nimport pandas as pd\n\ndf[\"product\"] = df[\"product\"].apply(lambda x: \"Macbook\" if \"Macbook\" in x else \"Surface\")\ndf[\"weight\"] = df[\"weight\"].apply(lambda x: float(x.replace(\" lbs\", \"\")))\ndf.groupby(\"product\").mean()", + "rationale_wo_intact": "To compare the average weight of Surface products and MacBook products, we'll need to:\n\n1. Extract the weight as a numerical value from the \"weight\" column.\n2. Classify each product as either 'Surface' or 'Macbook'.\n3. Use the `groupby` function to calculate the average weight for each product group." + }, + { + "variables": { + "cars": "pandas.DataFrame(shape=(5, 3), columns=[\"brand\", \"model\", \"price\"])\n brand model price\n 0 BMW X5 60000\n 1 Toyota Camry 25000\n 2 Honda Civic 20000\n 3 Ford Mustang 30000\n 4 Tesla Model 3 40000", + "owners": "pandas.DataFrame(shape=(6, 2), columns=[\"model\", \"owner\"])\n model owner\n 0 Civic Tom\n 1 X5 Tom\n 2 X5 Bob\n 3 Mustang Alice\n 4 Model 3 Alice\n 5 Model 3 Bob" + }, + "codes": [ + "# Compute the average price of each brand\ncars.groupby(\"brand\").mean()" + ], + "request": "Compute the total price of each owner's cars. Return a series with \"Owner\" as index and \"Price\" as value.", + "answer": "cars.merge(owners, on=\"model\").groupby(\"owner\").sum()[\"price\"].rename(\"Price\").rename_axis(\"Owner\")", + "rationale": "To compute the total price of each owner's cars, we need to follow these steps:\n\n1. Merge the `cars` DataFrame with the `owners` DataFrame on the \"model\" column to link each car's price with its owner.\n2. Group the merged DataFrame by the \"owner\" column.\n3. Sum the \"price\" column within each group to get the total price per owner.\n4. Return the resulting series with \"owner\" as index and the total \"price\" as value. The `price` column should be renamed to `Price` and the index should be renamed to `Owner` as required.\n\nLet's write the code to perform these steps:" + }, + { + "variables": { + "df": "pandas.DataFrame(shape=(6, 2), columns=[\"year\", \"sales\"])\n year sales\n 0 2015 100\n 1 2016 120\n 2 2017 150\n 3 2018 180\n 4 2019 200\n 5 2020 250" + }, + "codes": [], + "request": "Visualize the sales trend over years.", + "answer": "import matplotlib.pyplot as plt\nplt.plot(df[\"year\"], df[\"sales\"])", + "rationale": "To visualize the sales trend over the years, we will create a line plot using `matplotlib.pyplot`. This type of plot is ideal for showing how sales figures have changed over time. Here's how we can do it:\n\n1. Import `matplotlib.pyplot` for plotting.\n2. Use the `plot` function to create a line plot with 'year' on the x-axis and 'sales' on the y-axis.\n\nLet me write the code for you." + } +] \ No newline at end of file diff --git a/package.json b/package.json index 35ae82e..6c1fdf9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "coml", - "version": "0.0.6", + "version": "0.0.7", "description": "JupyterLab extension for CoML.", "keywords": [ "jupyter", diff --git a/pyproject.toml b/pyproject.toml index df3b633..2614f17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "mlcopilot" -version = "0.0.6" +version = "0.0.7" dependencies = [ "click", "colorama", @@ -109,7 +109,6 @@ before-build-python = ["jlpm clean:all"] line-length = 88 target-version = ['py310'] include = '\.pyi?$' -packages = ["coml"] extend-exclude = '.+/(dist|.venv|venv|build)/.+' [tool.isort]