HuggingFace support and June updates (#2)

* Improve SQL select experience implementation * support pgvector * Feature: Add huggingface model/hyperparameters space suggestion * Improve packaging * Minor fix * Generalize prompt template and knowledge * Minor fix
microsoft · Jul 4, 2023 · e432607 · e432607
1 parent b3c0e6a
commit e432607
Show file tree

Hide file tree

Showing 18 changed files with 421 additions and 301 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,9 +2,9 @@ name: Python CI
 
 on:
   push:
-    branches: [ demo_orphan ]
+    branches: [ main, dev ]
   pull_request_target:
-    branches: [ demo_orphan ]
+    branches: [ main, dev ]
 
 concurrency:
   group: ${{ format('ci-{0}', github.head_ref && format('pr-{0}', github.event.pull_request.number) || github.sha) }}
@@ -32,7 +32,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install -e .[dev]
 
       - name: Lint with flake8
         run: flake8
@@ -61,7 +61,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install -e .[dev]
       - name: Test with pytest
         run: |
           pytest
diff --git a/README.md b/README.md
@@ -20,8 +20,7 @@ MLCopilot is a tool to help you find the best models/hyperparametes for your tas
 0. Clone this repo: `git clone REPO_URL; cd mlcopilot`
 1. Put assets/mlcopilot.db in your home directory: `cp assets/mlcopilot.db ~/.mlcopilot/mlcopilot.db`
 2. Install Python 3.8 or higher
-3. Build: `hatch build`. (May need to install [hatch](https://hatch.pypa.io/latest/install/) first)
-4. Install: `pip install ./dist/*.whl`
+3. Install: `pip install .`. If you want to develop, use `pip install -e .[dev]` instead.
 
 ### Run
 

diff --git a/assets/mlcopilot.db b/assets/mlcopilot.db
diff --git a/mlcopilot/.env.template b/mlcopilot/.env.template
@@ -3,5 +3,17 @@
 OPENAI_API_KEY=your-openai-api-key
 
 ### DB
-## MLCOPILOT_DB_PATH - Path to database file (Example: ~/.mlcopilot/mlcopilot.db)
-MLCOPILOT_DB_PATH=~/.mlcopilot/mlcopilot.db
+## MLCOPILOT_DB_BACKEND - Database backend (Example: sqlite)
+MLCOPILOT_DB_BACKEND=sqlite
+## MLCOPILOT_DB_PATH - Path to database file (Example: ~/.mlcopilot/mlcopilot.db) - Only for sqlite
+MLCOPILOT_DB_PATH=~/.mlcopilot/mlcopilot.db
+## MLCOPILOT_DB_NAME - Database name (Example: mlcopilot)
+MLCOPILOT_DB_NAME=mlcopilot
+## MLCOPILOT_DB_HOST - Database host (Example: localhost)
+MLCOPILOT_DB_HOST=localhost
+## MLCOPILOT_DB_PORT - Database port (Example: 5432)
+MLCOPILOT_DB_PORT=5432
+## MLCOPILOT_DB_USER - Database user (Example: postgres)
+MLCOPILOT_DB_USER=postgres
+## MLCOPILOT_DB_PASSWORD - Database password (Example: '')
+MLCOPILOT_DB_PASSWORD=''
diff --git a/mlcopilot/constants.py b/mlcopilot/constants.py
@@ -8,16 +8,38 @@
     "bin_map",
     "inverse_bin_map",
     "q_num",
+    "MLCOPILOT_DB_BACKEND",
+    "MLCOPILOT_DB_NAME",
+    "MLCOPILOT_DB_HOST",
+    "MLCOPILOT_DB_PORT",
+    "MLCOPILOT_DB_USER",
+    "MLCOPILOT_DB_PASSWORD",
+    "PROMPT_FORMATS",
+    "DEFAULT_PROMPT_PREFIX",
+    "DEFAULT_PROMPT_SUFFIX",
+    "TOKEN_LIMIT",
+    "TOKEN_COMPLETION_LIMIT",
+    "RELAX_TOKEN",
 ]
 
 TOP_K = 3
 EMBED_DIM = 1536
+TOKEN_LIMIT = 4096
+TOKEN_COMPLETION_LIMIT = 800
+RELAX_TOKEN = 500  # RELAX_TOKEN is the number of tokens to void token limit
 
+MLCOPILOT_DB_BACKEND = os.environ.get("MLCOPILOT_DB_BACKEND", "sqlite")
 
 MLCOPILOT_DB_PATH = Path(
     os.environ.get("MLCOPILOT_DB_PATH", Path.home() / ".mlcopilot" / "mlcopilot.db")
 ).expanduser()
 
+MLCOPILOT_DB_NAME = os.environ.get("MLCOPILOT_DB_NAME", "mlcopilot")
+MLCOPILOT_DB_HOST = os.environ.get("MLCOPILOT_DB_HOST", "localhost")
+MLCOPILOT_DB_PORT = os.environ.get("MLCOPILOT_DB_PORT", 5432)
+MLCOPILOT_DB_USER = os.environ.get("MLCOPILOT_DB_USER", "postgres")
+MLCOPILOT_DB_PASSWORD = os.environ.get("MLCOPILOT_DB_PASSWORD", "")
+
 bin_map = {
     0.1: "very small",
     0.3: "small",
@@ -46,3 +68,17 @@
 )
 
 q_num = sorted(list(bin_map.keys()))
+
+PROMPT_FORMATS = {
+    "TOP_K",
+    "knowledge",
+    "space_desc",
+    "new_task_desc",
+}
+
+DEFAULT_PROMPT_PREFIX = """{space_desc}\nRecommend best configurations to train a model for a new task. Format strictly follows this template: ```Configuration 1: {{parameter_1_name}} is {{parameter_1_value}}. {{parameter_2_name}} is {{parameter_2_value}}...{{parameter_n_name}} is {{parameter_n_value}}.
+Configuration 2: {{parameter_1_name}} is {{parameter_1_value}}. {{parameter_2_name}} is {{parameter_2_value}}...{{parameter_n_name}} is {{parameter_n_value}}.
+Configuration 3: {{parameter_1_name}} is {{parameter_1_value}}. {{parameter_2_name}} is {{parameter_2_value}}...{{parameter_n_name}} is {{parameter_n_value}}.
+```\nHere are some tasks along with best hyper-parameter configurations to train a model on them.\n"""
+
+DEFAULT_PROMPT_SUFFIX = """\nGuidelines:{knowledge}\n\n\nBased on the examples(if provided) and guidelines(if provided) above, recommend {TOP_K} hyper-parameter configurations for a new classification dataset.\n\n{new_task_desc}"""
diff --git a/mlcopilot/experience.py b/mlcopilot/experience.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import langchain
 import numpy as np
@@ -330,13 +330,14 @@ def _get_best_relevant_solutions(space: Space, task_desc: str) -> ModelSelect:
         The best relevant solution.
     """
     SolutionAlias = Solution.alias()
+    order_key = Task.embedding.cosine_distance(task_desc)
     subquery = (
         SolutionAlias.select(
             SolutionAlias.demo,
             Task.task_id,
             Task.desc,
             Task.embedding,
-            fn.RANK()
+            fn.ROW_NUMBER()
             .over(
                 partition_by=[SolutionAlias.space, SolutionAlias.task],
                 order_by=[SolutionAlias.metric.desc()],
@@ -345,11 +346,13 @@ def _get_best_relevant_solutions(space: Space, task_desc: str) -> ModelSelect:
         )
         .where(SolutionAlias.space == space)
         .join(Task, on=(SolutionAlias.task == Task.task_id))
-        .order_by(fn.cosine_similarity(task_desc, Task.embedding).desc())
+        .order_by(order_key)
         .alias("subq")
     )
-    query = Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc).from_(
-        subquery
+    query = (
+        Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc)
+        .from_(subquery)
+        .where(subquery.c.rnk <= TOP_K)
     )
     return query
 
@@ -375,7 +378,7 @@ def _get_best_solutions(space: Space) -> ModelSelect:
             Task.task_id,
             Task.desc,
             Task.embedding,
-            fn.RANK()
+            fn.ROW_NUMBER()
             .over(
                 partition_by=[SolutionAlias.space, SolutionAlias.task],
                 order_by=[SolutionAlias.metric.desc()],
@@ -386,13 +389,17 @@ def _get_best_solutions(space: Space) -> ModelSelect:
         .join(Task, on=(SolutionAlias.task == Task.task_id))
         .alias("subq")
     )
-    query = Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc).from_(
-        subquery
+    query = (
+        Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc)
+        .from_(subquery)
+        .where(subquery.c.rnk <= TOP_K)
     )
     return query
 
 
-def gen_experience(space: Space, task_desc: Optional[str] = None) -> List[str]:
+def gen_experience(
+    space: Space, task_desc: Optional[str] = None
+) -> Tuple[List[str], List[str]]:
     """
     Generate experience content from space and optional task description.
 
@@ -417,8 +424,7 @@ def gen_experience(space: Space, task_desc: Optional[str] = None) -> List[str]:
     for solution in query:
         if solution.task_id not in examples:
             examples[solution.task_id] = [solution.desc]
-        if len(examples[solution.task_id]) <= TOP_K:
-            examples[solution.task_id].append(
-                f"Configuration {len(examples[solution.task_id])}: {solution.demo}"
-            )
-    return ["\n".join(e) for e in examples.values()]
+        examples[solution.task_id].append(
+            f"Configuration {len(examples[solution.task_id])}: {solution.demo}"
+        )
+    return list(examples.keys()), ["\n".join(e) for e in examples.values()]
diff --git a/mlcopilot/knowledge.py b/mlcopilot/knowledge.py
@@ -1,15 +1,17 @@
 import random
+import re
 from typing import Any, Callable, Dict, List, Optional
 
 import orjson
 from langchain.prompts import FewShotPromptTemplate, PromptTemplate
 from langchain.prompts.example_selector import LengthBasedExampleSelector
 
 from mlcopilot.constants import *
+from mlcopilot.constants import TOKEN_COMPLETION_LIMIT, TOKEN_LIMIT
 from mlcopilot.experience import gen_experience
 from mlcopilot.orm import Knowledge, Solution, Space, Task, database_proxy
 from mlcopilot.surrogate_utils import evaluate_configs
-from mlcopilot.utils import get_llm, parse_configs
+from mlcopilot.utils import get_llm, get_token_count_func, parse_configs
 
 prefix_sep = "__DUMM_SEP__"
 
@@ -28,6 +30,12 @@ def gen_knowledge_candidate(examples: List[str]) -> str:
     str
         The generated knowledge candidate.
     """
+    prefix_token = get_token_count_func()(
+        "Here are some tasks along with best hyper-parameter configurations to train a model on them.\n"
+    )
+    suffix_token = get_token_count_func()(
+        "\nQ: From the examples above, what patterns can we observe about the relationship between dataset characteristics and the best hyper-parameter configurations? (Answer MUST be concise, critical, point-by-point, line-by-line, and brief. Only include relevant observations without unnecessary elaboration.)\n\nA: 1."
+    )
     example_prompt = PromptTemplate(
         input_variables=["input"],
         template="{input}",
@@ -36,6 +44,12 @@ def gen_knowledge_candidate(examples: List[str]) -> str:
     example_selector = LengthBasedExampleSelector(
         examples=[{"input": example} for example in examples],
         example_prompt=example_prompt,
+        max_length=TOKEN_LIMIT
+        - prefix_token
+        - suffix_token
+        - TOKEN_COMPLETION_LIMIT
+        - RELAX_TOKEN,
+        get_text_length=get_token_count_func(),
     )
 
     dynamic_prompt = FewShotPromptTemplate(
@@ -76,6 +90,18 @@ def suggest_with_knowledge(
     List[Dict[str, Any]]
         The list of suggested configurations.
     """
+    prefix_token = get_token_count_func()(
+        "Here are some tasks along with best hyper-parameter configurations to train a model on them.\n"
+    )
+    suffix_token = get_token_count_func()(
+        "\nGuidelines:{knowledge}\n\n\nBased on the examples and guidelines above, recommend {TOP_K} hyper-parameter configurations for a new classification dataset.\n\n{output}".format(
+            knowledge=knowledge,
+            TOP_K=str(TOP_K),
+            output=(
+                valid_example[: valid_example.index("\nConfiguration 1:")] + "\n\n"
+            ),
+        )
+    )
     example_prompt = PromptTemplate(
         input_variables=["input"],
         template="{input}",
@@ -84,6 +110,12 @@ def suggest_with_knowledge(
     example_selector = LengthBasedExampleSelector(
         examples=[{"input": example} for example in examples],
         example_prompt=example_prompt,
+        max_length=TOKEN_LIMIT
+        - prefix_token
+        - suffix_token
+        - TOKEN_COMPLETION_LIMIT
+        - RELAX_TOKEN,
+        get_text_length=get_token_count_func(),
     )
 
     dynamic_prompt = FewShotPromptTemplate(
@@ -117,7 +149,7 @@ def suggest_with_knowledge(
 
 def post_validation(
     space: Space, surrogate_fn: Callable, config_names: List[str]
-) -> str:
+) -> List[str]:
     """
     Post validation to generate knowledge.
 
@@ -132,17 +164,17 @@ def post_validation(
 
     Returns
     -------
-    str
-        The generated knowledge.
+    List[str]
+        The list of generated knowledge.
     """
-    knowledge = get_knowledge(space.space_id)
-    if knowledge is not None:
+    knowledges = get_knowledge(space)
+    if knowledges != "":
         print("Knowledge already exists.")
-        return knowledge
+        return knowledges
     quantile_infos = orjson.loads(space.quantile_info)
-    examples = gen_experience(space)
+    retrieved_tasks, examples = gen_experience(space)
     best_score = float("-inf")
-    knowledge = None
+    knowledges = None
     for _ in range(3):
         random.shuffle(examples)
         knowledge_candidate = gen_knowledge_candidate(examples)
@@ -168,15 +200,49 @@ def post_validation(
             score += _score
         if best_score < score:
             best_score = score
-            knowledge = knowledge_candidate
-    assert knowledge is not None, "Knowledge is not generated."
+            knowledges = knowledge_candidate
+    assert knowledges is not None, "Knowledge is not generated."
 
-    return knowledge
+    knowledges = split_knowledge(knowledges)
+    return knowledges
 
 
-def get_knowledge(space: Space):
+def get_knowledge(space: Space, task=None):
     try:
-        knowledge = Knowledge.get(Knowledge.space_id == space.space_id).knowledge
-        return knowledge
+        knowledges = Knowledge.select().where(
+            (Knowledge.space_id == space.space_id)
+            & ((Knowledge.task == task) | (Knowledge.task == None))
+        )
+        knowledge_str = ""
+        for i, knowledge in enumerate(knowledges):
+            knowledge_str += f"{i+1}. {knowledge.knowledge}\n\n"
+        return knowledge_str
     except:
-        return None
+        return ""
+
+
+def split_knowledge(knowledges: str) -> List[str]:
+    """
+    Split the knowledge into a list of knowledge.
+
+    Parameters
+    ----------
+    knowledges: str
+        The knowledge.
+
+    Returns
+    -------
+    List[str]
+        The list of knowledge.
+
+    Examples
+    --------
+    >>> split_knowledge("1. A\n2. B\n3. C\n")
+    ["A", "B", "C"]
+    """
+    return [
+        k.strip()
+        for k in re.findall(
+            r"\n\d+\.([\s\S]+?)(?=\n+\d+\.)", "\n" + knowledges + "\n999."
+        )
+    ]