Skip to content

Commit

Permalink
HuggingFace support and June updates (#2)
Browse files Browse the repository at this point in the history
* Improve SQL select experience implementation

* support pgvector

* Feature: Add huggingface
model/hyperparameters space suggestion

* Improve packaging

* Minor fix

* Generalize prompt template and knowledge

* Minor fix
  • Loading branch information
zhanglei1172 authored Jul 4, 2023
1 parent b3c0e6a commit e432607
Show file tree
Hide file tree
Showing 18 changed files with 421 additions and 301 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ name: Python CI

on:
push:
branches: [ demo_orphan ]
branches: [ main, dev ]
pull_request_target:
branches: [ demo_orphan ]
branches: [ main, dev ]

concurrency:
group: ${{ format('ci-{0}', github.head_ref && format('pr-{0}', github.event.pull_request.number) || github.sha) }}
Expand Down Expand Up @@ -32,7 +32,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -e .[dev]
- name: Lint with flake8
run: flake8
Expand Down Expand Up @@ -61,7 +61,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -e .[dev]
- name: Test with pytest
run: |
pytest
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ MLCopilot is a tool to help you find the best models/hyperparametes for your tas
0. Clone this repo: `git clone REPO_URL; cd mlcopilot`
1. Put assets/mlcopilot.db in your home directory: `cp assets/mlcopilot.db ~/.mlcopilot/mlcopilot.db`
2. Install Python 3.8 or higher
3. Build: `hatch build`. (May need to install [hatch](https://hatch.pypa.io/latest/install/) first)
4. Install: `pip install ./dist/*.whl`
3. Install: `pip install .`. If you want to develop, use `pip install -e .[dev]` instead.

### Run

Expand Down
Binary file modified assets/mlcopilot.db
Binary file not shown.
16 changes: 14 additions & 2 deletions mlcopilot/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,17 @@
OPENAI_API_KEY=your-openai-api-key

### DB
## MLCOPILOT_DB_PATH - Path to database file (Example: ~/.mlcopilot/mlcopilot.db)
MLCOPILOT_DB_PATH=~/.mlcopilot/mlcopilot.db
## MLCOPILOT_DB_BACKEND - Database backend (Example: sqlite)
MLCOPILOT_DB_BACKEND=sqlite
## MLCOPILOT_DB_PATH - Path to database file (Example: ~/.mlcopilot/mlcopilot.db) - Only for sqlite
MLCOPILOT_DB_PATH=~/.mlcopilot/mlcopilot.db
## MLCOPILOT_DB_NAME - Database name (Example: mlcopilot)
MLCOPILOT_DB_NAME=mlcopilot
## MLCOPILOT_DB_HOST - Database host (Example: localhost)
MLCOPILOT_DB_HOST=localhost
## MLCOPILOT_DB_PORT - Database port (Example: 5432)
MLCOPILOT_DB_PORT=5432
## MLCOPILOT_DB_USER - Database user (Example: postgres)
MLCOPILOT_DB_USER=postgres
## MLCOPILOT_DB_PASSWORD - Database password (Example: '')
MLCOPILOT_DB_PASSWORD=''
36 changes: 36 additions & 0 deletions mlcopilot/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,38 @@
"bin_map",
"inverse_bin_map",
"q_num",
"MLCOPILOT_DB_BACKEND",
"MLCOPILOT_DB_NAME",
"MLCOPILOT_DB_HOST",
"MLCOPILOT_DB_PORT",
"MLCOPILOT_DB_USER",
"MLCOPILOT_DB_PASSWORD",
"PROMPT_FORMATS",
"DEFAULT_PROMPT_PREFIX",
"DEFAULT_PROMPT_SUFFIX",
"TOKEN_LIMIT",
"TOKEN_COMPLETION_LIMIT",
"RELAX_TOKEN",
]

TOP_K = 3
EMBED_DIM = 1536
TOKEN_LIMIT = 4096
TOKEN_COMPLETION_LIMIT = 800
RELAX_TOKEN = 500 # RELAX_TOKEN is the number of tokens to void token limit

MLCOPILOT_DB_BACKEND = os.environ.get("MLCOPILOT_DB_BACKEND", "sqlite")

MLCOPILOT_DB_PATH = Path(
os.environ.get("MLCOPILOT_DB_PATH", Path.home() / ".mlcopilot" / "mlcopilot.db")
).expanduser()

MLCOPILOT_DB_NAME = os.environ.get("MLCOPILOT_DB_NAME", "mlcopilot")
MLCOPILOT_DB_HOST = os.environ.get("MLCOPILOT_DB_HOST", "localhost")
MLCOPILOT_DB_PORT = os.environ.get("MLCOPILOT_DB_PORT", 5432)
MLCOPILOT_DB_USER = os.environ.get("MLCOPILOT_DB_USER", "postgres")
MLCOPILOT_DB_PASSWORD = os.environ.get("MLCOPILOT_DB_PASSWORD", "")

bin_map = {
0.1: "very small",
0.3: "small",
Expand Down Expand Up @@ -46,3 +68,17 @@
)

q_num = sorted(list(bin_map.keys()))

PROMPT_FORMATS = {
"TOP_K",
"knowledge",
"space_desc",
"new_task_desc",
}

DEFAULT_PROMPT_PREFIX = """{space_desc}\nRecommend best configurations to train a model for a new task. Format strictly follows this template: ```Configuration 1: {{parameter_1_name}} is {{parameter_1_value}}. {{parameter_2_name}} is {{parameter_2_value}}...{{parameter_n_name}} is {{parameter_n_value}}.
Configuration 2: {{parameter_1_name}} is {{parameter_1_value}}. {{parameter_2_name}} is {{parameter_2_value}}...{{parameter_n_name}} is {{parameter_n_value}}.
Configuration 3: {{parameter_1_name}} is {{parameter_1_value}}. {{parameter_2_name}} is {{parameter_2_value}}...{{parameter_n_name}} is {{parameter_n_value}}.
```\nHere are some tasks along with best hyper-parameter configurations to train a model on them.\n"""

DEFAULT_PROMPT_SUFFIX = """\nGuidelines:{knowledge}\n\n\nBased on the examples(if provided) and guidelines(if provided) above, recommend {TOP_K} hyper-parameter configurations for a new classification dataset.\n\n{new_task_desc}"""
34 changes: 20 additions & 14 deletions mlcopilot/experience.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from collections import OrderedDict
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple

import langchain
import numpy as np
Expand Down Expand Up @@ -330,13 +330,14 @@ def _get_best_relevant_solutions(space: Space, task_desc: str) -> ModelSelect:
The best relevant solution.
"""
SolutionAlias = Solution.alias()
order_key = Task.embedding.cosine_distance(task_desc)
subquery = (
SolutionAlias.select(
SolutionAlias.demo,
Task.task_id,
Task.desc,
Task.embedding,
fn.RANK()
fn.ROW_NUMBER()
.over(
partition_by=[SolutionAlias.space, SolutionAlias.task],
order_by=[SolutionAlias.metric.desc()],
Expand All @@ -345,11 +346,13 @@ def _get_best_relevant_solutions(space: Space, task_desc: str) -> ModelSelect:
)
.where(SolutionAlias.space == space)
.join(Task, on=(SolutionAlias.task == Task.task_id))
.order_by(fn.cosine_similarity(task_desc, Task.embedding).desc())
.order_by(order_key)
.alias("subq")
)
query = Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc).from_(
subquery
query = (
Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc)
.from_(subquery)
.where(subquery.c.rnk <= TOP_K)
)
return query

Expand All @@ -375,7 +378,7 @@ def _get_best_solutions(space: Space) -> ModelSelect:
Task.task_id,
Task.desc,
Task.embedding,
fn.RANK()
fn.ROW_NUMBER()
.over(
partition_by=[SolutionAlias.space, SolutionAlias.task],
order_by=[SolutionAlias.metric.desc()],
Expand All @@ -386,13 +389,17 @@ def _get_best_solutions(space: Space) -> ModelSelect:
.join(Task, on=(SolutionAlias.task == Task.task_id))
.alias("subq")
)
query = Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc).from_(
subquery
query = (
Solution.select(subquery.c.task_id, subquery.c.demo, subquery.c.desc)
.from_(subquery)
.where(subquery.c.rnk <= TOP_K)
)
return query


def gen_experience(space: Space, task_desc: Optional[str] = None) -> List[str]:
def gen_experience(
space: Space, task_desc: Optional[str] = None
) -> Tuple[List[str], List[str]]:
"""
Generate experience content from space and optional task description.
Expand All @@ -417,8 +424,7 @@ def gen_experience(space: Space, task_desc: Optional[str] = None) -> List[str]:
for solution in query:
if solution.task_id not in examples:
examples[solution.task_id] = [solution.desc]
if len(examples[solution.task_id]) <= TOP_K:
examples[solution.task_id].append(
f"Configuration {len(examples[solution.task_id])}: {solution.demo}"
)
return ["\n".join(e) for e in examples.values()]
examples[solution.task_id].append(
f"Configuration {len(examples[solution.task_id])}: {solution.demo}"
)
return list(examples.keys()), ["\n".join(e) for e in examples.values()]
98 changes: 82 additions & 16 deletions mlcopilot/knowledge.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import random
import re
from typing import Any, Callable, Dict, List, Optional

import orjson
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.prompts.example_selector import LengthBasedExampleSelector

from mlcopilot.constants import *
from mlcopilot.constants import TOKEN_COMPLETION_LIMIT, TOKEN_LIMIT
from mlcopilot.experience import gen_experience
from mlcopilot.orm import Knowledge, Solution, Space, Task, database_proxy
from mlcopilot.surrogate_utils import evaluate_configs
from mlcopilot.utils import get_llm, parse_configs
from mlcopilot.utils import get_llm, get_token_count_func, parse_configs

prefix_sep = "__DUMM_SEP__"

Expand All @@ -28,6 +30,12 @@ def gen_knowledge_candidate(examples: List[str]) -> str:
str
The generated knowledge candidate.
"""
prefix_token = get_token_count_func()(
"Here are some tasks along with best hyper-parameter configurations to train a model on them.\n"
)
suffix_token = get_token_count_func()(
"\nQ: From the examples above, what patterns can we observe about the relationship between dataset characteristics and the best hyper-parameter configurations? (Answer MUST be concise, critical, point-by-point, line-by-line, and brief. Only include relevant observations without unnecessary elaboration.)\n\nA: 1."
)
example_prompt = PromptTemplate(
input_variables=["input"],
template="{input}",
Expand All @@ -36,6 +44,12 @@ def gen_knowledge_candidate(examples: List[str]) -> str:
example_selector = LengthBasedExampleSelector(
examples=[{"input": example} for example in examples],
example_prompt=example_prompt,
max_length=TOKEN_LIMIT
- prefix_token
- suffix_token
- TOKEN_COMPLETION_LIMIT
- RELAX_TOKEN,
get_text_length=get_token_count_func(),
)

dynamic_prompt = FewShotPromptTemplate(
Expand Down Expand Up @@ -76,6 +90,18 @@ def suggest_with_knowledge(
List[Dict[str, Any]]
The list of suggested configurations.
"""
prefix_token = get_token_count_func()(
"Here are some tasks along with best hyper-parameter configurations to train a model on them.\n"
)
suffix_token = get_token_count_func()(
"\nGuidelines:{knowledge}\n\n\nBased on the examples and guidelines above, recommend {TOP_K} hyper-parameter configurations for a new classification dataset.\n\n{output}".format(
knowledge=knowledge,
TOP_K=str(TOP_K),
output=(
valid_example[: valid_example.index("\nConfiguration 1:")] + "\n\n"
),
)
)
example_prompt = PromptTemplate(
input_variables=["input"],
template="{input}",
Expand All @@ -84,6 +110,12 @@ def suggest_with_knowledge(
example_selector = LengthBasedExampleSelector(
examples=[{"input": example} for example in examples],
example_prompt=example_prompt,
max_length=TOKEN_LIMIT
- prefix_token
- suffix_token
- TOKEN_COMPLETION_LIMIT
- RELAX_TOKEN,
get_text_length=get_token_count_func(),
)

dynamic_prompt = FewShotPromptTemplate(
Expand Down Expand Up @@ -117,7 +149,7 @@ def suggest_with_knowledge(

def post_validation(
space: Space, surrogate_fn: Callable, config_names: List[str]
) -> str:
) -> List[str]:
"""
Post validation to generate knowledge.
Expand All @@ -132,17 +164,17 @@ def post_validation(
Returns
-------
str
The generated knowledge.
List[str]
The list of generated knowledge.
"""
knowledge = get_knowledge(space.space_id)
if knowledge is not None:
knowledges = get_knowledge(space)
if knowledges != "":
print("Knowledge already exists.")
return knowledge
return knowledges
quantile_infos = orjson.loads(space.quantile_info)
examples = gen_experience(space)
retrieved_tasks, examples = gen_experience(space)
best_score = float("-inf")
knowledge = None
knowledges = None
for _ in range(3):
random.shuffle(examples)
knowledge_candidate = gen_knowledge_candidate(examples)
Expand All @@ -168,15 +200,49 @@ def post_validation(
score += _score
if best_score < score:
best_score = score
knowledge = knowledge_candidate
assert knowledge is not None, "Knowledge is not generated."
knowledges = knowledge_candidate
assert knowledges is not None, "Knowledge is not generated."

return knowledge
knowledges = split_knowledge(knowledges)
return knowledges


def get_knowledge(space: Space):
def get_knowledge(space: Space, task=None):
try:
knowledge = Knowledge.get(Knowledge.space_id == space.space_id).knowledge
return knowledge
knowledges = Knowledge.select().where(
(Knowledge.space_id == space.space_id)
& ((Knowledge.task == task) | (Knowledge.task == None))
)
knowledge_str = ""
for i, knowledge in enumerate(knowledges):
knowledge_str += f"{i+1}. {knowledge.knowledge}\n\n"
return knowledge_str
except:
return None
return ""


def split_knowledge(knowledges: str) -> List[str]:
"""
Split the knowledge into a list of knowledge.
Parameters
----------
knowledges: str
The knowledge.
Returns
-------
List[str]
The list of knowledge.
Examples
--------
>>> split_knowledge("1. A\n2. B\n3. C\n")
["A", "B", "C"]
"""
return [
k.strip()
for k in re.findall(
r"\n\d+\.([\s\S]+?)(?=\n+\d+\.)", "\n" + knowledges + "\n999."
)
]
Loading

0 comments on commit e432607

Please sign in to comment.