Skip to content

Commit

Permalink
Merge pull request ucbepic#143 from ucbepic/optimizerui
Browse files Browse the repository at this point in the history
feat: add optimizer in the UI
  • Loading branch information
shreyashankar authored Nov 3, 2024
2 parents afddb5f + 7ff7d64 commit 169b601
Show file tree
Hide file tree
Showing 27 changed files with 1,135 additions and 506 deletions.
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,25 @@ DocETL is the ideal choice when you're looking to maximize correctness and outpu

## Installation

See the documentation for installing from PyPI.
You can install DocETL using either PyPI or from source. We recommend installing from source for the latest features and bug fixes.

### Prerequisites

Before installing DocETL, ensure you have Python 3.10 or later installed on your system. You can check your Python version by running:

```bash
python --version
```

### Install from PyPI

```bash
pip install docetl
```

### Installation Steps (from Source)
### Install from Source

1. Clone the DocETL repository:
1. Clone the DocETL repository (or your fork):

```bash
git clone https://github.com/ucbepic/docetl.git
Expand Down
34 changes: 15 additions & 19 deletions docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ class Optimizer:
def __init__(
self,
runner: "DSLRunner",
max_threads: Optional[int] = None,
model: str = "gpt-4o",
resume: bool = False,
timeout: int = 60,
Expand Down Expand Up @@ -980,6 +979,9 @@ def _get_sample_data(
return self._get_reduce_sample(
data, op_config.get("reduce_key"), sample_size
)

if not self.config.get("optimizer_config", {}).get("random_sample", False):
return data[:sample_size]

# Take the random 500 examples or all if less than 500
initial_data = random.sample(data, min(500, len(data)))
Expand Down Expand Up @@ -1038,7 +1040,13 @@ def _get_reduce_sample(
group_sample_size = int(sample_size * group_proportion)

# Sample from the group
group_sample = random.sample(items, min(group_sample_size, len(items)))
if not self.config.get("optimizer_config", {}).get("random_sample", False):
group_sample = items[:group_sample_size]
else:
group_sample = random.sample(
items, min(group_sample_size, len(items))
)

sample.extend(group_sample)

# If we haven't reached the desired sample size, add more items randomly
Expand All @@ -1051,22 +1059,10 @@ def _get_reduce_sample(
]
additional_sample = random.sample(
remaining_items,
min(sample_size - len(sample), len(remaining_items)),
)
sample.extend(additional_sample)

# Add items randomly from non-top groups to meet the sample size
if len(sample) < sample_size:
remaining_items = [
item
for _, items in grouped_data.items()
for item in items
if item not in sample
]
additional_sample = random.sample(
remaining_items,
min(sample_size - len(sample), len(remaining_items)),
)
min(
sample_size - len(sample), len(remaining_items)
),
) if self.config.get("optimizer_config", {}).get("random_sample", False) else remaining_items[:sample_size - len(sample)]
sample.extend(additional_sample)

# Create a histogram of group sizes
Expand Down Expand Up @@ -1201,7 +1197,7 @@ def _optimize_equijoin(
if map_operation["optimize"]:
dataset_to_transform_sample = random.sample(
dataset_to_transform, self.sample_size_map.get("map")
)
) if self.config.get("optimizer_config", {}).get("random_sample", False) else dataset_to_transform[:self.sample_size_map.get("map")]
optimized_map_operations = self._optimize_map(
map_operation, dataset_to_transform_sample
)
Expand Down
46 changes: 44 additions & 2 deletions docetl/console.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
from typing import Any, Optional
import time
from typing import Any, Optional, Tuple
from rich.console import Console
from io import StringIO
import threading
import queue

from docetl.utils import StageType, get_stage_description

class ThreadSafeConsole(Console):
def __init__(self, *args, **kwargs):
Expand All @@ -13,6 +14,47 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.input_event = threading.Event()
self.input_value = None
self.optimizer_statuses = []
self.optimizer_rationale = None

def status(
self,
status: "RenderableType",
*,
spinner: str = "dots",
spinner_style: "StyleType" = "status.spinner",
speed: float = 1.0,
refresh_per_second: float = 12.5,
) -> "Status":
from rich.status import Status

status_renderable = Status(
status,
console=None,
spinner=spinner,
spinner_style=spinner_style,
speed=speed,
refresh_per_second=refresh_per_second,
)
return status_renderable

def post_optimizer_rationale(self, should_optimize: bool, rationale: str, validator_prompt: str):
self.optimizer_rationale = (should_optimize, rationale, validator_prompt)

def post_optimizer_status(self, stage: StageType):
self.optimizer_statuses.append((stage, time.time()))

def get_optimizer_progress(self) -> Tuple[str, float]:
if len(self.optimizer_statuses) == 0:
return ("Optimization starting...", 0)

if len(self.optimizer_statuses) > 0 and self.optimizer_statuses[-1][0] == StageType.END:
return (get_stage_description(StageType.END), 1)

num_stages = len(StageType) - 1
num_completed = len([s for s in self.optimizer_statuses if s[1]]) - 1
current_stage = self.optimizer_statuses[-1][0]
return (get_stage_description(current_stage), num_completed / num_stages)

def print(self, *args, **kwargs):
super().print(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions docetl/operations/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
max_threads: int,
console: Optional[Console] = None,
status: Optional[Status] = None,
is_build: bool = False,
**kwargs,
):
"""
Expand All @@ -62,6 +63,7 @@ def __init__(
self.num_retries_on_validate_failure = self.config.get(
"num_retries_on_validate_failure", 0
)
self.is_build = is_build
self.syntax_check()

# This must be overridden in a subclass
Expand Down
2 changes: 1 addition & 1 deletion docetl/optimizers/join_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from docetl.operations.equijoin import EquijoinOperation
from docetl.operations.resolve import ResolveOperation
from docetl.utils import completion_cost, extract_jinja_variables
from docetl.utils import completion_cost, extract_jinja_variables, StageType


class JoinOptimizer:
Expand Down
5 changes: 2 additions & 3 deletions docetl/optimizers/map_optimizer/config_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,8 @@ def _check_metadata_necessity(
Determine if metadata is needed to perform the subtask.
Consider:
1. Does the subtask require information that might be present in metadata?
2. Is the sample chunk or full input missing any crucial information that could be in metadata?
3. Would having metadata significantly improve the performance or accuracy of the subtask?
1. Does the input sample have any structural metadata that might be relevant to the subtask?
2. Is the sample chunk or full input missing any crucial information that could be in this metadata?
Provide your response in the following format:
"""
Expand Down
29 changes: 25 additions & 4 deletions docetl/optimizers/map_optimizer/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def _assess_operation(
# Extract input variables from the prompt
variables_in_prompt = extract_jinja_variables(op_config["prompt"])
variables_in_prompt = [v.replace("input.", "") for v in variables_in_prompt]
input_sample = input_data[:2]
input_sample = input_data[:3]
output_sample = [
next(
(
Expand All @@ -291,7 +291,7 @@ def _assess_operation(
)
available_tokens = (
model_input_context_length - prompt_tokens - 100
) // 4 # 100 token buffer, divide by 4 for each sample
) // 6 # 100 token buffer, divide by 6 for each sample

# Prepare and truncate sample data
input_1 = truncate_sample_data(
Expand Down Expand Up @@ -336,22 +336,43 @@ def _assess_operation(
{json.dumps({"input": input_2, "output": output_2}, indent=2)}
"""

if len(input_sample) > 2:
input_3 = truncate_sample_data(
{key: input_sample[2].get(key, "N/A") for key in variables_in_prompt},
available_tokens,
[variables_in_prompt],
self.llm_client.model,
)
output_3 = truncate_sample_data(
{key: output_sample[2].get(key, "N/A") for key in output_schema.keys()},
available_tokens,
[list(output_schema.keys())],
self.llm_client.model,
)
prompt += f"""
---Pair 3---
{json.dumps({"input": input_3, "output": output_3}, indent=2)}
"""

prompt += f"""
Custom Validator Prompt:
{validator_prompt}
Based on the above information, please assess the operation's performance. Provide your assessment in the following format:
Based on the above information, please assess the operation's performance.
If it needs improvement, provide specific examples in your assessment.
Be very detailed in your reasons for improvements, if any.
Provide your assessment in the following format:
"""

parameters = {
"type": "object",
"properties": {
"needs_improvement": {"type": "boolean"},
"reasons": {"type": "array", "items": {"type": "string"}},
"improvements": {
"type": "array",
"items": {"type": "string"},
},
"needs_improvement": {"type": "boolean"},
},
"required": ["needs_improvement", "reasons", "improvements"],
}
Expand Down
12 changes: 12 additions & 0 deletions docetl/optimizers/map_optimizer/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def optimize(
The cost is the cost of the optimizer (from possibly synthesizing resolves).
"""
self.console.post_optimizer_status(StageType.SAMPLE_RUN)
input_data = copy.deepcopy(input_data)
# Add id to each input_data
for i in range(len(input_data)):
Expand Down Expand Up @@ -184,7 +185,9 @@ def optimize(
},
)


# Generate custom validator prompt
self.console.post_optimizer_status(StageType.SHOULD_OPTIMIZE)
validator_prompt = self.prompt_generator._generate_validator_prompt(
op_config, input_data, output_data
)
Expand Down Expand Up @@ -218,6 +221,11 @@ def optimize(
"improvements": assessment.get("improvements", []),
},
)
self.console.post_optimizer_rationale(
assessment.get("needs_improvement", True),
"\n".join(assessment.get("reasons", [])),
validator_prompt
)

# Check if improvement is needed based on the assessment
if not data_exceeds_limit and not assessment.get("needs_improvement", True):
Expand All @@ -237,6 +245,7 @@ def optimize(
candidate_plans["no_change"] = [op_config]

# Generate chunk size plans
self.console.post_optimizer_status(StageType.CANDIDATE_PLANS)
self.console.log("[bold magenta]Generating chunking plans...[/bold magenta]")
chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
op_config, input_data, validator_prompt, model_input_context_length
Expand Down Expand Up @@ -290,6 +299,7 @@ def optimize(
output=candidate_plans,
)

self.console.post_optimizer_status(StageType.EVALUATION_RESULTS)
self.console.log(
f"[bold magenta]Evaluating {len(plans_list)} plans...[/bold magenta]"
)
Expand Down Expand Up @@ -349,6 +359,7 @@ def optimize(

# Check if there are no top plans
if len(top_plans) == 0:
self.console.post_optimizer_status(StageType.END)
raise ValueError(
"Agent did not generate any plans. Unable to proceed with optimization. Try again."
)
Expand Down Expand Up @@ -422,6 +433,7 @@ def optimize(
},
)

self.console.post_optimizer_status(StageType.END)
return (
candidate_plans[best_plan_name],
best_output,
Expand Down
5 changes: 0 additions & 5 deletions docetl/optimizers/map_optimizer/plan_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,16 +774,11 @@ def _generate_chain_plans(
Note:
- This method is most effective when the original task has multiple output keys
with dependencies between them.
- If the output schema has only one key, an empty dictionary is returned as
chain decomposition is not necessary.
- The method uses the LLM to generate the chain of subtasks, ensuring that
all output keys from the original task are covered.
"""

output_schema = op_config["output"]["schema"]
if len(output_schema) <= 1:
return {} # No need for chain decomposition if there's only one output key

variables_in_prompt = extract_jinja_variables(op_config["prompt"])
variables_in_prompt = [v.replace("input.", "") for v in variables_in_prompt]

Expand Down
30 changes: 28 additions & 2 deletions docetl/optimizers/map_optimizer/prompt_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def _get_header_extraction_prompt(

header_extraction_prompt = f"""Analyze the following chunk of a document and extract any headers you see.
{{ input.{split_key}_chunk }}
{{{{ input.{split_key}_chunk }}}}
Examples of headers and their levels based on the document structure:
{chr(10).join(header_examples)}
Expand Down Expand Up @@ -331,15 +331,41 @@ def _get_combine_prompt(
{sample_inputs}
Modify the original prompt to be a prompt that will combine these chunk results to accomplish the original task.
This prompt will be submitted to an LLM, so it must be a valid Jinja2 template, with natural language instructions.
Guidelines for your prompt template:
- The only variable you are allowed to use is the `inputs` variable, which contains all chunk results. Each value is a dictionary with the keys {', '.join(schema_keys)}
- Avoid using filters or complex logic, even though Jinja technically supports it
- Avoid using filters or complex logic like `do` statements, even though Jinja technically supports it
- The prompt template must be a valid Jinja2 template
- You must use the {{{{ inputs }}}} variable somehow, in a for loop. You must access specific keys in each item in the loop.
- The prompt template must also contain natural language instructions so the LLM knows what to do with the data
Provide your prompt template as a single string.
"""
# Add example for combining themes
base_prompt += """
Example of a good combine prompt for combining themes:
```
You are tasked with combining themes extracted from different chunks of text.
Here are the themes extracted from each chunk:
{% for item in inputs %}
Themes for chunk {loop.index}:
{{ item.themes }}
{% endfor %}
Analyze all the themes above and create a consolidated list that:
1. Combines similar or related themes
2. Preserves unique themes that appear in only one chunk
3. Prioritizes themes that appear multiple times across chunks
4. Maintains the original wording where possible
Provide the final consolidated list of themes, ensuring each theme is distinct and meaningful.
```
Now generate a combine prompt for the current task.
"""

parameters = {
"type": "object",
"properties": {"combine_prompt": {"type": "string"}},
Expand Down
Loading

0 comments on commit 169b601

Please sign in to comment.