Merge pull request ucbepic#143 from ucbepic/optimizerui

feat: add optimizer in the UI
staru09 · Nov 3, 2024 · 169b601 · 169b601
2 parents afddb5f + 7ff7d64
commit 169b601
Show file tree

Hide file tree

Showing 27 changed files with 1,135 additions and 506 deletions.
diff --git a/README.md b/README.md
@@ -33,17 +33,25 @@ DocETL is the ideal choice when you're looking to maximize correctness and outpu
 
 ## Installation
 
-See the documentation for installing from PyPI.
+You can install DocETL using either PyPI or from source. We recommend installing from source for the latest features and bug fixes.
 
 ### Prerequisites
 
 Before installing DocETL, ensure you have Python 3.10 or later installed on your system. You can check your Python version by running:
 
+```bash
 python --version
+```
+
+### Install from PyPI
+
+```bash
+pip install docetl
+```
 
-### Installation Steps (from Source)
+### Install from Source
 
-1. Clone the DocETL repository:
+1. Clone the DocETL repository (or your fork):
 
 ```bash
 git clone https://github.com/ucbepic/docetl.git

diff --git a/docetl/builder.py b/docetl/builder.py
@@ -83,7 +83,6 @@ class Optimizer:
     def __init__(
         self,
         runner: "DSLRunner",
-        max_threads: Optional[int] = None,
         model: str = "gpt-4o",
         resume: bool = False,
         timeout: int = 60,
@@ -980,6 +979,9 @@ def _get_sample_data(
                 return self._get_reduce_sample(
                     data, op_config.get("reduce_key"), sample_size
                 )
+
+        if not self.config.get("optimizer_config", {}).get("random_sample", False):
+            return data[:sample_size]
 
         # Take the random 500 examples or all if less than 500
         initial_data = random.sample(data, min(500, len(data)))
@@ -1038,7 +1040,13 @@ def _get_reduce_sample(
             group_sample_size = int(sample_size * group_proportion)
 
             # Sample from the group
-            group_sample = random.sample(items, min(group_sample_size, len(items)))
+            if not self.config.get("optimizer_config", {}).get("random_sample", False):
+                group_sample = items[:group_sample_size]
+            else:
+                group_sample = random.sample(
+                    items, min(group_sample_size, len(items))
+                )
+
             sample.extend(group_sample)
 
         # If we haven't reached the desired sample size, add more items randomly
@@ -1051,22 +1059,10 @@ def _get_reduce_sample(
             ]
             additional_sample = random.sample(
                 remaining_items,
-                min(sample_size - len(sample), len(remaining_items)),
-            )
-            sample.extend(additional_sample)
-
-        # Add items randomly from non-top groups to meet the sample size
-        if len(sample) < sample_size:
-            remaining_items = [
-                item
-                for _, items in grouped_data.items()
-                for item in items
-                if item not in sample
-            ]
-            additional_sample = random.sample(
-                remaining_items,
-                min(sample_size - len(sample), len(remaining_items)),
-            )
+                min(
+                    sample_size - len(sample), len(remaining_items)
+                ),
+            ) if self.config.get("optimizer_config", {}).get("random_sample", False) else remaining_items[:sample_size - len(sample)]
             sample.extend(additional_sample)
 
         # Create a histogram of group sizes
@@ -1201,7 +1197,7 @@ def _optimize_equijoin(
             if map_operation["optimize"]:
                 dataset_to_transform_sample = random.sample(
                     dataset_to_transform, self.sample_size_map.get("map")
-                )
+                ) if self.config.get("optimizer_config", {}).get("random_sample", False) else dataset_to_transform[:self.sample_size_map.get("map")]
                 optimized_map_operations = self._optimize_map(
                     map_operation, dataset_to_transform_sample
                 )

diff --git a/docetl/console.py b/docetl/console.py
@@ -1,10 +1,11 @@
 import os
-from typing import Any, Optional
+import time
+from typing import Any, Optional, Tuple
 from rich.console import Console
 from io import StringIO
 import threading
 import queue
-
+from docetl.utils import StageType, get_stage_description
 
 class ThreadSafeConsole(Console):
     def __init__(self, *args, **kwargs):
@@ -13,6 +14,47 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.input_event = threading.Event()
         self.input_value = None
+        self.optimizer_statuses = []
+        self.optimizer_rationale = None
+
+    def status(
+        self,
+        status: "RenderableType",
+        *,
+        spinner: str = "dots",
+        spinner_style: "StyleType" = "status.spinner",
+        speed: float = 1.0,
+        refresh_per_second: float = 12.5,
+    ) -> "Status":
+        from rich.status import Status
+
+        status_renderable = Status(
+            status,
+            console=None,
+            spinner=spinner,
+            spinner_style=spinner_style,
+            speed=speed,
+            refresh_per_second=refresh_per_second,
+        )
+        return status_renderable
+
+    def post_optimizer_rationale(self, should_optimize: bool, rationale: str, validator_prompt: str):
+        self.optimizer_rationale = (should_optimize, rationale, validator_prompt)
+
+    def post_optimizer_status(self, stage: StageType):
+        self.optimizer_statuses.append((stage, time.time()))
+
+    def get_optimizer_progress(self) -> Tuple[str, float]:
+        if len(self.optimizer_statuses) == 0:
+            return ("Optimization starting...", 0)
+
+        if len(self.optimizer_statuses) > 0 and self.optimizer_statuses[-1][0] == StageType.END:
+            return (get_stage_description(StageType.END), 1)
+
+        num_stages = len(StageType) - 1
+        num_completed = len([s for s in self.optimizer_statuses if s[1]]) - 1
+        current_stage = self.optimizer_statuses[-1][0]
+        return (get_stage_description(current_stage), num_completed / num_stages)
 
     def print(self, *args, **kwargs):
         super().print(*args, **kwargs)

diff --git a/docetl/operations/base.py b/docetl/operations/base.py
@@ -38,6 +38,7 @@ def __init__(
         max_threads: int,
         console: Optional[Console] = None,
         status: Optional[Status] = None,
+        is_build: bool = False,
         **kwargs,
     ):
         """
@@ -62,6 +63,7 @@ def __init__(
         self.num_retries_on_validate_failure = self.config.get(
             "num_retries_on_validate_failure", 0
         )
+        self.is_build = is_build
         self.syntax_check()
 
     # This must be overridden in a subclass

diff --git a/docetl/optimizers/join_optimizer.py b/docetl/optimizers/join_optimizer.py
@@ -11,7 +11,7 @@
 
 from docetl.operations.equijoin import EquijoinOperation
 from docetl.operations.resolve import ResolveOperation
-from docetl.utils import completion_cost, extract_jinja_variables
+from docetl.utils import completion_cost, extract_jinja_variables, StageType
 
 
 class JoinOptimizer:

diff --git a/docetl/optimizers/map_optimizer/config_generators.py b/docetl/optimizers/map_optimizer/config_generators.py
@@ -252,9 +252,8 @@ def _check_metadata_necessity(
         Determine if metadata is needed to perform the subtask.
 
         Consider:
-        1. Does the subtask require information that might be present in metadata?
-        2. Is the sample chunk or full input missing any crucial information that could be in metadata?
-        3. Would having metadata significantly improve the performance or accuracy of the subtask?
+        1. Does the input sample have any structural metadata that might be relevant to the subtask?
+        2. Is the sample chunk or full input missing any crucial information that could be in this metadata?
 
         Provide your response in the following format:
         """

diff --git a/docetl/optimizers/map_optimizer/evaluator.py b/docetl/optimizers/map_optimizer/evaluator.py
@@ -267,7 +267,7 @@ def _assess_operation(
         # Extract input variables from the prompt
         variables_in_prompt = extract_jinja_variables(op_config["prompt"])
         variables_in_prompt = [v.replace("input.", "") for v in variables_in_prompt]
-        input_sample = input_data[:2]
+        input_sample = input_data[:3]
         output_sample = [
             next(
                 (
@@ -291,7 +291,7 @@ def _assess_operation(
         )
         available_tokens = (
             model_input_context_length - prompt_tokens - 100
-        ) // 4  # 100 token buffer, divide by 4 for each sample
+        ) // 6  # 100 token buffer, divide by 6 for each sample
 
         # Prepare and truncate sample data
         input_1 = truncate_sample_data(
@@ -336,22 +336,43 @@ def _assess_operation(
         {json.dumps({"input": input_2, "output": output_2}, indent=2)}
         """
 
+        if len(input_sample) > 2:
+            input_3 = truncate_sample_data(
+                {key: input_sample[2].get(key, "N/A") for key in variables_in_prompt},
+                available_tokens,
+                [variables_in_prompt],
+                self.llm_client.model,
+            )
+            output_3 = truncate_sample_data(
+                {key: output_sample[2].get(key, "N/A") for key in output_schema.keys()},
+                available_tokens,
+                [list(output_schema.keys())],
+                self.llm_client.model,
+            )
+            prompt += f"""
+        ---Pair 3---
+        {json.dumps({"input": input_3, "output": output_3}, indent=2)}
+        """
+
         prompt += f"""
         Custom Validator Prompt:
         {validator_prompt}
 
-        Based on the above information, please assess the operation's performance. Provide your assessment in the following format:
+        Based on the above information, please assess the operation's performance. 
+        If it needs improvement, provide specific examples in your assessment.
+        Be very detailed in your reasons for improvements, if any.
+        Provide your assessment in the following format:
         """
 
         parameters = {
             "type": "object",
             "properties": {
-                "needs_improvement": {"type": "boolean"},
                 "reasons": {"type": "array", "items": {"type": "string"}},
                 "improvements": {
                     "type": "array",
                     "items": {"type": "string"},
                 },
+                "needs_improvement": {"type": "boolean"},
             },
             "required": ["needs_improvement", "reasons", "improvements"],
         }

diff --git a/docetl/optimizers/map_optimizer/optimizer.py b/docetl/optimizers/map_optimizer/optimizer.py
@@ -133,6 +133,7 @@ def optimize(
             The cost is the cost of the optimizer (from possibly synthesizing resolves).
 
         """
+        self.console.post_optimizer_status(StageType.SAMPLE_RUN)
         input_data = copy.deepcopy(input_data)
         # Add id to each input_data
         for i in range(len(input_data)):
@@ -184,7 +185,9 @@ def optimize(
             },
         )
 
+
         # Generate custom validator prompt
+        self.console.post_optimizer_status(StageType.SHOULD_OPTIMIZE)
         validator_prompt = self.prompt_generator._generate_validator_prompt(
             op_config, input_data, output_data
         )
@@ -218,6 +221,11 @@ def optimize(
                 "improvements": assessment.get("improvements", []),
             },
         )
+        self.console.post_optimizer_rationale(
+            assessment.get("needs_improvement", True),
+            "\n".join(assessment.get("reasons", [])),
+            validator_prompt
+        )
 
         # Check if improvement is needed based on the assessment
         if not data_exceeds_limit and not assessment.get("needs_improvement", True):
@@ -237,6 +245,7 @@ def optimize(
             candidate_plans["no_change"] = [op_config]
 
         # Generate chunk size plans
+        self.console.post_optimizer_status(StageType.CANDIDATE_PLANS)
         self.console.log("[bold magenta]Generating chunking plans...[/bold magenta]")
         chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
             op_config, input_data, validator_prompt, model_input_context_length
@@ -290,6 +299,7 @@ def optimize(
             output=candidate_plans,
         )
 
+        self.console.post_optimizer_status(StageType.EVALUATION_RESULTS)
         self.console.log(
             f"[bold magenta]Evaluating {len(plans_list)} plans...[/bold magenta]"
         )
@@ -349,6 +359,7 @@ def optimize(
 
         # Check if there are no top plans
         if len(top_plans) == 0:
+            self.console.post_optimizer_status(StageType.END)
             raise ValueError(
                 "Agent did not generate any plans. Unable to proceed with optimization. Try again."
             )
@@ -422,6 +433,7 @@ def optimize(
             },
         )
 
+        self.console.post_optimizer_status(StageType.END)
         return (
             candidate_plans[best_plan_name],
             best_output,

diff --git a/docetl/optimizers/map_optimizer/plan_generators.py b/docetl/optimizers/map_optimizer/plan_generators.py
@@ -774,16 +774,11 @@ def _generate_chain_plans(
         Note:
             - This method is most effective when the original task has multiple output keys
               with dependencies between them.
-            - If the output schema has only one key, an empty dictionary is returned as
-              chain decomposition is not necessary.
             - The method uses the LLM to generate the chain of subtasks, ensuring that
               all output keys from the original task are covered.
         """
 
         output_schema = op_config["output"]["schema"]
-        if len(output_schema) <= 1:
-            return {}  # No need for chain decomposition if there's only one output key
-
         variables_in_prompt = extract_jinja_variables(op_config["prompt"])
         variables_in_prompt = [v.replace("input.", "") for v in variables_in_prompt]
 

diff --git a/docetl/optimizers/map_optimizer/prompt_generators.py b/docetl/optimizers/map_optimizer/prompt_generators.py
@@ -192,7 +192,7 @@ def _get_header_extraction_prompt(
 
         header_extraction_prompt = f"""Analyze the following chunk of a document and extract any headers you see.
 
-        {{ input.{split_key}_chunk }}
+        {{{{ input.{split_key}_chunk }}}}
 
         Examples of headers and their levels based on the document structure:
         {chr(10).join(header_examples)}
@@ -331,15 +331,41 @@ def _get_combine_prompt(
         {sample_inputs}
 
         Modify the original prompt to be a prompt that will combine these chunk results to accomplish the original task.
+        This prompt will be submitted to an LLM, so it must be a valid Jinja2 template, with natural language instructions.
 
         Guidelines for your prompt template:
         - The only variable you are allowed to use is the `inputs` variable, which contains all chunk results. Each value is a dictionary with the keys {', '.join(schema_keys)}
-        - Avoid using filters or complex logic, even though Jinja technically supports it
+        - Avoid using filters or complex logic like `do` statements, even though Jinja technically supports it
         - The prompt template must be a valid Jinja2 template
         - You must use the {{{{ inputs }}}} variable somehow, in a for loop. You must access specific keys in each item in the loop.
+        - The prompt template must also contain natural language instructions so the LLM knows what to do with the data
 
         Provide your prompt template as a single string.
         """
+        # Add example for combining themes
+        base_prompt += """
+        Example of a good combine prompt for combining themes:
+        ```
+        You are tasked with combining themes extracted from different chunks of text.
+
+        Here are the themes extracted from each chunk:
+        {% for item in inputs %}
+        Themes for chunk {loop.index}:
+        {{ item.themes }}
+        {% endfor %}
+
+        Analyze all the themes above and create a consolidated list that:
+        1. Combines similar or related themes
+        2. Preserves unique themes that appear in only one chunk
+        3. Prioritizes themes that appear multiple times across chunks
+        4. Maintains the original wording where possible
+
+        Provide the final consolidated list of themes, ensuring each theme is distinct and meaningful.
+        ```
+
+        Now generate a combine prompt for the current task.
+        """
+
         parameters = {
             "type": "object",
             "properties": {"combine_prompt": {"type": "string"}},