cleanup generation pipeline

SciPhi-AI · Sep 30, 2023 · 6d45fc0 · 6d45fc0
1 parent 917ee71
commit 6d45fc0
Show file tree

Hide file tree

Showing 8 changed files with 402 additions and 252 deletions.
diff --git a/README.md b/README.md
@@ -53,57 +53,54 @@ poetry install -E <extra_name>
 
 ## Usage
 
-### Dataset Generation
+### Library of Phi Generation
 
-To help with the Library of Phi generation, you can run the following commands below:
+The Library of Phi is a project sponsored by SciPhi which aims to democratize access to high quality textbooks. The Library is automatically generated by a pipeline which goes from Grounded Syllabi -> Table of Contents -> Textbooks. This multi-step process involves scraping the MIT OCW website, processing the scraped data, and generating configuration YAMLs and performing RAG over wikipedia.
+
+To contribute to the Library of Phi generation, run with the following commands below:
 
 #### Step 1: Scraping the MIT OCW Website
 
-This step involves scraping the MIT OCW website to obtain course details like course number, title, URL, resource level, page contents, and more.
+Scrape the MIT OCW website to obtain course details like course number, title, URL, resource level, page contents, etc., for each course.
 
 ```bash
 poetry run python sciphi/examples/library_of_phi/raw_data/ocw_scraper.py scrape
-    # Optional arguments
-    # --input_file_name=ocw_courses.html \
-    # --output_file_name=scraped_ocw.jsonl \
-    # --sleep_time=1 \
-    # --log_level=INFO
 ```
 
-#### Step 2: Processing the Scraped Data to Generate YAML
+#### Step 2: Process the scraped data to Generate 'Draft' syllabi YAMLs:
+
+Converts the scraped data into a JSON YAML 'draft' syllabi with the help of a specified LLM (defaults to openai).
+
+```bash
+poetry run python sciphi/examples/library_of_phi/gen_step_1_draft_syllabi.py run
+```
+
+#### Step 3: Process the draft YAML into the final YAML syllabi:
 
-This step converts the scraped data into a YAML rough draft with the help of the specified LLM.
+Clean the drafted syllabi and saves into a ready-to-run YAML output. TODO - Add LLM support for failing drafts.
 
 ```bash
-python poetry run python sciphi/examples/library_of_phi/yaml_step_1.py run 
-    # Optional arguments
-    # --output_dir=my_output_directory \
-    # --input_filename=my_input_file.jsonl \
-    # --log_level=DEBUG \
-    # --provider_name=openai \
-    # --model_name=gpt-4
+poetry run python sciphi/examples/library_of_phi/gen_step_2_clean_syllabi.py 
 ```
 
-#### Step 3: Process the draft YAML to generate the final YAML
+#### Step 4: Convert the syllabi into a table of contents
 
 This step converts the drafted YAML into a final YAML output
 
 ```bash
-python poetry run python sciphi/examples/library_of_phi/yaml_step_2.py 
-    # Optional arguments
-    # --input_rel_dir="yaml_step_1" \
-    # --output_rel_dir="yaml_step_2" \
-    # --data_directory=None \
-    # --log_level=DEBUG \
+poetry run python sciphi/examples/library_of_phi/gen_table_of_contents.py 
+
 ```
 
-For broad application, you can use SciPhi for dataset generation by executing the relevant `runner.py` file with various command-line arguments.
+### Customizeable Runner Script
+
+For customizeable application, you can use SciPhi for dataset generation by executing the relevant `runner.py` file with various command-line arguments.
 
 ```bash
 poetry run python sciphi/examples/basic_data_gen/runner.py --provider_name=openai --model_name=gpt-4 --log_level=INFO --batch_size=1 --num_samples=1 --output_file_name=example_output.jsonl --example_config=textbooks_are_all_you_need_basic_split
 ```
 
-### Key Command-Line Arguments
+### Command-Line Arguments
 
 - `--provider`: Which provider to use for completions (default: "openai").
 - `--model_name`: The name of the model to load from the provider (default: "gpt-3.5-turbo").
@@ -131,7 +128,7 @@ The code snippet below shows how to use SciPhi to generate synthetic data for a
 # Build an LLM and provider interface
 llm_config = LLMConfigManager.get_config_for_provider(
     provider_name
-).create(**build_llm_config(args))
+).create(**gen_llm_config(args))
 llm_provider = InterfaceManager.get_provider(
     provider_name,
     model_name,

diff --git a/sciphi/examples/basic_data_gen/runner.py b/sciphi/examples/basic_data_gen/runner.py
@@ -13,7 +13,7 @@
     get_root_dir,
 )
 from sciphi.examples.helpers import (
-    build_llm_config,
+    gen_llm_config,
     parse_arguments,
     prep_for_file_path,
 )
@@ -96,7 +96,7 @@ def generate_random_hash() -> str:
     # Build an LLM and provider interface
     llm_config = LLMConfigManager.get_config_for_provider(
         provider_name
-    ).create(**build_llm_config(args))
+    ).create(**gen_llm_config(args))
     llm_provider = InterfaceManager.get_provider(
         provider_name,
         model_name,

diff --git a/sciphi/examples/helpers.py b/sciphi/examples/helpers.py
@@ -1,11 +1,15 @@
 """Helper functions to be used across example scripts"""
 import argparse
 import json
+import logging
+import os
+
+import yaml
 
 from sciphi.interface import ProviderName
 
 
-def build_llm_config(args: argparse.Namespace) -> dict:
+def gen_llm_config(args: argparse.Namespace) -> dict:
     """Constructs the LLM config based on provided arguments."""
 
     config_args = {
@@ -243,3 +247,107 @@ def parse_arguments() -> argparse.Namespace:
         help="Directory to write generated output to.",
     )
     return parser.parse_args()
+
+
+def setup_logging(log_level: str = "INFO") -> logging.Logger:
+    """Set up logging with the given log level."""
+    logging.basicConfig(level=log_level.upper())
+    return logging.getLogger(__name__)
+
+
+def load_yaml(yml_file_path: str) -> dict:
+    """Load the content of a YAML file."""
+    with open(yml_file_path, "r", encoding="utf-8") as file:
+        return yaml.safe_load(file)
+
+
+def save_yaml(content: str, filename: str) -> None:
+    """Save content to a YAML file."""
+    with open(filename, "w", encoding="utf-8") as file:
+        yaml.dump(content, file, allow_unicode=True)
+
+
+def prep_yaml_line(line: str) -> str:
+    """Replace special characters in the YAML string."""
+    replacements = {
+        "\u201C": '"',
+        "\u201D": '"',
+        "Ã©": "é",
+        "Å": "œ",
+        "â": "",
+        "Î©": "I",
+        "Ï": "I",
+        "\x83": "",
+        "\x88": "",
+        "\x89": "",
+        "\x90": "",
+        "\x91": "",
+        "\x92": "",
+        "\x9b": "",
+        "\x8f": "",
+    }
+    for old, new in replacements.items():
+        line = line.replace(old, new)
+
+    line = line.replace("\\", "\\\\")
+    line = line.replace('"', "'")
+
+    return line
+
+
+def format_yaml_line(line: str, index: int, split_lines: list[str]) -> str:
+    """Format a specific line in the YAML content."""
+    line_cut = line.find("- ")
+    if line_cut != -1:
+        end = len(line) if line[-1] != ":" else len(line) - 1
+        line = (
+            line[: line_cut + 2]
+            + '"'
+            + line[line_cut + 2 : end]
+            + '"'
+            + ("" if end == len(line) else ":")
+        )
+        if line[-1] != ":" and index != len(split_lines) - 1:
+            if "subtopics:" in split_lines[index + 1]:
+                line += ":"
+    elif index == 2:
+        first_non_blank_char = next(
+            (i for i, char in enumerate(line) if char != " "), 0
+        )
+        if first_non_blank_char != 0:
+            line = (
+                line[:first_non_blank_char]
+                + '"'
+                + line[first_non_blank_char:-1]
+                + '":'
+            )
+    return line
+
+
+def ensure_directory_exists(directory_path: str) -> None:
+    """Ensure that a directory exists, and create it if it doesn't."""
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+
+
+def prase_yaml_completion(yml_content: dict) -> str:
+    """Parse and clean YAML content."""
+    completion = yml_content.get("completion", "")
+    if "```yaml" not in completion:
+        raise ValueError("YML not found in completion")
+
+    yml_str = completion.split("```yaml")[1].split("```")[0]
+
+    def clean_yaml_string(yml_str: str) -> str:
+        """Clean and format the YAML string."""
+        parsed_yml_str = ""
+        split_lines = yml_str.splitlines()
+
+        for it, line in enumerate(split_lines):
+            line = prep_yaml_line(line)
+            line = format_yaml_line(line, it, split_lines)
+            parsed_yml_str += line + "\n"
+
+        return parsed_yml_str
+
+    return clean_yaml_string(yml_str)
diff --git a/...hi/examples/library_of_phi/yaml_step_1.py → ...ibrary_of_phi/gen_step_1_draft_syllabi.py b/...hi/examples/library_of_phi/yaml_step_1.py → ...ibrary_of_phi/gen_step_1_draft_syllabi.py
@@ -1,41 +1,42 @@
 # type: ignore
 """
-MIT OCW Course Data Scraper to YAML
+MIT OCW Course Data Scraper to syllyabi 'Draft' YAML Generator
 
 Description:
     This script is designed to scrape course data from the MIT OpenCourseWare (OCW) website and 
-    generate YAML files suitable for LLM (Language Learning Model).
+    generate input YAML files with the help of an LLM.
 
 Usage:
     Command-line interface:
-        $ python sciphi/examples/library_of_phi/yaml_step_1.py run \
-            --output_dir=my_output_directory \
-            --input_filename=my_input_file.jsonl \
+        $ python sciphi/examples/library_of_phi/gen_step_1_draft_syllabi.py run \
+            --output_rel_dir=my_output_directory \
+            --input_jsonl_filename=my_input_file.jsonl \
             --log_level=DEBUG
 Parameters:
-    output_dir (str): 
-        The directory where the generated YAML files will be saved. 
-        Default value is derived based on the relative path 'raw_data/yaml_step_1'.
-
-    input_filename (str): 
-        The filename for the input JSONL file containing the scraped OCW data. 
-        Default is 'scraped_ocw.jsonl'.
-
-    data_directory (Optional[str]): 
-        The directory where the input JSONL file is located. 
-        If not specified, the default location is the directory of this script combined with 'raw_data'.
-
-    log_level (str): 
-        Logging level for the scraper. Can be one of: DEBUG, INFO, WARNING, ERROR, CRITICAL. 
-        Default is 'INFO'.
     
     provider (str):
-        The provider to use for LLM.
+        The provider to use for LLM completions.
         Default is 'openai'.
     
     model_name (str):
-        The model to use for LLM.
+        The model to use for LLM completions.
         Default is 'gpt-4-0613'.
+
+    data_directory (Optional[str]): 
+        The directory the input and output data is to be stored.
+        If none, defaults to the directory of this script plus '/raw_data'.
+
+    output_rel_dir (str): 
+        The relative directory within the data directory where the generated YAML files will be saved. 
+        Default value is 'output_step_1'.
+
+    input_jsonl_filename (str): 
+        The name of the input jsonl file containing course data scraped from MIT OCW.
+        Default value is 'output_step_1'.
+
+    log_level (str): 
+        Logging level for the scraper. Can be one of: DEBUG, INFO, WARNING, ERROR, CRITICAL. 
+        Default is 'INFO'.
 """
 
 import json
@@ -202,21 +203,19 @@ class OCWScraper:
 
     def __init__(
         self,
-        data_dir: str = "raw_data",
-        output_dir: str = "yaml_step_1",
-        input_filename: str = "scraped_ocw.jsonl",
-        data_directory: Optional[str] = None,
-        log_level: str = "INFO",
         provider: str = "openai",
         model_name: str = "gpt-4-0613",
+        data_directory: Optional[str] = None,
+        output_rel_dir: str = "output_step_1",
+        input_jsonl_filename: str = "scraped_ocw.jsonl",
         prompt: str = TOPIC_CREATION_PROMPT,
+        log_level: str = "INFO",
     ):
-        self.data_dir = data_dir
-        self.rel_output_dir = os.path.join(data_dir, output_dir)
-        self.input_filename = input_filename
-        self.data_directory = data_directory
         self.provider = provider
         self.model_name = model_name
+        self.data_directory = data_directory
+        self.output_rel_dir = output_rel_dir
+        self.input_jsonl_filename = input_jsonl_filename
         self.prompt = prompt
         logging.basicConfig(level=getattr(logging, log_level.upper()))
 
@@ -231,17 +230,18 @@ def run(self):
         llm_provider = InterfaceManager.get_provider(
             provider_name, self.model_name, llm_config
         )
-        file_path = os.path.dirname(os.path.abspath(__file__))
         if not self.data_directory:
-            self.data_directory = os.path.join(file_path, self.data_dir)
+            file_path = os.path.dirname(os.path.abspath(__file__))
+            self.data_directory = os.path.join(file_path, "raw_data")
 
-        output_dir = os.path.join(file_path, self.rel_output_dir)
+        output_dir = os.path.join(self.data_directory, self.output_rel_dir)
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
+        logging.info(f"Saving data to output directory = {output_dir}")
 
-        # Get path to this file
+        # The input file path = data directory + input file name
         input_file_path = os.path.join(
-            self.data_directory, self.input_filename
+            self.data_directory, self.input_jsonl_filename
         )
 
         with open(input_file_path, "r") as file:
@@ -270,7 +270,7 @@ def run(self):
                         logging.info(f"Skipping output file {dump_name}....")
                         continue
 
-                    logging.info(f"Creating output at {dump_name}")
+                    logging.info(f"Saving output file named {dump_name}.")
                     formatted_prompt = self.prompt.format(
                         course_name=course_name, context=context
                     )