Skip to content
This repository has been archived by the owner on Feb 12, 2024. It is now read-only.

Commit

Permalink
cleanup generation pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
emrgnt-cmplxty committed Sep 30, 2023
1 parent 917ee71 commit 6d45fc0
Show file tree
Hide file tree
Showing 8 changed files with 402 additions and 252 deletions.
51 changes: 24 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,57 +53,54 @@ poetry install -E <extra_name>

## Usage

### Dataset Generation
### Library of Phi Generation

To help with the Library of Phi generation, you can run the following commands below:
The Library of Phi is a project sponsored by SciPhi which aims to democratize access to high quality textbooks. The Library is automatically generated by a pipeline which goes from Grounded Syllabi -> Table of Contents -> Textbooks. This multi-step process involves scraping the MIT OCW website, processing the scraped data, and generating configuration YAMLs and performing RAG over wikipedia.

To contribute to the Library of Phi generation, run with the following commands below:

#### Step 1: Scraping the MIT OCW Website

This step involves scraping the MIT OCW website to obtain course details like course number, title, URL, resource level, page contents, and more.
Scrape the MIT OCW website to obtain course details like course number, title, URL, resource level, page contents, etc., for each course.

```bash
poetry run python sciphi/examples/library_of_phi/raw_data/ocw_scraper.py scrape
# Optional arguments
# --input_file_name=ocw_courses.html \
# --output_file_name=scraped_ocw.jsonl \
# --sleep_time=1 \
# --log_level=INFO
```

#### Step 2: Processing the Scraped Data to Generate YAML
#### Step 2: Process the scraped data to Generate 'Draft' syllabi YAMLs:

Converts the scraped data into a JSON YAML 'draft' syllabi with the help of a specified LLM (defaults to openai).

```bash
poetry run python sciphi/examples/library_of_phi/gen_step_1_draft_syllabi.py run
```

#### Step 3: Process the draft YAML into the final YAML syllabi:

This step converts the scraped data into a YAML rough draft with the help of the specified LLM.
Clean the drafted syllabi and saves into a ready-to-run YAML output. TODO - Add LLM support for failing drafts.

```bash
python poetry run python sciphi/examples/library_of_phi/yaml_step_1.py run
# Optional arguments
# --output_dir=my_output_directory \
# --input_filename=my_input_file.jsonl \
# --log_level=DEBUG \
# --provider_name=openai \
# --model_name=gpt-4
poetry run python sciphi/examples/library_of_phi/gen_step_2_clean_syllabi.py
```

#### Step 3: Process the draft YAML to generate the final YAML
#### Step 4: Convert the syllabi into a table of contents

This step converts the drafted YAML into a final YAML output

```bash
python poetry run python sciphi/examples/library_of_phi/yaml_step_2.py
# Optional arguments
# --input_rel_dir="yaml_step_1" \
# --output_rel_dir="yaml_step_2" \
# --data_directory=None \
# --log_level=DEBUG \
poetry run python sciphi/examples/library_of_phi/gen_table_of_contents.py

```

For broad application, you can use SciPhi for dataset generation by executing the relevant `runner.py` file with various command-line arguments.
### Customizeable Runner Script

For customizeable application, you can use SciPhi for dataset generation by executing the relevant `runner.py` file with various command-line arguments.

```bash
poetry run python sciphi/examples/basic_data_gen/runner.py --provider_name=openai --model_name=gpt-4 --log_level=INFO --batch_size=1 --num_samples=1 --output_file_name=example_output.jsonl --example_config=textbooks_are_all_you_need_basic_split
```

### Key Command-Line Arguments
### Command-Line Arguments

- `--provider`: Which provider to use for completions (default: "openai").
- `--model_name`: The name of the model to load from the provider (default: "gpt-3.5-turbo").
Expand Down Expand Up @@ -131,7 +128,7 @@ The code snippet below shows how to use SciPhi to generate synthetic data for a
# Build an LLM and provider interface
llm_config = LLMConfigManager.get_config_for_provider(
provider_name
).create(**build_llm_config(args))
).create(**gen_llm_config(args))
llm_provider = InterfaceManager.get_provider(
provider_name,
model_name,
Expand Down
4 changes: 2 additions & 2 deletions sciphi/examples/basic_data_gen/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
get_root_dir,
)
from sciphi.examples.helpers import (
build_llm_config,
gen_llm_config,
parse_arguments,
prep_for_file_path,
)
Expand Down Expand Up @@ -96,7 +96,7 @@ def generate_random_hash() -> str:
# Build an LLM and provider interface
llm_config = LLMConfigManager.get_config_for_provider(
provider_name
).create(**build_llm_config(args))
).create(**gen_llm_config(args))
llm_provider = InterfaceManager.get_provider(
provider_name,
model_name,
Expand Down
110 changes: 109 additions & 1 deletion sciphi/examples/helpers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
"""Helper functions to be used across example scripts"""
import argparse
import json
import logging
import os

import yaml

from sciphi.interface import ProviderName


def build_llm_config(args: argparse.Namespace) -> dict:
def gen_llm_config(args: argparse.Namespace) -> dict:
"""Constructs the LLM config based on provided arguments."""

config_args = {
Expand Down Expand Up @@ -243,3 +247,107 @@ def parse_arguments() -> argparse.Namespace:
help="Directory to write generated output to.",
)
return parser.parse_args()


def setup_logging(log_level: str = "INFO") -> logging.Logger:
"""Set up logging with the given log level."""
logging.basicConfig(level=log_level.upper())
return logging.getLogger(__name__)


def load_yaml(yml_file_path: str) -> dict:
"""Load the content of a YAML file."""
with open(yml_file_path, "r", encoding="utf-8") as file:
return yaml.safe_load(file)


def save_yaml(content: str, filename: str) -> None:
"""Save content to a YAML file."""
with open(filename, "w", encoding="utf-8") as file:
yaml.dump(content, file, allow_unicode=True)


def prep_yaml_line(line: str) -> str:
"""Replace special characters in the YAML string."""
replacements = {
"\u201C": '"',
"\u201D": '"',
"é": "é",
"œ": "œ",
"—": "",
"Ω": "I",
"Ï": "I",
"\x83": "",
"\x88": "",
"\x89": "",
"\x90": "",
"\x91": "",
"\x92": "",
"\x9b": "",
"\x8f": "",
}
for old, new in replacements.items():
line = line.replace(old, new)

line = line.replace("\\", "\\\\")
line = line.replace('"', "'")

return line


def format_yaml_line(line: str, index: int, split_lines: list[str]) -> str:
"""Format a specific line in the YAML content."""
line_cut = line.find("- ")
if line_cut != -1:
end = len(line) if line[-1] != ":" else len(line) - 1
line = (
line[: line_cut + 2]
+ '"'
+ line[line_cut + 2 : end]
+ '"'
+ ("" if end == len(line) else ":")
)
if line[-1] != ":" and index != len(split_lines) - 1:
if "subtopics:" in split_lines[index + 1]:
line += ":"
elif index == 2:
first_non_blank_char = next(
(i for i, char in enumerate(line) if char != " "), 0
)
if first_non_blank_char != 0:
line = (
line[:first_non_blank_char]
+ '"'
+ line[first_non_blank_char:-1]
+ '":'
)
return line


def ensure_directory_exists(directory_path: str) -> None:
"""Ensure that a directory exists, and create it if it doesn't."""
if not os.path.exists(directory_path):
os.makedirs(directory_path)


def prase_yaml_completion(yml_content: dict) -> str:
"""Parse and clean YAML content."""
completion = yml_content.get("completion", "")
if "```yaml" not in completion:
raise ValueError("YML not found in completion")

yml_str = completion.split("```yaml")[1].split("```")[0]

def clean_yaml_string(yml_str: str) -> str:
"""Clean and format the YAML string."""
parsed_yml_str = ""
split_lines = yml_str.splitlines()

for it, line in enumerate(split_lines):
line = prep_yaml_line(line)
line = format_yaml_line(line, it, split_lines)
parsed_yml_str += line + "\n"

return parsed_yml_str

return clean_yaml_string(yml_str)
Original file line number Diff line number Diff line change
@@ -1,41 +1,42 @@
# type: ignore
"""
MIT OCW Course Data Scraper to YAML
MIT OCW Course Data Scraper to syllyabi 'Draft' YAML Generator
Description:
This script is designed to scrape course data from the MIT OpenCourseWare (OCW) website and
generate YAML files suitable for LLM (Language Learning Model).
generate input YAML files with the help of an LLM.
Usage:
Command-line interface:
$ python sciphi/examples/library_of_phi/yaml_step_1.py run \
--output_dir=my_output_directory \
--input_filename=my_input_file.jsonl \
$ python sciphi/examples/library_of_phi/gen_step_1_draft_syllabi.py run \
--output_rel_dir=my_output_directory \
--input_jsonl_filename=my_input_file.jsonl \
--log_level=DEBUG
Parameters:
output_dir (str):
The directory where the generated YAML files will be saved.
Default value is derived based on the relative path 'raw_data/yaml_step_1'.
input_filename (str):
The filename for the input JSONL file containing the scraped OCW data.
Default is 'scraped_ocw.jsonl'.
data_directory (Optional[str]):
The directory where the input JSONL file is located.
If not specified, the default location is the directory of this script combined with 'raw_data'.
log_level (str):
Logging level for the scraper. Can be one of: DEBUG, INFO, WARNING, ERROR, CRITICAL.
Default is 'INFO'.
provider (str):
The provider to use for LLM.
The provider to use for LLM completions.
Default is 'openai'.
model_name (str):
The model to use for LLM.
The model to use for LLM completions.
Default is 'gpt-4-0613'.
data_directory (Optional[str]):
The directory the input and output data is to be stored.
If none, defaults to the directory of this script plus '/raw_data'.
output_rel_dir (str):
The relative directory within the data directory where the generated YAML files will be saved.
Default value is 'output_step_1'.
input_jsonl_filename (str):
The name of the input jsonl file containing course data scraped from MIT OCW.
Default value is 'output_step_1'.
log_level (str):
Logging level for the scraper. Can be one of: DEBUG, INFO, WARNING, ERROR, CRITICAL.
Default is 'INFO'.
"""

import json
Expand Down Expand Up @@ -202,21 +203,19 @@ class OCWScraper:

def __init__(
self,
data_dir: str = "raw_data",
output_dir: str = "yaml_step_1",
input_filename: str = "scraped_ocw.jsonl",
data_directory: Optional[str] = None,
log_level: str = "INFO",
provider: str = "openai",
model_name: str = "gpt-4-0613",
data_directory: Optional[str] = None,
output_rel_dir: str = "output_step_1",
input_jsonl_filename: str = "scraped_ocw.jsonl",
prompt: str = TOPIC_CREATION_PROMPT,
log_level: str = "INFO",
):
self.data_dir = data_dir
self.rel_output_dir = os.path.join(data_dir, output_dir)
self.input_filename = input_filename
self.data_directory = data_directory
self.provider = provider
self.model_name = model_name
self.data_directory = data_directory
self.output_rel_dir = output_rel_dir
self.input_jsonl_filename = input_jsonl_filename
self.prompt = prompt
logging.basicConfig(level=getattr(logging, log_level.upper()))

Expand All @@ -231,17 +230,18 @@ def run(self):
llm_provider = InterfaceManager.get_provider(
provider_name, self.model_name, llm_config
)
file_path = os.path.dirname(os.path.abspath(__file__))
if not self.data_directory:
self.data_directory = os.path.join(file_path, self.data_dir)
file_path = os.path.dirname(os.path.abspath(__file__))
self.data_directory = os.path.join(file_path, "raw_data")

output_dir = os.path.join(file_path, self.rel_output_dir)
output_dir = os.path.join(self.data_directory, self.output_rel_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logging.info(f"Saving data to output directory = {output_dir}")

# Get path to this file
# The input file path = data directory + input file name
input_file_path = os.path.join(
self.data_directory, self.input_filename
self.data_directory, self.input_jsonl_filename
)

with open(input_file_path, "r") as file:
Expand Down Expand Up @@ -270,7 +270,7 @@ def run(self):
logging.info(f"Skipping output file {dump_name}....")
continue

logging.info(f"Creating output at {dump_name}")
logging.info(f"Saving output file named {dump_name}.")
formatted_prompt = self.prompt.format(
course_name=course_name, context=context
)
Expand Down
Loading

0 comments on commit 6d45fc0

Please sign in to comment.