Skip to content
This repository has been archived by the owner on Feb 12, 2024. It is now read-only.

Commit

Permalink
Add deprecated
Browse files Browse the repository at this point in the history
  • Loading branch information
emrgnt-cmplxty committed Oct 13, 2023
1 parent 04f8362 commit a410f30
Show file tree
Hide file tree
Showing 19 changed files with 3,630 additions and 0 deletions.
Empty file.
163 changes: 163 additions & 0 deletions sciphi/deprecated/basic_data_gen/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""Run the dataset generation."""
import argparse
import hashlib
import os
import random
import secrets
import time

from sciphi.config import DataConfig, DataGeneratorMode
from sciphi.core.utils import (
get_configured_logger,
get_data_config_dir,
get_root_dir,
)
from sciphi.examples.helpers import (
gen_llm_config,
parse_arguments,
prep_for_file_path,
)
from sciphi.interface import InterfaceManager, ProviderName
from sciphi.llm import LLMConfigManager
from sciphi.makers import DataMaker
from sciphi.prompt import (
Prompt,
PromptGenerator,
PromptManager,
PromptStructure,
)
from sciphi.writers import JsonlDataWriter

OUTPUT_FILE_NAME = "{RUN_NAME}__provider_eq_{PROVIDER}__model_eq_{MODEL}__version_eq_{VERSION}{EXTRA}.jsonl"


def get_output_path(args: argparse.Namespace) -> str:
"""Get the output path for the given arguments."""

def generate_random_hash() -> str:
"""Generate a random hash."""
# Generate a random token in byte format and then convert to SHA256 hash
random_token = secrets.token_bytes(32) # 32 bytes = 256 bits
return hashlib.sha256(random_token).hexdigest()

output_dir = os.path.join(
get_root_dir(),
prep_for_file_path(args.output_dir),
prep_for_file_path(args.provider_name),
prep_for_file_path(args.model_name),
)
if not os.path.exists(output_dir):
os.makedirs(output_dir)

# TODO - Fail check if path does not exist after attempted creation

if not args.output_file_name:
output_file_name = OUTPUT_FILE_NAME.format(
**{
k: prep_for_file_path(v)
for k, v in {
"RUN_NAME": args.run_name,
"PROVIDER": str(args.provider_name),
"MODEL": args.model_name,
"VERSION": args.version,
"EXTRA": args.extra_output_file_text
or f"_{generate_random_hash()}",
}.items()
}
)
else:
output_file_name = args.output_file_name

return os.path.join(output_dir, output_file_name)


if __name__ == "__main__":
"""Run the synthesis."""
# Setup local environment
args = parse_arguments()
logger = get_configured_logger("sciphi", log_level=args.log_level)

if args.num_samples % args.batch_size != 0:
raise ValueError(
f"Number of samples ({args.num_samples}) must be divisible by batch size ({args.batch_size})."
)
if args.num_samples < args.batch_size:
raise ValueError(
"Number of samples must be greater than batch size0. Please set --num_samples."
)

model_name = args.model_name
provider_name = ProviderName(args.provider_name)

logger.info(
f"Loading ModelName={model_name} from ProviderName={provider_name.value}."
)

# Build an LLM and provider interface
llm_config = LLMConfigManager.get_config_for_provider(
provider_name
).create(**gen_llm_config(args))
llm_provider = InterfaceManager.get_provider(
provider_name,
model_name,
llm_config,
)

# Initialize the prompt generator
data_config = DataConfig(
os.path.join(
get_data_config_dir(), f"{args.example_config}", "main.yaml"
)
)
prompt_generator = PromptGenerator(
data_config.config,
data_config.prompt_templates,
data_config.prompt_template_input_dependencies,
data_config.prompt_dataset_dependencies,
data_config.prompt_inputs,
)
prompt = PromptManager().get_prompt(data_config.outer_prompt_format)
if args.prompt_override != "":
logger.debug(f"Overriding default prompt with: {args.prompt_override}")
prompt_inputs = args.prompt_override.split(",")
prompt = Prompt(
raw_text=prompt_inputs[0],
expected_inputs=set(prompt_inputs[1:]),
structure=PromptStructure.SINGLE,
)

# Initialize the data maker
data_maker = DataMaker(
DataGeneratorMode(data_config.generator_mode),
prompt_generator,
prompt,
# Optional field,
# currently only used when generator_mode == "from_hf_dataset"
dataset_name=data_config.dataset_name,
)

# Generate & write out the results
output_path = get_output_path(args)
logger.debug(f"Writing results to: {output_path}.")
writer = JsonlDataWriter(output_path)

seed = int(time.time() * 1_000_000) + os.getpid()
logger.info(f"Using seed: {seed}")
random.seed(seed)

for batch in data_maker.generator(args.batch_size, args.num_samples):
completions = llm_provider.get_batch_completion(batch)
for formatted_prompt, completion in zip(batch, completions):
logger.debug("-" * 100)
logger.debug(f"Formatted Prompt:\n{formatted_prompt}")
logger.debug(f"\nCompletion:\n{completion}")
logger.debug("-" * 100)
# Write the results using DataWriter
writer.write(
[
{
"formatted_prompt": formatted_prompt,
"completion": completion,
}
]
)
Empty file.
205 changes: 205 additions & 0 deletions sciphi/deprecated/khan_academy/gen_step_1_draft_syllabi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# type: ignore
"""
MIT OCW Course Data Scraper to syllyabi 'Draft' YAML Generator
Description:
This script is designed to scrape course data from the MIT OpenCourseWare (OCW) website and
generate input YAML files with the help of an LLM.
Usage:
Command-line interface:
$ python sciphi/examples/library_of_phi/gen_step_1_draft_syllabi.py run \
--output_rel_dir=my_output_directory \
--input_jsonl_filename=my_input_file.jsonl \
--log_level=DEBUG
Parameters:
provider (str):
The provider to use for LLM completions.
Default is 'openai'.
model_name (str):
The model to use for LLM completions.
Default is 'gpt-4-0613'.
data_directory (Optional[str]):
The directory the input and output data is to be stored.
If none, defaults to the directory of this script plus '/raw_data'.
output_rel_dir (str):
The relative directory within the data directory where the generated YAML files will be saved.
Default value is 'output_step_1'.
input_jsonl_filename (str):
The name of the input jsonl file containing course data scraped from MIT OCW.
Default value is 'output_step_1'.
log_level (str):
Logging level for the scraper. Can be one of: DEBUG, INFO, WARNING, ERROR, CRITICAL.
Default is 'INFO'.
"""

import json
import logging
import os
import re
from glob import glob
from typing import Optional, Set

import fire
import yaml

from sciphi.examples.helpers import (
get_default_settings_provider,
prase_yaml_completion,
save_yaml,
)
from sciphi.examples.khan_academy.prompts import SYLLABI_CREATION_PROMPT


def extract_data_from_record(record: dict[str, str]) -> tuple[dict, str]:
"""Extract and organize data from a given record."""
context = f"### Course Name:\n{record['course_title'].replace(':', '')}\n"
topics = {}
page_contents = record["page_contents"]["syllabus"]

for topic in page_contents["topics"]:
context += f"## Page:\n{topic}\n"
topics[topic] = "\n".join(page_contents["topics"][topic])

table = "\n".join(page_contents["topics"][topic])
context += f"Information:\n{table}\n"

return topics, context


def get_mapped_topics(topics: str) -> dict[str, str]:
"""Get mapped topics based on depth."""
mapped_topics = {}
for topic in topics:
if topics[topic] == 0:
mapped_topics["category"] = topic
elif topics[topic] == 1:
mapped_topics["field"] = topic
elif topics[topic] == 2:
mapped_topics["subfield"] = topic
break
return mapped_topics


def get_observed_files(output_dir: str) -> Set[str]:
"""Get set of previously observed files."""
observed_files = set([])
for yaml_file in glob(f"{output_dir}/**/*.yaml"):
with open(yaml_file, "r") as existing_file:
yaml_load = yaml.safe_load(existing_file)
if "course" in yaml_load:
for course_name in yaml_load["course"]:
_, discipline, field = yaml_file.split("/")
field = field.replace(".yaml", "")
subfield = yaml_load["course"][course_name]["subfield"]
observed_files.add(
f"field_{field}_subfield_{subfield}_course_name_{course_name.replace(' ','_')}"
)
for yaml_dumps in glob(f"{output_dir}/*.yaml"):
observed_files.add(yaml_dumps.split("/")[-1].replace(".yaml", ""))
return observed_files


def quoted_presenter(dumper, data):
"""Define a custom representer for string scalars that applies quotes."""
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"')


class DraftSyllabiYAMLRunner:
"""Runs the generation process for draft syllabi YAMLs."""

def __init__(
self,
provider: str = "openai",
model_name: str = "gpt-4-0613",
data_directory: Optional[str] = None,
output_rel_dir: str = "output_step_1",
input_jsonl_filename: str = "ka_scraped.jsonl",
prompt: str = SYLLABI_CREATION_PROMPT,
log_level: str = "INFO",
):
self.provider = provider
self.model_name = model_name
self.data_directory = data_directory
self.output_rel_dir = output_rel_dir
self.input_jsonl_filename = input_jsonl_filename
self.prompt = prompt
logging.basicConfig(level=getattr(logging, log_level.upper()))

def run(self) -> None:
"""Run the draft YAML generation process."""
yaml.add_representer(str, quoted_presenter)

llm_provider = get_default_settings_provider(
provider=self.provider, model_name=self.model_name
)
if not self.data_directory:
file_path = os.path.dirname(os.path.abspath(__file__))
self.data_directory = os.path.join(file_path, "raw_data")

output_dir = os.path.join(self.data_directory, self.output_rel_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logging.info(f"Saving data to output directory = {output_dir}")

# The input file path = data directory + input file name
input_file_path = os.path.join(
self.data_directory, self.input_jsonl_filename
)

with open(input_file_path, "r") as file:
for line in file:
try:
record = json.loads(line)

# Don't use the topics in KA, and all the context is in the string
_, context = extract_data_from_record(record)

course_name = record["course_title"]

# Replace all non-alphanumeric characters with underscores
dump_name = re.sub(
r"\W+", "", course_name.replace(" ", "_")
)

if os.path.exists(
os.path.join(output_dir, f"{dump_name}.yaml")
):
logging.warn(
f"Skipping {dump_name} because it was already created..."
)
continue

formatted_prompt = self.prompt.format(
course_name=course_name,
context=context,
)

completion = llm_provider.get_completion(formatted_prompt)

data_to_save = {
"completion": completion,
"course_name": course_name,
}

yaml_completion = prase_yaml_completion(data_to_save)
yml_load = yaml.safe_load(yaml_completion)
save_yaml(
yml_load,
os.path.join(output_dir, f"{dump_name}.yaml"),
)

except Exception as e:
logging.error(
f"Failed to create output yaml at {self.data_directory} with error: {e}"
)


if __name__ == "__main__":
fire.Fire(DraftSyllabiYAMLRunner)
Loading

0 comments on commit a410f30

Please sign in to comment.