This repository has been archived by the owner on Feb 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
04f8362
commit a410f30
Showing
19 changed files
with
3,630 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
"""Run the dataset generation.""" | ||
import argparse | ||
import hashlib | ||
import os | ||
import random | ||
import secrets | ||
import time | ||
|
||
from sciphi.config import DataConfig, DataGeneratorMode | ||
from sciphi.core.utils import ( | ||
get_configured_logger, | ||
get_data_config_dir, | ||
get_root_dir, | ||
) | ||
from sciphi.examples.helpers import ( | ||
gen_llm_config, | ||
parse_arguments, | ||
prep_for_file_path, | ||
) | ||
from sciphi.interface import InterfaceManager, ProviderName | ||
from sciphi.llm import LLMConfigManager | ||
from sciphi.makers import DataMaker | ||
from sciphi.prompt import ( | ||
Prompt, | ||
PromptGenerator, | ||
PromptManager, | ||
PromptStructure, | ||
) | ||
from sciphi.writers import JsonlDataWriter | ||
|
||
OUTPUT_FILE_NAME = "{RUN_NAME}__provider_eq_{PROVIDER}__model_eq_{MODEL}__version_eq_{VERSION}{EXTRA}.jsonl" | ||
|
||
|
||
def get_output_path(args: argparse.Namespace) -> str: | ||
"""Get the output path for the given arguments.""" | ||
|
||
def generate_random_hash() -> str: | ||
"""Generate a random hash.""" | ||
# Generate a random token in byte format and then convert to SHA256 hash | ||
random_token = secrets.token_bytes(32) # 32 bytes = 256 bits | ||
return hashlib.sha256(random_token).hexdigest() | ||
|
||
output_dir = os.path.join( | ||
get_root_dir(), | ||
prep_for_file_path(args.output_dir), | ||
prep_for_file_path(args.provider_name), | ||
prep_for_file_path(args.model_name), | ||
) | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
|
||
# TODO - Fail check if path does not exist after attempted creation | ||
|
||
if not args.output_file_name: | ||
output_file_name = OUTPUT_FILE_NAME.format( | ||
**{ | ||
k: prep_for_file_path(v) | ||
for k, v in { | ||
"RUN_NAME": args.run_name, | ||
"PROVIDER": str(args.provider_name), | ||
"MODEL": args.model_name, | ||
"VERSION": args.version, | ||
"EXTRA": args.extra_output_file_text | ||
or f"_{generate_random_hash()}", | ||
}.items() | ||
} | ||
) | ||
else: | ||
output_file_name = args.output_file_name | ||
|
||
return os.path.join(output_dir, output_file_name) | ||
|
||
|
||
if __name__ == "__main__": | ||
"""Run the synthesis.""" | ||
# Setup local environment | ||
args = parse_arguments() | ||
logger = get_configured_logger("sciphi", log_level=args.log_level) | ||
|
||
if args.num_samples % args.batch_size != 0: | ||
raise ValueError( | ||
f"Number of samples ({args.num_samples}) must be divisible by batch size ({args.batch_size})." | ||
) | ||
if args.num_samples < args.batch_size: | ||
raise ValueError( | ||
"Number of samples must be greater than batch size0. Please set --num_samples." | ||
) | ||
|
||
model_name = args.model_name | ||
provider_name = ProviderName(args.provider_name) | ||
|
||
logger.info( | ||
f"Loading ModelName={model_name} from ProviderName={provider_name.value}." | ||
) | ||
|
||
# Build an LLM and provider interface | ||
llm_config = LLMConfigManager.get_config_for_provider( | ||
provider_name | ||
).create(**gen_llm_config(args)) | ||
llm_provider = InterfaceManager.get_provider( | ||
provider_name, | ||
model_name, | ||
llm_config, | ||
) | ||
|
||
# Initialize the prompt generator | ||
data_config = DataConfig( | ||
os.path.join( | ||
get_data_config_dir(), f"{args.example_config}", "main.yaml" | ||
) | ||
) | ||
prompt_generator = PromptGenerator( | ||
data_config.config, | ||
data_config.prompt_templates, | ||
data_config.prompt_template_input_dependencies, | ||
data_config.prompt_dataset_dependencies, | ||
data_config.prompt_inputs, | ||
) | ||
prompt = PromptManager().get_prompt(data_config.outer_prompt_format) | ||
if args.prompt_override != "": | ||
logger.debug(f"Overriding default prompt with: {args.prompt_override}") | ||
prompt_inputs = args.prompt_override.split(",") | ||
prompt = Prompt( | ||
raw_text=prompt_inputs[0], | ||
expected_inputs=set(prompt_inputs[1:]), | ||
structure=PromptStructure.SINGLE, | ||
) | ||
|
||
# Initialize the data maker | ||
data_maker = DataMaker( | ||
DataGeneratorMode(data_config.generator_mode), | ||
prompt_generator, | ||
prompt, | ||
# Optional field, | ||
# currently only used when generator_mode == "from_hf_dataset" | ||
dataset_name=data_config.dataset_name, | ||
) | ||
|
||
# Generate & write out the results | ||
output_path = get_output_path(args) | ||
logger.debug(f"Writing results to: {output_path}.") | ||
writer = JsonlDataWriter(output_path) | ||
|
||
seed = int(time.time() * 1_000_000) + os.getpid() | ||
logger.info(f"Using seed: {seed}") | ||
random.seed(seed) | ||
|
||
for batch in data_maker.generator(args.batch_size, args.num_samples): | ||
completions = llm_provider.get_batch_completion(batch) | ||
for formatted_prompt, completion in zip(batch, completions): | ||
logger.debug("-" * 100) | ||
logger.debug(f"Formatted Prompt:\n{formatted_prompt}") | ||
logger.debug(f"\nCompletion:\n{completion}") | ||
logger.debug("-" * 100) | ||
# Write the results using DataWriter | ||
writer.write( | ||
[ | ||
{ | ||
"formatted_prompt": formatted_prompt, | ||
"completion": completion, | ||
} | ||
] | ||
) |
Empty file.
205 changes: 205 additions & 0 deletions
205
sciphi/deprecated/khan_academy/gen_step_1_draft_syllabi.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
# type: ignore | ||
""" | ||
MIT OCW Course Data Scraper to syllyabi 'Draft' YAML Generator | ||
Description: | ||
This script is designed to scrape course data from the MIT OpenCourseWare (OCW) website and | ||
generate input YAML files with the help of an LLM. | ||
Usage: | ||
Command-line interface: | ||
$ python sciphi/examples/library_of_phi/gen_step_1_draft_syllabi.py run \ | ||
--output_rel_dir=my_output_directory \ | ||
--input_jsonl_filename=my_input_file.jsonl \ | ||
--log_level=DEBUG | ||
Parameters: | ||
provider (str): | ||
The provider to use for LLM completions. | ||
Default is 'openai'. | ||
model_name (str): | ||
The model to use for LLM completions. | ||
Default is 'gpt-4-0613'. | ||
data_directory (Optional[str]): | ||
The directory the input and output data is to be stored. | ||
If none, defaults to the directory of this script plus '/raw_data'. | ||
output_rel_dir (str): | ||
The relative directory within the data directory where the generated YAML files will be saved. | ||
Default value is 'output_step_1'. | ||
input_jsonl_filename (str): | ||
The name of the input jsonl file containing course data scraped from MIT OCW. | ||
Default value is 'output_step_1'. | ||
log_level (str): | ||
Logging level for the scraper. Can be one of: DEBUG, INFO, WARNING, ERROR, CRITICAL. | ||
Default is 'INFO'. | ||
""" | ||
|
||
import json | ||
import logging | ||
import os | ||
import re | ||
from glob import glob | ||
from typing import Optional, Set | ||
|
||
import fire | ||
import yaml | ||
|
||
from sciphi.examples.helpers import ( | ||
get_default_settings_provider, | ||
prase_yaml_completion, | ||
save_yaml, | ||
) | ||
from sciphi.examples.khan_academy.prompts import SYLLABI_CREATION_PROMPT | ||
|
||
|
||
def extract_data_from_record(record: dict[str, str]) -> tuple[dict, str]: | ||
"""Extract and organize data from a given record.""" | ||
context = f"### Course Name:\n{record['course_title'].replace(':', '')}\n" | ||
topics = {} | ||
page_contents = record["page_contents"]["syllabus"] | ||
|
||
for topic in page_contents["topics"]: | ||
context += f"## Page:\n{topic}\n" | ||
topics[topic] = "\n".join(page_contents["topics"][topic]) | ||
|
||
table = "\n".join(page_contents["topics"][topic]) | ||
context += f"Information:\n{table}\n" | ||
|
||
return topics, context | ||
|
||
|
||
def get_mapped_topics(topics: str) -> dict[str, str]: | ||
"""Get mapped topics based on depth.""" | ||
mapped_topics = {} | ||
for topic in topics: | ||
if topics[topic] == 0: | ||
mapped_topics["category"] = topic | ||
elif topics[topic] == 1: | ||
mapped_topics["field"] = topic | ||
elif topics[topic] == 2: | ||
mapped_topics["subfield"] = topic | ||
break | ||
return mapped_topics | ||
|
||
|
||
def get_observed_files(output_dir: str) -> Set[str]: | ||
"""Get set of previously observed files.""" | ||
observed_files = set([]) | ||
for yaml_file in glob(f"{output_dir}/**/*.yaml"): | ||
with open(yaml_file, "r") as existing_file: | ||
yaml_load = yaml.safe_load(existing_file) | ||
if "course" in yaml_load: | ||
for course_name in yaml_load["course"]: | ||
_, discipline, field = yaml_file.split("/") | ||
field = field.replace(".yaml", "") | ||
subfield = yaml_load["course"][course_name]["subfield"] | ||
observed_files.add( | ||
f"field_{field}_subfield_{subfield}_course_name_{course_name.replace(' ','_')}" | ||
) | ||
for yaml_dumps in glob(f"{output_dir}/*.yaml"): | ||
observed_files.add(yaml_dumps.split("/")[-1].replace(".yaml", "")) | ||
return observed_files | ||
|
||
|
||
def quoted_presenter(dumper, data): | ||
"""Define a custom representer for string scalars that applies quotes.""" | ||
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"') | ||
|
||
|
||
class DraftSyllabiYAMLRunner: | ||
"""Runs the generation process for draft syllabi YAMLs.""" | ||
|
||
def __init__( | ||
self, | ||
provider: str = "openai", | ||
model_name: str = "gpt-4-0613", | ||
data_directory: Optional[str] = None, | ||
output_rel_dir: str = "output_step_1", | ||
input_jsonl_filename: str = "ka_scraped.jsonl", | ||
prompt: str = SYLLABI_CREATION_PROMPT, | ||
log_level: str = "INFO", | ||
): | ||
self.provider = provider | ||
self.model_name = model_name | ||
self.data_directory = data_directory | ||
self.output_rel_dir = output_rel_dir | ||
self.input_jsonl_filename = input_jsonl_filename | ||
self.prompt = prompt | ||
logging.basicConfig(level=getattr(logging, log_level.upper())) | ||
|
||
def run(self) -> None: | ||
"""Run the draft YAML generation process.""" | ||
yaml.add_representer(str, quoted_presenter) | ||
|
||
llm_provider = get_default_settings_provider( | ||
provider=self.provider, model_name=self.model_name | ||
) | ||
if not self.data_directory: | ||
file_path = os.path.dirname(os.path.abspath(__file__)) | ||
self.data_directory = os.path.join(file_path, "raw_data") | ||
|
||
output_dir = os.path.join(self.data_directory, self.output_rel_dir) | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
logging.info(f"Saving data to output directory = {output_dir}") | ||
|
||
# The input file path = data directory + input file name | ||
input_file_path = os.path.join( | ||
self.data_directory, self.input_jsonl_filename | ||
) | ||
|
||
with open(input_file_path, "r") as file: | ||
for line in file: | ||
try: | ||
record = json.loads(line) | ||
|
||
# Don't use the topics in KA, and all the context is in the string | ||
_, context = extract_data_from_record(record) | ||
|
||
course_name = record["course_title"] | ||
|
||
# Replace all non-alphanumeric characters with underscores | ||
dump_name = re.sub( | ||
r"\W+", "", course_name.replace(" ", "_") | ||
) | ||
|
||
if os.path.exists( | ||
os.path.join(output_dir, f"{dump_name}.yaml") | ||
): | ||
logging.warn( | ||
f"Skipping {dump_name} because it was already created..." | ||
) | ||
continue | ||
|
||
formatted_prompt = self.prompt.format( | ||
course_name=course_name, | ||
context=context, | ||
) | ||
|
||
completion = llm_provider.get_completion(formatted_prompt) | ||
|
||
data_to_save = { | ||
"completion": completion, | ||
"course_name": course_name, | ||
} | ||
|
||
yaml_completion = prase_yaml_completion(data_to_save) | ||
yml_load = yaml.safe_load(yaml_completion) | ||
save_yaml( | ||
yml_load, | ||
os.path.join(output_dir, f"{dump_name}.yaml"), | ||
) | ||
|
||
except Exception as e: | ||
logging.error( | ||
f"Failed to create output yaml at {self.data_directory} with error: {e}" | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
fire.Fire(DraftSyllabiYAMLRunner) |
Oops, something went wrong.