Add deprecated

SciPhi-AI · Oct 13, 2023 · a410f30 · a410f30
1 parent 04f8362
commit a410f30
Show file tree

Hide file tree

Showing 19 changed files with 3,630 additions and 0 deletions.
diff --git a/sciphi/deprecated/basic_data_gen/__init__.py b/sciphi/deprecated/basic_data_gen/__init__.py
diff --git a/sciphi/deprecated/basic_data_gen/runner.py b/sciphi/deprecated/basic_data_gen/runner.py
@@ -0,0 +1,163 @@
+"""Run the dataset generation."""
+import argparse
+import hashlib
+import os
+import random
+import secrets
+import time
+
+from sciphi.config import DataConfig, DataGeneratorMode
+from sciphi.core.utils import (
+    get_configured_logger,
+    get_data_config_dir,
+    get_root_dir,
+)
+from sciphi.examples.helpers import (
+    gen_llm_config,
+    parse_arguments,
+    prep_for_file_path,
+)
+from sciphi.interface import InterfaceManager, ProviderName
+from sciphi.llm import LLMConfigManager
+from sciphi.makers import DataMaker
+from sciphi.prompt import (
+    Prompt,
+    PromptGenerator,
+    PromptManager,
+    PromptStructure,
+)
+from sciphi.writers import JsonlDataWriter
+
+OUTPUT_FILE_NAME = "{RUN_NAME}__provider_eq_{PROVIDER}__model_eq_{MODEL}__version_eq_{VERSION}{EXTRA}.jsonl"
+
+
+def get_output_path(args: argparse.Namespace) -> str:
+    """Get the output path for the given arguments."""
+
+    def generate_random_hash() -> str:
+        """Generate a random hash."""
+        # Generate a random token in byte format and then convert to SHA256 hash
+        random_token = secrets.token_bytes(32)  # 32 bytes = 256 bits
+        return hashlib.sha256(random_token).hexdigest()
+
+    output_dir = os.path.join(
+        get_root_dir(),
+        prep_for_file_path(args.output_dir),
+        prep_for_file_path(args.provider_name),
+        prep_for_file_path(args.model_name),
+    )
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # TODO - Fail check if path does not exist after attempted creation
+
+    if not args.output_file_name:
+        output_file_name = OUTPUT_FILE_NAME.format(
+            **{
+                k: prep_for_file_path(v)
+                for k, v in {
+                    "RUN_NAME": args.run_name,
+                    "PROVIDER": str(args.provider_name),
+                    "MODEL": args.model_name,
+                    "VERSION": args.version,
+                    "EXTRA": args.extra_output_file_text
+                    or f"_{generate_random_hash()}",
+                }.items()
+            }
+        )
+    else:
+        output_file_name = args.output_file_name
+
+    return os.path.join(output_dir, output_file_name)
+
+
+if __name__ == "__main__":
+    """Run the synthesis."""
+    # Setup local environment
+    args = parse_arguments()
+    logger = get_configured_logger("sciphi", log_level=args.log_level)
+
+    if args.num_samples % args.batch_size != 0:
+        raise ValueError(
+            f"Number of samples ({args.num_samples}) must be divisible by batch size ({args.batch_size})."
+        )
+    if args.num_samples < args.batch_size:
+        raise ValueError(
+            "Number of samples must be greater than batch size0. Please set --num_samples."
+        )
+
+    model_name = args.model_name
+    provider_name = ProviderName(args.provider_name)
+
+    logger.info(
+        f"Loading ModelName={model_name} from ProviderName={provider_name.value}."
+    )
+
+    # Build an LLM and provider interface
+    llm_config = LLMConfigManager.get_config_for_provider(
+        provider_name
+    ).create(**gen_llm_config(args))
+    llm_provider = InterfaceManager.get_provider(
+        provider_name,
+        model_name,
+        llm_config,
+    )
+
+    # Initialize the prompt generator
+    data_config = DataConfig(
+        os.path.join(
+            get_data_config_dir(), f"{args.example_config}", "main.yaml"
+        )
+    )
+    prompt_generator = PromptGenerator(
+        data_config.config,
+        data_config.prompt_templates,
+        data_config.prompt_template_input_dependencies,
+        data_config.prompt_dataset_dependencies,
+        data_config.prompt_inputs,
+    )
+    prompt = PromptManager().get_prompt(data_config.outer_prompt_format)
+    if args.prompt_override != "":
+        logger.debug(f"Overriding default prompt with: {args.prompt_override}")
+        prompt_inputs = args.prompt_override.split(",")
+        prompt = Prompt(
+            raw_text=prompt_inputs[0],
+            expected_inputs=set(prompt_inputs[1:]),
+            structure=PromptStructure.SINGLE,
+        )
+
+    # Initialize the data maker
+    data_maker = DataMaker(
+        DataGeneratorMode(data_config.generator_mode),
+        prompt_generator,
+        prompt,
+        # Optional field,
+        # currently only used when generator_mode == "from_hf_dataset"
+        dataset_name=data_config.dataset_name,
+    )
+
+    # Generate & write out the results
+    output_path = get_output_path(args)
+    logger.debug(f"Writing results to: {output_path}.")
+    writer = JsonlDataWriter(output_path)
+
+    seed = int(time.time() * 1_000_000) + os.getpid()
+    logger.info(f"Using seed: {seed}")
+    random.seed(seed)
+
+    for batch in data_maker.generator(args.batch_size, args.num_samples):
+        completions = llm_provider.get_batch_completion(batch)
+        for formatted_prompt, completion in zip(batch, completions):
+            logger.debug("-" * 100)
+            logger.debug(f"Formatted Prompt:\n{formatted_prompt}")
+            logger.debug(f"\nCompletion:\n{completion}")
+            logger.debug("-" * 100)
+            # Write the results using DataWriter
+            writer.write(
+                [
+                    {
+                        "formatted_prompt": formatted_prompt,
+                        "completion": completion,
+                    }
+                ]
+            )
diff --git a/sciphi/deprecated/khan_academy/__init__.py b/sciphi/deprecated/khan_academy/__init__.py
diff --git a/sciphi/deprecated/khan_academy/gen_step_1_draft_syllabi.py b/sciphi/deprecated/khan_academy/gen_step_1_draft_syllabi.py
@@ -0,0 +1,205 @@
+# type: ignore
+"""
+MIT OCW Course Data Scraper to syllyabi 'Draft' YAML Generator
+
+Description:
+    This script is designed to scrape course data from the MIT OpenCourseWare (OCW) website and 
+    generate input YAML files with the help of an LLM.
+
+Usage:
+    Command-line interface:
+        $ python sciphi/examples/library_of_phi/gen_step_1_draft_syllabi.py run \
+            --output_rel_dir=my_output_directory \
+            --input_jsonl_filename=my_input_file.jsonl \
+            --log_level=DEBUG
+Parameters:
+    
+    provider (str):
+        The provider to use for LLM completions.
+        Default is 'openai'.
+    
+    model_name (str):
+        The model to use for LLM completions.
+        Default is 'gpt-4-0613'.
+
+    data_directory (Optional[str]): 
+        The directory the input and output data is to be stored.
+        If none, defaults to the directory of this script plus '/raw_data'.
+
+    output_rel_dir (str): 
+        The relative directory within the data directory where the generated YAML files will be saved. 
+        Default value is 'output_step_1'.
+
+    input_jsonl_filename (str): 
+        The name of the input jsonl file containing course data scraped from MIT OCW.
+        Default value is 'output_step_1'.
+
+    log_level (str): 
+        Logging level for the scraper. Can be one of: DEBUG, INFO, WARNING, ERROR, CRITICAL. 
+        Default is 'INFO'.
+"""
+
+import json
+import logging
+import os
+import re
+from glob import glob
+from typing import Optional, Set
+
+import fire
+import yaml
+
+from sciphi.examples.helpers import (
+    get_default_settings_provider,
+    prase_yaml_completion,
+    save_yaml,
+)
+from sciphi.examples.khan_academy.prompts import SYLLABI_CREATION_PROMPT
+
+
+def extract_data_from_record(record: dict[str, str]) -> tuple[dict, str]:
+    """Extract and organize data from a given record."""
+    context = f"### Course Name:\n{record['course_title'].replace(':', '')}\n"
+    topics = {}
+    page_contents = record["page_contents"]["syllabus"]
+
+    for topic in page_contents["topics"]:
+        context += f"## Page:\n{topic}\n"
+        topics[topic] = "\n".join(page_contents["topics"][topic])
+
+        table = "\n".join(page_contents["topics"][topic])
+        context += f"Information:\n{table}\n"
+
+    return topics, context
+
+
+def get_mapped_topics(topics: str) -> dict[str, str]:
+    """Get mapped topics based on depth."""
+    mapped_topics = {}
+    for topic in topics:
+        if topics[topic] == 0:
+            mapped_topics["category"] = topic
+        elif topics[topic] == 1:
+            mapped_topics["field"] = topic
+        elif topics[topic] == 2:
+            mapped_topics["subfield"] = topic
+            break
+    return mapped_topics
+
+
+def get_observed_files(output_dir: str) -> Set[str]:
+    """Get set of previously observed files."""
+    observed_files = set([])
+    for yaml_file in glob(f"{output_dir}/**/*.yaml"):
+        with open(yaml_file, "r") as existing_file:
+            yaml_load = yaml.safe_load(existing_file)
+            if "course" in yaml_load:
+                for course_name in yaml_load["course"]:
+                    _, discipline, field = yaml_file.split("/")
+                    field = field.replace(".yaml", "")
+                    subfield = yaml_load["course"][course_name]["subfield"]
+                    observed_files.add(
+                        f"field_{field}_subfield_{subfield}_course_name_{course_name.replace(' ','_')}"
+                    )
+    for yaml_dumps in glob(f"{output_dir}/*.yaml"):
+        observed_files.add(yaml_dumps.split("/")[-1].replace(".yaml", ""))
+    return observed_files
+
+
+def quoted_presenter(dumper, data):
+    """Define a custom representer for string scalars that applies quotes."""
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"')
+
+
+class DraftSyllabiYAMLRunner:
+    """Runs the generation process for draft syllabi YAMLs."""
+
+    def __init__(
+        self,
+        provider: str = "openai",
+        model_name: str = "gpt-4-0613",
+        data_directory: Optional[str] = None,
+        output_rel_dir: str = "output_step_1",
+        input_jsonl_filename: str = "ka_scraped.jsonl",
+        prompt: str = SYLLABI_CREATION_PROMPT,
+        log_level: str = "INFO",
+    ):
+        self.provider = provider
+        self.model_name = model_name
+        self.data_directory = data_directory
+        self.output_rel_dir = output_rel_dir
+        self.input_jsonl_filename = input_jsonl_filename
+        self.prompt = prompt
+        logging.basicConfig(level=getattr(logging, log_level.upper()))
+
+    def run(self) -> None:
+        """Run the draft YAML generation process."""
+        yaml.add_representer(str, quoted_presenter)
+
+        llm_provider = get_default_settings_provider(
+            provider=self.provider, model_name=self.model_name
+        )
+        if not self.data_directory:
+            file_path = os.path.dirname(os.path.abspath(__file__))
+            self.data_directory = os.path.join(file_path, "raw_data")
+
+        output_dir = os.path.join(self.data_directory, self.output_rel_dir)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        logging.info(f"Saving data to output directory = {output_dir}")
+
+        # The input file path = data directory + input file name
+        input_file_path = os.path.join(
+            self.data_directory, self.input_jsonl_filename
+        )
+
+        with open(input_file_path, "r") as file:
+            for line in file:
+                try:
+                    record = json.loads(line)
+
+                    # Don't use the topics in KA, and all the context is in the string
+                    _, context = extract_data_from_record(record)
+
+                    course_name = record["course_title"]
+
+                    # Replace all non-alphanumeric characters with underscores
+                    dump_name = re.sub(
+                        r"\W+", "", course_name.replace(" ", "_")
+                    )
+
+                    if os.path.exists(
+                        os.path.join(output_dir, f"{dump_name}.yaml")
+                    ):
+                        logging.warn(
+                            f"Skipping {dump_name} because it was already created..."
+                        )
+                        continue
+
+                    formatted_prompt = self.prompt.format(
+                        course_name=course_name,
+                        context=context,
+                    )
+
+                    completion = llm_provider.get_completion(formatted_prompt)
+
+                    data_to_save = {
+                        "completion": completion,
+                        "course_name": course_name,
+                    }
+
+                    yaml_completion = prase_yaml_completion(data_to_save)
+                    yml_load = yaml.safe_load(yaml_completion)
+                    save_yaml(
+                        yml_load,
+                        os.path.join(output_dir, f"{dump_name}.yaml"),
+                    )
+
+                except Exception as e:
+                    logging.error(
+                        f"Failed to create output yaml at {self.data_directory} with error: {e}"
+                    )
+
+
+if __name__ == "__main__":
+    fire.Fire(DraftSyllabiYAMLRunner)