Skip to content
This repository has been archived by the owner on Feb 12, 2024. It is now read-only.

Add dry-run feature #73

Merged
merged 2 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 0 additions & 24 deletions .github/scripts/check_prompt_generators.py

This file was deleted.

32 changes: 0 additions & 32 deletions .github/workflows/on-pr-config-validation.yml

This file was deleted.

7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@
**Introduction:**
The Library of Phi is an initiative sponsored by SciPhi. Its primary goal is to democratize access to high-quality textbooks. The project utilizes AI-driven techniques to generate textbooks by processing combining raw information (such as table of contents) with unstructured data (such as Vector Databases) to generate high quality factually grounded textbooks.

#### **Optional - Validating Generation Pipeline:**

```bash
poetry run python sciphi/examples/library_of_phi/generate_textbook.py dry_run
```


#### **Generating the default Textbook:**

```bash
Expand Down
25 changes: 10 additions & 15 deletions sciphi/examples/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,21 +422,24 @@ def wiki_search_api(
def traverse_config(
config: dict,
) -> Generator[Tuple[str, str, str, str, dict], None, None]:
"""Traverse the config and yield textbook, chapter, section, subsection names"""
"""
Traverse the config and yield textbook,
chapter, section, subsection names, and full chapter config
"""

def get_key(config_dict: dict) -> str:
"""Get the key from a dictionary with a single key-value pair"""
keys = list(config_dict.keys())
if not keys:
if not config_dict:
raise KeyError("Dictionary is empty, no key found")
return keys[0]
return next(iter(config_dict))

textbook_name = get_key(config["textbook"])
chapters = config["textbook"][textbook_name]["chapters"]

for chapter in chapters:
chapter_name = get_key(chapter)
sections = chapter[chapter_name]["sections"]

for section in sections:
if isinstance(section, str):
yield textbook_name, chapter_name, section, "", chapter[
Expand All @@ -445,14 +448,9 @@ def get_key(config_dict: dict) -> str:
continue

section_name = get_key(section)
subsections = section[section_name].get("subsections")
if not subsections or len(subsections) == 0:
yield textbook_name, chapter_name, section_name, "", chapter[
chapter_name
]
continue
subsections = section[section_name].get("subsections", [])

if subsections == None:
if not subsections:
yield textbook_name, chapter_name, section_name, "", chapter[
chapter_name
]
Expand All @@ -463,11 +461,8 @@ def get_key(config_dict: dict) -> str:
yield textbook_name, chapter_name, section_name, subsection, chapter[
chapter_name
]
elif isinstance(
subsection, dict
): # Additional check if subsections have nested structure
else:
subsection_name = get_key(subsection)
# Add logic to handle deeper nested structures if needed
yield textbook_name, chapter_name, section_name, subsection_name, chapter[
chapter_name
]
Expand Down
71 changes: 71 additions & 0 deletions sciphi/examples/library_of_phi/generate_textbook.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Generates textbook content from parsed course data."""
import collections
import glob
import multiprocessing
import os
Expand Down Expand Up @@ -119,6 +121,75 @@ def initialize_processing(
writer.raw_writer.write(TextbookContentGenerator.AI_DISCLAIMER)
return writer

def dry_run(self) -> None:
"""Run a dry configuration validation without content generation and output summary statistics."""
# Check RAG configurations
if self.config.do_rag and not all(
[
self.config.rag_url,
self.config.rag_username,
self.config.rag_password,
]
):
raise ValueError(
"RAG configuration is invalid. Make sure you provide a RAG server rag_url, rag_username, and rag_password."
)

# Validate LLM provider configuration
if not self.llm_provider:
raise ValueError("Invalid LLM provider configuration.")

# Check for YAML file paths
if self.config.textbook:
yml_file_paths = [
os.path.join(
self.config.data_dir,
self.config.toc_dir,
f"{self.config.textbook}.yaml",
)
]
else:
yml_file_paths = glob.glob(
os.path.join(
self.config.data_dir, self.config.toc_dir, "*.yaml"
)
)

if not yml_file_paths:
raise ValueError("No YAML files found in the specified directory.")

# Check the output path
output_path = os.path.join(
self.config.data_dir, self.config.output_dir, "dry_run_output"
)
if not os.path.exists(os.path.dirname(output_path)):
os.makedirs(os.path.dirname(output_path))

# Output some summary statistics
summary = collections.OrderedDict()
summary["RAG Server URL"] = self.config.rag_url
summary["LLM Provider"] = self.config.llm_provider
summary["LLM Model Name"] = self.config.llm_model_name
summary["Total YAML Files"] = len(yml_file_paths)
summary["Output Directory"] = os.path.dirname(output_path)
failed_loads = 0
self.logger.info("Validating configs now...")
for yml in tqdm(yml_file_paths):
try:
yml_config = load_yaml_file(yml)
# check that we can traverse the config
_ = [ele for ele in traverse_config(yml_config)]
except:
continue
failed_loads += 1
summary["YAML Files with Errors"] = failed_loads
summary["YAML Failure Rate"] = float(failed_loads) / len(
yml_file_paths
)
print("\nDry Run Summary:")
for key, value in summary.items():
print(f"{key}: {value}")

def run(self) -> None:
"""Run the draft book generation process."""
if self.config.textbook:
Expand Down