Skip to content

Commit

Permalink
Merge branch 'staging' into hotfix/pre-release
Browse files Browse the repository at this point in the history
  • Loading branch information
steffencruz authored Jan 22, 2024
2 parents d23461f + 22dbb05 commit bd4e52a
Show file tree
Hide file tree
Showing 17 changed files with 282 additions and 167 deletions.
7 changes: 4 additions & 3 deletions prompting/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ def __init__(
begin_conversation=True,
):
if persona is None:
self.persona = create_persona()
persona = create_persona()

self.persona = persona
self.task = task
self.llm_pipeline = llm_pipeline

Expand Down Expand Up @@ -68,11 +69,11 @@ def create_challenge(self) -> str:
t0 = time.time()

cleaner = None
if hasattr(self.task, 'cleaning_pipeline'):
if hasattr(self.task, 'cleaning_pipeline'):
cleaner = CleanerPipeline(
cleaning_pipeline=self.task.cleaning_pipeline
)

self.challenge = super().query(message="Ask a question related to your goal", cleaner=cleaner)
self.challenge = self.task.format_challenge(self.challenge)
self.challenge_time = time.time() - t0
Expand Down
2 changes: 1 addition & 1 deletion prompting/rewards/reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def __post_init__(self):
if self.rewards.shape != self.timings.shape:
raise ValueError(f"rewards.shape {self.rewards.shape} != timings.shape {self.timings.shape}")

self.rewards_normalized = (self.rewards-self.rewards.min())/(self.rewards.max()-self.rewards.min())
self.rewards_normalized = (self.rewards-self.rewards.min())/(self.rewards.max()-self.rewards.min()+1e-6)


class BaseRewardModel(ABC):
Expand Down
16 changes: 8 additions & 8 deletions prompting/tasks/date_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,26 @@ class DateQuestionAnsweringTask(Task):
penalty_definition = []

def __init__(self, llm_pipeline, context, create_reference=True):


self.name = "date-based question answering"
self.desc = "get help answering a specific date-based question"
self.goal = "to get the answer to the following date-based question"

self.cleaning_pipeline = [
dict(name="remove_quotes"),
dict(name="remove_roles"),
]
self.context = context

# The section is in {"Births", "Deaths", "Events"}
section = self.context["section"]
year, _, *event = self.context["event"].split()
self.context["event"] = " ".join(event)
event = " ".join(event)

options = {'Births':' was born ', 'Deaths':' died ', 'Events':' '}
self.query = self.context["event"].strip(".") + options[section] + 'on what exact date?'

self.query = event.strip(".") + options[section] + 'on what exact date?'
self.reference = self.context["date"] + ", " + year.strip()

self.topic = section
Expand Down
2 changes: 2 additions & 0 deletions prompting/tasks/debugging.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ def __init__(self, llm_pipeline, context, create_reference=True):
self.topic=self.context["repo_name"]
self.subtopic=self.context["path"]
self.tags=[self.context["language"]]
self.static_reference = True
self.static_query = True

def generate_query(
self,
Expand Down
1 change: 1 addition & 0 deletions prompting/tasks/summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,5 @@ def __init__(self, llm_pipeline: Pipeline, context: str, create_reference=True):
self.topic = self.context["title"]
self.subtopic = self.context["categories"][0]
self.tags = self.context["categories"]
self.static_query = True

53 changes: 37 additions & 16 deletions prompting/tools/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import time
import random
import string
from typing import Dict
import requests
import datetime
Expand Down Expand Up @@ -66,7 +67,7 @@ class CodingDataset:
}

def __init__(
self, dataset_id="codeparrot/github-code", seed=None, languages=None
self, dataset_id="codeparrot/github-code", seed=None, languages=None, buffer_size=10000
):
if seed is None:
seed = random.randint(0, 1000)
Expand All @@ -83,7 +84,7 @@ def __init__(
split="train",
streaming=True,
languages=self.languages,
).shuffle(seed=seed, buffer_size=10000)
).shuffle(seed=seed, buffer_size=buffer_size)
)

def next(self, min_lines=5, max_lines=100):
Expand Down Expand Up @@ -214,12 +215,15 @@ def get_wikipedia_article_content(self, title: str) -> str:

return text

def next(self, subset=False, chunk_sep="\n", n_chunks=None):
def next(self, subset=False, chunk_sep="\n", n_chunks=None, info=None):
bt.logging.debug("Retrieving data from prompting.dataset...")
tries = 0
t0 = time.time()
while tries < self.max_tries:
info = self.get_random_wikipedia_article()

if info is None:
info = self.get_random_wikipedia_article()

info["sections"] = self.get_wikipedia_article_content(info["title"])
text = "\n".join(info["sections"].values())
tries += 1
Expand Down Expand Up @@ -316,8 +320,10 @@ def next(self):


class DateQADataset:
def __init__(self, max_tries: int = 10):
def __init__(self, max_tries: int = 10, seed=None):
self.max_tries = max_tries
self.seed = seed
self.rng = random.Random(seed)

def get_random_event(self) -> Dict:
tries = 0
Expand All @@ -327,11 +333,11 @@ def get_random_event(self) -> Dict:

# Step 1: Generate a random date
year = 2000
month = random.randint(1, 12)
month = self.rng.randint(1, 12)

max_days = 31 if month in (1, 3, 5, 7, 8, 10, 12) else 30
max_days = max_days if month != 2 else 28 + int(year % 4 == 0)
day = random.randint(1, max_days)
day = self.rng.randint(1, max_days)
random_date = datetime.date(year, month, day)

# Step 2: Format the date for Wikipedia URL
Expand All @@ -352,7 +358,7 @@ def get_random_event(self) -> Dict:
section = soup.find("span", id=name)
if section:
available_sections.append(name)
section = random.choice(available_sections)
section = self.rng.choice(available_sections)
# Find the events section
events_list = soup.find(
"span", id=section
Expand All @@ -368,7 +374,7 @@ def get_random_event(self) -> Dict:
selected_event = random.choice(events)
links = selected_event.find_all("a")
if links:
link = random.choice(links)
link = self.rng.choice(links)

return {
"date": random_date.strftime("%B %d"),
Expand All @@ -386,9 +392,16 @@ def next(self):


class MathDataset:

topics_list = mathgenerator.getGenList()


def __init__(self, seed=None):

# NOTE: Unfortunately, mathgenerator does not provide a way to seed the random number generator and get the same problem every time

self.seed = seed
self.rng = random.Random(seed)

def random_problem(self, parse):
if parse:
parseable_list = [
Expand Down Expand Up @@ -443,17 +456,25 @@ def random_problem(self, parse):
123,
]
options = parseable_list
choice = random.choice((options))
choice = self.rng.choice((options))
#TODO: When the solution contains the symbol x we should specify the x value and substitute it in the solution
problem, solution = mathgenerator.genById(choice)
_, subtopic, _, _, topic, _ = self.topics_list[choice]

solution = parse_latex(
subs = {}
# check if solution contains letters
if 'x' in solution:
subs['x'] = 10
bt.logging.warning('Coercing a symbolic expression to a numeric expression by substituting x=10')

# BUG: parse latex assumes that all letters are variables and so solutions like $No$ are interpreted as 'N * o'
solution_numeric = parse_latex(
str(solution).replace("$", "").strip()
).evalf()
return {"problem": problem, "solution": solution, "topic": topic, "subtopic": subtopic}
).evalf(subs=subs)
return {"problem": problem, "solution": solution_numeric, "solution_raw": solution, "topic": topic, "subtopic": subtopic}
else:
options = mathgenerator.getGenList()
choice = random.choice(range(len(options)))
choice = self.rng.choice(range(len(options)))
problem, solution = mathgenerator.genById(choice)
_, subtopic, _, _, topic, _ = self.topics_list[choice]
return {"problem": problem, "solution": solution, "topic": topic, "subtopic": subtopic}
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ rouge
scipy==1.10.1
sentencepiece
wandb==0.15.10
tenacity
tenacity
antlr4-python3-runtime==4.11
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# Initialize variables
script="neurons/validators/validator.py"
script="neurons/validator.py"
autoRunLoc=$(readlink -f "$0")
proc_name="text_prompt_validators_main_process"
args=()
Expand Down
Empty file added tests/fixtures/__init__.py
Empty file.
51 changes: 51 additions & 0 deletions tests/fixtures/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

from prompting.tools import MockDataset, CodingDataset, WikiDataset, StackOverflowDataset, DateQADataset, MathDataset

DATASETS = [
# MockDataset,
CodingDataset,
WikiDataset,
# StackOverflowDataset,
DateQADataset,
MathDataset,
]

WIKI_ARTICLE = {
'title': 'Emilio Alvarez (bishop)',
'url': 'https://en.wikipedia.org/wiki/Emilio_Alvarez_(bishop)',
'length': 8185,
'extract': '<p><b>Emilio Alvarez</b> (born January 16) is a religious leader in the United States, and founding bishop of the Union of Charismatic Orthodox Churches. He is also the founding director of the Institute for Paleo-Orthodox Christian Studies (formerly the certificate in Convergence Studies Program at New York Theological Seminary).',
'backlinks': 7,
'categories': [
'21st-century American bishops',
'21st-century Puerto Rican peopl',
'nvergence Movemen',
'Living peopl',
'People of Afro–Puerto Rican descen',
'Puerto Rican bishops',
'Religious leaders from New York (state)',
'Short description matches Wikid',
'Writers from New York (state)',
'Year of birth missing (living people)'
]
}

WIKI_CONTEXT = WikiDataset().next(info=WIKI_ARTICLE)
CODING_CONTEXT = CodingDataset(buffer_size=10).next()
MATH_CONTEXT = MathDataset(seed=123).next()
DATEQA_CONTEXT = DateQADataset(seed=123).next()

CONTEXTS = {
WikiDataset: WIKI_CONTEXT,
CodingDataset: CODING_CONTEXT,
MathDataset: MATH_CONTEXT,
DateQADataset: DATEQA_CONTEXT,
}


CONTEXT_FIELDS = {
WikiDataset: {"text", "title", "categories", "url", "sections", "fetch_time", "length", "backlinks", "extract"},
CodingDataset: {"code", "repo_name", "path", "language", "size", "fetch_time", "license"},
MathDataset: {"problem", "solution", 'topic', 'subtopic', "fetch_time", "solution_raw"},
DateQADataset: {"section", "event", 'date', "next_page", "fetch_time"},
}
3 changes: 3 additions & 0 deletions tests/fixtures/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from prompting.mock import MockPipeline

LLM_PIPELINE = MockPipeline("This is just another test.")
20 changes: 20 additions & 0 deletions tests/fixtures/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from prompting.tasks import Task, QuestionAnsweringTask, SummarizationTask, DebuggingTask, MathTask, DateQuestionAnsweringTask
from .dataset import WIKI_CONTEXT, CODING_CONTEXT, MATH_CONTEXT, DATEQA_CONTEXT

TASKS = [
QuestionAnsweringTask,
SummarizationTask,
DebuggingTask,
MathTask,
DateQuestionAnsweringTask,
]

# TODO: Make fully deterministic
CONTEXTS = {
QuestionAnsweringTask: WIKI_CONTEXT,
SummarizationTask: WIKI_CONTEXT,
DebuggingTask: CODING_CONTEXT,
MathTask: MATH_CONTEXT,
DateQuestionAnsweringTask: DATEQA_CONTEXT,
}

Loading

0 comments on commit bd4e52a

Please sign in to comment.