Skip to content

Commit

Permalink
feat: caching, ensure necessary_evidence is on devq
Browse files Browse the repository at this point in the history
  • Loading branch information
zhudotexe committed Feb 13, 2024
1 parent 61bb8b3 commit ebd9bd3
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 9 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class DevQuestion:
question: str # the top-level question to answer
decomposition: list[DevSubquestion] # human-written decomposition of the question
answer: dict[str, Primitive] | list[Primitive] | Primitive
necessary_evidence: list[Evidence]
categories: list[str]


Expand Down Expand Up @@ -115,10 +116,12 @@ are two main functions to interface with Wikipedia:

To save on time waiting for requests and computation power (both locally and on Wikipedia's end), this package
aggressively caches retrieved Wikipedia pages. By default, this cache is located in `~/.cache/fanoutqa/wikicache`.
We provide many cached pages you can prepopulate this cache with, by using the following commands:
We provide many cached pages (~9GB) you can prepopulate this cache with, by using the following commands:

```shell
mkdir -p ~/.cache/fanoutqa/wikicache
mkdir -p ~/.cache/fanoutqa
wget -O ~/.cache/fanoutqa/wikicache.tar.gz https://datasets.mechanus.zhu.codes/fanoutqa/wikicache.tar.gz
tar -xzf ~/.cache/fanoutqa/wikicache.tar.gz
```

## Evaluation
Expand Down
17 changes: 15 additions & 2 deletions fanoutqa/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, Union

Primitive = Union[bool, int, float, str]
AnswerType = Union[dict[str, Primitive], list[Primitive], Primitive]


@dataclass
Expand Down Expand Up @@ -32,7 +33,7 @@ class DevSubquestion:
id: str
question: str
decomposition: list["DevSubquestion"]
answer: Union[dict[str, Primitive], list[Primitive], Primitive]
answer: AnswerType
"""the answer to this subquestion"""

depends_on: list[str]
Expand Down Expand Up @@ -64,7 +65,7 @@ class DevQuestion:
"""the top-level question to answer"""
decomposition: list[DevSubquestion]
"""human-written decomposition of the question"""
answer: Union[dict[str, Primitive], list[Primitive], Primitive]
answer: AnswerType
categories: list[str]

@classmethod
Expand All @@ -78,6 +79,18 @@ def from_dict(cls, d):
categories=d["categories"],
)

@property
def necessary_evidence(self) -> list[Evidence]:
"""A list of all the evidence used by human annotators to answer the question."""

def walk_evidences(subqs):
for subq in subqs:
if subq.evidence:
yield subq.evidence
yield from walk_evidences(subq.decomposition)

return list(walk_evidences(self.decomposition))


@dataclass
class TestQuestion:
Expand Down
2 changes: 2 additions & 0 deletions fanoutqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

AnyPath: TypeAlias = Union[str, bytes, os.PathLike]
PKG_ROOT = Path(__file__).parent
CACHE_DIR = Path("~/.cache/fanoutqa")
CACHE_DIR.mkdir(exist_ok=True, parents=True)
DATASET_EPOCH = datetime.datetime(year=2023, month=11, day=20, tzinfo=datetime.timezone.utc)
"""The day before which to get revisions from Wikipedia, to ensure that the contents of pages don't change over time."""

Expand Down
9 changes: 4 additions & 5 deletions fanoutqa/wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@
import functools
import logging
import urllib.parse
from pathlib import Path

import httpx

from .models import Evidence
from .utils import DATASET_EPOCH, markdownify
from .utils import CACHE_DIR, DATASET_EPOCH, markdownify

USER_AGENT = "fanoutqa/1.0.0 ([email protected])"
CACHE_DIR = Path("~/.cache/fanoutqa/wikicache")
CACHE_DIR.mkdir(exist_ok=True, parents=True)
WIKI_CACHE_DIR = CACHE_DIR / "wikicache"
WIKI_CACHE_DIR.mkdir(exist_ok=True, parents=True)

log = logging.getLogger(__name__)
wikipedia = httpx.Client(base_url="https://en.wikipedia.org/w/api.php", headers={"User-Agent": USER_AGENT})
Expand Down Expand Up @@ -70,7 +69,7 @@ def wiki_search(query: str, results=10) -> list[Evidence]:
def wiki_content(doc: Evidence) -> str:
"""Get the page content in markdown, including tables and infoboxes, appropriate for displaying to an LLM."""
# get the cached content, if available
cache_filename = CACHE_DIR / f"{doc.pageid}-dated.md"
cache_filename = WIKI_CACHE_DIR / f"{doc.pageid}-dated.md"
if cache_filename.exists():
return cache_filename.read_text()

Expand Down

0 comments on commit ebd9bd3

Please sign in to comment.