Skip to content

Commit

Permalink
add sanity check and switch back to gpt 4
Browse files Browse the repository at this point in the history
  • Loading branch information
sh-rp committed Sep 17, 2024
1 parent f0607c4 commit 468da12
Showing 1 changed file with 15 additions and 3 deletions.
18 changes: 15 additions & 3 deletions docs/tools/fix_grammar_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# constants
BASE_DIR = "../website/docs"
GPT_MODEL = "gpt-4o-2024-05-13"
GPT_MODEL = "gpt-4-turbo"
MAX_CHUNK_SIZE = 4000 # make sure that this is below the context window size of the model to not have cut off files

SYSTEM_PROMPT = """\
Expand Down Expand Up @@ -73,6 +73,9 @@
with open(file_path, "r", encoding="utf-8") as f:
doc = f.readlines()

with open(file_path, "r", encoding="utf-8") as f:
doc_length = len(f.read())

def get_chunk_length(chunk: List[str]) -> int:
count = 0
for line in chunk:
Expand Down Expand Up @@ -111,14 +114,16 @@ def get_chunk_length(chunk: List[str]) -> int:
# sanity test, make sure we still have the full doc
assert doc == functools.reduce(lambda a, b: a + b, chunks)

fmt.note(f"Created {len(chunks)} chunks")

# count chars in doc
fmt.note(f"Created {len(chunks)} chunks for {doc_length} chars")

fixed_chunks: List[str] = []
for chunk in chunks:
client = OpenAI()
input = "".join(chunk)
response = client.chat.completions.create(
seed=1239812398,
seed=123981298,
model=GPT_MODEL,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
Expand All @@ -128,6 +133,13 @@ def get_chunk_length(chunk: List[str]) -> int:
)
fixed_chunks.append(response.choices[0].message.content) # type: ignore


# here we check that no part of the doc was swallowed by gpt
fixed_doc_length = functools.reduce(lambda count, chunk: count + len(chunk), fixed_chunks, 0)
if fixed_doc_length / doc_length < 0.9:
fmt.error("Doc length reduced too much during processing, skipping saving, please check manually")
continue

with open(file_path, "w", encoding="utf-8") as f:
for c in fixed_chunks:
f.write(c)
Expand Down

0 comments on commit 468da12

Please sign in to comment.