From 65af38132e7ab1f0bdea397def0ce4a77d4b1ec6 Mon Sep 17 00:00:00 2001 From: Greg Hogue Date: Tue, 10 Dec 2024 13:18:54 -0500 Subject: [PATCH] script to export chat records from db --- .github/actions/verify_imports.py | 1 + .gitignore | 1 + bin/export_records.py | 66 +++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 bin/export_records.py diff --git a/.github/actions/verify_imports.py b/.github/actions/verify_imports.py index 8f0e508..6dea206 100644 --- a/.github/actions/verify_imports.py +++ b/.github/actions/verify_imports.py @@ -11,6 +11,7 @@ "chat-chainlit.py", "chat-fastapi.py", "embeddings_manager", + "export_records.py", ], ) ) diff --git a/.gitignore b/.gitignore index 3821deb..8b8e329 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ cython_debug/ !.chainlit/translations/en-US.json csv_files/ embeddings/ +records/ diff --git a/bin/export_records.py b/bin/export_records.py new file mode 100644 index 0000000..5a55baa --- /dev/null +++ b/bin/export_records.py @@ -0,0 +1,66 @@ +import csv +import os +from argparse import ArgumentParser +from pathlib import Path + +import psycopg +from dotenv import load_dotenv + +load_dotenv() + +CHAINLIT_DB_URI = f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@postgres:5432/{os.getenv('POSTGRES_CHAINLIT_DB')}?sslmode=disable" + + +def build_query(since_timestamp: str | None) -> str: + if since_timestamp is None: + since_timestamp = "" + query = f""" + SELECT "threadId", "createdAt", name, type, output + FROM steps + WHERE + type IN ('user_message', 'assistant_message') AND + "createdAt" > '{since_timestamp}' + ORDER BY ( + SELECT MIN("createdAt") + FROM steps s + WHERE s."threadId" = steps."threadId" + ), "createdAt"; + """ + return query + + +def last_record_timestamp(records_dir: Path) -> str | None: + record_names: list[str] = list(f.stem for f in records_dir.glob("records_*.csv")) + if len(record_names) > 0: + last_record: str = max(record_names) + return last_record[len("records_") :] + else: + return None + + +def main(records_dir: Path): + records_dir.mkdir(exist_ok=True) + + since_timestamp: str | None = last_record_timestamp(records_dir) + query: str = build_query(since_timestamp) + + with psycopg.connect(CHAINLIT_DB_URI) as conn: + with conn.cursor() as cur: + cur.execute(query) + records = cur.fetchall() + + latest_timestamp: str = max(row[1] for row in records) + + record_file = records_dir / f"records_{latest_timestamp}.csv" + + with open(record_file, mode="w", newline="") as file: + writer = csv.writer(file) + writer.writerow(["threadId", "createdAt", "name", "type", "output"]) + writer.writerows(records) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("records_dir", type=Path, default=Path("records")) + args = parser.parse_args() + main(**vars(args))