lm-sys · infwinston · Nov 13, 2024 · Nov 8, 2024 · Nov 10, 2024 · Nov 10, 2024
diff --git a/fastchat/serve/monitor/clean_chat_data.py b/fastchat/serve/monitor/clean_chat_data.py
@@ -5,13 +5,14 @@
 python3 clean_chat_data.py
 """
 import argparse
-import datetime
 import json
 import os
+import hashlib
 from pytz import timezone
+from functools import partial
+from datetime import datetime, timedelta
 import time
-
-from tqdm import tqdm
+import multiprocessing as mp
 
 from fastchat.serve.monitor.basic_stats import NUM_SERVERS
 from fastchat.serve.monitor.clean_battle_data import (
@@ -26,12 +27,20 @@
 )
 
 
-def get_log_files(max_num_files=None):
-    dates = []
-    for month in range(4, 12):
-        for day in range(1, 33):
-            dates.append(f"2023-{month:02d}-{day:02d}")
+def date_range(start="2023-04-01"):
+    start_date = datetime.strptime(start, "%Y-%m-%d").date()
+    end_date = datetime.now().date()
+    delta = end_date - start_date
+    dates = [
+        (start_date + timedelta(days=d)).strftime("%Y-%m-%d")
+        for d in range(delta.days + 2)
+    ]
+
+    return dates
 
+
+def get_log_files(max_num_files=None):
+    dates = date_range()
     filenames = []
     for d in dates:
         for i in range(NUM_SERVERS):
@@ -44,90 +53,117 @@ def get_log_files(max_num_files=None):
     return filenames
 
 
-def clean_chat_data(log_files, action_type):
-    raw_data = []
-    for filename in tqdm(log_files, desc="read files"):
-        for retry in range(5):
-            try:
-                lines = open(filename).readlines()
-                break
-            except FileNotFoundError:
-                time.sleep(2)
-
-        for l in lines:
-            row = json.loads(l)
-            if row["type"] == action_type:
-                raw_data.append(row)
-
-    all_models = set()
-    all_ips = dict()
-    chats = []
-    ct_invalid_conv_id = 0
-    ct_invalid = 0
-    ct_network_error = 0
-    for row in raw_data:
-        try:
-            if action_type in ["chat", "upvote", "downvote"]:
-                state = row["state"]
-                model = row["model"]
-            elif action_type == "leftvote":
-                state = row["states"][0]
-                model = row["states"][0]["model_name"]
-            elif action_type == "rightvote":
-                state = row["states"][1]
-                model = row["states"][1]["model_name"]
-            conversation_id = state["conv_id"]
-        except KeyError:
-            ct_invalid_conv_id += 1
-            continue
-
-        if conversation_id is None:
-            ct_invalid_conv_id += 1
-            continue
-
-        conversation = to_openai_format(state["messages"][state["offset"] :])
-        if not isinstance(model, str):
-            ct_invalid += 1
-            continue
-        model = replace_model_name(model, row["tstamp"])
-
+def get_action_type_data(filename, action_type):
+    for _ in range(5):
         try:
-            lang_code = detect_language(state["messages"][state["offset"]][1])
-        except IndexError:
-            ct_invalid += 1
-            continue
+            lines = open(filename).readlines()
+            break
+        except FileNotFoundError:
+            time.sleep(2)
+
+    rows = []
+    for l in lines:
+        row = json.loads(l)
+        if row["type"] == action_type:
+            rows.append(row)
+    return rows
+
+
+def process_data(row, action_type):
+    try:
+        if action_type in ["chat", "upvote", "downvote"]:
+            state = row["state"]
+            model = row["model"]
+        elif action_type == "leftvote":
+            state = row["states"][0]
+            model = row["states"][0]["model_name"]
+        elif action_type == "rightvote":
+            state = row["states"][1]
+            model = row["states"][1]["model_name"]
+        conversation_id = state["conv_id"]
+    except KeyError:
+        return {
+            "ct_invalid_conv_id": 1,
+        }
+
+    if conversation_id is None:
+        return {
+            "ct_invalid_conv_id": 1,
+        }
+
+    conversation = to_openai_format(state["messages"][state["offset"] :])
+    if not isinstance(model, str):
+        return {
+            "ct_invalid": 1,
+        }
+    model = replace_model_name(model, row["tstamp"])
+
+    try:
+        lang_code = detect_language(state["messages"][state["offset"]][1])
+    except IndexError:
+        return {
+            "ct_invalid": 1,
+        }
+
+    if not all(isinstance(x["content"], str) for x in conversation):
+        return {
+            "ct_invalid": 1,
+        }
+
+    messages = "".join([x["content"] for x in conversation]).lower()
+    if NETWORK_ERROR_MSG in messages:
+        return {
+            "ct_network_error": 1,
+        }
+    user_id = hashlib.md5(row["ip"].encode()).hexdigest()
+
+    # Prepare the result data
+    result = dict(
+        conversation_id=conversation_id,
+        model=model,
+        conversation=conversation,
+        turn=len(conversation) // 2,
+        language=lang_code,
+        user_id=user_id,
+        tstamp=row["tstamp"],
+    )
 
-        if not all(isinstance(x["content"], str) for x in conversation):
-            ct_invalid += 1
-            continue
+    return {
+        "result": result,
+        "model": model,
+    }
 
-        messages = "".join([x["content"] for x in conversation]).lower()
-        if NETWORK_ERROR_MSG in messages:
-            ct_network_error += 1
-            continue
 
-        ip = row["ip"]
-        if ip not in all_ips:
-            all_ips[ip] = len(all_ips)
-        user_id = all_ips[ip]
-
-        chats.append(
-            dict(
-                conversation_id=conversation_id,
-                model=model,
-                conversation=conversation,
-                turn=len(conversation) // 2,
-                language=lang_code,
-                user_id=user_id,
-                tstamp=row["tstamp"],
-            )
-        )
-
-        all_models.update([model])
+def clean_chat_data(log_files, action_type):
+    with mp.Pool() as pool:
+        # Use partial to pass action_type to get_action_type_data
+        func = partial(get_action_type_data, action_type=action_type)
+        file_data = pool.map(func, log_files, chunksize=1)
+    # filter out Nones as some files may not contain any data belong to action_type
+    raw_data = []
+    for data in file_data:
+        raw_data.extend(data)
+    raw_data = [r for r in raw_data if r is not None]
+
+    # Use the multiprocessing Pool
+    with mp.Pool() as pool:
+        func = partial(process_data, action_type=action_type)
+        results = pool.map(func, raw_data, chunksize=1)
+
+    # Aggregate results from child processes
+    ct_invalid_conv_id = sum(
+        [data["ct_invalid_conv_id"] for data in results if "ct_invalid_conv_id" in data]
+    )
+    ct_invalid = sum([data["ct_invalid"] for data in results if "ct_invalid" in data])
+    ct_network_error = sum(
+        [data["ct_network_error"] for data in results if "ct_network_error" in data]
+    )
+    all_models = set([data["model"] for data in results if "model" in data])
+    chats = [data["result"] for data in results if "result" in data]
 
     chats.sort(key=lambda x: x["tstamp"])
     last_updated_tstamp = chats[-1]["tstamp"]
-    last_updated_datetime = datetime.datetime.fromtimestamp(
+    last_updated_datetime = datetime.fromtimestamp(
         last_updated_tstamp, tz=timezone("US/Pacific")
     ).strftime("%Y-%m-%d %H:%M:%S %Z")
 
@@ -161,7 +197,7 @@ def clean_chat_data(log_files, action_type):
     log_files = get_log_files(args.max_num_files)
     chats = clean_chat_data(log_files, args.action_type)
     last_updated_tstamp = chats[-1]["tstamp"]
-    cutoff_date = datetime.datetime.fromtimestamp(
+    cutoff_date = datetime.fromtimestamp(
         last_updated_tstamp, tz=timezone("US/Pacific")
     ).strftime("%Y%m%d")