From 70b7e7c73afd514f51271df3add368d170ca316f Mon Sep 17 00:00:00 2001 From: Debanjum Date: Tue, 26 Nov 2024 15:35:23 -0800 Subject: [PATCH 1/2] Improve load of complex json objects. Use it to pick tool, run code Gemini doesn't work well when trying to output json objects. Using it to output raw json strings with complex, multi-line structures requires more intense clean-up of raw json string for parsing --- src/khoj/processor/conversation/utils.py | 41 ++++++++++++++++++++++++ src/khoj/processor/tools/run_code.py | 6 ++-- src/khoj/routers/research.py | 6 ++-- tests/test_conversation_utils.py | 12 +++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index 21a95a290..efd3c51df 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -5,6 +5,7 @@ import mimetypes import os import queue +import re import uuid from dataclasses import dataclass from datetime import datetime @@ -538,6 +539,46 @@ def clean_code_python(code: str): return code.strip().removeprefix("```python").removesuffix("```") +def load_complex_json(json_str): + """ + Preprocess a raw JSON string to escape unescaped double quotes within value strings, + while preserving the JSON structure and already escaped quotes. + """ + + def replace_unescaped_quotes(match): + # Get the content between colons and commas/end braces + content = match.group(1) + # Replace unescaped double, single quotes that aren't already escaped + # Uses negative lookbehind to avoid replacing already escaped quotes + # Replace " with \" + processed_dq = re.sub(r'(? Date: Tue, 26 Nov 2024 16:16:00 -0800 Subject: [PATCH 2/2] Fallback to json5 loader if json.loads cannot parse complex json str JSON5 spec is more flexible, try to load using a fast json5 parser if the stricter json.loads from the standard library can't load the raw complex json string into a python dictionary/list --- pyproject.toml | 1 + src/khoj/processor/conversation/utils.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f02b55599..59adf9527 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,7 @@ dependencies = [ "anthropic == 0.26.1", "docx2txt == 0.8", "google-generativeai == 0.8.3", + "pyjson5 == 1.6.7", ] dynamic = ["version"] diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index efd3c51df..079f3fea3 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -15,6 +15,7 @@ from typing import Any, Callable, Dict, List, Optional import PIL.Image +import pyjson5 import requests import tiktoken import yaml @@ -565,17 +566,18 @@ def replace_unescaped_quotes(match): processed = re.sub(pattern, replace_unescaped_quotes, cleaned) # See which json loader can load the processed JSON as valid - errors = "" - json_loaders_to_try = [json.loads] + errors = [] + json_loaders_to_try = [json.loads, pyjson5.loads] for loads in json_loaders_to_try: try: return loads(processed) - except json.JSONDecodeError as e: - errors += f"\n\n{e}" + except (json.JSONDecodeError, pyjson5.Json5Exception) as e: + errors.append(f"{type(e).__name__}: {str(e)}") # If all loaders fail, raise the aggregated error raise ValueError( - f"Failed to load JSON with error: {errors}\n\nWhile attempting to load this cleaned JSON:\n{processed}" + f"Failed to load JSON with errors: {'; '.join(errors)}\n\n" + f"While attempting to load this cleaned JSON:\n{processed}" )