aymenfurter · aymenfurter · Nov 10, 2024 · Sep 28, 2024 · Sep 28, 2024 · Sep 28, 2024
diff --git a/app/__pycache__/__init__.cpython-311.pyc b/app/__pycache__/__init__.cpython-311.pyc
diff --git a/app/api/routes.py b/app/api/routes.py
@@ -26,6 +26,7 @@
 from app.query.ask import AskService 
 from app.ingestion.pdf_processing import get_pdf_page_count
 from app.query.voice_chat_service import intro_message, voice_chat_with_data
+from app.compare.compare import compare_indexes
 
 def are_operations_restricted():
     return os.getenv('RESTRICT_OPERATIONS', 'false').lower() == 'true'
@@ -51,6 +52,7 @@
         self._add_ask_route()
         self._add_voice_chat_route()
         self._add_intro_route()
+        self._add_compare_route()
 
         return self.app
 
@@ -77,6 +79,63 @@
         self.app.route('/chat', methods=['POST'])(self._chat)
         self.app.route('/refine', methods=['POST'])(self._refine)
 
+    def _add_compare_route(self):
+        """Add the comparison endpoint."""
+        self.app.route('/compare', methods=['POST'])(self._compare)
+
+    def _compare(self):
+        """Handle comparison requests with phased execution."""
+        try:
+            user_id = get_user_id(request)
+            data = request.json
+
+            phase = data.get('phase')
+            if phase not in ['generate', 'refine', 'execute']:
+                return jsonify({"error": "Invalid phase. Must be 'generate', 'refine', or 'execute'"}), 400
+
+            if not isinstance(data.get('indexes', []), list) or len(data.get('indexes', [])) != 2:
+                return jsonify({"error": "Exactly 2 indexes must be provided"}), 400
+
+            is_restricted = data.get('is_restricted', True)
+            for index_name in data['indexes']:
+                index_manager = self._get_index_manager(user_id, index_name, is_restricted)
+                if isinstance(index_manager, tuple):
+                    return index_manager
+
+            if phase == 'refine':
+                if not data.get('requirements') or not data.get('feedback'):
+                    return jsonify({
+                        "error": "Refinement phase requires 'requirements' and 'feedback'"
+                    }), 400
+
+            elif phase == 'execute':
+                if not data.get('requirements'):
+                    return jsonify({
+                        "error": "Execute phase requires 'requirements'"
+                    }), 400
+
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+            try:
+                result = loop.run_until_complete(compare_indexes(data, user_id))
+
+                if isinstance(result, Response):
+                    return result
+
+                return jsonify(result)
+
+            except Exception as e:
+                current_app.logger.error(f"Comparison error: {str(e)}")
+                return jsonify({"error": str(e)}), 500
@@ -129,3 +129,3 @@
                current_app.logger.error(f"Comparison error: {str(e)}")
-                return jsonify({"error": str(e)}), 500
+                return jsonify({"error": "An internal error has occurred."}), 500
            finally:
@@ -135,3 +135,3 @@
            current_app.logger.error(f"Comparison request error: {str(e)}")
-            return jsonify({"error": str(e)}), 500
+            return jsonify({"error": "An internal error has occurred."}), 500
@@ -129,3 +129,3 @@
                current_app.logger.error(f"Comparison error: {str(e)}")
-                return jsonify({"error": str(e)}), 500
+                return jsonify({"error": "An internal error has occurred."}), 500
            finally:
@@ -135,3 +135,3 @@
            current_app.logger.error(f"Comparison request error: {str(e)}")
-            return jsonify({"error": str(e)}), 500
+            return jsonify({"error": "An internal error has occurred."}), 500

+            finally:
+                loop.close()
+
+        except Exception as e:
+            current_app.logger.error(f"Comparison request error: {str(e)}")
+            return jsonify({"error": str(e)}), 500
@@ -135,3 +135,3 @@
            current_app.logger.error(f"Comparison request error: {str(e)}")
-            return jsonify({"error": str(e)}), 500
+            return jsonify({"error": "An internal error has occurred."}), 500
@@ -135,3 +135,3 @@
            current_app.logger.error(f"Comparison request error: {str(e)}")
-            return jsonify({"error": str(e)}), 500
+            return jsonify({"error": "An internal error has occurred."}), 500

+
+
     def _add_voice_chat_route(self):
         @self.app.route('/voice_chat', methods=['POST'])
         def voice_chat():

diff --git a/app/compare/compare.py b/app/compare/compare.py
@@ -0,0 +1,48 @@
+import json
+import logging
+from typing import Dict, Any, AsyncGenerator
+from flask import Response, jsonify
+from .comparison_service import ComparisonService
+from .utils import convert_async_to_sync
+from .comparison_models import ComparisonRequest
+
+logger = logging.getLogger(__name__)
+
+async def handle_comparison_request(data: Dict[str, Any], user_id: str) -> AsyncGenerator[str, None]:
+    """Handle different phases of comparison process."""
+    try:
+        service = ComparisonService()
+        request = ComparisonRequest(**data)
+
+        if request.phase == "generate":
+            async for event in service.generate_requirements(data, user_id):
+                yield event
+        elif request.phase == "execute":
+            async for event in service.execute_comparison(data, user_id):
+                yield event
+        else:
+            yield json.dumps({
+                "type": "error", 
+                "content": f"Invalid phase specified: {request.phase}"
+            }) + "\n"
+
+    except Exception as e:
+        logger.error(f"Error in handle_comparison_request: {str(e)}")
+        yield json.dumps({
+            "type": "error",
+            "content": f"Request processing error: {str(e)}"
+        }) + "\n"
+
+async def compare_indexes(data: Dict[str, Any], user_id: str) -> Response:
+    """Entry point for comparison functionality."""
+    try:
+        return Response(
+            convert_async_to_sync(handle_comparison_request(data, user_id)),
+            content_type='application/x-ndjson'
+        )
+    except Exception as e:
+        logger.error(f"Error in compare_indexes: {str(e)}")
+        return jsonify({
+            "error": "Comparison failed",
+            "details": "An internal error has occurred. Please try again later."
+        }), 500
diff --git a/app/compare/comparison_executor.py b/app/compare/comparison_executor.py
@@ -0,0 +1,98 @@
+import json
+import logging
+from typing import Dict, Any, List, AsyncGenerator
+from openai import AzureOpenAI
+
+from app.query.graphrag_query import GraphRagQuery
+from app.integration.graphrag_config import GraphRagConfig
+from .comparison_models import ComparisonRequest, Requirement, ComparisonResult, SourceResult, CitationInfo
+from .response_processor import ResponseProcessor
+from .comparison_index_validator import validate_index_access
+
+logger = logging.getLogger(__name__)
+
+class ComparisonExecutor:
+    def __init__(self, config: Dict[str, Any], client: AzureOpenAI, response_processor: ResponseProcessor):
+        self.config = config
+        self.client = client
+        self.response_processor = response_processor
+
+    async def execute(self, data: Dict[str, Any], user_id: str) -> AsyncGenerator[str, None]:
+        try:
+            request = ComparisonRequest(**data)
+            if not request.requirements:
+                raise ValueError("No requirements provided for execution")
+
+            for requirement in request.requirements:
+                yield await self._process_requirement(requirement, request, user_id)
+
+        except Exception as e:
+            logger.error(f"Error executing comparison: {str(e)}")
+            yield json.dumps({"type": "error", "content": str(e)}) + "\n"
+
+    async def _process_requirement(self, requirement: Dict[str, Any], request: ComparisonRequest, user_id: str) -> str:
+        req_obj = Requirement(**requirement)
+        result = ComparisonResult(
+            requirement=req_obj,
+            sources={}
+        )
+
+        for index_name in request.indexes:
+            try:
+                source_result = await self._process_index(index_name, req_obj, user_id)
+                result.sources[index_name] = source_result
+
+            except Exception as e:
+                logger.error(f"Error querying {index_name} for requirement '{req_obj.description}': {str(e)}")
+                result.sources[index_name] = SourceResult(
+                    response=f"Error: {str(e)}",
+                    simplified_value=None,
+                    citations=[]
+                )
+
+        return json.dumps({
+            "type": "comparison_result",
+            "content": result.dict()
+        }) + "\n"
+
+    async def _process_index(self, index_name: str, requirement: Requirement, user_id: str) -> SourceResult:
+        container_name, data_source = await validate_index_access(user_id, index_name, self.config)
+
+        config = GraphRagConfig(index_name, user_id, False)
+        graph_rag = GraphRagQuery(config)
+
+        query = (
+            f"Regarding this requirement: {requirement.description}\n"
+            f"What is the current status or value? Provide a clear, specific answer."
+        )
+
+        response, context = await graph_rag.global_query(query)
+
+        reviewed_response, citations = await self.response_processor.process_citations(
+            response, 
+            context, 
+            index_name, 
+            data_source
+        )
+
+        simplified_value = await self.response_processor.simplify_response(
+            requirement.metric_type, 
+            query,
+            response
+        )
+
+        citation_infos = [
+            CitationInfo(
+                text=citation.get('text', ''),
+                document_id=citation['file'],
+                content=citation.get('content', ''),
+                index_name=index_name
+            )
+            for citation in citations
+        ]
+
+        return SourceResult(
+            response=response,
+            simplified_value=simplified_value,
+            citations=citation_infos
+        )
diff --git a/app/compare/comparison_index_validator.py b/app/compare/comparison_index_validator.py
@@ -0,0 +1,30 @@
+import logging
+from typing import Dict, Any, Tuple
+
+from app.integration.index_manager import create_index_manager, ContainerNameTooLongError
+from app.integration.azure_aisearch import create_data_source
+
+logger = logging.getLogger(__name__)
+
+async def validate_index_access(user_id: str, index_name: str, config: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
+    """Validate index access and return container name and data source."""
+    try:
+        index_manager = create_index_manager(user_id, index_name, False)
+
+        if not index_manager.user_has_access():
+            raise ValueError("Unauthorized access")
+
+        container_name = index_manager.get_ingestion_container()
+
+        data_source = create_data_source(
+            config['SEARCH_SERVICE_ENDPOINT'],
+            config['SEARCH_SERVICE_API_KEY'],
+            container_name
+        )
+
+        return container_name, data_source
+
+    except ContainerNameTooLongError as e:
+        raise ValueError(f"Container name too long: {str(e)}")
+    except Exception as e:
+        raise ValueError(f"Error accessing index: {str(e)}")
diff --git a/app/compare/comparison_models.py b/app/compare/comparison_models.py
@@ -0,0 +1,39 @@
+from typing import Dict, Any, List, Optional
+from pydantic import BaseModel, Field
+
+class Requirement(BaseModel):
+    description: str = Field(..., description="The detailed description of what needs to be compared")
+    metric_type: str = Field(..., description="Type of metric: 'yes_no' or 'numeric'")
+    metric_unit: Optional[str] = Field(None, description="Unit for numeric metrics (e.g., 'hours', '%', 'CHF')")
+
+class RequirementList(BaseModel):
+    requirements: List[Requirement] = Field(..., description="List of requirements to compare")
+
+class CitationInfo(BaseModel):
+    text: str = Field(..., description="The cited text")
+    document_id: str = Field(..., description="Source document identifier")
+    content: str = Field(..., description="Full context of the citation")
+    index_name: str = Field(..., description="Name of the index this citation is from")
+
+class ComparisonRequest(BaseModel):
+    phase: str = Field(..., description="Phase of comparison: 'generate', 'refine', or 'execute'")
+    num_requirements: int = Field(default=10, description="Number of requirements to generate")
+    role: str = Field(default="auditor", description="Role performing the comparison")
+    comparison_subject: str = Field(default="employment conditions", description="Subject being compared")
+    comparison_target: str = Field(default="Hospital", description="Target entity type being compared")
+    indexes: List[str] = Field(..., min_items=2, max_items=2, description="Exactly 2 indexes to compare")
+    is_restricted: bool = Field(default=True, description="Whether the indexes are restricted")
+    requirements: Optional[List[Dict[str, Any]]] = Field(None, description="Requirements for refine/execute phase")
+    feedback: Optional[str] = Field(None, description="Feedback for refinement phase")
+
+class SimplifiedResponse(BaseModel):
+    value: Optional[str] = Field(None, description="Simplified value: 'Yes', 'No', or numeric value with unit")
+
+class SourceResult(BaseModel):
+    response: str = Field(..., description="Detailed response from the data source")
+    simplified_value: Optional[str] = Field(None, description="Simplified value extracted from the detailed response")
+    citations: List[CitationInfo] = Field(default_factory=list, description="List of citations related to the response")
+
+class ComparisonResult(BaseModel):
+    requirement: Requirement = Field(..., description="The requirement being compared")
+    sources: Dict[str, SourceResult] = Field(..., description="Responses from each data source")