SocialGouv · maxgfr · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/srdt_analysis/chunker.py b/srdt_analysis/chunker.py
@@ -34,7 +34,9 @@ def __init__(self):
             ]
         )
         self._character_recursive_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            separators=["\n\n", "\n", ". ", " "],
         )
 
     def split_markdown(self, markdown: str) -> list[SplitDocument]:

diff --git a/srdt_analysis/collections.py b/srdt_analysis/collections.py
@@ -113,7 +113,10 @@ def upload(
                 )
             }
 
-            request_data = {"request": '{"collection": "%s"}' % id_collection}
+            request_data = {
+                "request": '{"collection": "%s", "chunker": {"name": "NoChunker"}}'
+                % id_collection
+            }
             response = httpx.post(
                 f"{ALBERT_ENDPOINT}/v1/files",
                 headers=self.headers,

diff --git a/srdt_analysis/constants.py b/srdt_analysis/constants.py
@@ -1,8 +1,8 @@
 ALBERT_ENDPOINT = "https://albert.api.etalab.gouv.fr"
 MODEL_VECTORISATION = "BAAI/bge-m3"
 LLM_MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
-CHUNK_SIZE = 5000
-CHUNK_OVERLAP = 500
+CHUNK_SIZE = 4096
+CHUNK_OVERLAP = 0
 COLLECTIONS_UPLOAD_BATCH_SIZE = 50
 COLLECTIONS_UPLOAD_DELAY_IN_SECONDS = 10
 BASE_URL_CDTN = "https://code.travail.gouv.fr"

diff --git a/srdt_analysis/data_exploiter.py b/srdt_analysis/data_exploiter.py
@@ -1,12 +1,9 @@
-import json
-
 from srdt_analysis.albert import AlbertBase
 from srdt_analysis.chunker import Chunker
 from srdt_analysis.collections import Collections
 from srdt_analysis.constants import BASE_URL_CDTN, MODEL_VECTORISATION
 from srdt_analysis.logger import Logger
 from srdt_analysis.models import (
-    XML_AS_JSON,
     ChunkerContentType,
     CollectionName,
     Document,
@@ -90,34 +87,7 @@ def get_content(self, doc: Document) -> FormattedTextContent:
 
 class FichesSPExploiter(BaseDataExploiter):
     def get_content(self, doc: Document) -> FormattedTextContent:
-        raw_data = doc.document.get("raw", "")
-        if raw_data:
-            if isinstance(raw_data, str):
-                raw_data = json.loads(raw_data)
-            xml_content = self._parse_xml(raw_data)
-            return xml_content.strip()
-        return ""
-
-    def _parse_xml(self, xml_as_json: XML_AS_JSON) -> FormattedTextContent:
-        xml_content = ""
-        if isinstance(xml_as_json, dict):
-            if xml_as_json.get("type") == "element":
-                tag_name = xml_as_json.get("name")
-                attributes = xml_as_json.get("attributes", {})
-                children = xml_as_json.get("children", [])
-                xml_content += f"<{tag_name}"
-                for attr_name, attr_value in attributes.items():
-                    xml_content += f' {attr_name}="{attr_value}"'
-                xml_content += ">"
-                for child in children:
-                    xml_content += self._parse_xml(child)
-                xml_content += f"</{tag_name}>"
-            elif xml_as_json.get("type") == "text":
-                xml_content += xml_as_json.get("text", "")
-        elif isinstance(xml_as_json, list):
-            for item in xml_as_json:
-                xml_content += self._parse_xml(item)
-        return xml_content
+        return doc.text
 
 
 class PageInfosExploiter(BaseDataExploiter):

diff --git a/srdt_analysis/database_manager.py b/srdt_analysis/database_manager.py
@@ -36,7 +36,8 @@ async def get_connection(self):
     async def fetch_documents_by_source(self, source: str) -> DocumentsList:
         async with self.get_connection() as conn:
             result = await conn.fetch(
-                "SELECT * from public.documents WHERE source = $1", source
+                "SELECT * from public.documents WHERE source = $1 AND is_published = true AND is_available = true",
+                source,
             )
             return [Document.from_record(r) for r in result]
 

diff --git a/srdt_analysis/models.py b/srdt_analysis/models.py
@@ -24,7 +24,6 @@
 FormattedTextContent = str
 COLLECTION_ID = str
 COLLECTIONS_ID = list[str]
-XML_AS_JSON = dict
 
 
 @dataclass