Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(fiches-sp): optimisation de la récupération de contenu et de l'ingestion #42

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion srdt_analysis/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ def __init__(self):
]
)
self._character_recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", ". ", " "],
)

def split_markdown(self, markdown: str) -> list[SplitDocument]:
Expand Down
5 changes: 4 additions & 1 deletion srdt_analysis/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ def upload(
)
}

request_data = {"request": '{"collection": "%s"}' % id_collection}
request_data = {
"request": '{"collection": "%s", "chunker": {"name": "NoChunker"}}'
% id_collection
}
response = httpx.post(
f"{ALBERT_ENDPOINT}/v1/files",
headers=self.headers,
Expand Down
4 changes: 2 additions & 2 deletions srdt_analysis/constants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ALBERT_ENDPOINT = "https://albert.api.etalab.gouv.fr"
MODEL_VECTORISATION = "BAAI/bge-m3"
LLM_MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500
CHUNK_SIZE = 4096
CHUNK_OVERLAP = 0
COLLECTIONS_UPLOAD_BATCH_SIZE = 50
COLLECTIONS_UPLOAD_DELAY_IN_SECONDS = 10
BASE_URL_CDTN = "https://code.travail.gouv.fr"
Expand Down
32 changes: 1 addition & 31 deletions srdt_analysis/data_exploiter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import json

from srdt_analysis.albert import AlbertBase
from srdt_analysis.chunker import Chunker
from srdt_analysis.collections import Collections
from srdt_analysis.constants import BASE_URL_CDTN, MODEL_VECTORISATION
from srdt_analysis.logger import Logger
from srdt_analysis.models import (
XML_AS_JSON,
ChunkerContentType,
CollectionName,
Document,
Expand Down Expand Up @@ -90,34 +87,7 @@ def get_content(self, doc: Document) -> FormattedTextContent:

class FichesSPExploiter(BaseDataExploiter):
def get_content(self, doc: Document) -> FormattedTextContent:
raw_data = doc.document.get("raw", "")
if raw_data:
if isinstance(raw_data, str):
raw_data = json.loads(raw_data)
xml_content = self._parse_xml(raw_data)
return xml_content.strip()
return ""

def _parse_xml(self, xml_as_json: XML_AS_JSON) -> FormattedTextContent:
xml_content = ""
if isinstance(xml_as_json, dict):
if xml_as_json.get("type") == "element":
tag_name = xml_as_json.get("name")
attributes = xml_as_json.get("attributes", {})
children = xml_as_json.get("children", [])
xml_content += f"<{tag_name}"
for attr_name, attr_value in attributes.items():
xml_content += f' {attr_name}="{attr_value}"'
xml_content += ">"
for child in children:
xml_content += self._parse_xml(child)
xml_content += f"</{tag_name}>"
elif xml_as_json.get("type") == "text":
xml_content += xml_as_json.get("text", "")
elif isinstance(xml_as_json, list):
for item in xml_as_json:
xml_content += self._parse_xml(item)
return xml_content
return doc.text


class PageInfosExploiter(BaseDataExploiter):
Expand Down
3 changes: 2 additions & 1 deletion srdt_analysis/database_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ async def get_connection(self):
async def fetch_documents_by_source(self, source: str) -> DocumentsList:
async with self.get_connection() as conn:
result = await conn.fetch(
"SELECT * from public.documents WHERE source = $1", source
"SELECT * from public.documents WHERE source = $1 AND is_published = true AND is_available = true",
source,
)
return [Document.from_record(r) for r in result]

Expand Down
1 change: 0 additions & 1 deletion srdt_analysis/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
FormattedTextContent = str
COLLECTION_ID = str
COLLECTIONS_ID = list[str]
XML_AS_JSON = dict


@dataclass
Expand Down