Skip to content

Commit

Permalink
feat(llm): lancement du processus de bout en bout avec les pages info…
Browse files Browse the repository at this point in the history
…rmations (#21)

* fix: code

* fix: code

* fix: llm

* fix: retours

* fix: retours

* fix: wait time

* fix: remove unsued code

* fix: pyright

* Update srdt_analysis/llm_processor.py

Co-authored-by: Victor Degliame <[email protected]>

* fix: retours pr

---------

Co-authored-by: Victor Degliame <[email protected]>
  • Loading branch information
maxgfr and RealVidy authored Dec 11, 2024
1 parent 6af7bb0 commit 1090f2f
Show file tree
Hide file tree
Showing 11 changed files with 302 additions and 185 deletions.
23 changes: 16 additions & 7 deletions srdt_analysis/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,32 @@
from srdt_analysis.collections import Collections
from srdt_analysis.data_exploiter import PageInfosExploiter
from srdt_analysis.database_manager import get_data
from srdt_analysis.llm_processor import LLMProcessor
from srdt_analysis.mapper import Mapper

load_dotenv()

QUESTION = "Combien de jours de congé payé par mois de travail effectif ?"
COLLECTION_ID = "40981583-0885-4e88-8e51-2d47f2d397a8"


def main():
data = get_data(["information"])
exploiter = PageInfosExploiter()
result = exploiter.process_documents(
[data["information"][0]], "page_infos.csv", "cdtn_page_infos"
)
result = exploiter.process_documents(data["information"], "cdtn_page_infos")
collections = Collections()
res = collections.search(
"combien de jour de congé payé par mois de travail effectif",
rag_response = collections.search(
QUESTION,
[result["id"]],
)

print(res)
mapper = Mapper(data)
data_to_send_to_llm = mapper.get_original_docs(rag_response)
llm_processor = LLMProcessor()
for token in llm_processor.get_answer_stream(
QUESTION,
data_to_send_to_llm,
):
print(token, end="", flush=True)


if __name__ == "__main__":
Expand Down
4 changes: 3 additions & 1 deletion srdt_analysis/albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ def __init__(self):
raise ValueError(
"API key must be provided either in constructor or as environment variable"
)
self.headers = {"Authorization": f"Bearer {self.api_key}"}
self.headers = {
"Authorization": f"Bearer {self.api_key}",
}

def get_models(self) -> Dict[str, Any]:
response = httpx.get(f"{ALBERT_ENDPOINT}/v1/models", headers=self.headers)
Expand Down
8 changes: 4 additions & 4 deletions srdt_analysis/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from srdt_analysis.albert import AlbertBase
from srdt_analysis.constants import ALBERT_ENDPOINT
from srdt_analysis.models import ChunkDataList, DocumentData
from srdt_analysis.models import DocumentData, RAGChunkSearchResult


class Collections(AlbertBase):
Expand Down Expand Up @@ -47,7 +47,7 @@ def search(
id_collections: List[str],
k: int = 5,
score_threshold: float = 0,
) -> ChunkDataList:
) -> RAGChunkSearchResult:
response = httpx.post(
f"{ALBERT_ENDPOINT}/v1/search",
headers=self.headers,
Expand Down Expand Up @@ -75,9 +75,9 @@ def upload(
"text": chunk.page_content,
"title": dt["title"],
"metadata": {
"cdtn_id": dt["cdtn_id"],
"structure_du_chunk": chunk.metadata,
"id": dt["cdtn_id"],
"url": dt["url"],
"source": dt["source"],
},
}
)
Expand Down
22 changes: 22 additions & 0 deletions srdt_analysis/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,25 @@
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500
BASE_URL_CDTN = "https://code.travail.gouv.fr"
LLM_ANSWER_PROMPT = """
Instructions
Rôle et objectif
L'assistant juridique est conçu pour répondre aux questions des usagers (salariés et employeurs du secteur privé) en France concernant le droit du travail, conformément aux normes et règlements du droit français. L'objectif est de fournir des informations juridiques précises et contextualisées, en utilisant des extraits de documents pertinents pour soutenir chaque réponse.
Lignes directrices
A chaque fois que l'utilisateur sollicite l'assistant juridique, le chatbot va procéder ainsi :
Reformuler la demande de l’utilisateur en deux parties : le contexte, et les points juridiques à traiter. Puis y répondre.
Pour chaque point, citer la source juridique utilisée dans la base de connaissance externe, ou bien citer le passage correspondant
Commencer par citer le principe général de droit qui répond au point, puis aller dans le détail en distinguant les cas particuliers, ou en posant des questions à l'utilisateur pour avoir plus de précisions quand cela est nécessaire
Conclure en synthétisant la réponse et si nécessaire, en indiquant les prochaines étapes à suivre, ainsi qu’en posant des questions qui vont permettre de compléter la réponse
Limites et contraintes
Il faut faire attention à ce que toutes les réponses aient une question. Mais si une question n'a pas de réponse, il ne faut pas inventer et plutôt simplement indiquer que la réponse n'a pas été trouvée. Si tu as besoin d’informations supplémentaires pour répondre à une question, tu demandes simplement ces informations à l’usager qui te les donnera.
Style et ton.
Répondre dans un langage clair et accessible.
"""
73 changes: 32 additions & 41 deletions srdt_analysis/data_exploiter.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,73 @@
from datetime import datetime

from srdt_analysis.albert import AlbertBase
from srdt_analysis.chunker import Chunker
from srdt_analysis.collections import Collections
from srdt_analysis.constants import BASE_URL_CDTN, MODEL_VECTORISATION
from srdt_analysis.document_processor import DocumentProcessor
from srdt_analysis.llm_processor import LLMProcessor
from srdt_analysis.models import DocumentData, DocumentsList, ResultProcessDocumentType
from srdt_analysis.logger import Logger
from srdt_analysis.models import (
CollectionName,
DocumentData,
DocumentsList,
ResultProcessDocumentType,
)


class BaseDataExploiter:
def __init__(self):
self.llm_processor = LLMProcessor()
self.doc_processor = DocumentProcessor()
self.chunker = Chunker()
self.collections = Collections()
self.albert = AlbertBase()
self.logger = Logger("BaseDataExploiter")

def get_content(self, doc):
raise NotImplementedError("Subclasses should implement this method")

def process_documents(
self, data: DocumentsList, output_file: str, collection_name: str
self, data: DocumentsList, collection_name: str
) -> ResultProcessDocumentType:
results: list[DocumentData] = []
print(f"Number of articles to be processed: {len(data)}")
self.logger.info(f"Number of articles to be processed: {len(data)}")

for doc in data:
print(
f"[{datetime.now().strftime('%H:%M:%S')}] Processing article: {doc.title}"
)
self.logger.info(f"Processing article: {doc.title}")
content = self.get_content(doc)

chunks = self.chunker.split(content)

summary = self.llm_processor.get_summary(content)
keywords = self.llm_processor.get_keywords(content)
questions = self.llm_processor.get_questions(content)

doc_data = self.create_document_data(
doc, content, chunks, keywords, summary, questions
)
doc_data = self.create_document_data(doc, content, chunks)
results.append(doc_data)

print(
f"[{datetime.now().strftime('%H:%M:%S')}] Article number {data.index(doc) + 1} out of {len(data)} processed"
self.logger.info(
f"Article number {data.index(doc) + 1} out of {len(data)} processed"
)

self.doc_processor.save_to_csv(results, output_file)
id = self.collections.create(collection_name, MODEL_VECTORISATION)
self.logger.info(f"Uploading {len(results)} documents to collection {id}")
self.collections.upload(results, id)
return {"documents": results, "id": id}

def create_document_data(
self, doc, content, chunks, keywords, summary, questions
) -> DocumentData:
def create_document_data(self, doc, content, content_chunked) -> DocumentData:
return {
"cdtn_id": doc.cdtn_id,
"initial_id": doc.initial_id,
"title": doc.title,
"content": content,
"keywords": keywords,
"summary": summary,
"questions": questions,
"content_chunked": chunks,
"url": BASE_URL_CDTN + "/" + doc.source + "/" + doc.slug,
"content_chunked": content_chunked,
"url": BASE_URL_CDTN
+ "/"
+ self._get_path_from_collection_name(doc.source)
+ "/"
+ doc.slug,
"source": doc.source,
}

def _get_path_from_collection_name(self, collection_name: CollectionName) -> str:
mapping = {
"code_du_travail": "code-du-travail",
"fiches_service_public": "fiche-service-public",
"page_fiche_ministere_travail": "fiche-ministere-travail",
"information": "information",
}
return mapping[collection_name]


class ArticlesCodeDuTravailExploiter(BaseDataExploiter):
Expand Down Expand Up @@ -93,14 +95,3 @@ def get_content(self, doc):
if block.get("type") == "markdown":
markdown += block.get("markdown", "")
return markdown


class PageContribsExploiter(BaseDataExploiter):
def get_content(self, doc):
return doc.document.get("content", "")

def create_document_data(self, doc, content, chunks, keywords, summary, questions):
data = super().create_document_data(
doc, content, chunks, keywords, summary, questions
)
return data
12 changes: 2 additions & 10 deletions srdt_analysis/database_manager.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
import asyncio
import os
from contextlib import asynccontextmanager
from typing import Literal, Optional, Sequence
from typing import Optional, Sequence

import asyncpg

from srdt_analysis.models import Document, DocumentsList

CollectionName = Literal[
"code_du_travail",
"fiches_service_public",
"page_fiche_ministere_travail",
"contributions",
"information",
]
from srdt_analysis.models import CollectionName, Document, DocumentsList


class DatabaseManager:
Expand Down
14 changes: 0 additions & 14 deletions srdt_analysis/document_processor.py

This file was deleted.

Loading

0 comments on commit 1090f2f

Please sign in to comment.