From 3552d66e362235d9d07845c569556604cae4a460 Mon Sep 17 00:00:00 2001 From: Anthony LC Date: Thu, 19 Sep 2024 13:11:29 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9A=97=EF=B8=8F(backend)=20function=20to=20e?= =?UTF-8?q?xtract=20text=20from=20base64=20yjs=20document?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function to extract text from base64 yjs document. Can be usefull if we need to index the content of the documents. --- CHANGELOG.md | 4 ++++ src/backend/core/tests/test_utils.py | 23 ++++++++++++++++++++++- src/backend/core/utils.py | 18 ++++++++++++++++++ src/backend/pyproject.toml | 2 ++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3289e0be4..1cbfb68f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to ## [Unreleased] +## Added + +- ⚗️(backend) Extract text from base64 yjs document #270 + ## [1.4.0] - 2024-09-17 diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py index 288d31971..96449ce65 100644 --- a/src/backend/core/tests/test_utils.py +++ b/src/backend/core/tests/test_utils.py @@ -10,7 +10,7 @@ import pytest -from core.utils import email_invitation +from core.utils import email_invitation, extract_text_from_saved_yjs_document pytestmark = pytest.mark.django_db @@ -85,3 +85,24 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail): assert email == "guest@example.com" assert isinstance(exception, smtplib.SMTPException) + + +def test_extract_text_from_saved_yjs_document(): + """ + Test extract_text_from_saved_yjs_document + This base64 string is an example of what is saved in the database. + This base64 is generated from the blocknote editor, it contains + the text "Hello world" + """ + base64_string = ( + "ARCymr/3DgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcAspq/9w4AAw5ibG9j" + "a0NvbnRhaW5lcgcAspq/9w4BAwlwYXJhZ3JhcGgHALKav/cOAgYEALKav/cOAwFIKACy" + "mr/3DgINdGV4dEFsaWdubWVudAF3BGxlZnQoALKav/cOAQJpZAF3DmluaXRpYWxCbG9j" + "a0lkKACymr/3DgEJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4BD2JhY2tncm91bmRD" + "b2xvcgF3B2RlZmF1bHSHspq/9w4BAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4JAwlwYXJh" + "Z3JhcGgoALKav/cOCg10ZXh0QWxpZ25tZW50AXcEbGVmdCgAspq/9w4JAmlkAXckMTFj" + "YTgzYmEtZGM3OS00N2Q3LTllNzYtNmM4OTQwNzc1ZjE3KACymr/3DgkJdGV4dENvbG9y" + "AXcHZGVmYXVsdCgAspq/9w4JD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSEspq/9w4E" + "C2VsbG8gd29ybGQgAA==" + ) + assert extract_text_from_saved_yjs_document(base64_string) == "Hello world" diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index b3767eafd..a7b12c306 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -2,6 +2,7 @@ Utilities for the core app. """ +import base64 import smtplib from logging import getLogger @@ -12,6 +13,9 @@ from django.utils.translation import gettext_lazy as _ from django.utils.translation import override +import y_py as Y +from bs4 import BeautifulSoup + logger = getLogger(__name__) @@ -38,3 +42,17 @@ def email_invitation(language, email, document_id): except smtplib.SMTPException as exception: logger.error("invitation to %s was not sent: %s", email, exception) + + +def extract_text_from_saved_yjs_document(base64_string): + """Extract text from saved yjs document""" + + decoded_bytes = base64.b64decode(base64_string) + uint8_array = bytearray(decoded_bytes) + + doc = Y.YDoc() # pylint: disable=E1101 + Y.apply_update(doc, uint8_array) # pylint: disable=E1101 + blocknote_structure = str(doc.get_xml_element("document-store")) + + soup = BeautifulSoup(blocknote_structure, "html.parser") + return soup.get_text(separator=" ").strip() diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index ccf044f33..0c8bc504e 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -25,6 +25,7 @@ license = { file = "LICENSE" } readme = "README.md" requires-python = ">=3.10" dependencies = [ + "beautifulsoup4==4.12.3", "boto3==1.35.10", "Brotli==1.1.0", "celery[redis]==5.4.0", @@ -57,6 +58,7 @@ dependencies = [ "WeasyPrint>=60.2", "whitenoise==6.7.0", "mozilla-django-oidc==4.0.1", + "y-py==0.5.5", ] [project.urls]