Skip to content

Commit 1ee8e5f

Browse files
committed
⚗️(backend) function to extract text from base64 yjs document
Function to extract text from base64 yjs document. Can be usefull if we need to index the content of the documents.
1 parent ac86a4e commit 1ee8e5f

File tree

4 files changed

+51
-1
lines changed

4 files changed

+51
-1
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ and this project adheres to
99

1010
## [Unreleased]
1111

12+
## Added
13+
14+
- ⚗️(backend) Extract text from base64 yjs document #270
15+
1216

1317
## [1.4.0] - 2024-09-17
1418

src/backend/core/tests/test_utils.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import pytest
1212

13-
from core.utils import email_invitation
13+
from core.utils import email_invitation, yjs_base64_to_text
1414

1515
pytestmark = pytest.mark.django_db
1616

@@ -85,3 +85,29 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail):
8585

8686
assert email == "guest@example.com"
8787
assert isinstance(exception, smtplib.SMTPException)
88+
89+
90+
def test_yjs_base64_to_text():
91+
"""
92+
Test extract_text_from_saved_yjs_document
93+
This base64 string is an example of what is saved in the database.
94+
This base64 is generated from the blocknote editor, it contains
95+
the text \n# *Hello* \n- w**or**ld
96+
"""
97+
base64_string = (
98+
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
99+
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
100+
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
101+
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
102+
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
103+
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
104+
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
105+
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
106+
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
107+
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
108+
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
109+
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
110+
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
111+
)
112+
113+
assert yjs_base64_to_text(base64_string) == "Hello world"

src/backend/core/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Utilities for the core app.
33
"""
44

5+
import base64
56
import smtplib
67
from logging import getLogger
78

@@ -12,6 +13,9 @@
1213
from django.utils.translation import gettext_lazy as _
1314
from django.utils.translation import override
1415

16+
import y_py as Y
17+
from bs4 import BeautifulSoup
18+
1519
logger = getLogger(__name__)
1620

1721

@@ -38,3 +42,17 @@ def email_invitation(language, email, document_id):
3842

3943
except smtplib.SMTPException as exception:
4044
logger.error("invitation to %s was not sent: %s", email, exception)
45+
46+
47+
def yjs_base64_to_text(base64_string):
48+
"""Extract text from base64 yjs document"""
49+
50+
decoded_bytes = base64.b64decode(base64_string)
51+
uint8_array = bytearray(decoded_bytes)
52+
53+
doc = Y.YDoc() # pylint: disable=E1101
54+
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
55+
blocknote_structure = str(doc.get_xml_element("document-store"))
56+
57+
soup = BeautifulSoup(blocknote_structure, "html.parser")
58+
return soup.get_text(separator=" ").strip()

src/backend/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ license = { file = "LICENSE" }
2525
readme = "README.md"
2626
requires-python = ">=3.10"
2727
dependencies = [
28+
"beautifulsoup4==4.12.3",
2829
"boto3==1.35.10",
2930
"Brotli==1.1.0",
3031
"celery[redis]==5.4.0",
@@ -57,6 +58,7 @@ dependencies = [
5758
"WeasyPrint>=60.2",
5859
"whitenoise==6.7.0",
5960
"mozilla-django-oidc==4.0.1",
61+
"y-py==0.5.5",
6062
]
6163

6264
[project.urls]

0 commit comments

Comments
 (0)