Skip to content

Commit d66f6cc

Browse files
authored
Merge pull request #49 from DataFog/feature/add-char-length-param
_chunk_text + tests
2 parents 3b7adbc + 78402db commit d66f6cc

File tree

4 files changed

+219
-129
lines changed

4 files changed

+219
-129
lines changed

datafog/services/text_service.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,61 @@
11
import asyncio
2+
from typing import Dict, List
23

34
from datafog.processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
45

56

67
class TextService:
7-
def __init__(self):
8+
def __init__(self, text_chunk_length: int = 1000):
89
self.annotator = SpacyPIIAnnotator.create()
10+
self.text_chunk_length = text_chunk_length
911

10-
def annotate_text_sync(self, text):
11-
"""Synchronously Annotate a text string."""
12-
res = self.annotator.annotate(text)
13-
return res
12+
def _chunk_text(self, text: str) -> List[str]:
13+
"""Split the text into chunks of specified length."""
14+
return [
15+
text[i : i + self.text_chunk_length]
16+
for i in range(0, len(text), self.text_chunk_length)
17+
]
1418

15-
def batch_annotate_text_sync(self, texts: list):
19+
def _combine_annotations(self, annotations: List[Dict]) -> Dict:
20+
"""Combine annotations from multiple chunks."""
21+
combined = {}
22+
for annotation in annotations:
23+
for key, value in annotation.items():
24+
if key not in combined:
25+
combined[key] = []
26+
combined[key].extend(value)
27+
return combined
28+
29+
def annotate_text_sync(self, text: str) -> Dict:
30+
"""Synchronously annotate a text string."""
31+
if not text:
32+
return {}
33+
print(f"Starting on {text.split()[0]}")
34+
chunks = self._chunk_text(text)
35+
annotations = []
36+
for chunk in chunks:
37+
res = self.annotator.annotate(chunk)
38+
annotations.append(res)
39+
combined = self._combine_annotations(annotations)
40+
print(f"Done processing {text.split()[0]}")
41+
return combined
42+
43+
def batch_annotate_text_sync(self, texts: List[str]) -> Dict[str, Dict]:
1644
"""Synchronously annotate a list of text input."""
1745
results = [self.annotate_text_sync(text) for text in texts]
1846
return dict(zip(texts, results, strict=True))
1947

20-
async def annotate_text_async(self, text):
48+
async def annotate_text_async(self, text: str) -> Dict:
2149
"""Asynchronously annotate a text string."""
22-
return await asyncio.to_thread(self.annotator.annotate, text)
50+
if not text:
51+
return {}
52+
chunks = self._chunk_text(text)
53+
tasks = [asyncio.to_thread(self.annotator.annotate, chunk) for chunk in chunks]
54+
annotations = await asyncio.gather(*tasks)
55+
return self._combine_annotations(annotations)
2356

24-
async def batch_annotate_text_async(self, text: list):
57+
async def batch_annotate_text_async(self, texts: List[str]) -> Dict[str, Dict]:
2558
"""Asynchronously annotate a list of text input."""
26-
tasks = [self.annotate_text_async(txt) for txt in text]
59+
tasks = [self.annotate_text_async(txt) for txt in texts]
2760
results = await asyncio.gather(*tasks)
28-
return dict(zip(text, results, strict=True))
61+
return dict(zip(texts, results, strict=True))

error_log.txt

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
============================= test session starts ==============================
2+
platform darwin -- Python 3.10.1, pytest-8.2.2, pluggy-1.5.0
3+
rootdir: /Users/sidmohan/Desktop/datafog_local/datafog-python/datafog-python
4+
plugins: cov-5.0.0, asyncio-0.23.7, anyio-4.4.0
5+
asyncio: mode=strict
6+
collected 12 items
7+
8+
tests/test_text_service.py .F.......... [100%]
9+
10+
=================================== FAILURES ===================================
11+
_______________________________ test_chunk_text ________________________________
12+
13+
text_service = <datafog.services.text_service.TextService object at 0x16fa68a60>
14+
15+
def test_chunk_text(text_service):
16+
text = "This is a test sentence for chunking."
17+
chunks = text_service._chunk_text(text)
18+
assert len(chunks) == 4
19+
> assert chunks == ["This is a", "test", "sentence f", "or chunki"]
20+
E AssertionError: assert ['This is a '...h', 'unking.'] == ['This is a',..., 'or chunki']
21+
E
22+
E At index 0 diff: 'This is a ' != 'This is a'
23+
E Use -v to get more diff
24+
25+
tests/test_text_service.py:23: AssertionError
26+
=============================== warnings summary ===============================
27+
.venv/lib/python3.10/site-packages/spacy/cli/info.py:3
28+
/Users/sidmohan/Desktop/datafog_local/datafog-python/datafog-python/.venv/lib/python3.10/site-packages/spacy/cli/info.py:3: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
29+
import pkg_resources
30+
31+
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
32+
33+
---------- coverage: platform darwin, python 3.10.1-final-0 ----------
34+
Name Stmts Miss Cover
35+
----------------------------------------------------------------------------------
36+
datafog/__about__.py 1 0 100%
37+
datafog/__init__.py 11 0 100%
38+
datafog/config.py 6 0 100%
39+
datafog/main.py 74 55 26%
40+
datafog/processing/__init__.py 0 0 100%
41+
datafog/processing/image_processing/__init__.py 0 0 100%
42+
datafog/processing/image_processing/donut_processor.py 44 29 34%
43+
datafog/processing/image_processing/image_downloader.py 17 7 59%
44+
datafog/processing/image_processing/pytesseract_processor.py 8 1 88%
45+
datafog/processing/spark_processing/__init__.py 3 3 0%
46+
datafog/processing/spark_processing/pyspark_udfs.py 39 39 0%
47+
datafog/processing/text_processing/__init__.py 1 0 100%
48+
datafog/processing/text_processing/spacy_pii_annotator.py 36 20 44%
49+
datafog/services/__init__.py 3 0 100%
50+
datafog/services/image_service.py 26 11 58%
51+
datafog/services/spark_service.py 26 16 38%
52+
datafog/services/text_service.py 43 1 98%
53+
----------------------------------------------------------------------------------
54+
TOTAL 338 182 46%
55+
56+
=========================== short test summary info ============================
57+
FAILED tests/test_text_service.py::test_chunk_text - AssertionError: assert [...
58+
=================== 1 failed, 11 passed, 1 warning in 2.06s ====================

tests/beta_pypi_test.ipynb

Lines changed: 0 additions & 118 deletions
This file was deleted.

tests/test_text_service.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
from unittest.mock import Mock, patch
2+
3+
import pytest
4+
5+
from datafog.services.text_service import TextService
6+
7+
8+
@pytest.fixture
9+
def mock_annotator():
10+
mock = Mock()
11+
mock.annotate.return_value = {"PER": ["John Doe"], "ORG": ["Acme Inc"]}
12+
return mock
13+
14+
15+
@pytest.fixture
16+
def text_service(mock_annotator):
17+
with patch(
18+
"datafog.services.text_service.SpacyPIIAnnotator.create",
19+
return_value=mock_annotator,
20+
):
21+
return TextService(text_chunk_length=10)
22+
23+
24+
def test_init(text_service):
25+
assert text_service.text_chunk_length == 10
26+
27+
28+
def test_chunk_text(text_service):
29+
text = "This is a test sentence for chunking."
30+
chunks = text_service._chunk_text(text)
31+
assert len(chunks) == 4
32+
assert chunks == ["This is a ", "test sente", "nce for ch", "unking."]
33+
34+
35+
def test_combine_annotations(text_service):
36+
annotations = [
37+
{"PER": ["John"], "ORG": ["Acme"]},
38+
{"PER": ["Doe"], "LOC": ["New York"]},
39+
]
40+
combined = text_service._combine_annotations(annotations)
41+
assert combined == {"PER": ["John", "Doe"], "ORG": ["Acme"], "LOC": ["New York"]}
42+
43+
44+
def test_annotate_text_sync(text_service):
45+
result = text_service.annotate_text_sync("John Doe works at Acme Inc")
46+
assert result == {
47+
"PER": ["John Doe", "John Doe", "John Doe"],
48+
"ORG": ["Acme Inc", "Acme Inc", "Acme Inc"],
49+
}
50+
51+
52+
def test_batch_annotate_text_sync(text_service):
53+
texts = ["John Doe", "Acme Inc"]
54+
result = text_service.batch_annotate_text_sync(texts)
55+
assert result == {
56+
"John Doe": {"PER": ["John Doe"], "ORG": ["Acme Inc"]},
57+
"Acme Inc": {"PER": ["John Doe"], "ORG": ["Acme Inc"]},
58+
}
59+
60+
61+
@pytest.mark.asyncio
62+
async def test_annotate_text_async(text_service):
63+
result = await text_service.annotate_text_async("John Doe works at Acme Inc")
64+
assert result == {
65+
"PER": ["John Doe", "John Doe", "John Doe"],
66+
"ORG": ["Acme Inc", "Acme Inc", "Acme Inc"],
67+
}
68+
69+
70+
@pytest.mark.asyncio
71+
async def test_batch_annotate_text_async(text_service):
72+
texts = ["John Doe", "Acme Inc"]
73+
result = await text_service.batch_annotate_text_async(texts)
74+
assert result == {
75+
"John Doe": {"PER": ["John Doe"], "ORG": ["Acme Inc"]},
76+
"Acme Inc": {"PER": ["John Doe"], "ORG": ["Acme Inc"]},
77+
}
78+
79+
80+
def test_long_text_chunking(text_service):
81+
long_text = "John Doe works at Acme Inc. Jane Smith is from New York City."
82+
result = text_service.annotate_text_sync(long_text)
83+
expected_count = len(text_service._chunk_text(long_text))
84+
assert result == {
85+
"PER": ["John Doe"] * expected_count,
86+
"ORG": ["Acme Inc"] * expected_count,
87+
}
88+
89+
90+
@pytest.mark.asyncio
91+
async def test_long_text_chunking_async(text_service):
92+
long_text = "John Doe works at Acme Inc. Jane Smith is from New York City."
93+
result = await text_service.annotate_text_async(long_text)
94+
expected_count = len(text_service._chunk_text(long_text))
95+
assert result == {
96+
"PER": ["John Doe"] * expected_count,
97+
"ORG": ["Acme Inc"] * expected_count,
98+
}
99+
100+
101+
def test_empty_string(text_service):
102+
result = text_service.annotate_text_sync("")
103+
assert result == {}
104+
105+
106+
def test_short_string(text_service):
107+
result = text_service.annotate_text_sync("Short")
108+
assert result == {"PER": ["John Doe"], "ORG": ["Acme Inc"]}
109+
110+
111+
def test_special_characters(text_service):
112+
result = text_service.annotate_text_sync("John@Doe.com works at Acme-Inc!!!")
113+
expected_count = len(text_service._chunk_text("John@Doe.com works at Acme-Inc!!!"))
114+
assert result == {
115+
"PER": ["John Doe"] * expected_count,
116+
"ORG": ["Acme Inc"] * expected_count,
117+
}

0 commit comments

Comments
 (0)