Skip to content

Commit a886334

Browse files
sidmohan0claude
andcommitted
resolve: merge conflicts with enhanced segfault detection
Keep the enhanced segfault detection logic from performance-regression branch while merging with latest dev changes. The enhanced version includes: - has_successful_test_run() function for better success detection - Support for exit code 245 (segfault variant) - More comprehensive test result parsing - Better handling of CI-specific test patterns 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2 parents 088a861 + 02fa8d8 commit a886334

File tree

6 files changed

+812
-98
lines changed

6 files changed

+812
-98
lines changed

.github/workflows/benchmark.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,13 @@ jobs:
3838
benchmark-${{ runner.os }}-
3939
4040
- name: Run benchmarks and save baseline
41+
env:
42+
CI: true
43+
GITHUB_ACTIONS: true
4144
run: |
42-
# Run benchmarks and save results
43-
python -m pytest tests/benchmark_text_service.py -v --benchmark-autosave --benchmark-json=benchmark-results.json
45+
# Run benchmarks with segfault protection and save results
46+
echo "Running benchmarks with memory optimizations..."
47+
python -m pytest tests/benchmark_text_service.py -v --benchmark-autosave --benchmark-json=benchmark-results.json --tb=short
4448
4549
- name: Check for performance regression
4650
run: |
@@ -60,7 +64,7 @@ jobs:
6064
pytest tests/benchmark_text_service.py --benchmark-compare
6165
6266
# Then check for significant regressions
63-
echo "Checking for performance regressions (>10% slower)..."
67+
echo "Checking for performance regressions (>100% slower)..."
6468
# Use our Python script for benchmark comparison
6569
python scripts/compare_benchmarks.py "$BASELINE_FILE" "$CURRENT_FILE"
6670
else

.github/workflows/beta-release.yml

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ jobs:
7272
- name: Install dependencies
7373
run: |
7474
python -m pip install --upgrade pip
75-
pip install bump2version build twine
75+
pip install bump2version build twine psutil
7676
pip install -e ".[all,dev]"
77+
# Install memory monitoring tools
78+
pip install memory_profiler
7779
7880
- name: Configure git
7981
run: |
@@ -107,11 +109,26 @@ jobs:
107109
run: |
108110
python scripts/generate_changelog.py --beta --output BETA_CHANGELOG.md
109111
110-
- name: Run tests
112+
- name: Run tests with segfault protection
113+
env:
114+
# Memory optimization environment variables (set by run_tests.py)
115+
CI: true
116+
GITHUB_ACTIONS: true
111117
run: |
112-
python -m pytest tests/ -v --tb=short
113-
python -m pytest -m integration -v
114-
python -m pytest tests/benchmark_text_service.py -v
118+
# Print system memory info
119+
free -h || echo "free command not available"
120+
121+
# Use our robust test runner that handles segfaults
122+
echo "Running main tests with segfault protection..."
123+
python run_tests.py tests/ -k "not benchmark and not integration" --no-header
124+
125+
# Run integration tests separately with segfault protection
126+
echo "Running integration tests..."
127+
python run_tests.py -m integration --no-header
128+
129+
# Run benchmark tests with segfault protection
130+
echo "Running benchmark tests with safeguards..."
131+
python run_tests.py tests/benchmark_text_service.py --no-header
115132
116133
- name: Build package
117134
run: |

datafog/services/text_service.py

Lines changed: 94 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,97 @@ def _annotate_with_smart_cascade(
241241
return result.spans
242242
return regex_result
243243

244+
def _annotate_single_chunk(
245+
self, text: str, structured: bool = False
246+
) -> Union[Dict[str, List[str]], List["Span"]]:
247+
"""Annotate a single chunk of text based on the engine type."""
248+
if self.engine == "regex":
249+
if structured:
250+
_, result = self.regex_annotator.annotate_with_spans(text)
251+
return result.spans
252+
return self.regex_annotator.annotate(text)
253+
elif self.engine == "spacy":
254+
if self.spacy_annotator is None:
255+
raise ImportError(
256+
"SpaCy engine not available. Install with: pip install datafog[nlp]"
257+
)
258+
return self.spacy_annotator.annotate(text)
259+
elif self.engine == "gliner":
260+
if self.gliner_annotator is None:
261+
raise ImportError(
262+
"GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
263+
)
264+
return self.gliner_annotator.annotate(text)
265+
elif self.engine == "smart":
266+
return self._annotate_with_smart_cascade(text, structured)
267+
elif self.engine == "auto":
268+
return self._annotate_with_auto_engine(text, structured)
269+
270+
def _annotate_with_auto_engine(
271+
self, text: str, structured: bool = False
272+
) -> Union[Dict[str, List[str]], List["Span"]]:
273+
"""Handle auto engine annotation with regex fallback to spacy."""
274+
# Try regex first
275+
if structured:
276+
# For structured output, use annotate_with_spans directly to avoid double processing
277+
_, result = self.regex_annotator.annotate_with_spans(text)
278+
regex_result = {}
279+
for span in result.spans:
280+
if span.label not in regex_result:
281+
regex_result[span.label] = []
282+
regex_result[span.label].append(span.text)
283+
284+
# Check if regex found any entities
285+
if any(entities for entities in regex_result.values()):
286+
return result.spans
287+
else:
288+
regex_result = self.regex_annotator.annotate(text)
289+
290+
# Check if regex found any entities
291+
if any(entities for entities in regex_result.values()):
292+
return regex_result
293+
294+
# Fall back to spacy if available
295+
if self.spacy_annotator is not None:
296+
return self.spacy_annotator.annotate(text)
297+
298+
# Return regex result even if empty
299+
if structured:
300+
# We already have the result from above in structured mode
301+
return result.spans
302+
return regex_result
303+
304+
def _annotate_multiple_chunks_structured(self, chunks: List[str]) -> List["Span"]:
305+
"""Handle structured annotation across multiple chunks."""
306+
all_spans = []
307+
current_offset = 0
308+
309+
# Get Span class once outside the loop for efficiency
310+
SpanClass = _get_span_class()
311+
312+
for chunk in chunks:
313+
chunk_spans = self._annotate_single_chunk(chunk, structured=True)
314+
# Adjust span positions to account for chunk offset
315+
for span in chunk_spans:
316+
adjusted_span = SpanClass(
317+
start=span.start + current_offset,
318+
end=span.end + current_offset,
319+
text=span.text,
320+
label=span.label,
321+
)
322+
all_spans.append(adjusted_span)
323+
current_offset += len(chunk)
324+
325+
return all_spans
326+
327+
def _annotate_multiple_chunks_dict(self, chunks: List[str]) -> Dict[str, List[str]]:
328+
"""Handle dictionary annotation across multiple chunks."""
329+
chunk_annotations = []
330+
for chunk in chunks:
331+
chunk_result = self._annotate_single_chunk(chunk, structured=False)
332+
chunk_annotations.append(chunk_result)
333+
return self._combine_annotations(chunk_annotations)
334+
244335
def annotate_text_sync(
245336
self, text: str, structured: bool = False
246337
) -> Union[Dict[str, List[str]], List["Span"]]:
@@ -256,88 +347,15 @@ def annotate_text_sync(
256347
"""
257348
if len(text) <= self.text_chunk_length:
258349
# Single chunk processing
259-
if self.engine == "regex":
260-
if structured:
261-
_, result = self.regex_annotator.annotate_with_spans(text)
262-
return result.spans
263-
return self.regex_annotator.annotate(text)
264-
elif self.engine == "spacy":
265-
if self.spacy_annotator is None:
266-
raise ImportError(
267-
"SpaCy engine not available. Install with: pip install datafog[nlp]"
268-
)
269-
return self.spacy_annotator.annotate(text)
270-
elif self.engine == "gliner":
271-
if self.gliner_annotator is None:
272-
raise ImportError(
273-
"GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
274-
)
275-
return self.gliner_annotator.annotate(text)
276-
elif self.engine == "smart":
277-
return self._annotate_with_smart_cascade(text, structured)
278-
elif self.engine == "auto":
279-
# Try regex first
280-
if structured:
281-
# For structured output, use annotate_with_spans directly to avoid double processing
282-
_, result = self.regex_annotator.annotate_with_spans(text)
283-
regex_result = {}
284-
for span in result.spans:
285-
if span.label not in regex_result:
286-
regex_result[span.label] = []
287-
regex_result[span.label].append(span.text)
288-
289-
# Check if regex found any entities
290-
if any(entities for entities in regex_result.values()):
291-
return result.spans
292-
else:
293-
regex_result = self.regex_annotator.annotate(text)
294-
295-
# Check if regex found any entities
296-
if any(entities for entities in regex_result.values()):
297-
return regex_result
298-
299-
# Fall back to spacy if available
300-
if self.spacy_annotator is not None:
301-
return self.spacy_annotator.annotate(text)
302-
303-
# Return regex result even if empty
304-
if structured:
305-
# We already have the result from above in structured mode
306-
return result.spans
307-
return regex_result
350+
return self._annotate_single_chunk(text, structured)
308351
else:
309352
# Multi-chunk processing
310353
chunks = self._chunk_text(text)
311354

312355
if structured:
313-
# For structured output, we need to handle span positions across chunks
314-
all_spans = []
315-
current_offset = 0
316-
317-
# Get Span class once outside the loop for efficiency
318-
SpanClass = _get_span_class()
319-
320-
for chunk in chunks:
321-
chunk_spans = self.annotate_text_sync(chunk, structured=True)
322-
# Adjust span positions to account for chunk offset
323-
for span in chunk_spans:
324-
adjusted_span = SpanClass(
325-
start=span.start + current_offset,
326-
end=span.end + current_offset,
327-
text=span.text,
328-
label=span.label,
329-
)
330-
all_spans.append(adjusted_span)
331-
current_offset += len(chunk)
332-
333-
return all_spans
356+
return self._annotate_multiple_chunks_structured(chunks)
334357
else:
335-
# Dictionary format - combine annotations
336-
chunk_annotations = []
337-
for chunk in chunks:
338-
chunk_result = self.annotate_text_sync(chunk, structured=False)
339-
chunk_annotations.append(chunk_result)
340-
return self._combine_annotations(chunk_annotations)
358+
return self._annotate_multiple_chunks_dict(chunks)
341359

342360
async def annotate_text_async(
343361
self, text: str, structured: bool = False

0 commit comments

Comments
 (0)