Skip to content

Commit b662969

Browse files
committed
Minor improvements and fixes.
- Fix PosixPath validation error in /v1/index TaskResponse - Fix string vs int comparison error in compressed file samples CLI - Fix compressed file range handling to return single chunk not individual lines - Add range support for compressed files in /v1/samples web API - Fix SeekableIndex attribute access error in analyse.py (.frames not .get()) - Add tests for zstd range samples in CLI, web API, and analyse
1 parent 1b1ef74 commit b662969

File tree

7 files changed

+1116
-180
lines changed

7 files changed

+1116
-180
lines changed

src/rx/analyse.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,9 @@ def _add_index_info(self, filepath: str, result: FileAnalysisState):
245245

246246
if result.index_valid:
247247
try:
248-
index_data = seekable_index.load_seekable_index(filepath)
249-
result.index_checkpoint_count = len(index_data.get('frames', []))
248+
index_data = seekable_index.load_index(index_path)
249+
if index_data:
250+
result.index_checkpoint_count = len(index_data.frames)
250251
except Exception as e:
251252
logger.warning(f'Failed to load seekable index: {e}')
252253
else:

src/rx/cli/samples.py

Lines changed: 141 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -107,30 +107,60 @@ def get_lines_for_byte_range(path: str, start_offset: int, end_offset: int, inde
107107
return result
108108

109109

110-
def get_line_range(path: str, start_line: int, end_line: int) -> list[str]:
110+
def get_line_range(path: str, start_line: int, end_line: int, file_index: 'FileIndex | None' = None) -> list[str]:
111111
"""Get lines from start_line to end_line (inclusive, 1-based).
112112
113-
Efficiently reads only the required lines without loading the entire file.
113+
Uses the index to seek directly to the start line's byte offset for efficiency.
114+
Falls back to linear scan if no index is available.
114115
115116
Args:
116117
path: File path
117118
start_line: Starting line number (1-based)
118119
end_line: Ending line number (1-based, inclusive)
120+
file_index: Optional pre-loaded FileIndex for efficient seeking
119121
120122
Returns:
121123
List of lines in the range
122124
"""
125+
123126
# Validate line numbers
124127
if start_line < 1 or end_line < 1:
125128
return []
126129

127-
result = []
128-
with open(path, encoding='utf-8', errors='replace') as f:
129-
for current_line, line in enumerate(f, 1):
130-
if current_line > end_line:
130+
# Try to use index for efficient seeking
131+
start_offset = None
132+
if file_index and file_index.line_index:
133+
# Find the closest checkpoint at or before start_line
134+
# line_index format: [[line_number, byte_offset], ...]
135+
for entry in reversed(file_index.line_index):
136+
checkpoint_line = entry[0]
137+
checkpoint_offset = entry[1]
138+
if checkpoint_line <= start_line:
139+
start_offset = checkpoint_offset
140+
checkpoint_start_line = checkpoint_line
131141
break
132-
if current_line >= start_line:
133-
result.append(line.rstrip('\n\r'))
142+
143+
result = []
144+
145+
if start_offset is not None:
146+
# Efficient path: seek to checkpoint and scan from there
147+
with open(path, 'rb') as f:
148+
f.seek(start_offset)
149+
current_line = checkpoint_start_line
150+
for raw_line in f:
151+
if current_line > end_line:
152+
break
153+
if current_line >= start_line:
154+
result.append(raw_line.decode('utf-8', errors='replace').rstrip('\n\r'))
155+
current_line += 1
156+
else:
157+
# Fallback: linear scan from beginning (for small files without index)
158+
with open(path, encoding='utf-8', errors='replace') as f:
159+
for current_line, line in enumerate(f, 1):
160+
if current_line > end_line:
161+
break
162+
if current_line >= start_line:
163+
result.append(line.rstrip('\n\r'))
134164

135165
return result
136166

@@ -360,8 +390,6 @@ def samples_command(
360390
try:
361391
# Handle compressed files separately
362392
if file_is_compressed:
363-
line_list = list(line_offset)
364-
365393
# Check if this is a seekable zstd file
366394
from rx.seekable_zstd import is_seekable_zstd
367395

@@ -374,70 +402,121 @@ def samples_command(
374402
index = get_or_build_index(path)
375403
frames = read_seek_table(path)
376404

377-
# Get samples for each line using seekable zstd
405+
# Get samples for each line/range using seekable zstd
378406
context_data = {}
379407
line_to_offset = {}
380-
for line_num in line_list:
381-
# Find which frame contains this line
382-
frame_idx = None
383-
frame_info = None
384-
for frame in index.frames:
385-
if frame.first_line <= line_num <= frame.last_line:
386-
frame_idx = frame.index
387-
frame_info = frame
388-
first_line = frame.first_line
389-
break
390-
391-
if frame_idx is None:
392-
context_data[line_num] = []
393-
line_to_offset[str(line_num)] = -1
394-
continue
395-
396-
# Calculate byte offset for this line
397-
# Start with the frame's starting offset, then add bytes for each line before the target
398-
frame_offset = frames[frame_idx].decompressed_offset
399-
400-
# Decompress the frame to calculate exact offset
401-
frame_data = decompress_frame(path, frame_idx, frames)
402-
frame_lines = frame_data.decode('utf-8', errors='replace').split('\n')
403-
404-
# Calculate line index within frame (0-based)
405-
line_in_frame = line_num - first_line
406-
407-
# Calculate byte offset by summing lengths of lines before target
408-
byte_offset = frame_offset
409-
for i in range(line_in_frame):
410-
byte_offset += len(frame_lines[i].encode('utf-8')) + 1 # +1 for newline
411-
412-
line_to_offset[str(line_num)] = byte_offset
413-
414-
# Get context lines
415-
start_idx = max(0, line_in_frame - before_context)
416-
end_idx = min(len(frame_lines), line_in_frame + after_context + 1)
417-
418-
context_data[line_num] = frame_lines[start_idx:end_idx]
408+
409+
for start, end in parsed_offsets:
410+
if end is None:
411+
# Single line - use context
412+
line_num = start
413+
414+
# Find which frame contains this line
415+
frame_idx = None
416+
for frame in index.frames:
417+
if frame.first_line <= line_num <= frame.last_line:
418+
frame_idx = frame.index
419+
first_line = frame.first_line
420+
break
421+
422+
if frame_idx is None:
423+
context_data[line_num] = []
424+
line_to_offset[str(line_num)] = -1
425+
continue
426+
427+
# Calculate byte offset for this line
428+
frame_offset = frames[frame_idx].decompressed_offset
429+
430+
# Decompress the frame to calculate exact offset
431+
frame_data = decompress_frame(path, frame_idx, frames)
432+
frame_lines = frame_data.decode('utf-8', errors='replace').split('\n')
433+
434+
# Calculate line index within frame (0-based)
435+
line_in_frame = line_num - first_line
436+
437+
# Calculate byte offset by summing lengths of lines before target
438+
byte_offset = frame_offset
439+
for i in range(line_in_frame):
440+
byte_offset += len(frame_lines[i].encode('utf-8')) + 1 # +1 for newline
441+
442+
line_to_offset[str(line_num)] = byte_offset
443+
444+
# Get context lines
445+
start_idx = max(0, line_in_frame - before_context)
446+
end_idx = min(len(frame_lines), line_in_frame + after_context + 1)
447+
448+
context_data[line_num] = frame_lines[start_idx:end_idx]
449+
else:
450+
# Range - get exact lines, ignore context
451+
range_key = f'{start}-{end}'
452+
range_lines = []
453+
454+
# Collect all lines in the range, potentially across multiple frames
455+
for line_num in range(start, end + 1):
456+
# Find which frame contains this line
457+
frame_idx = None
458+
for frame in index.frames:
459+
if frame.first_line <= line_num <= frame.last_line:
460+
frame_idx = frame.index
461+
first_line = frame.first_line
462+
break
463+
464+
if frame_idx is None:
465+
continue
466+
467+
# Decompress the frame
468+
frame_data = decompress_frame(path, frame_idx, frames)
469+
frame_lines = frame_data.decode('utf-8', errors='replace').split('\n')
470+
471+
# Calculate line index within frame (0-based)
472+
line_in_frame = line_num - first_line
473+
if 0 <= line_in_frame < len(frame_lines):
474+
range_lines.append(frame_lines[line_in_frame])
475+
476+
context_data[range_key] = range_lines
477+
line_to_offset[range_key] = -1
419478
else:
420479
# Use generic compressed index for other formats
421480
click.echo(f'Processing compressed file ({compression_format.value})...', err=True)
422481
index_data = get_or_build_compressed_index(path)
423482

424-
# Get samples for each line
483+
# Get samples for each line/range
425484
context_data = {}
426-
for line_num in line_list:
427-
lines = get_decompressed_content_at_line(
428-
path,
429-
line_num,
430-
context_before=before_context,
431-
context_after=after_context,
432-
index_data=index_data,
433-
)
434-
context_data[line_num] = lines
485+
line_to_offset = {}
486+
for start, end in parsed_offsets:
487+
if end is None:
488+
# Single line - use context
489+
lines = get_decompressed_content_at_line(
490+
path,
491+
start,
492+
context_before=before_context,
493+
context_after=after_context,
494+
index_data=index_data,
495+
)
496+
context_data[start] = lines
497+
line_to_offset[str(start)] = -1
498+
else:
499+
# Range - get exact lines, ignore context
500+
range_key = f'{start}-{end}'
501+
range_lines = []
502+
for line_num in range(start, end + 1):
503+
lines = get_decompressed_content_at_line(
504+
path,
505+
line_num,
506+
context_before=0,
507+
context_after=0,
508+
index_data=index_data,
509+
)
510+
if lines:
511+
range_lines.extend(lines)
512+
context_data[range_key] = range_lines
513+
line_to_offset[range_key] = -1
435514

436515
# Use calculated offsets for seekable zstd, -1 for other formats
437516
if is_seekable_zstd(path):
438517
lines_dict = line_to_offset
439518
else:
440-
lines_dict = {str(ln): -1 for ln in line_list}
519+
lines_dict = line_to_offset
441520

442521
response = SamplesResponse(
443522
path=path,
@@ -528,7 +607,7 @@ def samples_command(
528607
else:
529608
# Range - get exact lines, ignore context
530609
range_key = f'{start}-{end}'
531-
lines = get_line_range(path, start, end)
610+
lines = get_line_range(path, start, end, index_data)
532611
context_data[range_key] = lines
533612
# For ranges, byte offset is not meaningful - use -1 to skip expensive calculation
534613
line_to_offset[range_key] = -1

0 commit comments

Comments
 (0)