Skip to content

Commit cbe978b

Browse files
committed
Lines to bytes mapping.
1 parent e9b384e commit cbe978b

File tree

10 files changed

+815
-73
lines changed

10 files changed

+815
-73
lines changed

.github/workflows/release.yml

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,55 +38,70 @@ jobs:
3838
artifact_name: rx-macos-x86_64
3939
binary_path: dist/rx
4040

41-
# Skip builds based on manual workflow dispatch input
42-
if: |
43-
github.event_name == 'release' ||
44-
github.event.inputs.platform == 'all' ||
45-
github.event.inputs.platform == matrix.platform
46-
4741
steps:
42+
- name: Check if build should run
43+
id: should_build
44+
run: |
45+
if [ "${{ github.event_name }}" = "release" ]; then
46+
echo "should_run=true" >> $GITHUB_OUTPUT
47+
elif [ "${{ github.event.inputs.platform }}" = "all" ]; then
48+
echo "should_run=true" >> $GITHUB_OUTPUT
49+
elif [ "${{ github.event.inputs.platform }}" = "${{ matrix.platform }}" ]; then
50+
echo "should_run=true" >> $GITHUB_OUTPUT
51+
else
52+
echo "should_run=false" >> $GITHUB_OUTPUT
53+
fi
54+
shell: bash
55+
4856
- name: Checkout code
57+
if: steps.should_build.outputs.should_run == 'true'
4958
uses: actions/checkout@v4
5059

5160
- name: Set up Python
61+
if: steps.should_build.outputs.should_run == 'true'
5262
uses: actions/setup-python@v5
5363
with:
5464
python-version: "3.13"
5565

5666
- name: Install uv
67+
if: steps.should_build.outputs.should_run == 'true'
5768
uses: astral-sh/setup-uv@v4
5869
with:
5970
enable-cache: true
6071

6172
- name: Install dependencies
73+
if: steps.should_build.outputs.should_run == 'true'
6274
run: |
6375
uv sync --group dev --group build
6476
6577
- name: Install ripgrep (Ubuntu)
66-
if: matrix.os == 'ubuntu-latest'
78+
if: steps.should_build.outputs.should_run == 'true' && matrix.os == 'ubuntu-latest'
6779
run: |
6880
sudo apt-get update
6981
sudo apt-get install -y ripgrep
7082
7183
- name: Install ripgrep (macOS)
72-
if: matrix.os == 'macos-latest'
84+
if: steps.should_build.outputs.should_run == 'true' && matrix.os == 'macos-latest'
7385
run: |
7486
brew install ripgrep
7587
7688
- name: Install ripgrep (Windows)
77-
if: matrix.os == 'windows-latest'
89+
if: steps.should_build.outputs.should_run == 'true' && matrix.os == 'windows-latest'
7890
run: |
7991
choco install ripgrep -y
8092
8193
- name: Run tests
94+
if: steps.should_build.outputs.should_run == 'true'
8295
run: |
8396
uv run pytest
8497
8598
- name: Build with PyInstaller
99+
if: steps.should_build.outputs.should_run == 'true'
86100
run: |
87101
uv run pyinstaller rx.spec
88102
89103
- name: Verify binary exists
104+
if: steps.should_build.outputs.should_run == 'true'
90105
shell: bash
91106
run: |
92107
ls -lh dist/
@@ -96,25 +111,26 @@ jobs:
96111
fi
97112
98113
- name: Test binary (Unix)
99-
if: matrix.os != 'windows-latest'
114+
if: steps.should_build.outputs.should_run == 'true' && matrix.os != 'windows-latest'
100115
run: |
101116
chmod +x ${{ matrix.binary_path }}
102117
${{ matrix.binary_path }} --version
103118
104119
- name: Test binary (Windows)
105-
if: matrix.os == 'windows-latest'
120+
if: steps.should_build.outputs.should_run == 'true' && matrix.os == 'windows-latest'
106121
run: |
107122
${{ matrix.binary_path }} --version
108123
109124
- name: Upload artifact
125+
if: steps.should_build.outputs.should_run == 'true'
110126
uses: actions/upload-artifact@v4
111127
with:
112128
name: ${{ matrix.artifact_name }}
113129
path: ${{ matrix.binary_path }}
114130
if-no-files-found: error
115131

116132
- name: Upload to release (on release event)
117-
if: github.event_name == 'release'
133+
if: steps.should_build.outputs.should_run == 'true' && github.event_name == 'release'
118134
uses: actions/upload-release-asset@v1
119135
env:
120136
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/tests.yml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ name: Tests
22

33
on:
44
push:
5-
branches: [ main ]
5+
branches: [main]
66
pull_request:
7-
branches: [ main ]
7+
branches: [main]
88
workflow_dispatch:
99

1010
jobs:
@@ -29,10 +29,11 @@ jobs:
2929
- name: Run tests with coverage
3030
run: uv run pytest tests/ --cov=rx --cov-report=term-missing --cov-report=xml
3131

32-
- name: Upload coverage to Codecov
33-
uses: codecov/codecov-action@v4
34-
with:
35-
file: ./coverage.xml
36-
fail_ci_if_error: false
37-
env:
38-
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
32+
# disabled for now — no codecov token yet.
33+
# - name: Upload coverage to Codecov
34+
# uses: codecov/codecov-action@v4
35+
# with:
36+
# file: ./coverage.xml
37+
# fail_ci_if_error: false
38+
# env:
39+
# CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

src/rx/cli/samples.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import click
77

8+
from rx.index import calculate_exact_line_for_offset, calculate_exact_offset_for_line, get_index_path, load_index
89
from rx.models import SamplesResponse
910
from rx.parse import get_context, get_context_by_lines, is_text_file
1011

@@ -124,10 +125,22 @@ def samples_command(
124125
offset_list = list(byte_offset)
125126
context_data = get_context(path, offset_list, before_context, after_context)
126127

128+
# Calculate line numbers for each byte offset (only if JSON output)
129+
offset_to_line = {}
130+
if json_output:
131+
index_data = load_index(get_index_path(path))
132+
for offset in offset_list:
133+
line_num = calculate_exact_line_for_offset(path, offset, index_data)
134+
offset_to_line[str(offset)] = line_num
135+
else:
136+
# For CLI mode, populate with offsets as keys (no line mapping needed)
137+
for offset in offset_list:
138+
offset_to_line[str(offset)] = -1
139+
127140
response = SamplesResponse(
128141
path=path,
129-
offsets=offset_list,
130-
lines=[],
142+
offsets=offset_to_line,
143+
lines={},
131144
before_context=before_context,
132145
after_context=after_context,
133146
samples={str(k): v for k, v in context_data.items()},
@@ -137,10 +150,22 @@ def samples_command(
137150
line_list = list(line_offset)
138151
context_data = get_context_by_lines(path, line_list, before_context, after_context)
139152

153+
# Calculate byte offsets for each line number (only if JSON output)
154+
line_to_offset = {}
155+
if json_output:
156+
index_data = load_index(get_index_path(path))
157+
for line_num in line_list:
158+
byte_offset = calculate_exact_offset_for_line(path, line_num, index_data)
159+
line_to_offset[str(line_num)] = byte_offset
160+
else:
161+
# For CLI mode, populate with line numbers as keys (no offset mapping needed)
162+
for line_num in line_list:
163+
line_to_offset[str(line_num)] = -1
164+
140165
response = SamplesResponse(
141166
path=path,
142-
offsets=[],
143-
lines=line_list,
167+
offsets={},
168+
lines=line_to_offset,
144169
before_context=before_context,
145170
after_context=after_context,
146171
samples={str(k): v for k, v in context_data.items()},

src/rx/index.py

Lines changed: 165 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def get_large_file_threshold_bytes() -> int:
5050
Controlled by RX_LARGE_TEXT_FILE_MB environment variable.
5151
Default: 100MB
5252
"""
53-
threshold_mb = get_int_env("RX_LARGE_TEXT_FILE_MB")
53+
threshold_mb = get_int_env("DEFAULT_LARGE_FILE_MB")
5454
if threshold_mb <= 0:
5555
threshold_mb = DEFAULT_LARGE_FILE_MB
5656
return threshold_mb * 1024 * 1024
@@ -418,6 +418,170 @@ def find_line_offset(line_index: list[list[int]], target_line: int) -> tuple[int
418418
return (line_index[idx][0], line_index[idx][1])
419419

420420

421+
def calculate_exact_offset_for_line(filename: str, target_line: int, index_data: dict | None = None) -> int:
422+
"""Calculate the exact byte offset for a given line number.
423+
424+
Args:
425+
filename: Path to the file
426+
target_line: Line number (1-based) to find offset for
427+
index_data: Optional index data. If None, will try to load or calculate
428+
429+
Returns:
430+
Byte offset of the line, or -1 if cannot determine (large file without index)
431+
"""
432+
# If no index provided, try to load it
433+
if index_data is None:
434+
index_path = get_index_path(filename)
435+
index_data = load_index(index_path)
436+
437+
# If we have an index, use it
438+
if index_data:
439+
line_index = index_data.get("line_index", [])
440+
if not line_index:
441+
return -1
442+
443+
# Find closest indexed line
444+
indexed_line, indexed_offset = find_line_offset(line_index, target_line)
445+
446+
# If exact match, return it
447+
if indexed_line == target_line:
448+
return indexed_offset
449+
450+
# Read from indexed position and count to target
451+
# Sequential reading is fast due to OS buffering and disk read-ahead
452+
try:
453+
with open(filename, "rb") as f:
454+
f.seek(indexed_offset)
455+
current_line = indexed_line
456+
current_offset = indexed_offset
457+
458+
for line_bytes in f:
459+
if current_line == target_line:
460+
return current_offset
461+
current_offset += len(line_bytes)
462+
current_line += 1
463+
464+
# Reached EOF before finding target line
465+
return -1
466+
except (IOError, OSError) as e:
467+
logger.error(f"Failed to read file {filename}: {e}")
468+
return -1
469+
470+
# No index - check if file is small enough to read
471+
try:
472+
file_size = os.path.getsize(filename)
473+
threshold = get_large_file_threshold_bytes()
474+
475+
if file_size > threshold:
476+
# Large file without index - cannot determine
477+
return -1
478+
479+
# Small file - read from beginning
480+
with open(filename, "rb") as f:
481+
current_line = 0
482+
current_offset = 0
483+
484+
for line_bytes in f:
485+
current_line += 1
486+
if current_line == target_line:
487+
return current_offset
488+
current_offset += len(line_bytes)
489+
490+
# Target line beyond EOF
491+
return -1
492+
except (IOError, OSError) as e:
493+
logger.error(f"Failed to process file {filename}: {e}")
494+
return -1
495+
496+
497+
def calculate_exact_line_for_offset(filename: str, target_offset: int, index_data: dict | None = None) -> int:
498+
"""Calculate the exact line number for a given byte offset.
499+
500+
Args:
501+
filename: Path to the file
502+
target_offset: Byte offset to find line number for
503+
index_data: Optional index data. If None, will try to load or calculate
504+
505+
Returns:
506+
Line number (1-based) at the offset, or -1 if cannot determine
507+
"""
508+
# If no index provided, try to load it
509+
if index_data is None:
510+
index_path = get_index_path(filename)
511+
index_data = load_index(index_path)
512+
513+
# If we have an index, use it
514+
if index_data:
515+
line_index = index_data.get("line_index", [])
516+
if not line_index:
517+
return -1
518+
519+
# Find closest indexed line before target offset
520+
# Binary search by offset
521+
offsets = [entry[1] for entry in line_index]
522+
idx = bisect.bisect_right(offsets, target_offset) - 1
523+
if idx < 0:
524+
idx = 0
525+
526+
indexed_line, indexed_offset = line_index[idx]
527+
528+
# If exact match, return it
529+
if indexed_offset == target_offset:
530+
return indexed_line
531+
532+
# Read from indexed position and count lines to target offset
533+
# Sequential reading is fast due to OS buffering and disk read-ahead
534+
try:
535+
with open(filename, "rb") as f:
536+
f.seek(indexed_offset)
537+
current_line = indexed_line
538+
current_offset = indexed_offset
539+
540+
for line_bytes in f:
541+
if current_offset == target_offset:
542+
return current_line
543+
if current_offset + len(line_bytes) > target_offset:
544+
# Target offset is within this line
545+
return current_line
546+
current_offset += len(line_bytes)
547+
current_line += 1
548+
549+
# Reached EOF
550+
return -1
551+
except (IOError, OSError) as e:
552+
logger.error(f"Failed to read file {filename}: {e}")
553+
return -1
554+
555+
# No index - check if file is small enough to read
556+
try:
557+
file_size = os.path.getsize(filename)
558+
threshold = get_large_file_threshold_bytes()
559+
560+
if file_size > threshold:
561+
# Large file without index - cannot determine
562+
return -1
563+
564+
# Small file - read from beginning
565+
with open(filename, "rb") as f:
566+
current_line = 0
567+
current_offset = 0
568+
569+
for line_bytes in f:
570+
current_line += 1
571+
if current_offset == target_offset:
572+
return current_line
573+
if current_offset + len(line_bytes) > target_offset:
574+
# Target offset is within this line
575+
return current_line
576+
current_offset += len(line_bytes)
577+
578+
# EOF
579+
return -1
580+
except (IOError, OSError) as e:
581+
logger.error(f"Failed to process file {filename}: {e}")
582+
return -1
583+
584+
421585
def get_index_info(source_path: str) -> dict | None:
422586
"""Get information about an existing index.
423587

src/rx/models.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -582,8 +582,12 @@ class SamplesResponse(BaseModel):
582582
"""Response from samples endpoint"""
583583

584584
path: str = Field(..., example="/path/to/file.txt")
585-
offsets: list[int] = Field(default_factory=list, example=[123, 456])
586-
lines: list[int] = Field(default_factory=list, example=[100, 200])
585+
offsets: dict[str, int] = Field(
586+
default_factory=dict, example={"123": 1, "456": 2}, description="Mapping of byte offset to line number"
587+
)
588+
lines: dict[str, int] = Field(
589+
default_factory=dict, example={"1": 0, "2": 123}, description="Mapping of line number to byte offset"
590+
)
587591
before_context: int = Field(..., example=3)
588592
after_context: int = Field(..., example=3)
589593
samples: dict[str, list[str]] = Field(

0 commit comments

Comments
 (0)