Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: incorrect negative offset in multi-byte string slicing #15140

Merged
merged 2 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 32 additions & 38 deletions crates/polars-ops/src/chunked_array/strings/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,48 +6,42 @@ fn substring_ternary(
opt_offset: Option<i64>,
opt_length: Option<u64>,
) -> Option<&str> {
match (opt_str_val, opt_offset) {
(Some(str_val), Some(offset)) => {
// If `offset` is negative, it counts from the end of the string.
let offset = if offset >= 0 {
offset as usize
} else {
let offset = (0i64 - offset) as usize;
str_val
.char_indices()
.rev()
.nth(offset)
.map(|(idx, _)| idx + 1)
.unwrap_or(0)
};
let str_val = opt_str_val?;
let offset = opt_offset?;

let mut iter_chars = str_val.char_indices();
if let Some((offset_idx, _)) = iter_chars.nth(offset) {
let len_end = str_val.len() - offset_idx;

// Slice to end of str if no length given.
let length = if let Some(length) = opt_length {
length as usize
} else {
len_end
};
// Fast-path: always empty string.
if opt_length == Some(0) || offset >= str_val.len() as i64 {
return Some("");
}

if length == 0 {
return Some("");
}
let mut indices = str_val.char_indices().map(|(o, _)| o);
let mut length_reduction = 0;
let start_byte_offset = if offset >= 0 {
indices.nth(offset as usize).unwrap_or(str_val.len())
} else {
// If `offset` is negative, it counts from the end of the string.
let mut chars_skipped = 0;
let found = indices
.inspect(|_| chars_skipped += 1)
.nth_back((-offset - 1) as usize);

let end_idx = iter_chars
.nth(length.saturating_sub(1))
.map(|(idx, _)| idx)
.unwrap_or(str_val.len());
// If we didn't find our char that means our offset was so negative it
// is before the start of our string. This means our length must be
// reduced, assuming it is finite.
if let Some(off) = found {
off
} else {
length_reduction = (-offset) as usize - chars_skipped;
0
}
};

Some(&str_val[offset_idx..end_idx])
} else {
Some("")
}
},
_ => None,
}
let str_val = &str_val[start_byte_offset..];
let mut indices = str_val.char_indices().map(|(o, _)| o);
let stop_byte_offset = opt_length
.and_then(|l| indices.nth((l as usize).saturating_sub(length_reduction)))
.unwrap_or(str_val.len());
Some(&str_val[..stop_byte_offset])
}

pub(super) fn substring(
Expand Down
16 changes: 16 additions & 0 deletions py-polars/tests/unit/namespaces/string/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,22 @@ def test_str_slice_expr() -> None:
df.select(pl.col("a").str.slice(0, -1))


def test_str_slice_multibyte() -> None:
ref = "你好世界"
s = pl.Series([ref])

# Pad the string to simplify (negative) offsets starting before/after the string.
npad = 20
padref = "_" * npad + ref + "_" * npad
for start in range(-5, 6):
for length in range(6):
offset = npad + start if start >= 0 else npad + start + len(ref)
correct = padref[offset : offset + length].strip("_")
result = s.str.slice(start, length)
expected = pl.Series([correct])
assert_series_equal(result, expected)


def test_str_len_bytes() -> None:
s = pl.Series(["Café", None, "345", "東京"])
result = s.str.len_bytes()
Expand Down
Loading