From 911abc32fa1865d3e36f834fed585af9f9f0080d Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 22 Mar 2024 22:10:29 +0100 Subject: [PATCH] fix: incorrect negative offset in multi-byte string slicing (#15140) --- .../src/chunked_array/strings/substring.rs | 70 +++++++++---------- .../unit/namespaces/string/test_string.py | 16 +++++ 2 files changed, 48 insertions(+), 38 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/substring.rs b/crates/polars-ops/src/chunked_array/strings/substring.rs index 690567396fb8..b6d5d9828c3d 100644 --- a/crates/polars-ops/src/chunked_array/strings/substring.rs +++ b/crates/polars-ops/src/chunked_array/strings/substring.rs @@ -6,48 +6,42 @@ fn substring_ternary( opt_offset: Option, opt_length: Option, ) -> Option<&str> { - match (opt_str_val, opt_offset) { - (Some(str_val), Some(offset)) => { - // If `offset` is negative, it counts from the end of the string. - let offset = if offset >= 0 { - offset as usize - } else { - let offset = (0i64 - offset) as usize; - str_val - .char_indices() - .rev() - .nth(offset) - .map(|(idx, _)| idx + 1) - .unwrap_or(0) - }; + let str_val = opt_str_val?; + let offset = opt_offset?; - let mut iter_chars = str_val.char_indices(); - if let Some((offset_idx, _)) = iter_chars.nth(offset) { - let len_end = str_val.len() - offset_idx; - - // Slice to end of str if no length given. - let length = if let Some(length) = opt_length { - length as usize - } else { - len_end - }; + // Fast-path: always empty string. + if opt_length == Some(0) || offset >= str_val.len() as i64 { + return Some(""); + } - if length == 0 { - return Some(""); - } + let mut indices = str_val.char_indices().map(|(o, _)| o); + let mut length_reduction = 0; + let start_byte_offset = if offset >= 0 { + indices.nth(offset as usize).unwrap_or(str_val.len()) + } else { + // If `offset` is negative, it counts from the end of the string. + let mut chars_skipped = 0; + let found = indices + .inspect(|_| chars_skipped += 1) + .nth_back((-offset - 1) as usize); - let end_idx = iter_chars - .nth(length.saturating_sub(1)) - .map(|(idx, _)| idx) - .unwrap_or(str_val.len()); + // If we didn't find our char that means our offset was so negative it + // is before the start of our string. This means our length must be + // reduced, assuming it is finite. + if let Some(off) = found { + off + } else { + length_reduction = (-offset) as usize - chars_skipped; + 0 + } + }; - Some(&str_val[offset_idx..end_idx]) - } else { - Some("") - } - }, - _ => None, - } + let str_val = &str_val[start_byte_offset..]; + let mut indices = str_val.char_indices().map(|(o, _)| o); + let stop_byte_offset = opt_length + .and_then(|l| indices.nth((l as usize).saturating_sub(length_reduction))) + .unwrap_or(str_val.len()); + Some(&str_val[..stop_byte_offset]) } pub(super) fn substring( diff --git a/py-polars/tests/unit/namespaces/string/test_string.py b/py-polars/tests/unit/namespaces/string/test_string.py index 1d34025f539a..ba7cae882c93 100644 --- a/py-polars/tests/unit/namespaces/string/test_string.py +++ b/py-polars/tests/unit/namespaces/string/test_string.py @@ -46,6 +46,22 @@ def test_str_slice_expr() -> None: df.select(pl.col("a").str.slice(0, -1)) +def test_str_slice_multibyte() -> None: + ref = "你好世界" + s = pl.Series([ref]) + + # Pad the string to simplify (negative) offsets starting before/after the string. + npad = 20 + padref = "_" * npad + ref + "_" * npad + for start in range(-5, 6): + for length in range(6): + offset = npad + start if start >= 0 else npad + start + len(ref) + correct = padref[offset : offset + length].strip("_") + result = s.str.slice(start, length) + expected = pl.Series([correct]) + assert_series_equal(result, expected) + + def test_str_len_bytes() -> None: s = pl.Series(["Café", None, "345", "東京"]) result = s.str.len_bytes()