Skip to content

Commit 19aea1d

Browse files
committed
std.mem: Split tokenize into 3 versions by delimiter type: full, any, and scalar
This allows users to choose which version they need for their particular use case, as the previous default (now the 'any' version) was (1) not always the desired type of delimiter and (2) performed worse than the scalar version if the delimiter was a single item.
1 parent 29c48ef commit 19aea1d

File tree

1 file changed

+169
-54
lines changed

1 file changed

+169
-54
lines changed

lib/std/mem.zig

Lines changed: 169 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1862,72 +1862,117 @@ test "byteSwapAllFields" {
18621862
}, s);
18631863
}
18641864

1865+
/// Deprecated: use `tokenizeAny`, `tokenizeFull`, or `tokenizeScalar`
1866+
pub const tokenize = tokenizeAny;
1867+
18651868
/// Returns an iterator that iterates over the slices of `buffer` that are not
1866-
/// any of the bytes in `delimiter_bytes`.
1869+
/// any of the items in `delimiters`.
18671870
///
1868-
/// `tokenize(u8, " abc def ghi ", " ")` will return slices
1871+
/// `tokenizeAny(u8, " abc|def || ghi ", " |")` will return slices
18691872
/// for "abc", "def", "ghi", null, in that order.
18701873
///
18711874
/// If `buffer` is empty, the iterator will return null.
1872-
/// If `delimiter_bytes` does not exist in buffer,
1875+
/// If none of `delimiters` exist in buffer,
1876+
/// the iterator will return `buffer`, null, in that order.
1877+
///
1878+
/// See also: `tokenizeFull`, `tokenizeScalar`,
1879+
/// `splitFull`,`splitAny`, `splitScalar`,
1880+
/// `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
1881+
pub fn tokenizeAny(comptime T: type, buffer: []const T, delimiters: []const T) TokenIterator(T, .any) {
1882+
return .{
1883+
.index = 0,
1884+
.buffer = buffer,
1885+
.delimiter = delimiters,
1886+
};
1887+
}
1888+
1889+
/// Returns an iterator that iterates over the slices of `buffer` that are not
1890+
/// the sequence in `delimiter`.
1891+
///
1892+
/// `tokenizeFull(u8, "<>abc><def<><>ghi", "<>")` will return slices
1893+
/// for "abc><def", "ghi", null, in that order.
1894+
///
1895+
/// If `buffer` is empty, the iterator will return null.
1896+
/// If `delimiter` does not exist in buffer,
18731897
/// the iterator will return `buffer`, null, in that order.
1898+
/// The delimiter length must not be zero.
18741899
///
1875-
/// See also: `split` and `splitBackwards`.
1876-
pub fn tokenize(comptime T: type, buffer: []const T, delimiter_bytes: []const T) TokenIterator(T) {
1900+
/// See also: `tokenizeAny`, `tokenizeScalar`,
1901+
/// `splitFull`,`splitAny`, and `splitScalar`
1902+
/// `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
1903+
pub fn tokenizeFull(comptime T: type, buffer: []const T, delimiter: []const T) TokenIterator(T, .full) {
1904+
assert(delimiter.len != 0);
18771905
return .{
18781906
.index = 0,
18791907
.buffer = buffer,
1880-
.delimiter_bytes = delimiter_bytes,
1908+
.delimiter = delimiter,
18811909
};
18821910
}
18831911

1884-
test "tokenize" {
1885-
var it = tokenize(u8, " abc def ghi ", " ");
1912+
/// Returns an iterator that iterates over the slices of `buffer` that are not
1913+
/// `delimiter`.
1914+
///
1915+
/// `tokenizeScalar(u8, " abc def ghi ", ' ')` will return slices
1916+
/// for "abc", "def", "ghi", null, in that order.
1917+
///
1918+
/// If `buffer` is empty, the iterator will return null.
1919+
/// If `delimiter` does not exist in buffer,
1920+
/// the iterator will return `buffer`, null, in that order.
1921+
///
1922+
/// See also: `tokenizeAny`, `tokenizeFull`,
1923+
/// `splitFull`,`splitAny`, and `splitScalar`
1924+
/// `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
1925+
pub fn tokenizeScalar(comptime T: type, buffer: []const T, delimiter: T) TokenIterator(T, .scalar) {
1926+
return .{
1927+
.index = 0,
1928+
.buffer = buffer,
1929+
.delimiter = delimiter,
1930+
};
1931+
}
1932+
1933+
test "tokenizeScalar" {
1934+
var it = tokenizeScalar(u8, " abc def ghi ", ' ');
18861935
try testing.expect(eql(u8, it.next().?, "abc"));
18871936
try testing.expect(eql(u8, it.peek().?, "def"));
18881937
try testing.expect(eql(u8, it.next().?, "def"));
18891938
try testing.expect(eql(u8, it.next().?, "ghi"));
18901939
try testing.expect(it.next() == null);
18911940

1892-
it = tokenize(u8, "..\\bob", "\\");
1941+
it = tokenizeScalar(u8, "..\\bob", '\\');
18931942
try testing.expect(eql(u8, it.next().?, ".."));
18941943
try testing.expect(eql(u8, "..", "..\\bob"[0..it.index]));
18951944
try testing.expect(eql(u8, it.next().?, "bob"));
18961945
try testing.expect(it.next() == null);
18971946

1898-
it = tokenize(u8, "//a/b", "/");
1947+
it = tokenizeScalar(u8, "//a/b", '/');
18991948
try testing.expect(eql(u8, it.next().?, "a"));
19001949
try testing.expect(eql(u8, it.next().?, "b"));
19011950
try testing.expect(eql(u8, "//a/b", "//a/b"[0..it.index]));
19021951
try testing.expect(it.next() == null);
19031952

1904-
it = tokenize(u8, "|", "|");
1953+
it = tokenizeScalar(u8, "|", '|');
19051954
try testing.expect(it.next() == null);
19061955
try testing.expect(it.peek() == null);
19071956

1908-
it = tokenize(u8, "", "|");
1957+
it = tokenizeScalar(u8, "", '|');
19091958
try testing.expect(it.next() == null);
19101959
try testing.expect(it.peek() == null);
19111960

1912-
it = tokenize(u8, "hello", "");
1913-
try testing.expect(eql(u8, it.next().?, "hello"));
1914-
try testing.expect(it.next() == null);
1915-
1916-
it = tokenize(u8, "hello", " ");
1961+
it = tokenizeScalar(u8, "hello", ' ');
19171962
try testing.expect(eql(u8, it.next().?, "hello"));
19181963
try testing.expect(it.next() == null);
19191964

1920-
var it16 = tokenize(
1965+
var it16 = tokenizeScalar(
19211966
u16,
19221967
std.unicode.utf8ToUtf16LeStringLiteral("hello"),
1923-
std.unicode.utf8ToUtf16LeStringLiteral(" "),
1968+
' ',
19241969
);
19251970
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("hello")));
19261971
try testing.expect(it16.next() == null);
19271972
}
19281973

1929-
test "tokenize (multibyte)" {
1930-
var it = tokenize(u8, "a|b,c/d e", " /,|");
1974+
test "tokenizeAny (multibyte)" {
1975+
var it = tokenizeAny(u8, "a|b,c/d e", " /,|");
19311976
try testing.expect(eql(u8, it.next().?, "a"));
19321977
try testing.expect(eql(u8, it.peek().?, "b"));
19331978
try testing.expect(eql(u8, it.next().?, "b"));
@@ -1937,7 +1982,11 @@ test "tokenize (multibyte)" {
19371982
try testing.expect(it.next() == null);
19381983
try testing.expect(it.peek() == null);
19391984

1940-
var it16 = tokenize(
1985+
it = tokenizeAny(u8, "hello", "");
1986+
try testing.expect(eql(u8, it.next().?, "hello"));
1987+
try testing.expect(it.next() == null);
1988+
1989+
var it16 = tokenizeAny(
19411990
u16,
19421991
std.unicode.utf8ToUtf16LeStringLiteral("a|b,c/d e"),
19431992
std.unicode.utf8ToUtf16LeStringLiteral(" /,|"),
@@ -1950,18 +1999,68 @@ test "tokenize (multibyte)" {
19501999
try testing.expect(it16.next() == null);
19512000
}
19522001

2002+
test "tokenizeFull" {
2003+
var it = tokenizeFull(u8, "a<>b<><>c><>d><", "<>");
2004+
try testing.expectEqualStrings("a", it.next().?);
2005+
try testing.expectEqualStrings("b", it.peek().?);
2006+
try testing.expectEqualStrings("b", it.next().?);
2007+
try testing.expectEqualStrings("c>", it.next().?);
2008+
try testing.expectEqualStrings("d><", it.next().?);
2009+
try testing.expect(it.next() == null);
2010+
try testing.expect(it.peek() == null);
2011+
2012+
var it16 = tokenizeFull(
2013+
u16,
2014+
std.unicode.utf8ToUtf16LeStringLiteral("a<>b<><>c><>d><"),
2015+
std.unicode.utf8ToUtf16LeStringLiteral("<>"),
2016+
);
2017+
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a")));
2018+
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b")));
2019+
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c>")));
2020+
try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d><")));
2021+
try testing.expect(it16.next() == null);
2022+
}
2023+
19532024
test "tokenize (reset)" {
1954-
var it = tokenize(u8, " abc def ghi ", " ");
1955-
try testing.expect(eql(u8, it.next().?, "abc"));
1956-
try testing.expect(eql(u8, it.next().?, "def"));
1957-
try testing.expect(eql(u8, it.next().?, "ghi"));
2025+
{
2026+
var it = tokenizeAny(u8, " abc def ghi ", " ");
2027+
try testing.expect(eql(u8, it.next().?, "abc"));
2028+
try testing.expect(eql(u8, it.next().?, "def"));
2029+
try testing.expect(eql(u8, it.next().?, "ghi"));
19582030

1959-
it.reset();
2031+
it.reset();
19602032

1961-
try testing.expect(eql(u8, it.next().?, "abc"));
1962-
try testing.expect(eql(u8, it.next().?, "def"));
1963-
try testing.expect(eql(u8, it.next().?, "ghi"));
1964-
try testing.expect(it.next() == null);
2033+
try testing.expect(eql(u8, it.next().?, "abc"));
2034+
try testing.expect(eql(u8, it.next().?, "def"));
2035+
try testing.expect(eql(u8, it.next().?, "ghi"));
2036+
try testing.expect(it.next() == null);
2037+
}
2038+
{
2039+
var it = tokenizeFull(u8, "<><>abc<>def<><>ghi<>", "<>");
2040+
try testing.expect(eql(u8, it.next().?, "abc"));
2041+
try testing.expect(eql(u8, it.next().?, "def"));
2042+
try testing.expect(eql(u8, it.next().?, "ghi"));
2043+
2044+
it.reset();
2045+
2046+
try testing.expect(eql(u8, it.next().?, "abc"));
2047+
try testing.expect(eql(u8, it.next().?, "def"));
2048+
try testing.expect(eql(u8, it.next().?, "ghi"));
2049+
try testing.expect(it.next() == null);
2050+
}
2051+
{
2052+
var it = tokenizeScalar(u8, " abc def ghi ", ' ');
2053+
try testing.expect(eql(u8, it.next().?, "abc"));
2054+
try testing.expect(eql(u8, it.next().?, "def"));
2055+
try testing.expect(eql(u8, it.next().?, "ghi"));
2056+
2057+
it.reset();
2058+
2059+
try testing.expect(eql(u8, it.next().?, "abc"));
2060+
try testing.expect(eql(u8, it.next().?, "def"));
2061+
try testing.expect(eql(u8, it.next().?, "ghi"));
2062+
try testing.expect(it.next() == null);
2063+
}
19652064
}
19662065

19672066
/// Deprecated: use `splitFull`, `splitAny`, or `splitScalar`
@@ -1978,8 +2077,8 @@ pub const split = splitFull;
19782077
/// The delimiter length must not be zero.
19792078
///
19802079
/// See also: `splitAny`, `splitScalar`, `splitBackwardsFull`,
1981-
/// `splitBackwardsAny`,`splitBackwardsScalar`, and
1982-
/// `tokenize`.
2080+
/// `splitBackwardsAny`,`splitBackwardsScalar`,
2081+
/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
19832082
pub fn splitFull(comptime T: type, buffer: []const T, delimiter: []const T) SplitIterator(T, .full) {
19842083
assert(delimiter.len != 0);
19852084
return .{
@@ -1999,8 +2098,8 @@ pub fn splitFull(comptime T: type, buffer: []const T, delimiter: []const T) Spli
19992098
/// the iterator will return `buffer`, null, in that order.
20002099
///
20012100
/// See also: `splitFull`, `splitScalar`, `splitBackwardsFull`,
2002-
/// `splitBackwardsAny`,`splitBackwardsScalar`, and
2003-
/// `tokenize`.
2101+
/// `splitBackwardsAny`,`splitBackwardsScalar`,
2102+
/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
20042103
pub fn splitAny(comptime T: type, buffer: []const T, delimiters: []const T) SplitIterator(T, .any) {
20052104
return .{
20062105
.index = 0,
@@ -2019,8 +2118,8 @@ pub fn splitAny(comptime T: type, buffer: []const T, delimiters: []const T) Spli
20192118
/// the iterator will return `buffer`, null, in that order.
20202119
///
20212120
/// See also: `splitFull`, `splitAny`, `splitBackwardsFull`,
2022-
/// `splitBackwardsAny`,`splitBackwardsScalar`, and
2023-
/// `tokenize`.
2121+
/// `splitBackwardsAny`,`splitBackwardsScalar`,
2122+
/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
20242123
pub fn splitScalar(comptime T: type, buffer: []const T, delimiter: T) SplitIterator(T, .scalar) {
20252124
return .{
20262125
.index = 0,
@@ -2176,8 +2275,8 @@ pub const splitBackwards = splitBackwardsFull;
21762275
/// The delimiter length must not be zero.
21772276
///
21782277
/// See also: `splitBackwardsAny`, `splitBackwardsScalar`,
2179-
/// `splitFull`, `splitAny`,`splitScalar`, and
2180-
/// `tokenize`.
2278+
/// `splitFull`, `splitAny`,`splitScalar`,
2279+
/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
21812280
pub fn splitBackwardsFull(comptime T: type, buffer: []const T, delimiter: []const T) SplitBackwardsIterator(T, .full) {
21822281
assert(delimiter.len != 0);
21832282
return .{
@@ -2197,8 +2296,8 @@ pub fn splitBackwardsFull(comptime T: type, buffer: []const T, delimiter: []cons
21972296
/// the iterator will return `buffer`, null, in that order.
21982297
///
21992298
/// See also: `splitBackwardsFull`, `splitBackwardsScalar`,
2200-
/// `splitFull`, `splitAny`,`splitScalar`, and
2201-
/// `tokenize`.
2299+
/// `splitFull`, `splitAny`,`splitScalar`,
2300+
/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
22022301
pub fn splitBackwardsAny(comptime T: type, buffer: []const T, delimiters: []const T) SplitBackwardsIterator(T, .any) {
22032302
return .{
22042303
.index = buffer.len,
@@ -2217,8 +2316,8 @@ pub fn splitBackwardsAny(comptime T: type, buffer: []const T, delimiters: []cons
22172316
/// the iterator will return `buffer`, null, in that order.
22182317
///
22192318
/// See also: `splitBackwardsFull`, `splitBackwardsAny`,
2220-
/// `splitFull`, `splitAny`,`splitScalar`, and
2221-
/// `tokenize`.
2319+
/// `splitFull`, `splitAny`,`splitScalar`,
2320+
/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
22222321
pub fn splitBackwardsScalar(comptime T: type, buffer: []const T, delimiter: T) SplitBackwardsIterator(T, .scalar) {
22232322
return .{
22242323
.index = buffer.len,
@@ -2548,10 +2647,13 @@ test "endsWith" {
25482647

25492648
pub const DelimiterType = enum { full, any, scalar };
25502649

2551-
pub fn TokenIterator(comptime T: type) type {
2650+
pub fn TokenIterator(comptime T: type, comptime delimiter_type: DelimiterType) type {
25522651
return struct {
25532652
buffer: []const T,
2554-
delimiter_bytes: []const T,
2653+
delimiter: switch (delimiter_type) {
2654+
.full, .any => []const T,
2655+
.scalar => T,
2656+
},
25552657
index: usize,
25562658

25572659
const Self = @This();
@@ -2568,15 +2670,18 @@ pub fn TokenIterator(comptime T: type) type {
25682670
/// complete. Does not advance to the next token.
25692671
pub fn peek(self: *Self) ?[]const T {
25702672
// move to beginning of token
2571-
while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
2673+
while (self.index < self.buffer.len and self.isDelimiter(self.index)) : (self.index += switch (delimiter_type) {
2674+
.full => self.delimiter.len,
2675+
.any, .scalar => 1,
2676+
}) {}
25722677
const start = self.index;
25732678
if (start == self.buffer.len) {
25742679
return null;
25752680
}
25762681

25772682
// move to end of token
25782683
var end = start;
2579-
while (end < self.buffer.len and !self.isSplitByte(self.buffer[end])) : (end += 1) {}
2684+
while (end < self.buffer.len and !self.isDelimiter(end)) : (end += 1) {}
25802685

25812686
return self.buffer[start..end];
25822687
}
@@ -2585,7 +2690,10 @@ pub fn TokenIterator(comptime T: type) type {
25852690
pub fn rest(self: Self) []const T {
25862691
// move to beginning of token
25872692
var index: usize = self.index;
2588-
while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {}
2693+
while (index < self.buffer.len and self.isDelimiter(index)) : (index += switch (delimiter_type) {
2694+
.full => self.delimiter.len,
2695+
.any, .scalar => 1,
2696+
}) {}
25892697
return self.buffer[index..];
25902698
}
25912699

@@ -2594,13 +2702,20 @@ pub fn TokenIterator(comptime T: type) type {
25942702
self.index = 0;
25952703
}
25962704

2597-
fn isSplitByte(self: Self, byte: T) bool {
2598-
for (self.delimiter_bytes) |delimiter_byte| {
2599-
if (byte == delimiter_byte) {
2600-
return true;
2601-
}
2705+
fn isDelimiter(self: Self, index: usize) bool {
2706+
switch (delimiter_type) {
2707+
.full => return startsWith(T, self.buffer[index..], self.delimiter),
2708+
.any => {
2709+
const item = self.buffer[index];
2710+
for (self.delimiter) |delimiter_item| {
2711+
if (item == delimiter_item) {
2712+
return true;
2713+
}
2714+
}
2715+
return false;
2716+
},
2717+
.scalar => return self.buffer[index] == self.delimiter,
26022718
}
2603-
return false;
26042719
}
26052720
};
26062721
}

0 commit comments

Comments
 (0)