Skip to content

Commit b2088ce

Browse files
committed
src: replace naive search in Buffer::IndexOf
Adds the string search implementation from v8 which uses naive search if pattern length < 8 or to a specific badness then uses Boyer-Moore-Horspool Added benchmark shows the expected improvements Added option to use ucs2 encoding with Buffer::IndexOf
1 parent abb2a4b commit b2088ce

File tree

8 files changed

+4931
-60
lines changed

8 files changed

+4931
-60
lines changed

benchmark/buffers/buffer-indexof.js

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
var common = require('../common.js');
2+
var fs = require('fs');
3+
4+
var bench = common.createBenchmark(main, {
5+
search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther',
6+
'Ou est ma chatte?', 'found it very', 'among mad people',
7+
'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa',
8+
'venture to go near the house till she had brought herself down to',
9+
'</i> to the Caterpillar'],
10+
encoding: ['undefined', 'utf8', 'ucs2', 'binary'],
11+
type: ['buffer', 'string'],
12+
iter: [1]
13+
});
14+
15+
function main(conf) {
16+
var iter = (conf.iter) * 100000;
17+
var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html');
18+
var search = conf.search;
19+
var encoding = conf.encoding;
20+
21+
if (encoding === 'undefined') {
22+
encoding = undefined;
23+
}
24+
25+
if (encoding === 'ucs2') {
26+
aliceBuffer = new Buffer(aliceBuffer.toString(), encoding);
27+
}
28+
29+
if (conf.type === 'buffer') {
30+
search = new Buffer(new Buffer(search).toString(), encoding);
31+
}
32+
33+
bench.start();
34+
for (var i = 0; i < iter; i++) {
35+
aliceBuffer.indexOf(search, 0, encoding);
36+
}
37+
bench.end(iter);
38+
}

benchmark/fixtures/alice.html

+3,867
Large diffs are not rendered by default.

lib/buffer.js

+39-6
Original file line numberDiff line numberDiff line change
@@ -395,20 +395,53 @@ Buffer.prototype.compare = function compare(b) {
395395
return binding.compare(this, b);
396396
};
397397

398+
function slowIndexOf(buffer, val, byteOffset, encoding) {
399+
var loweredCase = false;
400+
for (;;) {
401+
switch (encoding) {
402+
case 'utf8':
403+
case 'utf-8':
404+
case 'ucs2':
405+
case 'ucs-2':
406+
case 'utf16le':
407+
case 'utf-16le':
408+
case 'binary':
409+
return binding.indexOfString(buffer, val, byteOffset, encoding);
398410

399-
Buffer.prototype.indexOf = function indexOf(val, byteOffset) {
411+
case 'base64':
412+
case 'ascii':
413+
case 'hex':
414+
return binding.indexOfBuffer(
415+
buffer, Buffer(val, encoding), byteOffset, encoding);
416+
417+
default:
418+
if (loweredCase) {
419+
throw new TypeError('Unknown encoding: ' + encoding);
420+
}
421+
422+
encoding = ('' + encoding).toLowerCase();
423+
loweredCase = true;
424+
}
425+
}
426+
}
427+
428+
Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
400429
if (byteOffset > 0x7fffffff)
401430
byteOffset = 0x7fffffff;
402431
else if (byteOffset < -0x80000000)
403432
byteOffset = -0x80000000;
404433
byteOffset >>= 0;
405434

406-
if (typeof val === 'string')
407-
return binding.indexOfString(this, val, byteOffset);
408-
if (val instanceof Buffer)
409-
return binding.indexOfBuffer(this, val, byteOffset);
410-
if (typeof val === 'number')
435+
if (typeof val === 'string') {
436+
if (encoding === undefined) {
437+
return binding.indexOfString(this, val, byteOffset, encoding);
438+
}
439+
return slowIndexOf(this, val, byteOffset, encoding);
440+
} else if (val instanceof Buffer) {
441+
return binding.indexOfBuffer(this, val, byteOffset, encoding);
442+
} else if (typeof val === 'number') {
411443
return binding.indexOfNumber(this, val, byteOffset);
444+
}
412445

413446
throw new TypeError('val must be string, number or Buffer');
414447
};

node.gyp

+1
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@
166166
'src/util.h',
167167
'src/util-inl.h',
168168
'src/util.cc',
169+
'src/string_search.cc',
169170
'deps/http_parser/http_parser.h',
170171
'deps/v8/include/v8.h',
171172
'deps/v8/include/v8-debug.h',

src/node_buffer.cc

+124-54
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "env.h"
55
#include "env-inl.h"
66
#include "string_bytes.h"
7+
#include "string_search.h"
78
#include "util.h"
89
#include "util-inl.h"
910
#include "v8-profiler.h"
@@ -854,87 +855,156 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
854855
}
855856

856857

857-
int32_t IndexOf(const char* haystack,
858-
size_t h_length,
859-
const char* needle,
860-
size_t n_length) {
861-
CHECK_GE(h_length, n_length);
862-
// TODO(trevnorris): Implement Boyer-Moore string search algorithm.
863-
for (size_t i = 0; i < h_length - n_length + 1; i++) {
864-
if (haystack[i] == needle[0]) {
865-
if (memcmp(haystack + i, needle, n_length) == 0)
866-
return i;
867-
}
868-
}
869-
return -1;
870-
}
871-
872-
873858
void IndexOfString(const FunctionCallbackInfo<Value>& args) {
874859
ASSERT(args[1]->IsString());
875860
ASSERT(args[2]->IsNumber());
876861

862+
enum encoding enc = ParseEncoding(args.GetIsolate(),
863+
args[3],
864+
UTF8);
865+
877866
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
878867
SPREAD_ARG(args[0], ts_obj);
879868

880-
node::Utf8Value str(args.GetIsolate(), args[1]);
881-
int32_t offset_i32 = args[2]->Int32Value();
882-
uint32_t offset;
869+
Local<String> needle = args[1].As<String>();
870+
const char* haystack = ts_obj_data;
871+
const size_t haystack_length = ts_obj_length;
872+
const size_t needle_length = needle->Utf8Length();
873+
874+
875+
if (needle_length == 0 || haystack_length == 0) {
876+
return args.GetReturnValue().Set(-1);
877+
}
878+
879+
int64_t offset_i64 = args[2]->IntegerValue();
880+
size_t offset = 0;
883881

884-
if (offset_i32 < 0) {
885-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
882+
if (offset_i64 < 0) {
883+
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
886884
offset = 0;
887-
else
888-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
885+
} else {
886+
offset = static_cast<size_t>(haystack_length + offset_i64);
887+
}
889888
} else {
890-
offset = static_cast<uint32_t>(offset_i32);
889+
offset = static_cast<size_t>(offset_i64);
891890
}
892891

893-
if (str.length() == 0 ||
894-
ts_obj_length == 0 ||
895-
(offset != 0 && str.length() + offset <= str.length()) ||
896-
str.length() + offset > ts_obj_length)
892+
if (haystack_length < offset || needle_length + offset > haystack_length) {
897893
return args.GetReturnValue().Set(-1);
894+
}
898895

899-
int32_t r =
900-
IndexOf(ts_obj_data + offset, ts_obj_length - offset, *str, str.length());
901-
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
902-
}
896+
size_t result = haystack_length;
897+
898+
if (enc == UCS2) {
899+
String::Value needle_value(needle);
900+
if (*needle_value == nullptr)
901+
return args.GetReturnValue().Set(-1);
902+
903+
if (haystack_length < 2 || needle_value.length() < 1) {
904+
return args.GetReturnValue().Set(-1);
905+
}
906+
907+
result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
908+
haystack_length / 2,
909+
reinterpret_cast<const uint16_t*>(*needle_value),
910+
needle_value.length(),
911+
offset / 2);
912+
result *= 2;
913+
} else if (enc == UTF8) {
914+
String::Utf8Value needle_value(needle);
915+
if (*needle_value == nullptr)
916+
return args.GetReturnValue().Set(-1);
917+
918+
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
919+
haystack_length,
920+
reinterpret_cast<const uint8_t*>(*needle_value),
921+
needle_length,
922+
offset);
923+
} else if (enc == BINARY) {
924+
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
925+
if (needle_data == nullptr) {
926+
return args.GetReturnValue().Set(-1);
927+
}
928+
needle->WriteOneByte(
929+
needle_data, 0, needle_length, String::NO_NULL_TERMINATION);
930+
931+
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
932+
haystack_length,
933+
needle_data,
934+
needle_length,
935+
offset);
936+
free(needle_data);
937+
}
903938

939+
args.GetReturnValue().Set(
940+
result == haystack_length ? -1 : static_cast<int>(result));
941+
}
904942

905943
void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
906944
ASSERT(args[1]->IsObject());
907945
ASSERT(args[2]->IsNumber());
908946

947+
enum encoding enc = ParseEncoding(args.GetIsolate(),
948+
args[3],
949+
UTF8);
950+
909951
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
910952
SPREAD_ARG(args[0], ts_obj);
911953
SPREAD_ARG(args[1], buf);
912-
const int32_t offset_i32 = args[2]->Int32Value();
913-
uint32_t offset;
914954

915955
if (buf_length > 0)
916956
CHECK_NE(buf_data, nullptr);
917957

918-
if (offset_i32 < 0) {
919-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
958+
const char* haystack = ts_obj_data;
959+
const size_t haystack_length = ts_obj_length;
960+
const char* needle = buf_data;
961+
const size_t needle_length = buf_length;
962+
963+
if (needle_length == 0 || haystack_length == 0) {
964+
return args.GetReturnValue().Set(-1);
965+
}
966+
967+
int64_t offset_i64 = args[2]->IntegerValue();
968+
size_t offset = 0;
969+
970+
if (offset_i64 < 0) {
971+
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
920972
offset = 0;
921973
else
922-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
974+
offset = static_cast<size_t>(haystack_length + offset_i64);
923975
} else {
924-
offset = static_cast<uint32_t>(offset_i32);
976+
offset = static_cast<size_t>(offset_i64);
925977
}
926978

927-
if (buf_length == 0 ||
928-
ts_obj_length == 0 ||
929-
(offset != 0 && buf_length + offset <= buf_length) ||
930-
buf_length + offset > ts_obj_length)
979+
if (haystack_length < offset || needle_length + offset > haystack_length) {
931980
return args.GetReturnValue().Set(-1);
981+
}
932982

933-
int32_t r =
934-
IndexOf(ts_obj_data + offset, ts_obj_length - offset, buf_data, buf_length);
935-
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
936-
}
983+
size_t result = haystack_length;
937984

985+
if (enc == UCS2) {
986+
if (haystack_length < 2 || needle_length < 2) {
987+
return args.GetReturnValue().Set(-1);
988+
}
989+
result = SearchString(
990+
reinterpret_cast<const uint16_t*>(haystack),
991+
haystack_length / 2,
992+
reinterpret_cast<const uint16_t*>(needle),
993+
needle_length / 2,
994+
offset / 2);
995+
result *= 2;
996+
} else {
997+
result = SearchString(
998+
reinterpret_cast<const uint8_t*>(haystack),
999+
haystack_length,
1000+
reinterpret_cast<const uint8_t*>(needle),
1001+
needle_length,
1002+
offset);
1003+
}
1004+
1005+
args.GetReturnValue().Set(
1006+
result == haystack_length ? -1 : static_cast<int>(result));
1007+
}
9381008

9391009
void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
9401010
ASSERT(args[1]->IsNumber());
@@ -944,25 +1014,25 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
9441014
SPREAD_ARG(args[0], ts_obj);
9451015

9461016
uint32_t needle = args[1]->Uint32Value();
947-
int32_t offset_i32 = args[2]->Int32Value();
948-
uint32_t offset;
1017+
int64_t offset_i64 = args[2]->IntegerValue();
1018+
size_t offset;
9491019

950-
if (offset_i32 < 0) {
951-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
1020+
if (offset_i64 < 0) {
1021+
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
9521022
offset = 0;
9531023
else
954-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
1024+
offset = static_cast<size_t>(ts_obj_length + offset_i64);
9551025
} else {
956-
offset = static_cast<uint32_t>(offset_i32);
1026+
offset = static_cast<size_t>(offset_i64);
9571027
}
9581028

9591029
if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
9601030
return args.GetReturnValue().Set(-1);
9611031

9621032
void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
9631033
char* ptr_char = static_cast<char*>(ptr);
964-
args.GetReturnValue().Set(
965-
ptr ? static_cast<int32_t>(ptr_char - ts_obj_data) : -1);
1034+
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
1035+
: -1);
9661036
}
9671037

9681038

src/string_search.cc

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#include "string_search.h"
2+
3+
namespace node {
4+
namespace stringsearch {
5+
6+
int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize];
7+
int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1];
8+
int StringSearchBase::kSuffixTable[kBMMaxShift + 1];
9+
}
10+
} // namespace node::stringsearch

0 commit comments

Comments
 (0)