Skip to content

Commit 8279206

Browse files
authored
YQL-18027: Add block implementation for String::Contains (#2772)
1 parent a423b94 commit 8279206

File tree

7 files changed

+74
-10
lines changed

7 files changed

+74
-10
lines changed

ydb/library/yql/public/udf/arrow/block_item.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ class TBlockItem {
2626
template <typename T, typename = std::enable_if_t<TPrimitiveDataType<T>::Result>>
2727
inline explicit TBlockItem(T value);
2828

29+
inline explicit TBlockItem(bool value) {
30+
Raw.Simple.bool_ = value ? 1 : 0;
31+
Raw.Simple.Meta = static_cast<ui8>(EMarkers::Present);
32+
}
33+
2934
inline explicit TBlockItem(TStringRef value) {
3035
Raw.String.Value = value.Data();
3136
Raw.String.Size = value.Size();
@@ -109,6 +114,10 @@ class TBlockItem {
109114
#define FIELD(type) type type##_;
110115
PRIMITIVE_VALUE_TYPES(FIELD);
111116
#undef FIELD
117+
// According to the YQL <-> arrow type mapping convention,
118+
// boolean values are processed as 8-bit unsigned integer
119+
// with either 0 or 1 as a condition payload.
120+
ui8 bool_;
112121
ui64 Count;
113122
};
114123
union {
@@ -167,6 +176,9 @@ UDF_ASSERT_TYPE_SIZE(TBlockItem, 16);
167176
PRIMITIVE_VALUE_TYPES(VALUE_AS)
168177
PRIMITIVE_VALUE_TYPES(VALUE_GET)
169178
PRIMITIVE_VALUE_TYPES(VALUE_CONSTR)
179+
// XXX: TBlockItem constructor with <bool> parameter is implemented above.
180+
VALUE_AS(bool)
181+
VALUE_GET(bool)
170182

171183
#undef VALUE_AS
172184
#undef VALUE_GET

ydb/library/yql/tests/sql/dq_file/part14/canondata/result.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -379,9 +379,9 @@
379379
],
380380
"test.test[ansi_idents-escaped_udf_name-default.txt-Debug]": [
381381
{
382-
"checksum": "21a4fe7fb93bc82cd8f67fed0ffad56e",
383-
"size": 1780,
384-
"uri": "https://{canondata_backend}/1936997/ad7538cf8edf8e81865f7eee42c2de851daf1211/resource.tar.gz#test.test_ansi_idents-escaped_udf_name-default.txt-Debug_/opt.yql_patched"
382+
"checksum": "131c2b624f1fb925a389a58ee3d1ee12",
383+
"size": 1793,
384+
"uri": "https://{canondata_backend}/1936842/557f7ab03608bf231a6bd2276c94b8a7ee4523b0/resource.tar.gz#test.test_ansi_idents-escaped_udf_name-default.txt-Debug_/opt.yql_patched"
385385
}
386386
],
387387
"test.test[ansi_idents-escaped_udf_name-default.txt-Plan]": [

ydb/library/yql/tests/sql/hybrid_file/part7/canondata/result.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -449,9 +449,9 @@
449449
],
450450
"test.test[ansi_idents-escaped_udf_name-default.txt-Debug]": [
451451
{
452-
"checksum": "4c8e1060cd6d4c7c0166346a5eb44739",
453-
"size": 3018,
454-
"uri": "https://{canondata_backend}/1936842/15d1b251a19a947bc78bcd914d26903ce91d665f/resource.tar.gz#test.test_ansi_idents-escaped_udf_name-default.txt-Debug_/opt.yql_patched"
452+
"checksum": "b217896dea1c1d1811b86d0ec696f868",
453+
"size": 3031,
454+
"uri": "https://{canondata_backend}/1880306/c1aa0466ddf0d02543125e7a8f308326758dc098/resource.tar.gz#test.test_ansi_idents-escaped_udf_name-default.txt-Debug_/opt.yql_patched"
455455
}
456456
],
457457
"test.test[ansi_idents-escaped_udf_name-default.txt-Plan]": [

ydb/library/yql/tests/sql/yt_native_file/part14/canondata/result.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -340,9 +340,9 @@
340340
],
341341
"test.test[ansi_idents-escaped_udf_name-default.txt-Debug]": [
342342
{
343-
"checksum": "4988877c69725bebc3eb77a48625f5cd",
344-
"size": 2011,
345-
"uri": "https://{canondata_backend}/1923547/5154c8bd8ef9ead4f609771f831f20c15e795571/resource.tar.gz#test.test_ansi_idents-escaped_udf_name-default.txt-Debug_/opt.yql"
343+
"checksum": "bba9b3784ee486de1c6c60c024b7ac96",
344+
"size": 2024,
345+
"uri": "https://{canondata_backend}/1942525/2596beba419499583afc5dd08cdd9d8cff9ec93b/resource.tar.gz#test.test_ansi_idents-escaped_udf_name-default.txt-Debug_/opt.yql"
346346
}
347347
],
348348
"test.test[ansi_idents-escaped_udf_name-default.txt-Plan]": [

ydb/library/yql/udfs/common/string/string_udf.cpp

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,19 +174,27 @@ namespace {
174174
XX(AsciiToUpper, to_upper) \
175175
XX(AsciiToTitle, to_title)
176176

177+
// NOTE: The functions below are marked as deprecated, so block implementation
178+
// is not required for them. Hence, STROKA_FIND_UDF provides only the scalar
179+
// one at the moment.
177180
#define STROKA_FIND_UDF_MAP(XX) \
178-
XX(Contains, Contains) \
179181
XX(StartsWith, StartsWith) \
180182
XX(EndsWith, EndsWith) \
181183
XX(HasPrefix, StartsWith) \
182184
XX(HasSuffix, EndsWith)
183185

186+
// NOTE: The functions below are marked as deprecated, so block implementation
187+
// is not required for them. Hence, STRING_TWO_ARGS_UDF provides only the
188+
// scalar one at the moment.
184189
#define STRING_TWO_ARGS_UDF_MAP(XX) \
185190
XX(StartsWithIgnoreCase, AsciiHasPrefixIgnoreCase) \
186191
XX(EndsWithIgnoreCase, AsciiHasSuffixIgnoreCase) \
187192
XX(HasPrefixIgnoreCase, AsciiHasPrefixIgnoreCase) \
188193
XX(HasSuffixIgnoreCase, AsciiHasSuffixIgnoreCase)
189194

195+
// NOTE: The functions below are marked as deprecated, so block implementation
196+
// is not required for them. Hence, STROKA_UDF provides only the scalar one at
197+
// the moment.
190198
#define STROKA_UDF_MAP(XX) \
191199
XX(Reverse, ReverseInPlace)
192200

@@ -207,6 +215,30 @@ namespace {
207215
return valueBuilder->NewString(input);
208216
}
209217

218+
BEGIN_SIMPLE_STRICT_ARROW_UDF(TContains, bool(TOptional<char*>, char*)) {
219+
Y_UNUSED(valueBuilder);
220+
if (!args[0])
221+
return TUnboxedValuePod(false);
222+
223+
const TString haystack(args[0].AsStringRef());
224+
const TString needle(args[1].AsStringRef());
225+
return TUnboxedValuePod(haystack.Contains(needle));
226+
}
227+
228+
struct TContainsKernelExec : public TBinaryKernelExec<TContainsKernelExec> {
229+
template <typename TSink>
230+
static void Process(TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
231+
if (!arg1)
232+
return sink(TBlockItem(false));
233+
234+
const TString haystack(arg1.AsStringRef());
235+
const TString needle(arg2.AsStringRef());
236+
sink(TBlockItem(haystack.Contains(needle)));
237+
}
238+
};
239+
240+
END_SIMPLE_ARROW_UDF(TContains, TContainsKernelExec::Do);
241+
210242
SIMPLE_STRICT_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) {
211243
if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
212244
return valueBuilder->NewString(result);
@@ -277,6 +309,8 @@ namespace {
277309
return args[0];
278310
}
279311

312+
// NOTE: String::Find is marked as deprecated, so block implementation is
313+
// not required for them. Hence, only the scalar one is provided.
280314
SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) {
281315
Y_UNUSED(valueBuilder);
282316
const TString haystack(args[0].AsStringRef());
@@ -285,6 +319,9 @@ namespace {
285319
return TUnboxedValuePod(haystack.find(needle, pos));
286320
}
287321

322+
// NOTE: String::ReverseFind is marked as deprecated, so block
323+
// implementation is not required for them. Hence, only the scalar one is
324+
// provided.
288325
SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) {
289326
Y_UNUSED(valueBuilder);
290327
const TString haystack(args[0].AsStringRef());
@@ -293,6 +330,8 @@ namespace {
293330
return TUnboxedValuePod(haystack.rfind(needle, pos));
294331
}
295332

333+
// NOTE: String::Substring is marked as deprecated, so block implementation
334+
// is not required for them. Hence, only the scalar one is provided.
296335
SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>), 1) {
297336
const TString input(args[0].AsStringRef());
298337
const ui64 from = args[1].GetOrDefault<ui64>(0);

ydb/library/yql/udfs/common/string/test/canondata/test.test_BlockFind_/results.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@
1414
"String"
1515
]
1616
];
17+
[
18+
"contains";
19+
[
20+
"DataType";
21+
"Bool"
22+
]
23+
];
1724
[
1825
"levenstein";
1926
[
@@ -27,22 +34,27 @@
2734
"Data" = [
2835
[
2936
"fdsa";
37+
%false;
3038
"3"
3139
];
3240
[
3341
"aswedfg";
42+
%true;
3443
"5"
3544
];
3645
[
3746
"asdadsaasd";
47+
%true;
3848
"8"
3949
];
4050
[
4151
"gdsfsassas";
52+
%true;
4253
"8"
4354
];
4455
[
4556
"";
57+
%false;
4658
"2"
4759
]
4860
]

ydb/library/yql/udfs/common/string/test/cases/BlockFind.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22
pragma UseBlocks;
33
SELECT
44
value,
5+
String::Contains(value, "as") AS contains,
56
String::LevensteinDistance(value, "as") AS levenstein
67
FROM Input;

0 commit comments

Comments
 (0)