From 4c153b237fcfe86110bc75e5ba1b49e5f9460861 Mon Sep 17 00:00:00 2001 From: mswapnilG <95067368+mswapnilG@users.noreply.github.com> Date: Mon, 10 Jul 2023 20:12:57 +0530 Subject: [PATCH] Fix edge cases in JavaScript regexp_extract and regexp_extract_all UDFs (#369) * Fix edge cases in JavaScript regexp_extract and regexp_extract_all UDFs * Fix edge cases in JavaScript regexp_extract and regexp_extract_all UDFs --------- Co-authored-by: Daniel De Leo --- udfs/community/README.md | 16 ++++++- udfs/community/cw_regexp_extract.sqlx | 13 ++---- udfs/community/cw_regexp_extract_all.sqlx | 9 ++-- udfs/community/test_cases.js | 56 +++++++++++++++++++++++ 4 files changed, 81 insertions(+), 13 deletions(-) diff --git a/udfs/community/README.md b/udfs/community/README.md index 9c88d633e..c081934d6 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -706,19 +706,31 @@ sg ``` ### [cw_regexp_extract(str STRING, regexp STRING)](cw_regexp_extract.sqlx) -Returns the first substring matched by the regular expression regexp in str. +Extracts the first substring matched by the regular expression regexp in str, returns null if the regex doesn't have a match or either str or regexp is null. ```sql SELECT bqutil.fn.cw_regexp_extract('TestStr123456#?%&', 'Str'); +SELECT bqutil.fn.cw_regexp_extract('TestStr123456#?%&', 'StrX'); +SELECT bqutil.fn.cw_regexp_extract(NULL, 'StrX'); +SELECT bqutil.fn.cw_regexp_extract('TestStr123456#?%&', NULL); Str +NULL +NULL +NULL ``` ### [cw_regexp_extract_all(str STRING, regexp STRING)](cw_regexp_extract_all.sqlx) -Returns the substring(s) matched by the regular expression regexp in str. +Returns the substring(s) matched by the regular expression regexp in str, returns null if the regex doesn't have a match or either str or regexp is null. ```sql SELECT bqutil.fn.cw_regexp_extract_all('TestStr123456', 'Str.*'); +SELECT bqutil.fn.cw_regexp_extract_all('TestStr123456', 'StrX.*'); +SELECT bqutil.fn.cw_regexp_extract_all(NULL, 'Str.*'); +SELECT bqutil.fn.cw_regexp_extract_all('TestStr123456', NULL); [Str123456] +NULL +NULL +NULL ``` ### [cw_regexp_extract_all_n(str STRING, regexp STRING, groupn INT64)](cw_regexp_extract_all_n.sqlx) diff --git a/udfs/community/cw_regexp_extract.sqlx b/udfs/community/cw_regexp_extract.sqlx index f26b3f05e..88455f491 100644 --- a/udfs/community/cw_regexp_extract.sqlx +++ b/udfs/community/cw_regexp_extract.sqlx @@ -15,14 +15,11 @@ config { hasOutput: true } * limitations under the License. */ -/* Returns the first substring matched by the regular expression `regexp` in `str`. */ +/* Extracts the first substring matched by the regular expression `regexp` in `str`, returns null if the regex doesn't have a match or either str or regexp is null. */ CREATE OR REPLACE FUNCTION ${self()}(str STRING, regexp STRING) RETURNS STRING -LANGUAGE js OPTIONS ( - description="Returns the first substring matched by the regular expression `regexp` in `str`." + description="""Extracts the first substring matched by the regular expression `regexp` in `str`, returns null if the regex doesn't have a match or either str or regexp is null.""" ) -AS """ - var r = new RegExp(regexp); - var a = str.match(r); - return a[0]; -"""; +AS ( + ${ref("cw_regexp_extract_all")}(str, regexp)[SAFE_OFFSET(0)] +); diff --git a/udfs/community/cw_regexp_extract_all.sqlx b/udfs/community/cw_regexp_extract_all.sqlx index 68f7c153e..979f68885 100644 --- a/udfs/community/cw_regexp_extract_all.sqlx +++ b/udfs/community/cw_regexp_extract_all.sqlx @@ -15,13 +15,16 @@ config { hasOutput: true } * limitations under the License. */ -/* Returns the substring(s) matched by the regular expression `regexp` in `str`. */ +/* Returns the substring(s) matched by the regular expression `regexp` in `str`, returns null if the regex doesn't have a match or either str or regexp is null. */ CREATE OR REPLACE FUNCTION ${self()}(str STRING, regexp STRING) RETURNS ARRAY -LANGUAGE js +LANGUAGE js OPTIONS ( - description="Returns the substring(s) matched by the regular expression `regexp` in `str`." + description="Returns the substring(s) matched by the regular expression `regexp` in `str`, returns null if the regex doesn't have a match or either str or regexp is null." ) AS """ + if (str == null || regexp == null) { + return null; + } var r = new RegExp(regexp, "g"); return str.match(r); """; diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js index afc44be40..22e7bb411 100644 --- a/udfs/community/test_cases.js +++ b/udfs/community/test_cases.js @@ -1922,6 +1922,34 @@ generate_udf_test("cw_regexp_extract", [ ], expected_output: `"Str"` }, + { + inputs: [ + `"TestStr123456#?%&"`, + `"StrX"` + ], + expected_output: `NULL` + }, + { + inputs: [ + `"TestStr123456#?%&"`, + `NULL` + ], + expected_output: `NULL` + }, + { + inputs: [ + `NULL`, + `"StrX"` + ], + expected_output: `NULL` + }, + { + inputs: [ + `NULL`, + `NULL` + ], + expected_output: `NULL` + }, ]); generate_udf_test("cw_regexp_extract_n", [ { @@ -1941,6 +1969,34 @@ generate_udf_test("cw_regexp_extract_all", [ ], expected_output: `CAST(["Str123456"] AS ARRAY)` }, + { + inputs: [ + `"TestStr123456"`, + `"StrX"` + ], + expected_output: `NULL` + }, + { + inputs: [ + `"TestStr123456"`, + `NULL` + ], + expected_output: `NULL` + }, + { + inputs: [ + `NULL`, + `"StrX"` + ], + expected_output: `NULL` + }, + { + inputs: [ + `NULL`, + `NULL` + ], + expected_output: `NULL` + }, ]); generate_udf_test("cw_regexp_extract_all_n", [ {