Skip to content

Commit 20efb79

Browse files
committed
[SPARK-16324][SQL] regexp_extract should doc that it returns empty string when match fails
## What changes were proposed in this pull request? Doc that regexp_extract returns empty string when regex or group does not match ## How was this patch tested? Jenkins test, with a few new test cases Author: Sean Owen <sowen@cloudera.com> Closes #14525 from srowen/SPARK-16324. (cherry picked from commit 0578ff9) Signed-off-by: Sean Owen <sowen@cloudera.com>
1 parent 2285de7 commit 20efb79

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

python/pyspark/sql/functions.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1440,11 +1440,15 @@ def split(str, pattern):
14401440
@ignore_unicode_prefix
14411441
@since(1.5)
14421442
def regexp_extract(str, pattern, idx):
1443-
"""Extract a specific(idx) group identified by a java regex, from the specified string column.
1443+
"""Extract a specific group matched by a Java regex, from the specified string column.
1444+
If the regex did not match, or the specified group did not match, an empty string is returned.
14441445
14451446
>>> df = spark.createDataFrame([('100-200',)], ['str'])
14461447
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
14471448
[Row(d=u'100')]
1449+
>>> df = spark.createDataFrame([('foo',)], ['str'])
1450+
>>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
1451+
[Row(d=u'')]
14481452
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
14491453
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
14501454
[Row(d=u'')]

sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2175,7 +2175,8 @@ object functions {
21752175
def ltrim(e: Column): Column = withExpr {StringTrimLeft(e.expr) }
21762176

21772177
/**
2178-
* Extract a specific(idx) group identified by a java regex, from the specified string column.
2178+
* Extract a specific group matched by a Java regex, from the specified string column.
2179+
* If the regex did not match, or the specified group did not match, an empty string is returned.
21792180
*
21802181
* @group string_funcs
21812182
* @since 1.5.0

sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
9494

9595
test("non-matching optional group") {
9696
val df = Seq(Tuple1("aaaac")).toDF("s")
97+
checkAnswer(
98+
df.select(regexp_extract($"s", "(foo)", 1)),
99+
Row("")
100+
)
97101
checkAnswer(
98102
df.select(regexp_extract($"s", "(a+)(b)?(c)", 2)),
99103
Row("")

0 commit comments

Comments
 (0)