Skip to content

Commit 3b56f2a

Browse files
committed
Merge remote-tracking branch 'origin/master' into 8223
# Conflicts: # sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
2 parents 5189690 + b285ac5 commit 3b56f2a

File tree

7 files changed

+151
-54
lines changed

7 files changed

+151
-54
lines changed

mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala

Lines changed: 63 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -214,12 +214,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
214214
/*
215215
Using the following R code to load the data and train the model using glmnet package.
216216
217-
> library("glmnet")
218-
> data <- read.csv("path", header=FALSE)
219-
> label = factor(data$V1)
220-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
221-
> weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
222-
> weights
217+
library("glmnet")
218+
data <- read.csv("path", header=FALSE)
219+
label = factor(data$V1)
220+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
221+
weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
222+
weights
223+
223224
5 x 1 sparse Matrix of class "dgCMatrix"
224225
s0
225226
(Intercept) 2.8366423
@@ -245,13 +246,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
245246
/*
246247
Using the following R code to load the data and train the model using glmnet package.
247248
248-
> library("glmnet")
249-
> data <- read.csv("path", header=FALSE)
250-
> label = factor(data$V1)
251-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
252-
> weights =
249+
library("glmnet")
250+
data <- read.csv("path", header=FALSE)
251+
label = factor(data$V1)
252+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
253+
weights =
253254
coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
254-
> weights
255+
weights
256+
255257
5 x 1 sparse Matrix of class "dgCMatrix"
256258
s0
257259
(Intercept) .
@@ -278,12 +280,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
278280
/*
279281
Using the following R code to load the data and train the model using glmnet package.
280282
281-
> library("glmnet")
282-
> data <- read.csv("path", header=FALSE)
283-
> label = factor(data$V1)
284-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
285-
> weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
286-
> weights
283+
library("glmnet")
284+
data <- read.csv("path", header=FALSE)
285+
label = factor(data$V1)
286+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
287+
weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
288+
weights
289+
287290
5 x 1 sparse Matrix of class "dgCMatrix"
288291
s0
289292
(Intercept) -0.05627428
@@ -310,13 +313,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
310313
/*
311314
Using the following R code to load the data and train the model using glmnet package.
312315
313-
> library("glmnet")
314-
> data <- read.csv("path", header=FALSE)
315-
> label = factor(data$V1)
316-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
317-
> weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
316+
library("glmnet")
317+
data <- read.csv("path", header=FALSE)
318+
label = factor(data$V1)
319+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
320+
weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
318321
intercept=FALSE))
319-
> weights
322+
weights
323+
320324
5 x 1 sparse Matrix of class "dgCMatrix"
321325
s0
322326
(Intercept) .
@@ -343,12 +347,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
343347
/*
344348
Using the following R code to load the data and train the model using glmnet package.
345349
346-
> library("glmnet")
347-
> data <- read.csv("path", header=FALSE)
348-
> label = factor(data$V1)
349-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
350-
> weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
351-
> weights
350+
library("glmnet")
351+
data <- read.csv("path", header=FALSE)
352+
label = factor(data$V1)
353+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
354+
weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
355+
weights
356+
352357
5 x 1 sparse Matrix of class "dgCMatrix"
353358
s0
354359
(Intercept) 0.15021751
@@ -375,13 +380,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
375380
/*
376381
Using the following R code to load the data and train the model using glmnet package.
377382
378-
> library("glmnet")
379-
> data <- read.csv("path", header=FALSE)
380-
> label = factor(data$V1)
381-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
382-
> weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
383+
library("glmnet")
384+
data <- read.csv("path", header=FALSE)
385+
label = factor(data$V1)
386+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
387+
weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
383388
intercept=FALSE))
384-
> weights
389+
weights
390+
385391
5 x 1 sparse Matrix of class "dgCMatrix"
386392
s0
387393
(Intercept) .
@@ -408,12 +414,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
408414
/*
409415
Using the following R code to load the data and train the model using glmnet package.
410416
411-
> library("glmnet")
412-
> data <- read.csv("path", header=FALSE)
413-
> label = factor(data$V1)
414-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
415-
> weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
416-
> weights
417+
library("glmnet")
418+
data <- read.csv("path", header=FALSE)
419+
label = factor(data$V1)
420+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
421+
weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
422+
weights
423+
417424
5 x 1 sparse Matrix of class "dgCMatrix"
418425
s0
419426
(Intercept) 0.57734851
@@ -440,13 +447,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
440447
/*
441448
Using the following R code to load the data and train the model using glmnet package.
442449
443-
> library("glmnet")
444-
> data <- read.csv("path", header=FALSE)
445-
> label = factor(data$V1)
446-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
447-
> weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
450+
library("glmnet")
451+
data <- read.csv("path", header=FALSE)
452+
label = factor(data$V1)
453+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
454+
weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
448455
intercept=FALSE))
449-
> weights
456+
weights
457+
450458
5 x 1 sparse Matrix of class "dgCMatrix"
451459
s0
452460
(Intercept) .
@@ -503,12 +511,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
503511
/*
504512
Using the following R code to load the data and train the model using glmnet package.
505513
506-
> library("glmnet")
507-
> data <- read.csv("path", header=FALSE)
508-
> label = factor(data$V1)
509-
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
510-
> weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
511-
> weights
514+
library("glmnet")
515+
data <- read.csv("path", header=FALSE)
516+
label = factor(data$V1)
517+
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
518+
weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
519+
weights
520+
512521
5 x 1 sparse Matrix of class "dgCMatrix"
513522
s0
514523
(Intercept) -0.2480643

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ object FunctionRegistry {
159159
expression[Substring]("substr"),
160160
expression[Substring]("substring"),
161161
expression[Upper]("ucase"),
162+
expression[UnHex]("unhex"),
162163
expression[Upper]("upper")
163164
)
164165

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,58 @@ case class ShiftRight(left: Expression, right: Expression) extends BinaryExpress
453453
override def toString: String = s"ShiftRight($left, $right)"
454454
}
455455

456+
/**
457+
* Performs the inverse operation of HEX.
458+
* Resulting characters are returned as a byte array.
459+
*/
460+
case class UnHex(child: Expression) extends UnaryExpression with Serializable {
461+
462+
override def dataType: DataType = BinaryType
463+
464+
override def checkInputDataTypes(): TypeCheckResult = {
465+
if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) {
466+
TypeCheckResult.TypeCheckSuccess
467+
} else {
468+
TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}")
469+
}
470+
}
471+
472+
override def eval(input: InternalRow): Any = {
473+
val num = child.eval(input)
474+
if (num == null) {
475+
null
476+
} else {
477+
unhex(num.asInstanceOf[UTF8String].getBytes)
478+
}
479+
}
480+
481+
private val unhexDigits = {
482+
val array = Array.fill[Byte](128)(-1)
483+
(0 to 9).foreach(i => array('0' + i) = i.toByte)
484+
(0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
485+
(0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
486+
array
487+
}
488+
489+
private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
490+
var bytes = inputBytes
491+
if ((bytes.length & 0x01) != 0) {
492+
bytes = '0'.toByte +: bytes
493+
}
494+
val out = new Array[Byte](bytes.length >> 1)
495+
// two characters form the hex value.
496+
var i = 0
497+
while (i < bytes.length) {
498+
val first = unhexDigits(bytes(i))
499+
val second = unhexDigits(bytes(i + 1))
500+
if (first == -1 || second == -1) { return null}
501+
out(i / 2) = (((first << 4) | second) & 0xFF).toByte
502+
i += 2
503+
}
504+
out
505+
}
506+
}
507+
456508
case class Hypot(left: Expression, right: Expression)
457509
extends BinaryMathExpression(math.hypot, "HYPOT")
458510

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
264264
// scalastyle:on
265265
}
266266

267+
test("unhex") {
268+
checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)
269+
checkEvaluation(UnHex(Literal("")), new Array[Byte](0))
270+
checkEvaluation(UnHex(Literal("0")), Array[Byte](0))
271+
}
272+
267273
test("hypot") {
268274
testBinary(Hypot, math.hypot)
269275
}

sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,24 @@ object functions {
10531053
*/
10541054
def hex(colName: String): Column = hex(Column(colName))
10551055

1056+
/**
1057+
* Inverse of hex. Interprets each pair of characters as a hexadecimal number
1058+
* and converts to the byte representation of number.
1059+
*
1060+
* @group math_funcs
1061+
* @since 1.5.0
1062+
*/
1063+
def unhex(column: Column): Column = UnHex(column.expr)
1064+
1065+
/**
1066+
* Inverse of hex. Interprets each pair of characters as a hexadecimal number
1067+
* and converts to the byte representation of number.
1068+
*
1069+
* @group math_funcs
1070+
* @since 1.5.0
1071+
*/
1072+
def unhex(colName: String): Column = unhex(Column(colName))
1073+
10561074
/**
10571075
* Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
10581076
*

sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,16 @@ class MathExpressionsSuite extends QueryTest {
225225
checkAnswer(data.selectExpr("hex(cast(d as binary))"), Seq(Row("68656C6C6F")))
226226
}
227227

228+
test("unhex") {
229+
val data = Seq(("1C", "737472696E67")).toDF("a", "b")
230+
checkAnswer(data.select(unhex('a)), Row(Array[Byte](28.toByte)))
231+
checkAnswer(data.select(unhex('b)), Row("string".getBytes))
232+
checkAnswer(data.selectExpr("unhex(a)"), Row(Array[Byte](28.toByte)))
233+
checkAnswer(data.selectExpr("unhex(b)"), Row("string".getBytes))
234+
checkAnswer(data.selectExpr("""unhex("##")"""), Row(null))
235+
checkAnswer(data.selectExpr("""unhex("G123")"""), Row(null))
236+
}
237+
228238
test("hypot") {
229239
testTwoToOneMathFunction(hypot, hypot, math.hypot)
230240
}

sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
949949
"udf_trim",
950950
"udf_ucase",
951951
"udf_unix_timestamp",
952+
"udf_unhex",
952953
"udf_upper",
953954
"udf_var_pop",
954955
"udf_var_samp",

0 commit comments

Comments
 (0)