Merge remote-tracking branch 'origin/master' into 8223

tarekbecker · tarekbecker · commit 3b56f2a8f64d · 2015-07-01T22:33:40.000-07:00
# Conflicts:
#	sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -214,12 +214,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
        (Intercept)  2.8366423
@@ -245,13 +246,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights =
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights =
            coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
        (Intercept)   .
@@ -278,12 +280,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept) -0.05627428
@@ -310,13 +313,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
            intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)   .
@@ -343,12 +347,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)  0.15021751
@@ -375,13 +380,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
            intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)   .
@@ -408,12 +414,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)  0.57734851
@@ -440,13 +447,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
            intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)   .
@@ -503,12 +511,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept) -0.2480643
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -159,6 +159,7 @@ object FunctionRegistry {
     expression[Substring]("substr"),
     expression[Substring]("substring"),
     expression[Upper]("ucase"),
+    expression[UnHex]("unhex"),
     expression[Upper]("upper")
   )
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -453,6 +453,58 @@ case class ShiftRight(left: Expression, right: Expression) extends BinaryExpress
   override def toString: String = s"ShiftRight($left, $right)"
 }
 
+/**
+ * Performs the inverse operation of HEX.
+ * Resulting characters are returned as a byte array.
+ */
+case class UnHex(child: Expression) extends UnaryExpression with Serializable {
+
+  override def dataType: DataType = BinaryType
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}")
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val num = child.eval(input)
+    if (num == null) {
+      null
+    } else {
+      unhex(num.asInstanceOf[UTF8String].getBytes)
+    }
+  }
+
+  private val unhexDigits = {
+    val array = Array.fill[Byte](128)(-1)
+    (0 to 9).foreach(i => array('0' + i) = i.toByte)
+    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
+    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
+    array
+  }
+
+  private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
+    var bytes = inputBytes
+    if ((bytes.length & 0x01) != 0) {
+      bytes = '0'.toByte +: bytes
+    }
+    val out = new Array[Byte](bytes.length >> 1)
+    // two characters form the hex value.
+    var i = 0
+    while (i < bytes.length) {
+        val first = unhexDigits(bytes(i))
+        val second = unhexDigits(bytes(i + 1))
+        if (first == -1 || second == -1) { return null}
+        out(i / 2) = (((first << 4) | second) & 0xFF).toByte
+        i += 2
+    }
+    out
+  }
+}
+
 case class Hypot(left: Expression, right: Expression)
   extends BinaryMathExpression(math.hypot, "HYPOT")
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -264,6 +264,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:on
   }
 
+  test("unhex") {
+    checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)
+    checkEvaluation(UnHex(Literal("")), new Array[Byte](0))
+    checkEvaluation(UnHex(Literal("0")), Array[Byte](0))
+  }
+
   test("hypot") {
     testBinary(Hypot, math.hypot)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1053,6 +1053,24 @@ object functions {
    */
   def hex(colName: String): Column = hex(Column(colName))
 
+  /**
+   * Inverse of hex. Interprets each pair of characters as a hexadecimal number
+   * and converts to the byte representation of number.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def unhex(column: Column): Column = UnHex(column.expr)
+
+  /**
+   * Inverse of hex. Interprets each pair of characters as a hexadecimal number
+   * and converts to the byte representation of number.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def unhex(colName: String): Column = unhex(Column(colName))
+
   /**
    * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -225,6 +225,16 @@ class MathExpressionsSuite extends QueryTest {
     checkAnswer(data.selectExpr("hex(cast(d as binary))"), Seq(Row("68656C6C6F")))
   }
 
+  test("unhex") {
+    val data = Seq(("1C", "737472696E67")).toDF("a", "b")
+    checkAnswer(data.select(unhex('a)), Row(Array[Byte](28.toByte)))
+    checkAnswer(data.select(unhex('b)), Row("string".getBytes))
+    checkAnswer(data.selectExpr("unhex(a)"), Row(Array[Byte](28.toByte)))
+    checkAnswer(data.selectExpr("unhex(b)"), Row("string".getBytes))
+    checkAnswer(data.selectExpr("""unhex("##")"""), Row(null))
+    checkAnswer(data.selectExpr("""unhex("G123")"""), Row(null))
+  }
+
   test("hypot") {
     testTwoToOneMathFunction(hypot, hypot, math.hypot)
   }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -949,6 +949,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_trim",
     "udf_ucase",
     "udf_unix_timestamp",
+    "udf_unhex",
     "udf_upper",
     "udf_var_pop",
     "udf_var_samp",

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,7 @@ object FunctionRegistry {`
`159`	`159`	`expression[Substring]("substr"),`
`160`	`160`	`expression[Substring]("substring"),`
`161`	`161`	`expression[Upper]("ucase"),`
	`162`	`+ expression[UnHex]("unhex"),`
`162`	`163`	`expression[Upper]("upper")`
`163`	`164`	`)`
`164`	`165`
Original file line number	Diff line number	Diff line change
`@@ -264,6 +264,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {`
`264`	`264`	`// scalastyle:on`
`265`	`265`	`}`
`266`	`266`
	`267`	`+ test("unhex") {`
	`268`	`+ checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)`
	`269`	`+ checkEvaluation(UnHex(Literal("")), new Array[Byte](0))`
	`270`	`+ checkEvaluation(UnHex(Literal("0")), Array[Byte](0))`
	`271`	`+ }`
	`272`	`+`
`267`	`273`	`test("hypot") {`
`268`	`274`	`testBinary(Hypot, math.hypot)`
`269`	`275`	`}`