Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix unsigned bigint in group by columns #780

Merged
merged 5 commits into from
May 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions core/src/main/scala/com/pingcap/tispark/utils/TiConverter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@ package com.pingcap.tispark.utils

import java.util.logging.Logger

import com.google.common.primitives.UnsignedLong
import com.pingcap.tikv.exception.TiBatchWriteException
import com.pingcap.tikv.meta.TiColumnInfo
import com.pingcap.tikv.operation.transformer.RowTransformer
import com.pingcap.tikv.types._
import com.pingcap.tispark.TiBatchWrite.TiRow
import org.apache.spark.sql
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.DataTypes
import org.apache.spark.sql.types.{DataTypes, Decimal}

object TiConverter {
type TiDataType = com.pingcap.tikv.types.DataType
Expand Down Expand Up @@ -42,7 +43,18 @@ object TiConverter {
val rowArray = new Array[Any](finalTypes.size)

for (i <- 0 until transRow.fieldCount) {
rowArray(i) = transRow.get(i, finalTypes(i))
val colTp = finalTypes(i)
val isBigInt = colTp.getType.equals(MySQLType.TypeLonglong)
val isUnsigned = colTp.isUnsigned
val tmp = transRow.get(i, finalTypes(i))
rowArray(i) = if (isBigInt && isUnsigned) {
tmp match {
case l: java.lang.Long => Decimal.apply(UnsignedLong.fromLongBits(l).bigIntegerValue())
case _ => tmp
}
} else {
tmp
}
}

Row.fromSeq(rowArray)
Expand Down
15 changes: 0 additions & 15 deletions core/src/main/scala/com/pingcap/tispark/utils/TiUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,10 @@ import com.pingcap.tikv.TiConfiguration
import com.pingcap.tikv.expression.ExpressionBlacklist
import com.pingcap.tikv.expression.visitor.{MetaResolver, SupportedExpressionValidator}
import com.pingcap.tikv.meta.{TiColumnInfo, TiDAGRequest, TiTableInfo}
import com.pingcap.tikv.operation.transformer.RowTransformer
import com.pingcap.tikv.region.RegionStoreClient.RequestTypes
import com.pingcap.tikv.types._
import com.pingcap.tispark.TiBatchWrite.TiRow
import com.pingcap.tispark.{BasicExpression, TiConfigConst, TiDBRelation}
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal, NamedExpression}
import org.apache.spark.sql.execution.SparkPlan
Expand Down Expand Up @@ -110,18 +107,6 @@ object TiUtil {
blacklist: ExpressionBlacklist): Boolean =
isSupportedBasicExpression(expr, source, blacklist) && isPushDownSupported(expr, source)

def toSparkRow(row: TiRow, rowTransformer: RowTransformer): Row = {
val finalTypes = rowTransformer.getTypes.toList
val transRow = rowTransformer.transform(row)
val rowArray = new Array[Any](finalTypes.size)

for (i <- 0 until transRow.fieldCount) {
rowArray(i) = transRow.get(i, finalTypes(i))
}

Row.fromSeq(rowArray)
}

def getSchemaFromTable(table: TiTableInfo): StructType = {
val fields = new Array[StructField](table.getColumns.size())
for (i <- 0 until table.getColumns.size()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import com.pingcap.tikv.{TiConfiguration, TiSession}
import com.pingcap.tispark.TiSessionCache
import com.pingcap.tispark.listener.CacheInvalidateListener
import com.pingcap.tispark.utils.ReflectionUtil.ReflectionMapPartitionWithIndexInternal
import com.pingcap.tispark.utils.TiUtil
import com.pingcap.tispark.utils.{TiConverter, TiUtil}
import gnu.trove.list.array
import gnu.trove.list.array.TLongArrayList
import org.apache.log4j.Logger
Expand Down Expand Up @@ -412,17 +412,6 @@ case class RegionTaskExec(child: SparkPlan,
val outputTypes = output.map(_.dataType)
val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)

def toSparkRow(row: TiRow): Row = {
val transRow = rowTransformer.transform(row)
val rowArray = new Array[Any](finalTypes.size)

for (i <- 0 until transRow.fieldCount) {
rowArray(i) = transRow.get(i, finalTypes(i))
}

Row.fromSeq(rowArray)
}

// The result iterator serves as an wrapper to the final result we fetched from region tasks
val resultIter = new util.Iterator[UnsafeRow] {
override def hasNext: Boolean = {
Expand Down Expand Up @@ -457,7 +446,7 @@ case class RegionTaskExec(child: SparkPlan,
numOutputRows += 1
// Unsafe row projection
project.initialize(index)
val sparkRow = toSparkRow(rowIterator.next())
val sparkRow = TiConverter.toSparkRow(rowIterator.next(), rowTransformer)
// Need to convert spark row to internal row for Catalyst
project(rowToInternalRow(sparkRow, outputTypes, converters))
}
Expand Down
6 changes: 3 additions & 3 deletions core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ import com.pingcap.tikv.types.DataType
import com.pingcap.tikv.util.RangeSplitter
import com.pingcap.tikv.util.RangeSplitter.RegionTask
import com.pingcap.tispark.listener.CacheInvalidateListener
import com.pingcap.tispark.utils.TiUtil
import com.pingcap.tispark.{TiConfigConst, TiPartition, TiSessionCache, TiTableReference}
import com.pingcap.tispark.utils.TiConverter
import com.pingcap.tispark.{TiPartition, TiSessionCache, TiTableReference}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.{Partition, TaskContext, TaskKilledException}
Expand Down Expand Up @@ -78,7 +78,7 @@ class TiRDD(val dagRequest: TiDAGRequest,
iterator.hasNext
}

override def next(): Row = TiUtil.toSparkRow(iterator.next, rowTransformer)
override def next(): Row = TiConverter.toSparkRow(iterator.next, rowTransformer)
}

override protected def getPreferredLocations(split: Partition): Seq[String] =
Expand Down
58 changes: 46 additions & 12 deletions core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ class IssueTestSuite extends BaseTiSparkTest {
tidbStmt.execute(
"insert into t values(1),(2),(3),(4),(null)"
)
refreshConnections()

assert(spark.sql("select * from t limit 10").count() == 5)
assert(spark.sql("select a from t limit 10").count() == 5)
Expand All @@ -85,7 +84,6 @@ class IssueTestSuite extends BaseTiSparkTest {
tidbStmt.execute(
"insert into t values(1,771.64),(2,378.49),(3,920.92),(4,113.97)"
)
refreshConnections()

assert(try {
judge("select a, max(b) from t group by a limit 2")
Expand Down Expand Up @@ -133,7 +131,6 @@ class IssueTestSuite extends BaseTiSparkTest {
tidbStmt.execute(
"insert into single_read values(1, 1, 1, 2, null), (1, 2, 1, 1, null), (2, 1, 3, 2, null), (2, 2, 2, 1, 0)"
)
refreshConnections()

judge("select count(1) from single_read")
judge("select count(c1) from single_read")
Expand Down Expand Up @@ -163,7 +160,6 @@ class IssueTestSuite extends BaseTiSparkTest {
)
tidbStmt.execute("insert into t1 values(1, 201707, 'aa'), (2, 201707, 'aa')")
tidbStmt.execute("insert into t2 values(2, 201707, 'aa')")
refreshConnections()

// Note: Left outer join for DataSet is different from that in mysql.
// The result of DataSet[a, b, c] left outer join DataSet[d, b, c]
Expand Down Expand Up @@ -191,7 +187,6 @@ class IssueTestSuite extends BaseTiSparkTest {
tidbStmt.execute(
"INSERT INTO `tmp_debug` VALUES ('0000-00-00 00:00:00','0000-00-00','0000-00-00 00:00:00')"
)
refreshConnections()
spark.sql("select * from tmp_debug").collect()
}

Expand Down Expand Up @@ -231,16 +226,16 @@ class IssueTestSuite extends BaseTiSparkTest {
tidbStmt.execute("insert into t values(1)")
tidbStmt.execute("insert into t values(2)")
tidbStmt.execute("insert into t values(4)")
refreshConnections() // refresh since we need to load data again
judge("select count(c1) from t")
judge("select count(c1 + 1) from t")
judge("select count(1 + c1) from t")
ti.meta.reloadAllMeta()
runTest("select count(c1) from t")
runTest("select count(c1 + 1) from t")
runTest("select count(1 + c1) from t")
tidbStmt.execute("drop table if exists t")
tidbStmt.execute("create table t(c1 int not null, c2 int not null)")
tidbStmt.execute("insert into t values(1, 4)")
tidbStmt.execute("insert into t values(2, 2)")
refreshConnections()
judge("select count(c1 + c2) from t")
ti.meta.reloadAllMeta()
runTest("select count(c1 + c2) from t")
}

// https://github.com/pingcap/tispark/issues/496
Expand All @@ -249,7 +244,6 @@ class IssueTestSuite extends BaseTiSparkTest {
tidbStmt.execute(
"CREATE TABLE `tmp_empty_tbl` (`c1` varchar(20))"
)
refreshConnections()
judge("select count(1) from `tmp_empty_tbl`")
judge("select cast(count(1) as char(20)) from `tmp_empty_tbl`")
}
Expand All @@ -274,6 +268,45 @@ class IssueTestSuite extends BaseTiSparkTest {
)
}

test("unsigned bigint as group by column") {
def explainTestAndCollect(sql: String): Unit = {
val df = spark.sql(sql)
df.explain
df.show
df.collect.foreach(println)
}
tidbStmt.execute("drop table if exists table_group_by_bigint")
tidbStmt.execute("""
|CREATE TABLE `table_group_by_bigint` (
| `a` int(11) NOT NULL,
| `b` bigint(20) UNSIGNED DEFAULT NULL,
| `c` bigint(20) UNSIGNED DEFAULT NULL,
| KEY idx(b)
|) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
""".stripMargin)
tidbStmt.execute(
"insert into table_group_by_bigint values(1, 2, 18446744073709551615), (2, 18446744073709551615, 18446744073709551614), (3, 18446744073709551615, 5), (4, 18446744073709551614, 18446744073709551614)"
)
explainTestAndCollect(
"select sum(a) from table_group_by_bigint group by b"
)
explainTestAndCollect(
"select sum(a) from table_group_by_bigint where c > 0 group by b"
)
explainTestAndCollect(
"select sum(b) from table_group_by_bigint group by c"
)
explainTestAndCollect(
"select sum(a) from table_group_by_bigint group by b"
)
explainTestAndCollect(
"select b from table_group_by_bigint group by b"
)
explainTestAndCollect(
"select b from table_group_by_bigint where c=18446744073709551614 group by b"
)
}

override def afterAll(): Unit =
try {
tidbStmt.execute("drop table if exists t")
Expand All @@ -283,6 +316,7 @@ class IssueTestSuite extends BaseTiSparkTest {
tidbStmt.execute("drop table if exists single_read")
tidbStmt.execute("drop table if exists set_t")
tidbStmt.execute("drop table if exists enum_t")
tidbStmt.execute("drop table if exists table_group_by_bigint")
} finally {
super.afterAll()
}
Expand Down