Skip to content

Commit ddab984

Browse files
committed
Merge pull request #19 from yhuai/pr3431yin
Parse data types in DDLParser.
2 parents cf982d2 + 91ad91b commit ddab984

File tree

4 files changed

+74
-88
lines changed

4 files changed

+74
-88
lines changed

sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala

Lines changed: 64 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,9 @@ package org.apache.spark.sql.sources
1919

2020
import scala.language.implicitConversions
2121
import scala.util.parsing.combinator.syntactical.StandardTokenParsers
22-
import scala.util.parsing.combinator.{RegexParsers, PackratParsers}
22+
import scala.util.parsing.combinator.PackratParsers
2323

2424
import org.apache.spark.Logging
25-
import org.apache.spark.annotation.DeveloperApi
2625
import org.apache.spark.sql.SQLContext
2726
import org.apache.spark.sql.catalyst.types._
2827
import org.apache.spark.sql.execution.RunnableCommand
@@ -44,18 +43,43 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi
4443
}
4544
}
4645

46+
def parseType(input: String): DataType = {
47+
phrase(dataType)(new lexical.Scanner(input)) match {
48+
case Success(r, x) => r
49+
case x =>
50+
sys.error(s"Unsupported dataType: $x")
51+
}
52+
}
53+
4754
protected case class Keyword(str: String)
4855

4956
protected implicit def asParser(k: Keyword): Parser[String] =
5057
lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _)
5158

5259
protected val CREATE = Keyword("CREATE")
53-
protected val DECIMAL = Keyword("DECIMAL")
5460
protected val TEMPORARY = Keyword("TEMPORARY")
5561
protected val TABLE = Keyword("TABLE")
5662
protected val USING = Keyword("USING")
5763
protected val OPTIONS = Keyword("OPTIONS")
5864

65+
// Data types.
66+
protected val STRING = Keyword("STRING")
67+
protected val FLOAT = Keyword("FLOAT")
68+
protected val INT = Keyword("INT")
69+
protected val TINYINT = Keyword("TINYINT")
70+
protected val SMALLINT = Keyword("SMALLINT")
71+
protected val DOUBLE = Keyword("DOUBLE")
72+
protected val BIGINT = Keyword("BIGINT")
73+
protected val BINARY = Keyword("BINARY")
74+
protected val BOOLEAN = Keyword("BOOLEAN")
75+
protected val DECIMAL = Keyword("DECIMAL")
76+
protected val DATE = Keyword("DATE")
77+
protected val TIMESTAMP = Keyword("TIMESTAMP")
78+
protected val VARCHAR = Keyword("VARCHAR")
79+
protected val ARRAY = Keyword("ARRAY")
80+
protected val MAP = Keyword("MAP")
81+
protected val STRUCT = Keyword("STRUCT")
82+
5983
// Use reflection to find the reserved words defined in this class.
6084
protected val reservedWords =
6185
this.getClass
@@ -77,20 +101,15 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi
77101
* OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")`
78102
*/
79103
protected lazy val createTable: Parser[LogicalPlan] =
80-
( CREATE ~ TEMPORARY ~ TABLE ~> ident ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ {
81-
case tableName ~ provider ~ opts =>
82-
CreateTableUsing(tableName, Seq.empty, provider, opts)
83-
}
84-
|
104+
(
85105
CREATE ~ TEMPORARY ~ TABLE ~> ident
86-
~ tableCols ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ {
87-
case tableName ~ tableColumns ~ provider ~ opts =>
88-
CreateTableUsing(tableName, tableColumns, provider, opts)
106+
~ (tableCols).? ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ {
107+
case tableName ~ columns ~ provider ~ opts =>
108+
val tblColumns = if(columns.isEmpty) Seq.empty else columns.get
109+
CreateTableUsing(tableName, tblColumns, provider, opts)
89110
}
90111
)
91112

92-
protected lazy val metastoreTypes = new MetastoreTypes
93-
94113
protected lazy val tableCols: Parser[Seq[StructField]] = "(" ~> repsep(column, ",") <~ ")"
95114

96115
protected lazy val options: Parser[Map[String, String]] =
@@ -101,96 +120,62 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi
101120
protected lazy val pair: Parser[(String, String)] = ident ~ stringLit ^^ { case k ~ v => (k,v) }
102121

103122
protected lazy val column: Parser[StructField] =
104-
( ident ~ ident ^^ { case name ~ typ =>
105-
StructField(name, metastoreTypes.toDataType(typ))
123+
ident ~ dataType ^^ { case columnName ~ typ =>
124+
StructField(cleanIdentifier(columnName), typ)
106125
}
107-
|
108-
ident ~ (DECIMAL ~ "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
109-
case name ~ precision ~ scale =>
110-
StructField(name, DecimalType(precision.toInt, scale.toInt))
111-
}
112-
)
113-
}
114126

115-
/**
116-
* :: DeveloperApi ::
117-
* Provides a parser for data types.
118-
*/
119-
@DeveloperApi
120-
private[sql] class MetastoreTypes extends RegexParsers {
121127
protected lazy val primitiveType: Parser[DataType] =
122-
"string" ^^^ StringType |
123-
"float" ^^^ FloatType |
124-
"int" ^^^ IntegerType |
125-
"tinyint" ^^^ ByteType |
126-
"smallint" ^^^ ShortType |
127-
"double" ^^^ DoubleType |
128-
"bigint" ^^^ LongType |
129-
"binary" ^^^ BinaryType |
130-
"boolean" ^^^ BooleanType |
131-
fixedDecimalType | // decimal with precision/scale
132-
"decimal" ^^^ DecimalType.Unlimited | // decimal with no precision/scale
133-
"date" ^^^ DateType |
134-
"timestamp" ^^^ TimestampType |
135-
"varchar\\((\\d+)\\)".r ^^^ StringType
128+
STRING ^^^ StringType |
129+
BINARY ^^^ BinaryType |
130+
BOOLEAN ^^^ BooleanType |
131+
TINYINT ^^^ ByteType |
132+
SMALLINT ^^^ ShortType |
133+
INT ^^^ IntegerType |
134+
BIGINT ^^^ LongType |
135+
FLOAT ^^^ FloatType |
136+
DOUBLE ^^^ DoubleType |
137+
fixedDecimalType | // decimal with precision/scale
138+
DECIMAL ^^^ DecimalType.Unlimited | // decimal with no precision/scale
139+
DATE ^^^ DateType |
140+
TIMESTAMP ^^^ TimestampType |
141+
VARCHAR ~ "(" ~ numericLit ~ ")" ^^^ StringType
136142

137143
protected lazy val fixedDecimalType: Parser[DataType] =
138-
("decimal" ~> "(" ~> "\\d+".r) ~ ("," ~> "\\d+".r <~ ")") ^^ {
139-
case precision ~ scale =>
140-
DecimalType(precision.toInt, scale.toInt)
144+
(DECIMAL ~ "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
145+
case precision ~ scale => DecimalType(precision.toInt, scale.toInt)
141146
}
142147

143148
protected lazy val arrayType: Parser[DataType] =
144-
"array" ~> "<" ~> dataType <~ ">" ^^ {
149+
ARRAY ~> "<" ~> dataType <~ ">" ^^ {
145150
case tpe => ArrayType(tpe)
146151
}
147152

148153
protected lazy val mapType: Parser[DataType] =
149-
"map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
154+
MAP ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
150155
case t1 ~ _ ~ t2 => MapType(t1, t2)
151156
}
152157

153158
protected lazy val structField: Parser[StructField] =
154-
"[a-zA-Z0-9_]*".r ~ ":" ~ dataType ^^ {
155-
case name ~ _ ~ tpe => StructField(name, tpe, nullable = true)
159+
ident ~ ":" ~ dataType ^^ {
160+
case fieldName ~ _ ~ tpe => StructField(cleanIdentifier(fieldName), tpe, nullable = true)
156161
}
157162

158163
protected lazy val structType: Parser[DataType] =
159-
"struct" ~> "<" ~> repsep(structField,",") <~ ">" ^^ {
164+
STRUCT ~> "<" ~> repsep(structField, ",") <~ ">" ^^ {
160165
case fields => new StructType(fields)
161166
}
162167

163168
private[sql] lazy val dataType: Parser[DataType] =
164169
arrayType |
165-
mapType |
166-
structType |
167-
primitiveType
168-
169-
def toDataType(metastoreType: String): DataType = parseAll(dataType, metastoreType) match {
170-
case Success(result, _) => result
171-
case failure: NoSuccess => sys.error(s"Unsupported dataType: $metastoreType")
172-
}
173-
174-
def toMetastoreType(dt: DataType): String = dt match {
175-
case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
176-
case StructType(fields) =>
177-
s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>"
178-
case MapType(keyType, valueType, _) =>
179-
s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>"
180-
case StringType => "string"
181-
case FloatType => "float"
182-
case IntegerType => "int"
183-
case ByteType => "tinyint"
184-
case ShortType => "smallint"
185-
case DoubleType => "double"
186-
case LongType => "bigint"
187-
case BinaryType => "binary"
188-
case BooleanType => "boolean"
189-
case DateType => "date"
190-
case d: DecimalType => "decimal"
191-
case TimestampType => "timestamp"
192-
case NullType => "void"
193-
case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType)
170+
mapType |
171+
structType |
172+
primitiveType
173+
174+
protected val escapedIdentifier = "`([^`]+)`".r
175+
/** Strips backticks from ident if present */
176+
protected def cleanIdentifier(ident: String): String = ident match {
177+
case escapedIdentifier(i) => i
178+
case plainIdent => plainIdent
194179
}
195180
}
196181

sql/core/src/test/scala/org/apache/spark/sql/sources/NewTableScanSuite.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,9 @@ class NewTableScanSuite extends DataSourceTest {
9393
before {
9494
sql(
9595
"""
96-
|CREATE TEMPORARY TABLE oneToTen(stringField string, intField int, longField bigint,
97-
|floatField float, doubleField double, shortField smallint, byteField tinyint,
98-
|booleanField boolean, decimalField decimal(10,2), dateField date, timestampField timestamp)
96+
|CREATE TEMPORARY TABLE oneToTen(stringField stRIng, intField iNt, longField Bigint,
97+
|floatField flOat, doubleField doubLE, shortField smaLlint, byteField tinyint,
98+
|booleanField boolean, decimalField decimal(10,2), dateField dAte, timestampField tiMestamp)
9999
|USING org.apache.spark.sql.sources.AllDataTypesScanSource
100100
|OPTIONS (
101101
| From '1',

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ import org.apache.spark.sql.catalyst.plans.logical
3838
import org.apache.spark.sql.catalyst.plans.logical._
3939
import org.apache.spark.sql.catalyst.rules._
4040
import org.apache.spark.sql.catalyst.types._
41-
import org.apache.spark.sql.sources.MetastoreTypes
4241
import org.apache.spark.util.Utils
4342

4443
/* Implicit conversions */
@@ -438,7 +437,7 @@ private[hive] case class MetastoreRelation
438437
implicit class SchemaAttribute(f: FieldSchema) {
439438
def toAttribute = AttributeReference(
440439
f.getName,
441-
HiveMetastoreTypes.toDataType(f.getType),
440+
sqlContext.ddlParser.parseType(f.getType),
442441
// Since data can be dumped in randomly with no validation, everything is nullable.
443442
nullable = true
444443
)(qualifiers = Seq(alias.getOrElse(tableName)))
@@ -459,9 +458,8 @@ private[hive] case class MetastoreRelation
459458
val columnOrdinals = AttributeMap(attributes.zipWithIndex)
460459
}
461460

462-
463-
object HiveMetastoreTypes extends MetastoreTypes {
464-
override def toMetastoreType(dt: DataType): String = dt match {
461+
object HiveMetastoreTypes {
462+
def toMetastoreType(dt: DataType): String = dt match {
465463
case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
466464
case StructType(fields) =>
467465
s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>"

sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,17 @@ package org.apache.spark.sql.hive
2020
import org.scalatest.FunSuite
2121

2222
import org.apache.spark.sql.catalyst.types.StructType
23+
import org.apache.spark.sql.sources.DDLParser
2324
import org.apache.spark.sql.test.ExamplePointUDT
2425

2526
class HiveMetastoreCatalogSuite extends FunSuite {
2627

2728
test("struct field should accept underscore in sub-column name") {
2829
val metastr = "struct<a: int, b_1: string, c: string>"
2930

30-
val datatype = HiveMetastoreTypes.toDataType(metastr)
31+
val ddlParser = new DDLParser
32+
33+
val datatype = ddlParser.parseType(metastr)
3134
assert(datatype.isInstanceOf[StructType])
3235
}
3336

0 commit comments

Comments
 (0)