Skip to content

Commit 378b458

Browse files
committed
Add support for BigQuery struct and array datatype
This builds on top of apache#817 - `STRUCT` literal support via `STRUCT` keyword https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#constructing_a_struct - `STRUCT` and `ARRAY` type declarations https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_an_array_type https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type It works around the issue of not being able to parse nested types like `ARRAY<ARRAY<INT>>` due to the right angle bracket ambiguity where the tokenizer chooses the right-shift operator (this affects other dialects like Hive that have similar syntax). When parsing such types, we accept a closing `>` or `>>` and track which variant is in use in order to preserve correctness. Fixes apache#901 Closes apache#817
1 parent 83cb734 commit 378b458

File tree

8 files changed

+894
-65
lines changed

8 files changed

+894
-65
lines changed

src/ast/data_type.rs

Lines changed: 55 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use serde::{Deserialize, Serialize};
2020
#[cfg(feature = "visitor")]
2121
use sqlparser_derive::{Visit, VisitMut};
2222

23-
use crate::ast::ObjectName;
23+
use crate::ast::{display_comma_separated, ObjectName, StructField};
2424

2525
use super::value::escape_single_quote_string;
2626

@@ -71,6 +71,10 @@ pub enum DataType {
7171
/// [standard]: https://jakewheat.github.io/sql-overview/sql-2016-foundation-grammar.html#binary-large-object-string-type
7272
/// [Oracle]: https://docs.oracle.com/javadb/10.8.3.0/ref/rrefblob.html
7373
Blob(Option<u64>),
74+
/// Variable-length binary data with optional length.
75+
///
76+
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#bytes_type
77+
Bytes(Option<u64>),
7478
/// Numeric type with optional precision and scale e.g. NUMERIC(10,2), [standard][1]
7579
///
7680
/// [1]: https://jakewheat.github.io/sql-overview/sql-2016-foundation-grammar.html#exact-numeric-type
@@ -125,6 +129,10 @@ pub enum DataType {
125129
///
126130
/// [postgresql]: https://www.postgresql.org/docs/15/datatype.html
127131
Int4(Option<u64>),
132+
/// Integer type in [bigquery]
133+
///
134+
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
135+
Int64,
128136
/// Integer with optional display width e.g. INTEGER or INTEGER(11)
129137
Integer(Option<u64>),
130138
/// Unsigned int with optional display width e.g. INT UNSIGNED or INT(11) UNSIGNED
@@ -149,6 +157,10 @@ pub enum DataType {
149157
///
150158
/// [postgresql]: https://www.postgresql.org/docs/15/datatype.html
151159
Float4,
160+
/// Floating point in [bigquery]
161+
///
162+
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
163+
Float64,
152164
/// Floating point e.g. REAL
153165
Real,
154166
/// Float8 as alias for Double in [postgresql]
@@ -190,18 +202,23 @@ pub enum DataType {
190202
Regclass,
191203
/// Text
192204
Text,
193-
/// String
194-
String,
205+
/// String with optional length.
206+
String(Option<u64>),
195207
/// Bytea
196208
Bytea,
197209
/// Custom type such as enums
198210
Custom(ObjectName, Vec<String>),
199211
/// Arrays
200-
Array(Option<Box<DataType>>),
212+
Array(ArrayElemTypeDef),
201213
/// Enums
202214
Enum(Vec<String>),
203215
/// Set
204216
Set(Vec<String>),
217+
/// Struct
218+
///
219+
/// [hive]: https://docs.cloudera.com/cdw-runtime/cloud/impala-sql-reference/topics/impala-struct.html
220+
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
221+
Struct(Vec<StructField>),
205222
}
206223

207224
impl fmt::Display for DataType {
@@ -231,6 +248,7 @@ impl fmt::Display for DataType {
231248
format_type_with_optional_length(f, "VARBINARY", size, false)
232249
}
233250
DataType::Blob(size) => format_type_with_optional_length(f, "BLOB", size, false),
251+
DataType::Bytes(size) => format_type_with_optional_length(f, "BYTES", size, false),
234252
DataType::Numeric(info) => {
235253
write!(f, "NUMERIC{info}")
236254
}
@@ -274,6 +292,9 @@ impl fmt::Display for DataType {
274292
DataType::Int4(zerofill) => {
275293
format_type_with_optional_length(f, "INT4", zerofill, false)
276294
}
295+
DataType::Int64 => {
296+
write!(f, "INT64")
297+
}
277298
DataType::UnsignedInt4(zerofill) => {
278299
format_type_with_optional_length(f, "INT4", zerofill, true)
279300
}
@@ -297,6 +318,7 @@ impl fmt::Display for DataType {
297318
}
298319
DataType::Real => write!(f, "REAL"),
299320
DataType::Float4 => write!(f, "FLOAT4"),
321+
DataType::Float64 => write!(f, "FLOAT64"),
300322
DataType::Double => write!(f, "DOUBLE"),
301323
DataType::Float8 => write!(f, "FLOAT8"),
302324
DataType::DoublePrecision => write!(f, "DOUBLE PRECISION"),
@@ -316,15 +338,13 @@ impl fmt::Display for DataType {
316338
DataType::JSON => write!(f, "JSON"),
317339
DataType::Regclass => write!(f, "REGCLASS"),
318340
DataType::Text => write!(f, "TEXT"),
319-
DataType::String => write!(f, "STRING"),
341+
DataType::String(size) => format_type_with_optional_length(f, "STRING", size, false),
320342
DataType::Bytea => write!(f, "BYTEA"),
321-
DataType::Array(ty) => {
322-
if let Some(t) = &ty {
323-
write!(f, "{t}[]")
324-
} else {
325-
write!(f, "ARRAY")
326-
}
327-
}
343+
DataType::Array(ty) => match ty {
344+
ArrayElemTypeDef::None => write!(f, "ARRAY"),
345+
ArrayElemTypeDef::SquareBracket(t) => write!(f, "{t}[]"),
346+
ArrayElemTypeDef::AngleBracket(t) => write!(f, "ARRAY<{t}>"),
347+
},
328348
DataType::Custom(ty, modifiers) => {
329349
if modifiers.is_empty() {
330350
write!(f, "{ty}")
@@ -352,6 +372,13 @@ impl fmt::Display for DataType {
352372
}
353373
write!(f, ")")
354374
}
375+
DataType::Struct(fields) => {
376+
if !fields.is_empty() {
377+
write!(f, "STRUCT<{}>", display_comma_separated(fields))
378+
} else {
379+
write!(f, "STRUCT")
380+
}
381+
}
355382
}
356383
}
357384
}
@@ -533,3 +560,19 @@ impl fmt::Display for CharLengthUnits {
533560
}
534561
}
535562
}
563+
564+
/// Represents the data type of the elements in an array (if any) as well as
565+
/// the syntax used to declare the array.
566+
///
567+
/// For example: Bigquery/Hive use ARRAY<INT> whereas snowflake uses ARRAY.
568+
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
569+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
570+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
571+
pub enum ArrayElemTypeDef {
572+
/// ARRAY
573+
None,
574+
/// ARRAY<INT>
575+
AngleBracket(Box<DataType>),
576+
/// []INT
577+
SquareBracket(Box<DataType>),
578+
}

src/ast/mod.rs

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use serde::{Deserialize, Serialize};
2626
use sqlparser_derive::{Visit, VisitMut};
2727

2828
pub use self::data_type::{
29-
CharLengthUnits, CharacterLength, DataType, ExactNumberInfo, TimezoneInfo,
29+
ArrayElemTypeDef, CharLengthUnits, CharacterLength, DataType, ExactNumberInfo, TimezoneInfo,
3030
};
3131
pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue};
3232
pub use self::ddl::{
@@ -322,6 +322,27 @@ impl fmt::Display for JsonOperator {
322322
}
323323
}
324324

325+
/// A field definition within a struct.
326+
///
327+
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
328+
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
329+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
330+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
331+
pub struct StructField {
332+
pub field_name: Option<Ident>,
333+
pub field_type: DataType,
334+
}
335+
336+
impl fmt::Display for StructField {
337+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
338+
if let Some(name) = &self.field_name {
339+
write!(f, "{name} {}", self.field_type)
340+
} else {
341+
write!(f, "{}", self.field_type)
342+
}
343+
}
344+
}
345+
325346
/// An SQL expression of any type.
326347
///
327348
/// The parser does not distinguish between expressions of different types
@@ -569,6 +590,26 @@ pub enum Expr {
569590
Rollup(Vec<Vec<Expr>>),
570591
/// ROW / TUPLE a single value, such as `SELECT (1, 2)`
571592
Tuple(Vec<Expr>),
593+
/// `BigQuery` specific `Struct` literal expression
594+
/// Syntax:
595+
/// ```sql
596+
/// STRUCT<[field_name] field_type, ...>( expr1 [, ... ])
597+
/// ```
598+
/// https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
599+
Struct {
600+
/// Struct values.
601+
values: Vec<Expr>,
602+
/// Struct field definitions.
603+
fields: Vec<StructField>,
604+
},
605+
/// `BigQuery` specific: An named expression in a typeless struct
606+
///
607+
/// Syntax
608+
/// ```sql
609+
/// 1 AS a
610+
/// ```
611+
/// https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
612+
Named { expr: Box<Expr>, name: Ident },
572613
/// An array index expression e.g. `(ARRAY[1, 2])[1]` or `(current_schemas(FALSE))[1]`
573614
ArrayIndex { obj: Box<Expr>, indexes: Vec<Expr> },
574615
/// An array expression e.g. `ARRAY[1, 2]`
@@ -917,6 +958,21 @@ impl fmt::Display for Expr {
917958
Expr::Tuple(exprs) => {
918959
write!(f, "({})", display_comma_separated(exprs))
919960
}
961+
Expr::Struct { values, fields } => {
962+
if !fields.is_empty() {
963+
write!(
964+
f,
965+
"STRUCT<{}>({})",
966+
display_comma_separated(fields),
967+
display_comma_separated(values)
968+
)
969+
} else {
970+
write!(f, "STRUCT({})", display_comma_separated(values))
971+
}
972+
}
973+
Expr::Named { expr, name } => {
974+
write!(f, "{} AS {}", expr, name)
975+
}
920976
Expr::ArrayIndex { obj, indexes } => {
921977
write!(f, "{obj}")?;
922978
for i in indexes {

src/keywords.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ define_keywords!(
120120
BY,
121121
BYPASSRLS,
122122
BYTEA,
123+
BYTES,
123124
CACHE,
124125
CALL,
125126
CALLED,
@@ -270,6 +271,7 @@ define_keywords!(
270271
FIRST_VALUE,
271272
FLOAT,
272273
FLOAT4,
274+
FLOAT64,
273275
FLOAT8,
274276
FLOOR,
275277
FOLLOWING,
@@ -293,6 +295,7 @@ define_keywords!(
293295
FUSION,
294296
GENERATE,
295297
GENERATED,
298+
GEOGRAPHY,
296299
GET,
297300
GLOBAL,
298301
GRANT,
@@ -328,6 +331,7 @@ define_keywords!(
328331
INT,
329332
INT2,
330333
INT4,
334+
INT64,
331335
INT8,
332336
INTEGER,
333337
INTERSECT,
@@ -580,6 +584,7 @@ define_keywords!(
580584
STORED,
581585
STRICT,
582586
STRING,
587+
STRUCT,
583588
SUBMULTISET,
584589
SUBSTRING,
585590
SUBSTRING_REGEX,

0 commit comments

Comments
 (0)