Skip to content

Commit

Permalink
[GLUTEN-6840][CH] Enable cache files for hdfs (#6841)
Browse files Browse the repository at this point in the history
  • Loading branch information
loneylee authored Aug 21, 2024
1 parent 371be6f commit 371d448
Show file tree
Hide file tree
Showing 32 changed files with 1,616 additions and 222 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

grammar GlutenCacheFileSqlBase;

@members {
/**
* Verify whether current token is a valid decimal token (which contains dot).
* Returns true if the character that follows the token is not a digit or letter or underscore.
*
* For example:
* For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
* For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
* For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
* For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is folllowed
* by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
* which is not a digit or letter or underscore.
*/
public boolean isValidDecimal() {
int nextChar = _input.LA(1);
if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' ||
nextChar == '_') {
return false;
} else {
return true;
}
}
}

tokens {
DELIMITER
}

singleStatement
: statement ';'* EOF
;

statement
: CACHE FILES ASYNC? SELECT selectedColumns=selectedColumnNames
FROM (path=STRING)
(CACHEPROPERTIES cacheProps=propertyList)? #cacheFiles
| .*? #passThrough
;

selectedColumnNames
: ASTERISK
| identifier (COMMA identifier)*
;

propertyList
: LEFT_PAREN property (COMMA property)* RIGHT_PAREN
;

property
: key=propertyKey (EQ? value=propertyValue)?
;

propertyKey
: identifier (DOT identifier)*
| stringLit
;

propertyValue
: INTEGER_VALUE
| DECIMAL_VALUE
| booleanValue
| identifier LEFT_PAREN stringLit COMMA stringLit RIGHT_PAREN
| value=stringLit
;

stringLit
: STRING
| DOUBLEQUOTED_STRING
;

booleanValue
: TRUE | FALSE
;

identifier
: IDENTIFIER #unquotedIdentifier
| quotedIdentifier #quotedIdentifierAlternative
| nonReserved #unquotedIdentifier
;

quotedIdentifier
: BACKQUOTED_IDENTIFIER
;

// Add keywords here so that people's queries don't break if they have a column name as one of
// these tokens
nonReserved
: CACHE | FILES | ASYNC
| SELECT | FOR | AFTER | CACHEPROPERTIES
| TIMESTAMP | AS | OF | DATE_PARTITION
|
;

// Define how the keywords above should appear in a user's SQL statement.
CACHE: 'CACHE';
META: 'META';
ASYNC: 'ASYNC';
SELECT: 'SELECT';
COMMA: ',';
FOR: 'FOR';
FROM: 'FROM';
AFTER: 'AFTER';
CACHEPROPERTIES: 'CACHEPROPERTIES';
DOT: '.';
ASTERISK: '*';
TIMESTAMP: 'TIMESTAMP';
AS: 'AS';
OF: 'OF';
DATE_PARTITION: 'DATE_PARTITION';
LEFT_PAREN: '(';
RIGHT_PAREN: ')';
TRUE: 'TRUE';
FALSE: 'FALSE';
FILES: 'FILES';

EQ : '=' | '==';
NSEQ: '<=>';
NEQ : '<>';
NEQJ: '!=';
LTE : '<=' | '!>';
GTE : '>=' | '!<';
CONCAT_PIPE: '||';

STRING
: '\'' ( ~('\''|'\\') | ('\\' .) )* '\''
| '"' ( ~('"'|'\\') | ('\\' .) )* '"'
;

DOUBLEQUOTED_STRING
:'"' ( ~('"'|'\\') | ('\\' .) )* '"'
;

BIGINT_LITERAL
: DIGIT+ 'L'
;

SMALLINT_LITERAL
: DIGIT+ 'S'
;

TINYINT_LITERAL
: DIGIT+ 'Y'
;

INTEGER_VALUE
: DIGIT+
;

DECIMAL_VALUE
: DIGIT+ EXPONENT
| DECIMAL_DIGITS EXPONENT? {isValidDecimal()}?
;

DOUBLE_LITERAL
: DIGIT+ EXPONENT? 'D'
| DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}?
;

BIGDECIMAL_LITERAL
: DIGIT+ EXPONENT? 'BD'
| DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}?
;

IDENTIFIER
: (LETTER | DIGIT | '_')+
;

BACKQUOTED_IDENTIFIER
: '`' ( ~'`' | '``' )* '`'
;

fragment DECIMAL_DIGITS
: DIGIT+ '.' DIGIT*
| '.' DIGIT+
;

fragment EXPONENT
: 'E' [+-]? DIGIT+
;

fragment DIGIT
: [0-9]
;

fragment LETTER
: [A-Z]
;

SIMPLE_COMMENT
: '--' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN)
;

BRACKETED_COMMENT
: '/*' .*? '*/' -> channel(HIDDEN)
;

WS : [ \r\n\t]+ -> channel(HIDDEN)
;

// Catch-all for anything we can't recognize.
// We use this to be able to ignore and recover all the text
// when splitting statements with DelimiterLexer
UNRECOGNIZED
: .
;
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gluten.parser

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.types.{DataType, StructType}

class GlutenCacheFilesSqlParser(spark: SparkSession, delegate: ParserInterface)
extends GlutenCacheFileSqlParserBase {

override def parsePlan(sqlText: String): LogicalPlan =
parse(sqlText) {
parser =>
astBuilder.visit(parser.singleStatement()) match {
case plan: LogicalPlan => plan
case _ => delegate.parsePlan(sqlText)
}
}

override def parseExpression(sqlText: String): Expression = {
delegate.parseExpression(sqlText)
}

override def parseTableIdentifier(sqlText: String): TableIdentifier = {
delegate.parseTableIdentifier(sqlText)
}

override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = {
delegate.parseFunctionIdentifier(sqlText)
}

override def parseMultipartIdentifier(sqlText: String): Seq[String] = {
delegate.parseMultipartIdentifier(sqlText)
}

override def parseTableSchema(sqlText: String): StructType = {
delegate.parseTableSchema(sqlText)
}

override def parseDataType(sqlText: String): DataType = {
delegate.parseDataType(sqlText)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gluten.parser

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.types.{DataType, StructType}

class GlutenCacheFilesSqlParser(spark: SparkSession, delegate: ParserInterface)
extends GlutenCacheFileSqlParserBase {

override def parsePlan(sqlText: String): LogicalPlan =
parse(sqlText) {
parser =>
astBuilder.visit(parser.singleStatement()) match {
case plan: LogicalPlan => plan
case _ => delegate.parsePlan(sqlText)
}
}

override def parseExpression(sqlText: String): Expression = {
delegate.parseExpression(sqlText)
}

override def parseTableIdentifier(sqlText: String): TableIdentifier = {
delegate.parseTableIdentifier(sqlText)
}

override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = {
delegate.parseFunctionIdentifier(sqlText)
}

override def parseMultipartIdentifier(sqlText: String): Seq[String] = {
delegate.parseMultipartIdentifier(sqlText)
}

override def parseTableSchema(sqlText: String): StructType = {
delegate.parseTableSchema(sqlText)
}

override def parseDataType(sqlText: String): DataType = {
delegate.parseDataType(sqlText)
}

override def parseQuery(sqlText: String): LogicalPlan = {
delegate.parseQuery(sqlText)
}
}
Loading

0 comments on commit 371d448

Please sign in to comment.