From dbf833ffab5d8f40453d35e27baa2e1e9d5a9f60 Mon Sep 17 00:00:00 2001 From: Piotr Grabowski Date: Fri, 28 Feb 2025 09:17:46 +0100 Subject: [PATCH] Add tests of lexer dialects (sqlparse, sqlfluff) This PR adds tests of our ports of dialects from sqlparse and sqlfluff. The test files are the extracted SQL queries from the tests of sqlparse and sqlfluff. The expected outputs (tokens) are collected by running the queries through the original lexers of sqlparse and sqlfluff. By doing this, we get a really good test coverage (>10k SQL queries, many edge case/"strange"/"tricky" SQL queries collected) and we can be almost certain that our implementation behaves identically to the original sqlparse/sqlfluff implementations. The test files are stored in a separate repo (added to Quesma as a Git submodule), because they are large (~10MB). --- .gitmodules | 3 + .../sql/lexer/dialect_sqlparse/rule_test.go | 40 +++++++++++- .../lexer/dialects_sqlfluff/ansi/rule_test.go | 42 +++++++++++- .../sql/lexer/testutils/testdata_loader.go | 65 +++++++++++++++++++ platform/parsers/sql/testdata | 1 + 5 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 .gitmodules create mode 100644 platform/parsers/sql/lexer/testutils/testdata_loader.go create mode 160000 platform/parsers/sql/testdata diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..47ba6be58 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "platform/parsers/sql/testdata"] + path = platform/parsers/sql/testdata + url = https://github.com/avelanarius/quesma-testdata-wip.git diff --git a/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go b/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go index 7805665f2..2547dac3b 100644 --- a/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go +++ b/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go @@ -4,9 +4,11 @@ package dialect_sqlparse import ( - "github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core" "testing" + "github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core" + "github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/testutils" + "github.com/stretchr/testify/assert" ) @@ -38,6 +40,42 @@ func TestSimpleSelect(t *testing.T) { assert.Equal(t, "tabela", tokens[6].RawValue) } +func TestSqlparsedTestcases(t *testing.T) { + testfiles := []string{ + "../../testdata/testdata/dialect_sqlparse/parsed-sqlparse-testcases.txt", + "../../testdata/testdata/dialect_sqlparse/parsed-sqlfluff-all-testcases.txt", + } + + for _, testfile := range testfiles { + t.Run(testfile, func(t *testing.T) { + testcases := testutils.LoadParsedTestcases(testfile) + for _, testcase := range testcases { + t.Run(testcase.Query, func(t *testing.T) { + tokens := core.Lex(testcase.Query, SqlparseRules) + assert.Equal(t, len(testcase.ExpectedTokens), len(tokens)) + + commonLength := min(len(testcase.ExpectedTokens), len(tokens)) + + for i := 0; i < commonLength; i++ { + assert.Equalf(t, testcase.ExpectedTokens[i].TokenType, tokens[i].Type.Name, "Token type at position %d", i) + assert.Equalf(t, testcase.ExpectedTokens[i].TokenValue, tokens[i].RawValue, "Token value at position %d", i) + } + + if t.Failed() { + for i := 0; i < commonLength; i++ { + if testcase.ExpectedTokens[i].TokenType != tokens[i].Type.Name || testcase.ExpectedTokens[i].TokenValue != tokens[i].RawValue { + t.Logf("Mismatch token at position %d: %s(%s). Got: %s(%s)", i, testcase.ExpectedTokens[i].TokenType, testcase.ExpectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue) + } else { + t.Logf("Expected token at position %d: %s(%s). Got: %s(%s)", i, testcase.ExpectedTokens[i].TokenType, testcase.ExpectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue) + } + } + } + }) + } + }) + } +} + func FuzzLex(f *testing.F) { f.Add("SELECT * FROM tabela") f.Add("SELECT id, name, email FROM customers WHERE age > 21") diff --git a/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go b/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go index 28d99486c..0cae3dff6 100644 --- a/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go +++ b/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go @@ -4,11 +4,51 @@ package ansi import ( + "testing" + "github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core" + "github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/testutils" "github.com/stretchr/testify/assert" - "testing" ) +func TestSqlfluffAnsiTestcases(t *testing.T) { + testfiles := []string{ + "../../../testdata/testdata/dialects_sqlfluff/parsed-sqlfluff-ansi-testcases.txt", + "../../../testdata/testdata/dialects_sqlfluff/parsed-sqlparse-testcases.txt", + } + + for _, testfile := range testfiles { + t.Run(testfile, func(t *testing.T) { + testcases := testutils.LoadParsedTestcases(testfile) + for _, testcase := range testcases { + t.Run(testcase.Query, func(t *testing.T) { + expectedTokens := testcase.ExpectedTokens[:len(testcase.ExpectedTokens)-1] // remove the last token, which is an EOF token + + tokens := core.Lex(testcase.Query, SqlfluffAnsiRules) + assert.Equal(t, len(expectedTokens), len(tokens)) + + commonLength := min(len(expectedTokens), len(tokens)) + + for i := 0; i < commonLength; i++ { + assert.Equalf(t, expectedTokens[i].TokenType, tokens[i].Type.Name, "Token type at position %d", i) + assert.Equalf(t, expectedTokens[i].TokenValue, tokens[i].RawValue, "Token value at position %d", i) + } + + if t.Failed() { + for i := 0; i < commonLength; i++ { + if expectedTokens[i].TokenType != tokens[i].Type.Name || expectedTokens[i].TokenValue != tokens[i].RawValue { + t.Logf("Mismatch token at position %d: %s(%s). Got: %s(%s)", i, expectedTokens[i].TokenType, expectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue) + } else { + t.Logf("Expected token at position %d: %s(%s). Got: %s(%s)", i, expectedTokens[i].TokenType, expectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue) + } + } + } + }) + } + }) + } +} + func FuzzLex(f *testing.F) { f.Add("SELECT * FROM tabela") f.Add("SELECT id, name, email FROM customers WHERE age > 21") diff --git a/platform/parsers/sql/lexer/testutils/testdata_loader.go b/platform/parsers/sql/lexer/testutils/testdata_loader.go new file mode 100644 index 000000000..4873bfd5c --- /dev/null +++ b/platform/parsers/sql/lexer/testutils/testdata_loader.go @@ -0,0 +1,65 @@ +// Copyright Quesma, licensed under the Elastic License 2.0. +// SPDX-License-Identifier: Elastic-2.0 + +package testutils + +import ( + "bytes" + "os" +) + +type ParsedTestcase struct { + Query string + ExpectedTokens []ExpectedToken +} + +type ExpectedToken struct { + TokenType string + TokenValue string +} + +// Loads a list of test queries and their expected tokens (extracted from existing parsers). +// The structure of the file is as follows: +// +// [QUERY1] +// +// [TOKEN_TYPE_1] +// [TOKEN_VALUE_1] +// +// [TOKEN_TYPE_2] +// [TOKEN_VALUE_2] +// +// ... +// +// [QUERY2] +// ... +func LoadParsedTestcases(filename string) []ParsedTestcase { + contents, err := os.ReadFile(filename) + if err != nil { + panic(err) + } + + testcases := bytes.Split(contents, []byte("\n\n")) + testcases = testcases[:len(testcases)-1] + + var parsedTestcases []ParsedTestcase + for _, testcase := range testcases { + endOfQuerySplit := bytes.Split(testcase, []byte("\n\n")) + + query := string(endOfQuerySplit[0]) + + tokens := bytes.Split(endOfQuerySplit[1], []byte("\n\n")) + tokens = tokens[:len(tokens)-1] + + var expectedTokens []ExpectedToken + for _, tokenDescription := range tokens { + tokenDescriptionSplit := bytes.SplitN(tokenDescription, []byte("\n"), 2) + tokenType := string(tokenDescriptionSplit[0]) + tokenValue := string(tokenDescriptionSplit[1]) + expectedTokens = append(expectedTokens, ExpectedToken{tokenType, tokenValue}) + } + + parsedTestcases = append(parsedTestcases, ParsedTestcase{query, expectedTokens}) + } + return parsedTestcases +} diff --git a/platform/parsers/sql/testdata b/platform/parsers/sql/testdata new file mode 160000 index 000000000..e7995a72b --- /dev/null +++ b/platform/parsers/sql/testdata @@ -0,0 +1 @@ +Subproject commit e7995a72b4eda7d836bbd825996dc8642bb9fe15