Add tests of lexer dialects (sqlparse, sqlfluff)

This PR adds tests of our ports of dialects from sqlparse and sqlfluff. The test files are the extracted SQL queries from the tests of sqlparse and sqlfluff. The expected outputs (tokens) are collected by running the queries through the original lexers of sqlparse and sqlfluff. By doing this, we get a really good test coverage (>10k SQL queries, many edge case/"strange"/"tricky" SQL queries collected) and we can be almost certain that our implementation behaves identically to the original sqlparse/sqlfluff implementations. The test files are stored in a separate repo (added to Quesma as a Git submodule), because they are large (~10MB).
QuesmaOrg · Feb 28, 2025 · dbf833f · dbf833f
1 parent 199bcea
commit dbf833f
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 2 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "platform/parsers/sql/testdata"]
+	path = platform/parsers/sql/testdata
+	url = https://github.com/avelanarius/quesma-testdata-wip.git
diff --git a/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go b/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go
@@ -4,9 +4,11 @@
 package dialect_sqlparse
 
 import (
-	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core"
 	"testing"
 
+	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core"
+	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/testutils"
+
 	"github.com/stretchr/testify/assert"
 )
 
@@ -38,6 +40,42 @@ func TestSimpleSelect(t *testing.T) {
 	assert.Equal(t, "tabela", tokens[6].RawValue)
 }
 
+func TestSqlparsedTestcases(t *testing.T) {
+	testfiles := []string{
+		"../../testdata/testdata/dialect_sqlparse/parsed-sqlparse-testcases.txt",
+		"../../testdata/testdata/dialect_sqlparse/parsed-sqlfluff-all-testcases.txt",
+	}
+
+	for _, testfile := range testfiles {
+		t.Run(testfile, func(t *testing.T) {
+			testcases := testutils.LoadParsedTestcases(testfile)
+			for _, testcase := range testcases {
+				t.Run(testcase.Query, func(t *testing.T) {
+					tokens := core.Lex(testcase.Query, SqlparseRules)
+					assert.Equal(t, len(testcase.ExpectedTokens), len(tokens))
+
+					commonLength := min(len(testcase.ExpectedTokens), len(tokens))
+
+					for i := 0; i < commonLength; i++ {
+						assert.Equalf(t, testcase.ExpectedTokens[i].TokenType, tokens[i].Type.Name, "Token type at position %d", i)
+						assert.Equalf(t, testcase.ExpectedTokens[i].TokenValue, tokens[i].RawValue, "Token value at position %d", i)
+					}
+
+					if t.Failed() {
+						for i := 0; i < commonLength; i++ {
+							if testcase.ExpectedTokens[i].TokenType != tokens[i].Type.Name || testcase.ExpectedTokens[i].TokenValue != tokens[i].RawValue {
+								t.Logf("Mismatch token at position %d: %s(%s). Got: %s(%s)", i, testcase.ExpectedTokens[i].TokenType, testcase.ExpectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							} else {
+								t.Logf("Expected token at position %d: %s(%s). Got: %s(%s)", i, testcase.ExpectedTokens[i].TokenType, testcase.ExpectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							}
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
 func FuzzLex(f *testing.F) {
 	f.Add("SELECT * FROM tabela")
 	f.Add("SELECT id, name, email FROM customers WHERE age > 21")

diff --git a/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go b/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go
@@ -4,11 +4,51 @@
 package ansi
 
 import (
+	"testing"
+
 	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core"
+	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/testutils"
 	"github.com/stretchr/testify/assert"
-	"testing"
 )
 
+func TestSqlfluffAnsiTestcases(t *testing.T) {
+	testfiles := []string{
+		"../../../testdata/testdata/dialects_sqlfluff/parsed-sqlfluff-ansi-testcases.txt",
+		"../../../testdata/testdata/dialects_sqlfluff/parsed-sqlparse-testcases.txt",
+	}
+
+	for _, testfile := range testfiles {
+		t.Run(testfile, func(t *testing.T) {
+			testcases := testutils.LoadParsedTestcases(testfile)
+			for _, testcase := range testcases {
+				t.Run(testcase.Query, func(t *testing.T) {
+					expectedTokens := testcase.ExpectedTokens[:len(testcase.ExpectedTokens)-1] // remove the last token, which is an EOF token
+
+					tokens := core.Lex(testcase.Query, SqlfluffAnsiRules)
+					assert.Equal(t, len(expectedTokens), len(tokens))
+
+					commonLength := min(len(expectedTokens), len(tokens))
+
+					for i := 0; i < commonLength; i++ {
+						assert.Equalf(t, expectedTokens[i].TokenType, tokens[i].Type.Name, "Token type at position %d", i)
+						assert.Equalf(t, expectedTokens[i].TokenValue, tokens[i].RawValue, "Token value at position %d", i)
+					}
+
+					if t.Failed() {
+						for i := 0; i < commonLength; i++ {
+							if expectedTokens[i].TokenType != tokens[i].Type.Name || expectedTokens[i].TokenValue != tokens[i].RawValue {
+								t.Logf("Mismatch token at position %d: %s(%s). Got: %s(%s)", i, expectedTokens[i].TokenType, expectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							} else {
+								t.Logf("Expected token at position %d: %s(%s). Got: %s(%s)", i, expectedTokens[i].TokenType, expectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							}
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
 func FuzzLex(f *testing.F) {
 	f.Add("SELECT * FROM tabela")
 	f.Add("SELECT id, name, email FROM customers WHERE age > 21")

diff --git a/platform/parsers/sql/lexer/testutils/testdata_loader.go b/platform/parsers/sql/lexer/testutils/testdata_loader.go
@@ -0,0 +1,65 @@
+// Copyright Quesma, licensed under the Elastic License 2.0.
+// SPDX-License-Identifier: Elastic-2.0
+
+package testutils
+
+import (
+	"bytes"
+	"os"
+)
+
+type ParsedTestcase struct {
+	Query          string
+	ExpectedTokens []ExpectedToken
+}
+
+type ExpectedToken struct {
+	TokenType  string
+	TokenValue string
+}
+
+// Loads a list of test queries and their expected tokens (extracted from existing parsers).
+// The structure of the file is as follows:
+//
+//	[QUERY1]
+//	<end_of_query/>
+//	[TOKEN_TYPE_1]
+//	[TOKEN_VALUE_1]
+//	<end_of_token/>
+//	[TOKEN_TYPE_2]
+//	[TOKEN_VALUE_2]
+//	<end_of_token/>
+//	...
+//	<end_of_tokens/>
+//	[QUERY2]
+//	...
+func LoadParsedTestcases(filename string) []ParsedTestcase {
+	contents, err := os.ReadFile(filename)
+	if err != nil {
+		panic(err)
+	}
+
+	testcases := bytes.Split(contents, []byte("\n<end_of_tokens/>\n"))
+	testcases = testcases[:len(testcases)-1]
+
+	var parsedTestcases []ParsedTestcase
+	for _, testcase := range testcases {
+		endOfQuerySplit := bytes.Split(testcase, []byte("\n<end_of_query/>\n"))
+
+		query := string(endOfQuerySplit[0])
+
+		tokens := bytes.Split(endOfQuerySplit[1], []byte("\n<end_of_token/>\n"))
+		tokens = tokens[:len(tokens)-1]
+
+		var expectedTokens []ExpectedToken
+		for _, tokenDescription := range tokens {
+			tokenDescriptionSplit := bytes.SplitN(tokenDescription, []byte("\n"), 2)
+			tokenType := string(tokenDescriptionSplit[0])
+			tokenValue := string(tokenDescriptionSplit[1])
+			expectedTokens = append(expectedTokens, ExpectedToken{tokenType, tokenValue})
+		}
+
+		parsedTestcases = append(parsedTestcases, ParsedTestcase{query, expectedTokens})
+	}
+	return parsedTestcases
+}
diff --git a/platform/parsers/sql/testdata b/platform/parsers/sql/testdata