Replace numberRegex option with underscoresInNumbers option

nene · nene · commit 640c982fbd15 · 2025-08-15T10:25:54.000+03:00
To keep the regexes in the same place and avoid repeating them.
diff --git a/src/languages/duckdb/duckdb.formatter.ts b/src/languages/duckdb/duckdb.formatter.ts
@@ -155,9 +155,7 @@ export const duckdb: DialectOptions = {
     reservedFunctionNames: functions,
     nestedBlockComments: true,
     extraParens: ['[]', '{}'],
-    // Support underscore separators in numeric literals (e.g., 1_000_000)
-    numberRegex:
-      /(?:0x[0-9a-fA-F_]+|0b[01_]+|(?:-\s*)?(?:[0-9_]*\.[0-9_]+|[0-9_]+(?:\.[0-9_]*)?)(?:[eE][-+]?[0-9_]+(?:\.[0-9_]+)?)?)(?![\w\p{Alphabetic}])/uy,
+    underscoresInNumbers: true,
     stringTypes: [
       '$$',
       "''-qq",
diff --git a/src/languages/postgresql/postgresql.formatter.ts b/src/languages/postgresql/postgresql.formatter.ts
@@ -277,9 +277,7 @@ export const postgresql: DialectOptions = {
     reservedFunctionNames: functions,
     nestedBlockComments: true,
     extraParens: ['[]'],
-    // Support underscore separators in numeric literals (e.g., 1_000_000)
-    numberRegex:
-      /(?:0x[0-9a-fA-F_]+|0b[01_]+|(?:-\s*)?(?:[0-9_]*\.[0-9_]+|[0-9_]+(?:\.[0-9_]*)?)(?:[eE][-+]?[0-9_]+(?:\.[0-9_]+)?)?)(?![\w\p{Alphabetic}])/uy,
+    underscoresInNumbers: true,
     stringTypes: [
       '$$',
       { quote: "''-qq", prefixes: ['U&'] },
diff --git a/src/lexer/Tokenizer.ts b/src/lexer/Tokenizer.ts
@@ -50,9 +50,9 @@ export default class Tokenizer {
       },
       {
         type: TokenType.NUMBER,
-        regex:
-          cfg.numberRegex ??
-          /(?:0x[0-9a-fA-F]+|0b[01]+|(?:-\s*)?(?:[0-9]*\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][-+]?[0-9]+(?:\.[0-9]+)?)?)(?![\w\p{Alphabetic}])/uy,
+        regex: cfg.underscoresInNumbers
+          ? /(?:0x[0-9a-fA-F_]+|0b[01_]+|(?:-\s*)?(?:[0-9_]*\.[0-9_]+|[0-9_]+(?:\.[0-9_]*)?)(?:[eE][-+]?[0-9_]+(?:\.[0-9_]+)?)?)(?![\w\p{Alphabetic}])/uy
+          : /(?:0x[0-9a-fA-F]+|0b[01]+|(?:-\s*)?(?:[0-9]*\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][-+]?[0-9]+(?:\.[0-9]+)?)?)(?![\w\p{Alphabetic}])/uy,
       },
       // RESERVED_PHRASE is matched before all other keyword tokens
       // to e.g. prioritize matching "TIMESTAMP WITH TIME ZONE" phrase over "WITH" clause.
diff --git a/src/lexer/TokenizerOptions.ts b/src/lexer/TokenizerOptions.ts
@@ -100,8 +100,8 @@ export interface TokenizerOptions {
   propertyAccessOperators?: string[];
   // Enables PostgreSQL-specific OPERATOR(...) syntax
   operatorKeyword?: boolean;
-  // Custom regex pattern for number tokens (defaults to standard SQL number pattern)
-  numberRegex?: RegExp;
+  // True to support underscores in number literals (e.g., 1_000_000)
+  underscoresInNumbers?: boolean;
   // Allows custom modifications on the token array.
   // Called after the whole input string has been split into tokens.
   // The result of this will be the output of the tokenizer.