diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 00000000..1cde398b --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,71 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ master ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ master ] + schedule: + - cron: '25 5 * * 1' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] + # Learn more: + # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 00000000..3033af97 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,38 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + push: + branches: + - master + pull_request: + branches: [ master ] + schedule: + - cron: '0 12 * * *' + +jobs: + build: + + runs-on: ubuntu-20.04 # keep it on 20.04 to have Python 3.5 and 3.6 available + strategy: + matrix: + python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12-dev"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip flit + flit install --deps=develop + - name: Lint with flake8 + run: flake8 sqlparse --count --max-complexity=31 --show-source --statistics + - name: Test with pytest + run: pytest --cov=sqlparse + - name: Publish to codecov + uses: codecov/codecov-action@v3 diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b9774a23..00000000 --- a/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ -language: python -python: - - "3.5" - - "3.6" - - "3.7" - - "3.8" - - "3.9-dev" - - "nightly" - - "pypy3" - -matrix: - include: - - stage: codecheck - python: 3.8 - install: - - pip install flake8 - script: - - flake8 sqlparse - after_success: skip - -install: - - pip install -Uq pytest pytest-cov codecov - - pytest --version - -script: - - pytest --cov=sqlparse - -after_success: - - codecov diff --git a/AUTHORS b/AUTHORS index 2e31ae0d..1717adff 100644 --- a/AUTHORS +++ b/AUTHORS @@ -8,6 +8,7 @@ project: https://bitbucket.org/gutworth/six. Alphabetical list of contributors: * Adam Greenhall +* Aki Ariga * Alexander Beedie * Alexey Malyshev * ali-tny @@ -16,20 +17,24 @@ Alphabetical list of contributors: * atronah * casey * Cauê Beloni +* Christian Clauss * circld * Corey Zumar * Cristian Orellana * Dag Wieers +* Daniel Harding * Darik Gamble * Demetrio92 * Dennis Taylor * Dvořák Václav +* Erik Cederstrand * Florian Bauer * Fredy Wijaya * Gavin Wahl * hurcy * Ian Robertson * JacekPliszka +* JavierPan * Jean-Martin Archer * Jesús Leganés Combarro "Piranna" * Johannes Hoff @@ -39,11 +44,13 @@ Alphabetical list of contributors: * Kevin Jing Qiu * koljonen * Likai Liu +* Long Le Xich * mathilde.oustlant * Michael Schuller * Mike Amy * mulos * Oleg Broytman +* osmnv <80402144+osmnv@users.noreply.github.com> * Patrick Schemitz * Pi Delport * Prudhvi Vatala @@ -55,6 +62,7 @@ Alphabetical list of contributors: * Ryan Wooden * saaj * Shen Longxing +* Simon Heisterkamp * Sjoerd Job Postmus * Soloman Weng * spigwitmer diff --git a/CHANGELOG b/CHANGELOG index 65e03fce..a42577e1 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,53 @@ +Release 0.4.4 (Apr 18, 2023) +---------------------------- + +Notable Changes + +* IMPORTANT: This release fixes a security vulnerability in the + parser where a regular expression vulnerable to ReDOS (Regular + Expression Denial of Service) was used. See the security advisory + for details: https://github.com/andialbrecht/sqlparse/security/advisories/GHSA-rrm6-wvj7-cwh2 + The vulnerability was discovered by @erik-krogh from GitHub + Security Lab (GHSL). Thanks for reporting! + +Bug Fixes + +* Revert a change from 0.4.0 that changed IN to be a comparison (issue694). + The primary expectation is that IN is treated as a keyword and not as a + comparison operator. That also follows the definition of reserved keywords + for the major SQL syntax definitions. +* Fix regular expressions for string parsing. + +Other + +* sqlparse now uses pyproject.toml instead of setup.cfg (issue685). + + +Release 0.4.3 (Sep 23, 2022) +---------------------------- + +Enhancements + +* Add support for DIV operator (pr664, by chezou). +* Add support for additional SPARK keywords (pr643, by mrmasterplan). +* Avoid tokens copy (pr622, by living180). +* Add REGEXP as a comparision (pr647, by PeterSandwich). +* Add DISTINCTROW keyword for MS Access (issue677). +* Improve parsing of CREATE TABLE AS SELECT (pr662, by chezou). + +Bug Fixes + +* Fix spelling of INDICATOR keyword (pr653, by ptld). +* Fix formatting error in EXTRACT function (issue562, issue670, pr676, by ecederstrand). +* Fix bad parsing of create table statements that use lower case (issue217, pr642, by mrmasterplan). +* Handle backtick as valid quote char (issue628, pr629, by codenamelxl). +* Allow any unicode character as valid identifier name (issue641). + +Other + +* Update github actions to test on Python 3.10 as well (pr661, by cclaus). + + Release 0.4.2 (Sep 10, 2021) ---------------------------- @@ -78,7 +128,7 @@ Bug Fixes * Remove support for parsing double slash comments introduced in 0.3.0 (issue456) as it had some side-effects with other dialects and doesn't seem to be widely used (issue476). -* Restrict detection of alias names to objects that acutally could +* Restrict detection of alias names to objects that actually could have an alias (issue455, adopted some parts of pr509 by john-bodley). * Fix parsing of date/time literals (issue438, by vashek). * Fix initialization of TokenList (issue499, pr505 by john-bodley). diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 8043b359..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,11 +0,0 @@ -recursive-include docs source/* -include docs/sqlformat.1 -include docs/Makefile -recursive-include tests *.py *.sql -include LICENSE -include TODO -include AUTHORS -include CHANGELOG -include Makefile -include setup.cfg -include tox.ini diff --git a/Makefile b/Makefile index ee35e546..1657822e 100644 --- a/Makefile +++ b/Makefile @@ -22,5 +22,5 @@ clean: release: @rm -rf dist/ - python setup.py sdist bdist_wheel + python -m build twine upload --sign --identity E0B84F81 dist/* diff --git a/README.rst b/README.rst index 92e15c18..df4e7e36 100644 --- a/README.rst +++ b/README.rst @@ -4,6 +4,7 @@ python-sqlparse - Parse SQL statements |buildstatus|_ |coverage|_ |docs|_ +|packageversion|_ .. docincludebegin @@ -59,7 +60,7 @@ Documentation https://sqlparse.readthedocs.io/ Online Demo - https://sqlformat.org/ + https://sqlformat.org/ sqlparse is licensed under the BSD license. @@ -67,9 +68,11 @@ sqlparse is licensed under the BSD license. Parts of the code are based on pygments written by Georg Brandl and others. pygments-Homepage: http://pygments.org/ -.. |buildstatus| image:: https://secure.travis-ci.org/andialbrecht/sqlparse.png?branch=master -.. _buildstatus: https://travis-ci.org/#!/andialbrecht/sqlparse +.. |buildstatus| image:: https://github.com/andialbrecht/sqlparse/actions/workflows/python-app.yml/badge.svg +.. _buildstatus: https://github.com/andialbrecht/sqlparse/actions/workflows/python-app.yml .. |coverage| image:: https://codecov.io/gh/andialbrecht/sqlparse/branch/master/graph/badge.svg .. _coverage: https://codecov.io/gh/andialbrecht/sqlparse .. |docs| image:: https://readthedocs.org/projects/sqlparse/badge/?version=latest .. _docs: https://sqlparse.readthedocs.io/en/latest/?badge=latest +.. |packageversion| image:: https://img.shields.io/pypi/v/sqlparse?color=%2334D058&label=pypi%20package +.. _packageversion: https://pypi.org/project/sqlparse diff --git a/docs/source/extending.rst b/docs/source/extending.rst new file mode 100644 index 00000000..0c10924b --- /dev/null +++ b/docs/source/extending.rst @@ -0,0 +1,76 @@ +Extending :mod:`sqlparse` +========================= + +.. module:: sqlparse + :synopsis: Extending parsing capability of sqlparse. + +The :mod:`sqlparse` module uses a sql grammar that was tuned through usage and numerous +PR to fit a broad range of SQL syntaxes, but it cannot cater to every given case since +some SQL dialects have adopted conflicting meanings of certain keywords. Sqlparse +therefore exposes a mechanism to configure the fundamental keywords and regular +expressions that parse the language as described below. + +If you find an adaptation that works for your specific use-case. Please consider +contributing it back to the community by opening a PR on +`GitHub `_. + +Configuring the Lexer +--------------------- + +The lexer is a singleton class that breaks down the stream of characters into language +tokens. It does this by using a sequence of regular expressions and keywords that are +listed in the file ``sqlparse.keywords``. Instead of applying these fixed grammar +definitions directly, the lexer is default initialized in its method called +``default_initialization()``. As an api user, you can adapt the Lexer configuration by +applying your own configuration logic. To do so, start out by clearing previous +configurations with ``.clear()``, then apply the SQL list with +``.set_SQL_REGEX(SQL_REGEX)``, and apply keyword lists with ``.add_keywords(KEYWORDS)``. + +You can do so by re-using the expressions in ``sqlparse.keywords`` (see example below), +leaving parts out, or by making up your own master list. + +See the expected types of the arguments by inspecting their structure in +``sqlparse.keywords``. +(For compatibility with python 3.4, this library does not use type-hints.) + +The following example adds support for the expression ``ZORDER BY``, and adds ``BAR`` as +a keyword to the lexer: + +.. code-block:: python + + import re + + import sqlparse + from sqlparse import keywords + from sqlparse.lexer import Lexer + + # get the lexer singleton object to configure it + lex = Lexer.get_default_instance() + + # Clear the default configurations. + # After this call, reg-exps and keyword dictionaries need to be loaded + # to make the lexer functional again. + lex.clear() + + my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword) + + # slice the default SQL_REGEX to inject the custom object + lex.set_SQL_REGEX( + keywords.SQL_REGEX[:38] + + [my_regex] + + keywords.SQL_REGEX[38:] + ) + + # add the default keyword dictionaries + lex.add_keywords(keywords.KEYWORDS_COMMON) + lex.add_keywords(keywords.KEYWORDS_ORACLE) + lex.add_keywords(keywords.KEYWORDS_PLPGSQL) + lex.add_keywords(keywords.KEYWORDS_HQL) + lex.add_keywords(keywords.KEYWORDS_MSACCESS) + lex.add_keywords(keywords.KEYWORDS) + + # add a custom keyword dictionary + lex.add_keywords({'BAR', sqlparse.tokens.Keyword}) + + # no configuration is passed here. The lexer is used as a singleton. + sqlparse.parse("select * from foo zorder by bar;") diff --git a/docs/source/index.rst b/docs/source/index.rst index cba33141..e18d2b3c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,6 +20,7 @@ Contents api analyzing ui + extending changes license indices diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..338a53ce --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,70 @@ +[build-system] +requires = ["flit_core >=3.2,<4"] +build-backend = "flit_core.buildapi" + +[project] +name = "sqlparse" +description = "A non-validating SQL parser." +authors = [{name = "Andi Albrecht", email = "albrecht.andi@gmail.com"}] +readme = "README.rst" +dynamic = ["version"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Database", + "Topic :: Software Development", +] +requires-python = ">=3.5" + +[project.urls] +Home = "https://github.com/andialbrecht/sqlparse" +Documentation = "https://sqlparse.readthedocs.io/" +"Release Notes" = "https://sqlparse.readthedocs.io/en/latest/changes/" +Source = "https://github.com/andialbrecht/sqlparse" +Tracker = "https://github.com/andialbrecht/sqlparse/issues" + +[project.scripts] +sqlformat = "sqlparse.__main__:main" + +[project.optional-dependencies] +dev = [ + "flake8", + "build", +] +test = [ + "pytest", + "pytest-cov", +] +doc = [ + "sphinx", +] + +[tool.flit.sdist] +include = [ + "docs/source/", + "docs/sqlformat.1", + "docs/Makefile", + "tests/*.py", "tests/files/*.sql", + "LICENSE", + "TODO", + "AUTHORS", + "CHANGELOG", + "Makefile", + "tox.ini", +] + +[tool.coverage.run] +omit = ["sqlparse/__main__.py"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index a9127c01..00000000 --- a/setup.cfg +++ /dev/null @@ -1,54 +0,0 @@ -[metadata] -name = sqlparse -version = attr: sqlparse.__version__ -url = https://github.com/andialbrecht/sqlparse -author = Andi Albrecht -author_email = albrecht.andi@gmail.com -description = A non-validating SQL parser. -long_description = file: README.rst -license = BSD-3-Clause -classifiers = - Development Status :: 5 - Production/Stable - Intended Audience :: Developers - License :: OSI Approved :: BSD License - Operating System :: OS Independent - Programming Language :: Python - Programming Language :: Python :: 3 - Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.5 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: Implementation :: CPython - Programming Language :: Python :: Implementation :: PyPy - Topic :: Database - Topic :: Software Development -project_urls = - Documentation = https://sqlparse.readthedocs.io/ - Release Notes = https://sqlparse.readthedocs.io/en/latest/changes/ - Source = https://github.com/andialbrecht/sqlparse - Tracker = https://github.com/andialbrecht/sqlparse/issues - -[options] -python_requires = >=3.5 -packages = find: - -[options.packages.find] -exclude = tests - -[options.entry_points] -console_scripts = - sqlformat = sqlparse.__main__:main - -[tool:pytest] -xfail_strict = True - -[flake8] -extend-ignore = - E731 - -[coverage:run] -branch = False -omit = - sqlparse/__main__.py diff --git a/setup.py b/setup.py deleted file mode 100644 index ede0aff8..00000000 --- a/setup.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (C) 2009-2020 the sqlparse authors and contributors -# -# -# This setup script is part of python-sqlparse and is released under -# the BSD License: https://opensource.org/licenses/BSD-3-Clause - -from setuptools import setup - - -setup() diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py index 9cab9d2c..122595b3 100644 --- a/sqlparse/__init__.py +++ b/sqlparse/__init__.py @@ -16,7 +16,7 @@ from sqlparse import formatter -__version__ = '0.4.2' +__version__ = '0.4.4' __all__ = ['engine', 'filters', 'formatter', 'sql', 'tokens', 'cli'] diff --git a/sqlparse/engine/grouping.py b/sqlparse/engine/grouping.py index 175ae8e5..86d8fc64 100644 --- a/sqlparse/engine/grouping.py +++ b/sqlparse/engine/grouping.py @@ -91,13 +91,20 @@ def group_tzcasts(tlist): def match(token): return token.ttype == T.Keyword.TZCast - def valid(token): + def valid_prev(token): return token is not None + def valid_next(token): + return token is not None and ( + token.is_whitespace + or token.match(T.Keyword, 'AS') + or token.match(*sql.TypedLiteral.M_CLOSE) + ) + def post(tlist, pidx, tidx, nidx): return pidx, nidx - _group(tlist, sql.Identifier, match, valid, valid, post) + _group(tlist, sql.Identifier, match, valid_prev, valid_next, post) def group_typed_literal(tlist): @@ -334,12 +341,15 @@ def group_aliased(tlist): def group_functions(tlist): has_create = False has_table = False + has_as = False for tmp_token in tlist.tokens: - if tmp_token.value == 'CREATE': + if tmp_token.value.upper() == 'CREATE': has_create = True - if tmp_token.value == 'TABLE': + if tmp_token.value.upper() == 'TABLE': has_table = True - if has_create and has_table: + if tmp_token.value == 'AS': + has_as = True + if has_create and has_table and not has_as: return tidx, token = tlist.token_next_by(t=T.Name) diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py index 68506282..b45f3e0f 100644 --- a/sqlparse/keywords.py +++ b/sqlparse/keywords.py @@ -5,99 +5,92 @@ # This module is part of python-sqlparse and is released under # the BSD License: https://opensource.org/licenses/BSD-3-Clause -import re - from sqlparse import tokens - -def is_keyword(value): - val = value.upper() - return (KEYWORDS_COMMON.get(val) - or KEYWORDS_ORACLE.get(val) - or KEYWORDS_PLPGSQL.get(val) - or KEYWORDS_HQL.get(val) - or KEYWORDS.get(val, tokens.Name)), value - - -SQL_REGEX = { - 'root': [ - (r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint), - (r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint), - - (r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single), - (r'/\*[\s\S]*?\*/', tokens.Comment.Multiline), - - (r'(\r\n|\r|\n)', tokens.Newline), - (r'\s+?', tokens.Whitespace), - - (r':=', tokens.Assignment), - (r'::', tokens.Punctuation), - - (r'\*', tokens.Wildcard), - - (r"`(``|[^`])*`", tokens.Name), - (r"´(´´|[^´])*´", tokens.Name), - (r'((?=~!]+', tokens.Operator.Comparison), - (r'[+/@#%^&|^-]+', tokens.Operator), - ]} - -FLAGS = re.IGNORECASE | re.UNICODE -SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX['root']] +# object() only supports "is" and is useful as a marker +# use this marker to specify that the given regex in SQL_REGEX +# shall be processed further through a lookup in the KEYWORDS dictionaries +PROCESS_AS_KEYWORD = object() + + +SQL_REGEX = [ + (r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint), + (r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint), + + (r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single), + (r'/\*[\s\S]*?\*/', tokens.Comment.Multiline), + + (r'(\r\n|\r|\n)', tokens.Newline), + (r'\s+?', tokens.Whitespace), + + (r':=', tokens.Assignment), + (r'::', tokens.Punctuation), + + (r'\*', tokens.Wildcard), + + (r"`(``|[^`])*`", tokens.Name), + (r"´(´´|[^´])*´", tokens.Name), + (r'((?=~!]+', tokens.Operator.Comparison), + (r'[+/@#%^&|^-]+', tokens.Operator), +] KEYWORDS = { 'ABORT': tokens.Keyword, @@ -241,6 +234,7 @@ def is_keyword(value): 'DISABLE': tokens.Keyword, 'DISCONNECT': tokens.Keyword, 'DISPATCH': tokens.Keyword, + 'DIV': tokens.Operator, 'DO': tokens.Keyword, 'DOMAIN': tokens.Keyword, 'DYNAMIC': tokens.Keyword, @@ -314,7 +308,7 @@ def is_keyword(value): 'INCREMENT': tokens.Keyword, 'INDEX': tokens.Keyword, - 'INDITCATOR': tokens.Keyword, + 'INDICATOR': tokens.Keyword, 'INFIX': tokens.Keyword, 'INHERITS': tokens.Keyword, 'INITIAL': tokens.Keyword, @@ -907,6 +901,7 @@ def is_keyword(value): 'INLINE': tokens.Keyword, 'INSTR': tokens.Keyword, 'LEN': tokens.Keyword, + 'MAP': tokens.Name.Builtin, 'MAXELEMENT': tokens.Keyword, 'MAXINDEX': tokens.Keyword, 'MAX_PART_DATE': tokens.Keyword, @@ -938,9 +933,12 @@ def is_keyword(value): 'SQRT': tokens.Keyword, 'STACK': tokens.Keyword, 'STR': tokens.Keyword, + 'STRING': tokens.Name.Builtin, + 'STRUCT': tokens.Name.Builtin, 'SUBSTR': tokens.Keyword, 'SUMMARY': tokens.Keyword, 'TBLPROPERTIES': tokens.Keyword, + 'TIMESTAMP': tokens.Name.Builtin, 'TIMESTAMP_ISO': tokens.Keyword, 'TO_CHAR': tokens.Keyword, 'TO_DATE': tokens.Keyword, @@ -956,3 +954,8 @@ def is_keyword(value): 'BREAK': tokens.Keyword, 'LEAVE': tokens.Keyword, } + + +KEYWORDS_MSACCESS = { + 'DISTINCTROW': tokens.Keyword, +} diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 4397f185..9d25c9e6 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -6,6 +6,7 @@ # the BSD License: https://opensource.org/licenses/BSD-3-Clause """SQL Lexer""" +import re # This code is based on the SqlLexer in pygments. # http://pygments.org/ @@ -14,18 +15,90 @@ from io import TextIOBase -from sqlparse import tokens -from sqlparse.keywords import SQL_REGEX +from sqlparse import tokens, keywords from sqlparse.utils import consume class Lexer: - """Lexer - Empty class. Leaving for backwards-compatibility - """ + """The Lexer supports configurable syntax. + To add support for additional keywords, use the `add_keywords` method.""" + + _default_intance = None + + # Development notes: + # - This class is prepared to be able to support additional SQL dialects + # in the future by adding additional functions that take the place of + # the function default_initialization() + # - The lexer class uses an explicit singleton behavior with the + # instance-getter method get_default_instance(). This mechanism has + # the advantage that the call signature of the entry-points to the + # sqlparse library are not affected. Also, usage of sqlparse in third + # party code does not need to be adapted. On the other hand, singleton + # behavior is not thread safe, and the current implementation does not + # easily allow for multiple SQL dialects to be parsed in the same + # process. Such behavior can be supported in the future by passing a + # suitably initialized lexer object as an additional parameter to the + # entry-point functions (such as `parse`). Code will need to be written + # to pass down and utilize such an object. The current implementation + # is prepared to support this thread safe approach without the + # default_instance part needing to change interface. + + @classmethod + def get_default_instance(cls): + """Returns the lexer instance used internally + by the sqlparse core functions.""" + if cls._default_intance is None: + cls._default_intance = cls() + cls._default_intance.default_initialization() + return cls._default_intance + + def default_initialization(self): + """Initialize the lexer with default dictionaries. + Useful if you need to revert custom syntax settings.""" + self.clear() + self.set_SQL_REGEX(keywords.SQL_REGEX) + self.add_keywords(keywords.KEYWORDS_COMMON) + self.add_keywords(keywords.KEYWORDS_ORACLE) + self.add_keywords(keywords.KEYWORDS_PLPGSQL) + self.add_keywords(keywords.KEYWORDS_HQL) + self.add_keywords(keywords.KEYWORDS_MSACCESS) + self.add_keywords(keywords.KEYWORDS) + + def clear(self): + """Clear all syntax configurations. + Useful if you want to load a reduced set of syntax configurations. + After this call, regexps and keyword dictionaries need to be loaded + to make the lexer functional again.""" + self._SQL_REGEX = [] + self._keywords = [] + + def set_SQL_REGEX(self, SQL_REGEX): + """Set the list of regex that will parse the SQL.""" + FLAGS = re.IGNORECASE | re.UNICODE + self._SQL_REGEX = [ + (re.compile(rx, FLAGS).match, tt) + for rx, tt in SQL_REGEX + ] + + def add_keywords(self, keywords): + """Add keyword dictionaries. Keywords are looked up in the same order + that dictionaries were added.""" + self._keywords.append(keywords) + + def is_keyword(self, value): + """Checks for a keyword. + + If the given value is in one of the KEYWORDS_* dictionary + it's considered a keyword. Otherwise, tokens.Name is returned. + """ + val = value.upper() + for kwdict in self._keywords: + if val in kwdict: + return kwdict[val], value + else: + return tokens.Name, value - @staticmethod - def get_tokens(text, encoding=None): + def get_tokens(self, text, encoding=None): """ Return an iterable of (tokentype, value) pairs generated from `text`. If `unfiltered` is set to `True`, the filtering mechanism @@ -57,15 +130,15 @@ def get_tokens(text, encoding=None): iterable = enumerate(text) for pos, char in iterable: - for rexmatch, action in SQL_REGEX: + for rexmatch, action in self._SQL_REGEX: m = rexmatch(text, pos) if not m: continue elif isinstance(action, tokens._TokenType): yield action, m.group() - elif callable(action): - yield action(m.group()) + elif action is keywords.PROCESS_AS_KEYWORD: + yield self.is_keyword(m.group()) consume(iterable, m.end() - pos - 1) break @@ -79,4 +152,4 @@ def tokenize(sql, encoding=None): Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream of ``(token type, value)`` items. """ - return Lexer().get_tokens(sql, encoding) + return Lexer.get_default_instance().get_tokens(sql, encoding) diff --git a/sqlparse/sql.py b/sqlparse/sql.py index 6a32c26a..1ccfbdbe 100644 --- a/sqlparse/sql.py +++ b/sqlparse/sql.py @@ -234,16 +234,16 @@ def _token_matching(self, funcs, start=0, end=None, reverse=False): if reverse: assert end is None - for idx in range(start - 2, -1, -1): - token = self.tokens[idx] - for func in funcs: - if func(token): - return idx, token + indexes = range(start - 2, -1, -1) else: - for idx, token in enumerate(self.tokens[start:end], start=start): - for func in funcs: - if func(token): - return idx, token + if end is None: + end = len(self.tokens) + indexes = range(start, end) + for idx in indexes: + token = self.tokens[idx] + for func in funcs: + if func(token): + return idx, token return None, None def token_first(self, skip_ws=True, skip_cm=False): @@ -413,27 +413,28 @@ def get_type(self): Whitespaces and comments at the beginning of the statement are ignored. """ - first_token = self.token_first(skip_cm=True) - if first_token is None: + token = self.token_first(skip_cm=True) + if token is None: # An "empty" statement that either has not tokens at all # or only whitespace tokens. return 'UNKNOWN' - elif first_token.ttype in (T.Keyword.DML, T.Keyword.DDL): - return first_token.normalized + elif token.ttype in (T.Keyword.DML, T.Keyword.DDL): + return token.normalized - elif first_token.ttype == T.Keyword.CTE: + elif token.ttype == T.Keyword.CTE: # The WITH keyword should be followed by either an Identifier or # an IdentifierList containing the CTE definitions; the actual # DML keyword (e.g. SELECT, INSERT) will follow next. - fidx = self.token_index(first_token) - tidx, token = self.token_next(fidx, skip_ws=True) - if isinstance(token, (Identifier, IdentifierList)): - _, dml_keyword = self.token_next(tidx, skip_ws=True) - - if dml_keyword is not None \ - and dml_keyword.ttype == T.Keyword.DML: - return dml_keyword.normalized + tidx = self.token_index(token) + while tidx is not None: + tidx, token = self.token_next(tidx, skip_ws=True) + if isinstance(token, (Identifier, IdentifierList)): + tidx, token = self.token_next(tidx, skip_ws=True) + + if token is not None \ + and token.ttype == T.Keyword.DML: + return token.normalized # Hmm, probably invalid syntax, so return unknown. return 'UNKNOWN' diff --git a/sqlparse/utils.py b/sqlparse/utils.py index 299a84cc..512f0385 100644 --- a/sqlparse/utils.py +++ b/sqlparse/utils.py @@ -55,7 +55,7 @@ def remove_quotes(val): """Helper that removes surrounding quotes from strings.""" if val is None: return - if val[0] in ('"', "'") and val[0] == val[-1]: + if val[0] in ('"', "'", '`') and val[0] == val[-1]: val = val[1:-1] return val diff --git a/tests/test_grouping.py b/tests/test_grouping.py index cf629e9c..03d16c5d 100644 --- a/tests/test_grouping.py +++ b/tests/test_grouping.py @@ -324,6 +324,11 @@ def test_grouping_alias_case(): assert p.tokens[0].get_alias() == 'foo' +def test_grouping_alias_ctas(): + p = sqlparse.parse('CREATE TABLE tbl1 AS SELECT coalesce(t1.col1, 0) AS col1 FROM t1')[0] + assert p.tokens[10].get_alias() == 'col1' + assert isinstance(p.tokens[10].tokens[0], sql.Function) + def test_grouping_subquery_no_parens(): # Not totally sure if this is the right approach... # When a THEN clause contains a subquery w/o parenthesis around it *and* @@ -371,20 +376,10 @@ def test_grouping_function_not_in(): # issue183 p = sqlparse.parse('in(1, 2)')[0] assert len(p.tokens) == 2 - assert p.tokens[0].ttype == T.Comparison + assert p.tokens[0].ttype == T.Keyword assert isinstance(p.tokens[1], sql.Parenthesis) -def test_in_comparison(): - # issue566 - p = sqlparse.parse('a in (1, 2)')[0] - assert len(p.tokens) == 1 - assert isinstance(p.tokens[0], sql.Comparison) - assert len(p.tokens[0].tokens) == 5 - assert p.tokens[0].left.value == 'a' - assert p.tokens[0].right.value == '(1, 2)' - - def test_grouping_varchar(): p = sqlparse.parse('"text" Varchar(50) NOT NULL')[0] assert isinstance(p.tokens[2], sql.Function) @@ -655,3 +650,7 @@ def test_grouping_as_cte(): assert p[0].get_alias() is None assert p[2].value == 'AS' assert p[4].value == 'WITH' + +def test_grouping_create_table(): + p = sqlparse.parse("create table db.tbl (a string)")[0].tokens + assert p[4].value == "db.tbl" diff --git a/tests/test_keywords.py b/tests/test_keywords.py index d4ded4b6..b26e9b45 100644 --- a/tests/test_keywords.py +++ b/tests/test_keywords.py @@ -1,7 +1,7 @@ import pytest from sqlparse import tokens -from sqlparse.keywords import SQL_REGEX +from sqlparse.lexer import Lexer class TestSQLREGEX: @@ -9,5 +9,5 @@ class TestSQLREGEX: '1.', '-1.', '.1', '-.1']) def test_float_numbers(self, number): - ttype = next(tt for action, tt in SQL_REGEX if action(number)) + ttype = next(tt for action, tt in Lexer.get_default_instance()._SQL_REGEX if action(number)) assert tokens.Number.Float == ttype diff --git a/tests/test_parse.py b/tests/test_parse.py index 513b4be9..5feef5a7 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -4,7 +4,8 @@ import pytest import sqlparse -from sqlparse import sql, tokens as T +from sqlparse import sql, tokens as T, keywords +from sqlparse.lexer import Lexer def test_parse_tokenize(): @@ -132,6 +133,12 @@ def test_parse_nested_function(): assert type(t[0]) is sql.Function +def test_parse_div_operator(): + p = sqlparse.parse('col1 DIV 5 AS div_col1')[0].tokens + assert p[0].tokens[0].tokens[2].ttype is T.Operator + assert p[0].get_alias() == 'div_col1' + + def test_quoted_identifier(): t = sqlparse.parse('select x.y as "z" from foo')[0].tokens assert isinstance(t[2], sql.Identifier) @@ -142,6 +149,7 @@ def test_quoted_identifier(): @pytest.mark.parametrize('name', [ 'foo', '_foo', # issue175 '1_data', # valid MySQL table name, see issue337 + '業者名稱', # valid at least for SQLite3, see issue641 ]) def test_valid_identifier_names(name): t = sqlparse.parse(name)[0].tokens @@ -482,3 +490,79 @@ def test_parenthesis(): T.Newline, T.Newline, T.Punctuation] + + +def test_configurable_keywords(): + sql = """select * from foo BACON SPAM EGGS;""" + tokens = sqlparse.parse(sql)[0] + + assert list( + (t.ttype, t.value) + for t in tokens + if t.ttype not in sqlparse.tokens.Whitespace + ) == [ + (sqlparse.tokens.Keyword.DML, "select"), + (sqlparse.tokens.Wildcard, "*"), + (sqlparse.tokens.Keyword, "from"), + (None, "foo BACON"), + (None, "SPAM EGGS"), + (sqlparse.tokens.Punctuation, ";"), + ] + + Lexer.get_default_instance().add_keywords( + { + "BACON": sqlparse.tokens.Name.Builtin, + "SPAM": sqlparse.tokens.Keyword, + "EGGS": sqlparse.tokens.Keyword, + } + ) + + tokens = sqlparse.parse(sql)[0] + + # reset the syntax for later tests. + Lexer.get_default_instance().default_initialization() + + assert list( + (t.ttype, t.value) + for t in tokens + if t.ttype not in sqlparse.tokens.Whitespace + ) == [ + (sqlparse.tokens.Keyword.DML, "select"), + (sqlparse.tokens.Wildcard, "*"), + (sqlparse.tokens.Keyword, "from"), + (None, "foo"), + (sqlparse.tokens.Name.Builtin, "BACON"), + (sqlparse.tokens.Keyword, "SPAM"), + (sqlparse.tokens.Keyword, "EGGS"), + (sqlparse.tokens.Punctuation, ";"), + ] + + +def test_configurable_regex(): + lex = Lexer.get_default_instance() + lex.clear() + + my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword) + + lex.set_SQL_REGEX( + keywords.SQL_REGEX[:38] + + [my_regex] + + keywords.SQL_REGEX[38:] + ) + lex.add_keywords(keywords.KEYWORDS_COMMON) + lex.add_keywords(keywords.KEYWORDS_ORACLE) + lex.add_keywords(keywords.KEYWORDS_PLPGSQL) + lex.add_keywords(keywords.KEYWORDS_HQL) + lex.add_keywords(keywords.KEYWORDS_MSACCESS) + lex.add_keywords(keywords.KEYWORDS) + + tokens = sqlparse.parse("select * from foo zorder by bar;")[0] + + # reset the syntax for later tests. + Lexer.get_default_instance().default_initialization() + + assert list( + (t.ttype, t.value) + for t in tokens + if t.ttype not in sqlparse.tokens.Whitespace + )[4] == (sqlparse.tokens.Keyword, "zorder by") diff --git a/tests/test_regressions.py b/tests/test_regressions.py index 38d18404..bc8b7dd3 100644 --- a/tests/test_regressions.py +++ b/tests/test_regressions.py @@ -401,6 +401,15 @@ def test_issue489_tzcasts(): assert p.tokens[-1].get_alias() == 'foo' +def test_issue562_tzcasts(): + # Test that whitespace between 'from' and 'bar' is retained + formatted = sqlparse.format( + 'SELECT f(HOUR from bar AT TIME ZONE \'UTC\') from foo', reindent=True + ) + assert formatted == \ + 'SELECT f(HOUR\n from bar AT TIME ZONE \'UTC\')\nfrom foo' + + def test_as_in_parentheses_indents(): # did raise NoneType has no attribute is_group in _process_parentheses formatted = sqlparse.format('(as foo)', reindent=True) @@ -418,3 +427,12 @@ def test_splitting_at_and_backticks_issue588(): 'grant foo to user1@`myhost`; grant bar to user1@`myhost`;') assert len(splitted) == 2 assert splitted[-1] == 'grant bar to user1@`myhost`;' + + +def test_comment_between_cte_clauses_issue632(): + p, = sqlparse.parse(""" + WITH foo AS (), + -- A comment before baz subquery + baz AS () + SELECT * FROM baz;""") + assert p.get_type() == "SELECT" diff --git a/tests/test_split.py b/tests/test_split.py index a9d75765..e79750e8 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -18,8 +18,8 @@ def test_split_semicolon(): def test_split_backslash(): - stmts = sqlparse.parse(r"select '\\'; select '\''; select '\\\'';") - assert len(stmts) == 3 + stmts = sqlparse.parse("select '\'; select '\'';") + assert len(stmts) == 2 @pytest.mark.parametrize('fn', ['function.sql', diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..d020f3fa --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,12 @@ +import pytest + +from sqlparse import utils + + +@pytest.mark.parametrize('value, expected', ( + [None, None], + ['\'foo\'', 'foo'], + ['"foo"', 'foo'], + ['`foo`', 'foo'])) +def test_remove_quotes(value, expected): + assert utils.remove_quotes(value) == expected