From 4a33c252847b34dc2940b24df9fd96dbf0e05a9c Mon Sep 17 00:00:00 2001 From: Bill Sun Date: Fri, 7 Jan 2022 11:02:37 -0500 Subject: [PATCH] Initial commit (mo_sql_parsing) --- .gitignore | 15 + LICENSE | 363 +++++++++++++++++++++ README.md | 230 +++++++++++++ mo_sql_parsing/__init__.py | 93 ++++++ mo_sql_parsing/formatting.py | 602 ++++++++++++++++++++++++++++++++++ mo_sql_parsing/keywords.py | 392 ++++++++++++++++++++++ mo_sql_parsing/sql_parser.py | 605 ++++++++++++++++++++++++++++++++++ mo_sql_parsing/types.py | 223 +++++++++++++ mo_sql_parsing/utils.py | 617 +++++++++++++++++++++++++++++++++++ mo_sql_parsing/windows.py | 107 ++++++ requirements.txt | 4 + run.py | 6 + 12 files changed, 3257 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 mo_sql_parsing/__init__.py create mode 100644 mo_sql_parsing/formatting.py create mode 100644 mo_sql_parsing/keywords.py create mode 100644 mo_sql_parsing/sql_parser.py create mode 100644 mo_sql_parsing/types.py create mode 100644 mo_sql_parsing/utils.py create mode 100644 mo_sql_parsing/windows.py create mode 100644 requirements.txt create mode 100644 run.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9cc2d1d --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +*.log +*.pyc +*.tab +out +.idea +.svn +*.iml +/mo_sql_parsing.egg-info +/build +/dist +/mo-sql-parsing +vendor/ +._* +.DS_Store +.eggs diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..df5936e --- /dev/null +++ b/LICENSE @@ -0,0 +1,363 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. "Contributor" + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. "Contributor Version" + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the terms of + a Secondary License. + +1.6. "Executable Form" + + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + + means a work that combines Covered Software with other material, in a + separate file or files, that is not Covered Software. + +1.8. "License" + + means this document. + +1.9. "Licensable" + + means having the right to grant, to the maximum extent possible, whether + at the time of the initial grant or subsequently, any and all of the + rights conveyed by this License. + +1.10. "Modifications" + + means any of the following: + + a. any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. "Patent Claims" of a Contributor + + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the License, + by the making, using, selling, offering for sale, having made, import, + or transfer of either its Contributions or its Contributor Version. + +1.12. "Secondary License" + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. "Source Code Form" + + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, "control" means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution + become effective for each Contribution on the date the Contributor first + distributes such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under + this License. No additional rights or licenses will be implied from the + distribution or licensing of Covered Software under this License. + Notwithstanding Section 2.1(b) above, no patent license is granted by a + Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of + its Contributions. + + This License does not grant any rights in the trademarks, service marks, + or logos of any Contributor (except as may be necessary to comply with + the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this + License (see Section 10.2) or under the terms of a Secondary License (if + permitted under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its + Contributions are its original creation(s) or it has sufficient rights to + grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under + applicable copyright doctrines of fair use, fair dealing, or other + equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under + the terms of this License. You must inform recipients that the Source + Code Form of the Covered Software is governed by the terms of this + License, and how they can obtain a copy of this License. You may not + attempt to alter or restrict the recipients' rights in the Source Code + Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter the + recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for + the Covered Software. If the Larger Work is a combination of Covered + Software with a work governed by one or more Secondary Licenses, and the + Covered Software is not Incompatible With Secondary Licenses, this + License permits You to additionally distribute such Covered Software + under the terms of such Secondary License(s), so that the recipient of + the Larger Work may, at their option, further distribute the Covered + Software under the terms of either this License or such Secondary + License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices + (including copyright notices, patent notices, disclaimers of warranty, or + limitations of liability) contained within the Source Code Form of the + Covered Software, except that You may alter any license notices to the + extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on + behalf of any Contributor. You must make it absolutely clear that any + such warranty, support, indemnity, or liability obligation is offered by + You alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, + judicial order, or regulation then You must: (a) comply with the terms of + this License to the maximum extent possible; and (b) describe the + limitations and the code they affect. Such description must be placed in a + text file included with all distributions of the Covered Software under + this License. Except to the extent prohibited by statute or regulation, + such description must be sufficiently detailed for a recipient of ordinary + skill to be able to understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing + basis, if such Contributor fails to notify You of the non-compliance by + some reasonable means prior to 60 days after You have come back into + compliance. Moreover, Your grants from a particular Contributor are + reinstated on an ongoing basis if such Contributor notifies You of the + non-compliance by some reasonable means, this is the first time You have + received notice of non-compliance with this License from such + Contributor, and You become compliant prior to 30 days after Your receipt + of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, + counter-claims, and cross-claims) alleging that a Contributor Version + directly or indirectly infringes any patent, then the rights granted to + You by any and all Contributors for the Covered Software under Section + 2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an "as is" basis, + without warranty of any kind, either expressed, implied, or statutory, + including, without limitation, warranties that the Covered Software is free + of defects, merchantable, fit for a particular purpose or non-infringing. + The entire risk as to the quality and performance of the Covered Software + is with You. Should any Covered Software prove defective in any respect, + You (not any Contributor) assume the cost of any necessary servicing, + repair, or correction. This disclaimer of warranty constitutes an essential + part of this License. No use of any Covered Software is authorized under + this License except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from + such party's negligence to the extent applicable law prohibits such + limitation. Some jurisdictions do not allow the exclusion or limitation of + incidental or consequential damages, so this exclusion and limitation may + not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts + of a jurisdiction where the defendant maintains its principal place of + business and such litigation shall be governed by laws of that + jurisdiction, without reference to its conflict-of-law provisions. Nothing + in this Section shall prevent a party's ability to bring cross-claims or + counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. Any law or regulation which provides that + the language of a contract shall be construed against the drafter shall not + be used to construe this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version + of the License under which You originally received the Covered Software, + or under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a + modified version of this License if you rename the license and remove + any references to the name of the license steward (except to note that + such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary + Licenses If You choose to distribute Source Code Form that is + Incompatible With Secondary Licenses under the terms of this version of + the License, the notice described in Exhibit B of this License must be + attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, +then You may include the notice in a location (such as a LICENSE file in a +relevant directory) where a recipient would be likely to look for such a +notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice + + This Source Code Form is "Incompatible + With Secondary Licenses", as defined by + the Mozilla Public License, v. 2.0. + diff --git a/README.md b/README.md new file mode 100644 index 0000000..0809051 --- /dev/null +++ b/README.md @@ -0,0 +1,230 @@ +# More SQL Parsing! + +[![PyPI Latest Release](https://img.shields.io/pypi/v/mo-sql-parsing.svg)](https://pypi.org/project/mo-sql-parsing/) +[![Build Status](https://app.travis-ci.com/klahnakoski/mo-sql-parsing.svg?branch=master)](https://travis-ci.com/github/klahnakoski/mo-sql-parsing) + + +Parse SQL into JSON so we can translate it for other datastores! + +[See changes](https://github.com/klahnakoski/mo-sql-parsing#version-changes) + + +## Problem Statement + +SQL is a familiar language used to access databases. Although, each database vendor has its quirky implementation, there is enough standardization that the average developer does not need to know of those quirks. This familiar core SQL (lowest common denominator, if you will) is useful enough to explore data in primitive ways. It is hoped that, once programmers have reviewed a datastore with basic SQL queries, and they see the value of that data, and they will be motivated to use the datastore's native query format. + +## Objectives + +The objective is to convert SQL queries to JSON-izable parse trees. This originally targeted MySQL, but has grown to include other database engines. *Please [paste some SQL into a new issue](https://github.com/klahnakoski/mo-sql-parsing/issues) if it does not work for you* + + +## Project Status + +November 2021 - There are [over 800 tests](https://app.travis-ci.com/github/klahnakoski/mo-sql-parsing). This parser is good enough for basic usage, including: + * inner queries, + * with clauses, + * window functions + * create/drop tables and views + * insert/update/delete statements + * lambda (`->`) functions + +## Install + + pip install mo-sql-parsing + +## Parsing SQL + + >>> from mo_sql_parsing import parse + >>> parse("select count(1) from jobs") + {'select': {'value': {'count': 1}}, 'from': 'jobs'} + +Each SQL query is parsed to an object: Each clause is assigned to an object property of the same name. + + >>> parse("select a as hello, b as world from jobs") + {'select': [{'value': 'a', 'name': 'hello'}, {'value': 'b', 'name': 'world'}], 'from': 'jobs'} + +The `SELECT` clause is an array of objects containing `name` and `value` properties. + + +### SQL Flavours + +There are a few parsing modes you may be interested in: + + +#### SQLServer Identifiers (`[]`) + +SQLServer uses square brackets to delimit identifiers. For example + + SELECT [Timestamp] FROM [table] + +which conflicts with BigQuery array constructor (eg `[1, 2, 3, 4]`). You may use the SqlServer flavour with + + from mo_sql_parsing import parse_sqlserver as parse + + +#### NULL is None + +The default output for this parser is to emit a null function `{"null":{}}` wherever `NULL` is encountered in the SQL. If you would like something different, you can replace nulls with `None` (or anything else for that matter): + + result = parse(sql, null=None) + +this has been implemented with a post-parse rewriting of the parse tree. + + +#### Normalized function call form + +The default behaviour of the parser is to output function calls in `simple_op` format: The operator being a key in the object; `{op: params}`. This form can be difficult to work with because the object must be scanned for known operators, or possible optional arguments, or at least distinguished from a query object. + +You can have the parser emit function calls in `normal_op` format + + >>> from mo_sql_parsing import parse, normal_op + >>> parse("select trim(' ' from b+c)", calls=normal_op) + +which produces calls in a normalized format + + {"op": op, "args": args, "kwargs": kwargs} + +here is the pretty-printed JSON from the example above: + +``` +{'select': {'value': { + 'op': 'trim', + 'args': [{'op': 'add', 'args': ['b', 'c']}], + 'kwargs': {'characters': {'literal': ' '}} +}}} +``` + +#### MySQL literal strings + +MySQL uses both double quotes and single quotes to declare literal strings. This is not ansi behaviour, but it is more forgiving for programmers coming from other languages. A specific parse function is provided: + + result = parse_mysql(sql) + + +## Generating SQL + +You may also generate SQL from the a given JSON document. This is done by the formatter, which is in Alpha state (Oct2021). + + >>> from mo_sql_parsing import format + >>> format({"from":"test", "select":["a.b", "c"]}) + 'SELECT a.b, c FROM test' + +## Contributing + +In the event that the parser is not working for you, you can help make this better but simply pasting your sql (or JSON) into a new issue. Extra points if you describe the problem. Even more points if you submit a PR with a test. If you also submit a fix, then you also have my gratitude. + + +### Run Tests + +See [the tests directory](https://github.com/klahnakoski/mo-sql-parsing/tree/dev/tests) for instructions running tests, or writing new ones. + +## More about implementation + +SQL queries are translated to JSON objects: Each clause is assigned to an object property of the same name. + + + # SELECT * FROM dual WHERE a>b ORDER BY a+b + { + "select": "*", + "from": "dual", + "where": {"gt": ["a", "b"]}, + "orderby": {"value": {"add": ["a", "b"]}} + } + +Expressions are also objects, but with only one property: The name of the operation, and the value holding (an array of) parameters for that operation. + + {op: parameters} + +and you can see this pattern in the previous example: + + {"gt": ["a","b"]} + +## Array Programming + +The `mo-sql-parsing.scrub()` method is used liberally throughout the code, and it "simplifies" the JSON. You may find this form a bit tedious to work with because the JSON property values can be values, lists of values, or missing. Please consider converting everything to arrays: + + +``` +def listwrap(value): + if value is None: + return [] + elif isinstance(value, list) + return value + else: + return [value] +``` + +then you may avoid all the is-it-a-list checks : + +``` +for select in listwrap(parsed_result.get('select')): + do_something(select) +``` + +## Version Changes + + + +### Version 8 + +*November 2021* + +* Prefer BigQuery `[]` (create array) over SQLServer `[]` (identity) +* Added basic DML (`INSERT`/`UPDATE`/`DELETE`) +* flatter `CREATE TABLE` structures. The `option` list in column definition has been flattened:
+ **Old column format** + + {"create table": { + "columns": { + "name": "name", + "type": {"decimal": [2, 3]}, + "option": [ + "not null", + "check": {"lt": [{"length": "name"}, 10]} + ] + } + }} + + **New column format** + + {"create table": { + "columns": { + "name": "name", + "type": {"decimal": [2, 3]} + "nullable": False, + "check": {"lt": [{"length": "name"}, 10]} + } + }} + +### Version 7 + +*October 2021* + +* changed error reporting; still terrible +* upgraded mo-parsing library which forced version change + +### Version 6 + +*October 2021* + +* fixed `SELECT DISTINCT` parsing +* added `DISTINCT ON` parsing + +### Version 5 + +*August 2021* + +* remove inline module `mo-parsing` +* support `CREATE TABLE`, add SQL "flavours" emit `{null:{}}` for None + +### Version 4 + +*November 2021* + +* changed parse result of `SELECT DISTINCT` +* simpler `ORDER BY` clause in window functions + + + + + diff --git a/mo_sql_parsing/__init__.py b/mo_sql_parsing/__init__.py new file mode 100644 index 0000000..eb8cd34 --- /dev/null +++ b/mo_sql_parsing/__init__.py @@ -0,0 +1,93 @@ +# encoding: utf-8 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Contact: Kyle Lahnakoski (kyle@lahnakoski.com) +# + +from __future__ import absolute_import, division, unicode_literals + +import json +from threading import Lock + +from mo_sql_parsing.sql_parser import scrub +from mo_sql_parsing.utils import ansi_string, simple_op, normal_op + +parse_locker = Lock() # ENSURE ONLY ONE PARSING AT A TIME +common_parser = None +mysql_parser = None +sqlserver_parser = None + +SQL_NULL = {"null": {}} + + +def parse(sql, null=SQL_NULL, calls=simple_op): + """ + :param sql: String of SQL + :param null: What value to use as NULL (default is the null function `{"null":{}}`) + :return: parse tree + """ + global common_parser + + with parse_locker: + if not common_parser: + common_parser = sql_parser.common_parser() + result = _parse(common_parser, sql, null, calls) + return result + + +def parse_mysql(sql, null=SQL_NULL, calls=simple_op): + """ + PARSE MySQL ASSUME DOUBLE QUOTED STRINGS ARE LITERALS + :param sql: String of SQL + :param null: What value to use as NULL (default is the null function `{"null":{}}`) + :return: parse tree + """ + global mysql_parser + + with parse_locker: + if not mysql_parser: + mysql_parser = sql_parser.mysql_parser() + return _parse(mysql_parser, sql, null, calls) + + +def parse_sqlserver(sql, null=SQL_NULL, calls=simple_op): + """ + PARSE MySQL ASSUME DOUBLE QUOTED STRINGS ARE LITERALS + :param sql: String of SQL + :param null: What value to use as NULL (default is the null function `{"null":{}}`) + :return: parse tree + """ + global sqlserver_parser + + with parse_locker: + if not sqlserver_parser: + sqlserver_parser = sql_parser.sqlserver_parser() + return _parse(sqlserver_parser, sql, null, calls) + + +parse_bigquery = parse_mysql + + +def _parse(parser, sql, null, calls): + utils.null_locations = [] + utils.scrub_op = calls + sql = sql.rstrip().rstrip(";") + parse_result = parser.parse_string(sql, parse_all=True) + output = scrub(parse_result) + for o, n in utils.null_locations: + o[n] = null + return output + + +def format(json, **kwargs): + from mo_sql_parsing.formatting import Formatter + + return Formatter(**kwargs).dispatch(json) + + +_ = json.dumps + +__all__ = ["parse", "format", "parse_mysql", "parse_bigquery", "normal_op", "simple_op"] diff --git a/mo_sql_parsing/formatting.py b/mo_sql_parsing/formatting.py new file mode 100644 index 0000000..ed0f208 --- /dev/null +++ b/mo_sql_parsing/formatting.py @@ -0,0 +1,602 @@ +# encoding: utf-8 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Author: Beto Dealmeida (beto@dealmeida.net) +# + +from __future__ import absolute_import, division, unicode_literals + +import re + +from mo_dots import split_field +from mo_future import first, is_text, string_types, text +from mo_parsing import listwrap + +from mo_sql_parsing.keywords import RESERVED, join_keywords, precedence +from mo_sql_parsing.utils import binary_ops, is_set_op + +MAX_PRECEDENCE = 100 +VALID = re.compile(r"^[a-zA-Z_]\w*$") + + +def is_keyword(identifier): + try: + RESERVED.parse_string(identifier) + return True + except Exception: + return False + + +def should_quote(identifier): + """ + Return true if a given identifier should be quoted. + + This is usually true when the identifier: + + - is a reserved word + - contain spaces + - does not match the regex `[a-zA-Z_]\\w*` + + """ + return identifier != "*" and (not VALID.match(identifier) or is_keyword(identifier)) + + +def escape(ident, ansi_quotes, should_quote): + """ + Escape identifiers. + + ANSI uses double quotes, but many databases use back quotes. + + """ + + def esc(identifier): + if not should_quote(identifier): + return identifier + + quote = '"' if ansi_quotes else "`" + identifier = identifier.replace(quote, 2 * quote) + return "{0}{1}{2}".format(quote, identifier, quote) + + return ".".join(esc(f) for f in split_field(ident)) + + +def Operator(_op): + op_prec = precedence[binary_ops[_op]] + op = " {0} ".format(_op).replace("_", " ").upper() + + def func(self, json, prec): + acc = [] + + if isinstance(json, dict): + # {VARIABLE: VALUE} FORM + k, v = first(json.items()) + json = [k, {"literal": v}] + + for i, v in enumerate(listwrap(json)): + if i == 0: + acc.append(self.dispatch(v, op_prec + 0.25)) + else: + acc.append(self.dispatch(v, op_prec)) + if prec >= op_prec: + return op.join(acc) + else: + return f"({op.join(acc)})" + + return func + + +def isolate(expr, sql, prec): + """ + RETURN sql IN PARENTHESIS IF PREEDENCE > prec + :param expr: expression to isolate + :param sql: sql to return + :param prec: current precedence + """ + if is_text(expr): + return sql + ps = [p for k in expr.keys() for p in [precedence.get(k)] if p is not None] + if not ps: + return sql + elif min(ps) >= prec: + return f"({sql})" + else: + return sql + + +unordered_clauses = [ + "with", + "distinct_on", + "select_distinct", + "select", + "from", + "where", + "groupby", + "having", +] + +ordered_clauses = [ + "orderby", + "limit", + "offset", + "fetch", +] + + +class Formatter: + # infix operators + _concat = Operator("||") + _mul = Operator("*") + _div = Operator("/") + _mod = Operator("%") + _add = Operator("+") + _sub = Operator("-") + _neq = Operator("<>") + _gt = Operator(">") + _lt = Operator("<") + _gte = Operator(">=") + _lte = Operator("<=") + _eq = Operator("=") + _or = Operator("or") + _and = Operator("and") + _binary_and = Operator("&") + _binary_or = Operator("|") + _like = Operator("like") + _not_like = Operator("not like") + _rlike = Operator("rlike") + _not_rlike = Operator("not rlike") + _union = Operator("union") + _union_all = Operator("union all") + _intersect = Operator("intersect") + _minus = Operator("minus") + _except = Operator("except") + + def __init__(self, ansi_quotes=True, should_quote=should_quote): + self.ansi_quotes = ansi_quotes + self.should_quote = should_quote + + def format(self, json): + return self.dispatch(json, 50) + + def dispatch(self, json, prec=100): + if isinstance(json, list): + return self.sql_list(json, prec=precedence["list"]) + if isinstance(json, dict): + if len(json) == 0: + return "" + elif "value" in json: + return self.value(json, prec) + elif "join" in json: + return self._join_on(json) + elif "insert" in json: + return self.insert(json) + elif json.keys() & set(ordered_clauses): + return self.ordered_query(json, prec) + elif json.keys() & set(unordered_clauses): + return self.unordered_query(json, prec) + elif "null" in json: + return "NULL" + elif "trim" in json: + return self._trim(json, prec) + elif "extract" in json: + return self._extract(json, prec) + else: + return self.op(json, prec) + if isinstance(json, string_types): + return escape(json, self.ansi_quotes, self.should_quote) + if json == None: + return "NULL" + + return text(json) + + def sql_list(self, json, prec=precedence["from"] - 1): + sql = ", ".join(self.dispatch(element, prec=MAX_PRECEDENCE) for element in json) + if prec >= precedence["from"]: + return sql + else: + return f"({sql})" + + def value(self, json, prec=precedence["from"]): + parts = [self.dispatch(json["value"], prec)] + if "over" in json: + over = json["over"] + parts.append("OVER") + window = [] + if "partitionby" in over: + window.append("PARTITION BY") + window.append(self.dispatch(over["partitionby"])) + if "orderby" in over: + window.append(self.orderby(over, precedence["window"])) + if "range" in over: + + def wordy(v): + if v < 0: + return [text(abs(v)), "PRECEDING"] + elif v > 0: + return [text(v), "FOLLOWING"] + + window.append("ROWS") + range = over["range"] + min = range.get("min") + max = range.get("max") + + if min is None: + if max is None: + window.pop() # not expected, but deal + elif max == 0: + window.append("UNBOUNDED PRECEDING") + else: + window.append("BETWEEN") + window.append("UNBOUNDED PRECEDING") + window.append("AND") + window.extend(wordy(max)) + elif min == 0: + if max is None: + window.append("UNBOUNDED FOLLOWING") + elif max == 0: + window.append("CURRENT ROW") + else: + window.extend(wordy(max)) + else: + if max is None: + window.append("BETWEEN") + window.extend(wordy(min)) + window.append("AND") + window.append("UNBOUNDED FOLLOWING") + elif max == 0: + window.extend(wordy(min)) + else: + window.append("BETWEEN") + window.extend(wordy(min)) + window.append("AND") + window.extend(wordy(max)) + + window = " ".join(window) + parts.append(f"({window})") + if "name" in json: + parts.extend(["AS", self.dispatch(json["name"])]) + + return " ".join(parts) + + def op(self, json, prec): + if len(json) > 1: + raise Exception("Operators should have only one key!") + key, value = list(json.items())[0] + + # check if the attribute exists, and call the corresponding method; + # note that we disallow keys that start with `_` to avoid giving access + # to magic methods + attr = f"_{key}" + if hasattr(self, attr) and not key.startswith("_"): + method = getattr(self, attr) + op_prec = precedence.get(key, MAX_PRECEDENCE) + if prec >= op_prec: + return method(value, op_prec) + else: + return f"({method(value, op_prec)})" + + # treat as regular function call + if isinstance(value, dict) and len(value) == 0: + return ( + key.upper() + "()" + ) # NOT SURE IF AN EMPTY dict SHOULD BE DELT WITH HERE, OR IN self.format() + else: + params = ", ".join(self.dispatch(p) for p in listwrap(value)) + return f"{key.upper()}({params})" + + def _binary_not(self, value, prec): + return "~{0}".format(self.dispatch(value)) + + def _exists(self, value, prec): + return "{0} IS NOT NULL".format(self.dispatch(value, precedence["is"])) + + def _missing(self, value, prec): + return "{0} IS NULL".format(self.dispatch(value, precedence["is"])) + + def _collate(self, pair, prec): + return "{0} COLLATE {1}".format( + self.dispatch(pair[0], precedence["collate"]), pair[1] + ) + + def _in(self, json, prec): + member, set = json + if "literal" in set: + set = {"literal": listwrap(set["literal"])} + sql = ( + self.dispatch(member, precedence["in"]) + + " IN " + + self.dispatch(set, precedence["in"]) + ) + if prec < precedence["in"]: + sql = f"({sql})" + return sql + + def _nin(self, json, prec): + member, set = json + if "literal" in set: + set = {"literal": listwrap(set["literal"])} + sql = ( + self.dispatch(member, precedence["in"]) + + " NOT IN " + + self.dispatch(set, precedence["in"]) + ) + if prec < precedence["in"]: + sql = f"({sql})" + return sql + + def _case(self, checks, prec): + parts = ["CASE"] + for check in checks if isinstance(checks, list) else [checks]: + if isinstance(check, dict): + if "when" in check and "then" in check: + parts.extend(["WHEN", self.dispatch(check["when"])]) + parts.extend(["THEN", self.dispatch(check["then"])]) + else: + parts.extend(["ELSE", self.dispatch(check)]) + else: + parts.extend(["ELSE", self.dispatch(check)]) + parts.append("END") + return " ".join(parts) + + def _cast(self, json, prec): + expr, type = json + + type_name, params = first(type.items()) + if not params: + type = type_name.upper() + else: + type = {type_name.upper(): params} + + return f"CAST({self.dispatch(expr)} AS {self.dispatch(type)})" + + def _extract(self, json, prec): + interval, value = json["extract"] + i = self.dispatch(interval).upper() + v = self.dispatch(value) + return f"EXTRACT({i} FROM {v})" + + def _interval(self, json, prec): + amount = self.dispatch(json[0], precedence["and"]) + type = self.dispatch(json[1], precedence["and"]) + return f"INTERVAL {amount} {type.upper()}" + + def _literal(self, json, prec=0): + if isinstance(json, list): + return "({0})".format(", ".join( + self._literal(v, precedence["literal"]) for v in json + )) + elif isinstance(json, string_types): + return "'{0}'".format(json.replace("'", "''")) + else: + return str(json) + + def _get(self, json, prec): + v, i = json + v_sql = self.dispatch(v, prec=precedence["literal"]) + i_sql = self.dispatch(i) + return f"{v_sql}[{i_sql}]" + + def _between(self, json, prec): + return "{0} BETWEEN {1} AND {2}".format( + self.dispatch(json[0], precedence["between"]), + self.dispatch(json[1], precedence["between"]), + self.dispatch(json[2], precedence["between"]), + ) + + def _trim(self, json, prec): + c = json.get("characters") + d = json.get("direction") + v = json["trim"] + acc = ["TRIM("] + if d: + acc.append(d.upper()) + acc.append(" ") + if c: + acc.append(self.dispatch(c)) + acc.append(" ") + if c or d: + acc.append("FROM ") + acc.append(self.dispatch(v)) + acc.append(")") + return "".join(acc) + + def _not_between(self, json, prec): + return "{0} NOT BETWEEN {1} AND {2}".format( + self.dispatch(json[0], precedence["between"]), + self.dispatch(json[1], precedence["between"]), + self.dispatch(json[2], precedence["between"]), + ) + + def _distinct(self, json, prec): + return "DISTINCT " + ", ".join( + self.dispatch(v, precedence["select"]) for v in listwrap(json) + ) + + def _select_distinct(self, json, prec): + return "SELECT DISTINCT " + ", ".join(self.dispatch(v) for v in listwrap(json)) + + def _distinct_on(self, json, prec): + return ( + "DISTINCT ON (" + ", ".join(self.dispatch(v) for v in listwrap(json)) + ")" + ) + + def _join_on(self, json, prec): + detected_join = join_keywords & set(json.keys()) + if len(detected_join) == 0: + raise Exception( + 'Fail to detect join type! Detected: "{}" Except one of: "{}"'.format( + [on_keyword for on_keyword in json if on_keyword != "on"][0], + '", "'.join(join_keywords), + ) + ) + + join_keyword = detected_join.pop() + + acc = [] + acc.append(join_keyword.upper()) + acc.append(self.dispatch(json[join_keyword], precedence["join"])) + + if json.get("on"): + acc.append("ON") + acc.append(self.dispatch(json["on"])) + if json.get("using"): + acc.append("USING") + acc.append(self.dispatch(json["using"])) + return " ".join(acc) + + def ordered_query(self, json, prec): + if json.keys() & set(unordered_clauses) - {"from"}: + # regular query + acc = [self.unordered_query(json, precedence["order"])] + else: + # set-op expression + acc = [self.dispatch(json["from"], precedence["order"])] + + acc.extend( + part + for clause in ordered_clauses + if clause in json + for part in [getattr(self, clause)(json, precedence["order"])] + if part + ) + sql = " ".join(acc) + if prec >= precedence["order"]: + return sql + else: + return f"({sql})" + + def unordered_query(self, json, prec): + sql = " ".join( + part + for clause in unordered_clauses + if clause in json + for part in [getattr(self, clause)(json, precedence["from"])] + if part + ) + if prec >= precedence["from"]: + return sql + else: + return f"({sql})" + + def with_(self, json, prec): + if "with" in json: + with_ = json["with"] + if not isinstance(with_, list): + with_ = [with_] + parts = ", ".join( + "{0} AS ({1})".format(part["name"], self.dispatch(part["value"])) + for part in with_ + ) + return "WITH {0}".format(parts) + + def select(self, json, prec): + param = ", ".join(self.dispatch(s) for s in listwrap(json["select"])) + if "top" in json: + top = self.dispatch(json["top"]) + return f"SELECT TOP ({top}) {param}" + if "distinct_on" in json: + return param + else: + return f"SELECT {param}" + + def distinct_on(self, json, prec): + param = ", ".join(self.dispatch(s) for s in listwrap(json["distinct_on"])) + return f"SELECT DISTINCT ON ({param})" + + def select_distinct(self, json, prec): + param = ", ".join(self.dispatch(s) for s in listwrap(json["select_distinct"])) + return f"SELECT DISTINCT {param}" + + def from_(self, json, prec): + is_join = False + from_ = json["from"] + if isinstance(from_, dict) and is_set_op & from_.keys(): + source = self.op(from_, precedence["from"]) + return f"FROM {source}" + + from_ = listwrap(from_) + parts = [] + for v in from_: + if join_keywords & set(v): + is_join = True + parts.append(self._join_on(v, precedence["from"] - 1)) + else: + parts.append(self.dispatch(v, precedence["from"] - 1)) + joiner = " " if is_join else ", " + rest = joiner.join(parts) + return f"FROM {rest}" + + def where(self, json, prec): + expr = self.dispatch(json["where"]) + return f"WHERE {expr}" + + def groupby(self, json, prec): + param = ", ".join(self.dispatch(s) for s in listwrap(json["groupby"])) + return f"GROUP BY {param}" + + def having(self, json, prec): + return "HAVING {0}".format(self.dispatch(json["having"])) + + def orderby(self, json, prec): + param = ", ".join( + ( + self.dispatch(s["value"], precedence["order"]) + + " " + + s.get("sort", "").upper() + ).strip() + for s in listwrap(json["orderby"]) + ) + return f"ORDER BY {param}" + + def limit(self, json, prec): + num = self.dispatch(json["limit"], precedence["order"]) + return f"LIMIT {num}" + + def offset(self, json, prec): + num = self.dispatch(json["offset"], precedence["order"]) + return f"OFFSET {num}" + + def fetch(self, json, prec): + num = self.dispatch(json["offset"], precedence["order"]) + return f"FETCH {num} ROWS ONLY" + + def insert(self, json, prec=precedence["from"]): + acc = ["INSERT"] + if "overwrite" in json: + acc.append("OVERWRITE") + else: + acc.append("INTO") + acc.append(json["insert"]) + + if "columns" in json: + acc.append(self.sql_list(json)) + if "values" in json: + values = json["values"] + if all(isinstance(row, dict) for row in values): + columns = list(sorted(set(k for row in values for k in row.keys()))) + acc.append(self.sql_list(columns)) + if "if exists" in json: + acc.append("IF EXISTS") + acc.append("VALUES") + acc.append(",\n".join( + "(" + ", ".join(self._literal(row[c]) for c in columns) + ")" + for row in values + )) + else: + if "if exists" in json: + acc.append("IF EXISTS") + acc.append("VALUES") + for row in values: + acc.append("(" + ", ".join(self._literal(row)) + ")") + + else: + if json["if exists"]: + acc.append("IF EXISTS") + acc.append(self.dispatch(json["query"])) + return " ".join(acc) + + +setattr(Formatter, "with", Formatter.with_) +setattr(Formatter, "from", Formatter.from_) diff --git a/mo_sql_parsing/keywords.py b/mo_sql_parsing/keywords.py new file mode 100644 index 0000000..3626698 --- /dev/null +++ b/mo_sql_parsing/keywords.py @@ -0,0 +1,392 @@ +# encoding: utf-8 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Contact: Kyle Lahnakoski (kyle@lahnakoski.com) +# + +# SQL CONSTANTS +from mo_parsing import * + +from mo_sql_parsing.utils import SQL_NULL, keyword + +NULL = keyword("null") / (lambda: SQL_NULL) +TRUE = keyword("true") / (lambda: True) +FALSE = keyword("false") / (lambda: False) +NOCASE = keyword("nocase") +ASC = keyword("asc") +DESC = keyword("desc") + +# SIMPLE KEYWORDS +AS = keyword("as").suppress() +ALL = keyword("all") +BY = keyword("by").suppress() +CAST = keyword("cast") +CONSTRAINT = keyword("constraint").suppress() +CREATE = keyword("create").suppress() +CROSS = keyword("cross") +DISTINCT = keyword("distinct") +EXCEPT = keyword("except") +FETCH = keyword("fetch").suppress() +FROM = keyword("from").suppress() +FULL = keyword("full") +GROUP = keyword("group").suppress() +HAVING = keyword("having").suppress() +INNER = keyword("inner") +INTERVAL = keyword("interval") +JOIN = keyword("join") +LEFT = keyword("left") +LIKE = keyword("like") +LIMIT = keyword("limit").suppress() +MINUS = keyword("minus") +OFFSET = keyword("offset").suppress() +ON = keyword("on").suppress() +ORDER = keyword("order").suppress() +OUTER = keyword("outer") +OVER = keyword("over").suppress() +PARTITION = keyword("partition").suppress() +# PERCENT = keyword("percent").suppress() +RIGHT = keyword("right") +RLIKE = keyword("rlike") +SELECT = keyword("select").suppress() +TABLE = keyword("table").suppress() +THEN = keyword("then").suppress() +TOP = keyword("top").suppress() +UNION = keyword("union") +INTERSECT = keyword("intersect") +USING = keyword("using").suppress() +WHEN = keyword("when").suppress() +WHERE = keyword("where").suppress() +WITH = keyword("with").suppress() +WITHIN = keyword("within").suppress() +PRIMARY = keyword("primary").suppress() +FOREIGN = keyword("foreign").suppress() +KEY = keyword("key").suppress() +UNIQUE = keyword("unique").suppress() +INDEX = keyword("index").suppress() +REFERENCES = keyword("references").suppress() +RECURSIVE = keyword("recursive").suppress() +VALUES = keyword("values").suppress() +WINDOW = keyword("window") + +PRIMARY_KEY = Group(PRIMARY + KEY).set_parser_name("primary_key") +FOREIGN_KEY = Group(FOREIGN + KEY).set_parser_name("foreign_key") + +# SIMPLE OPERATORS +CONCAT = Literal("||").set_parser_name("concat") +MUL = Literal("*").set_parser_name("mul") +DIV = Literal("/").set_parser_name("div") +MOD = Literal("%").set_parser_name("mod") +NEG = Literal("-").set_parser_name("neg") +ADD = Literal("+").set_parser_name("add") +SUB = Literal("-").set_parser_name("sub") +BINARY_NOT = Literal("~").set_parser_name("binary_not") +BINARY_AND = Literal("&").set_parser_name("binary_and") +BINARY_OR = Literal("|").set_parser_name("binary_or") +GTE = Literal(">=").set_parser_name("gte") +LTE = Literal("<=").set_parser_name("lte") +LT = Literal("<").set_parser_name("lt") +GT = Literal(">").set_parser_name("gt") +EEQ = ( + # conservative equality https://github.com/klahnakoski/jx-sqlite/blob/dev/docs/Logical%20Equality.md#definitions + Literal("==") | Literal("=") +).set_parser_name("eq") +DEQ = ( + # decisive equality + # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/ + Literal("<=>").set_parser_name("eq!") +) +IDF = ( + # decisive equality + # https://prestodb.io/docs/current/functions/comparison.html#is-distinct-from-and-is-not-distinct-from + keyword("is distinct from").set_parser_name("eq!") +) +INDF = ( + # decisive equality + # https://prestodb.io/docs/current/functions/comparison.html#is-distinct-from-and-is-not-distinct-from + keyword("is not distinct from").set_parser_name("ne!") +) +NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") +LAMBDA = Literal("->").set_parser_name("lambda") + +AND = keyword("and") +BETWEEN = keyword("between") +CASE = keyword("case").suppress() +COLLATE = keyword("collate") +END = keyword("end") +ELSE = keyword("else").suppress() +IN = keyword("in") +IS = keyword("is") +NOT = keyword("not") +OR = keyword("or") +LATERAL = keyword("lateral") +VIEW = keyword("view") + +# COMPOUND KEYWORDS + + +joins = ( + ( + Optional(CROSS | OUTER | INNER | ((FULL | LEFT | RIGHT) + Optional(INNER | OUTER))) + + JOIN + + Optional(LATERAL) + ) + | LATERAL + VIEW + Optional(OUTER) +) / (lambda tokens: " ".join(tokens).lower()) + +UNION_ALL = (UNION + ALL).set_parser_name("union_all") +WITHIN_GROUP = Group(WITHIN + GROUP).set_parser_name("within_group") +SELECT_DISTINCT = Group(SELECT + DISTINCT).set_parser_name("select distinct") +PARTITION_BY = Group(PARTITION + BY).set_parser_name("partition by") +GROUP_BY = Group(GROUP + BY).set_parser_name("group by") +ORDER_BY = Group(ORDER + BY).set_parser_name("order by") + +# COMPOUND OPERATORS +AT_TIME_ZONE = Group(keyword("at") + keyword("time") + keyword("zone")) +NOT_BETWEEN = Group(NOT + BETWEEN).set_parser_name("not_between") +NOT_LIKE = Group(NOT + LIKE).set_parser_name("not_like") +NOT_RLIKE = Group(NOT + RLIKE).set_parser_name("not_rlike") +NOT_IN = Group(NOT + IN).set_parser_name("nin") +IS_NOT = Group(IS + NOT).set_parser_name("is_not") + +_SIMILAR = keyword("similar") +_TO = keyword("to") +SIMILAR_TO = Group(_SIMILAR + _TO).set_parser_name("similar_to") +NOT_SIMILAR_TO = Group(NOT + _SIMILAR + _TO).set_parser_name("not_similar_to") + +RESERVED = MatchFirst([ + # ONY INCLUDE SINGLE WORDS + ALL, + AND, + AS, + ASC, + BETWEEN, + BY, + CASE, + COLLATE, + CONSTRAINT, + CREATE, + CROSS, + DESC, + DISTINCT, + EXCEPT, + ELSE, + END, + FALSE, + FETCH, + FOREIGN, + FROM, + FULL, + GROUP_BY, + GROUP, + HAVING, + IN, + INDEX, + INNER, + INTERSECT, + INTERVAL, + IS_NOT, + IS, + JOIN, + KEY, + LATERAL, + LEFT, + LIKE, + LIMIT, + MINUS, + NOCASE, + NOT, + NULL, + OFFSET, + ON, + OR, + ORDER, + OUTER, + OVER, + PARTITION, + PRIMARY, + REFERENCES, + RIGHT, + RLIKE, + SELECT, + THEN, + TRUE, + UNION, + UNIQUE, + USING, + WHEN, + WHERE, + WINDOW, + WITH, + WITHIN, +]) + +LB = Literal("(").suppress() +RB = Literal(")").suppress() +EQ = Char("=").suppress() + +join_keywords = { + "join", + "full join", + "cross join", + "inner join", + "left join", + "right join", + "full outer join", + "right outer join", + "left outer join", +} + +precedence = { + # https://www.sqlite.org/lang_expr.html + "literal": -1, + "interval": 0, + "cast": 0, + "collate": 0, + "concat": 1, + "mul": 2, + "div": 1.5, + "mod": 2, + "neg": 3, + "add": 3, + "sub": 2.5, + "binary_not": 4, + "binary_and": 4, + "binary_or": 4, + "gte": 5, + "lte": 5, + "lt": 5, + "gt": 6, + "eq": 7, + "neq": 7, + "missing": 7, + "exists": 7, + "at_time_zone": 8, + "between": 8, + "not_between": 8, + "in": 8, + "nin": 8, + "is": 8, + "like": 8, + "not_like": 8, + "rlike": 8, + "not_rlike": 8, + "similar_to": 8, + "not_similar_to": 8, + "and": 10, + "or": 11, + "lambda": 12, + "join": 18, + "list": 18, + "select": 30, + "from": 30, + "window": 35, + "union": 40, + "union_all": 40, + "except": 40, + "minus": 40, + "intersect": 40, + "order": 50, +} + +KNOWN_OPS = [ + COLLATE, + CONCAT, + MUL | DIV | MOD, + NEG, + ADD | SUB, + BINARY_NOT, + BINARY_AND, + BINARY_OR, + GTE | LTE | LT | GT, + EEQ | NEQ | DEQ | IDF | INDF, + AT_TIME_ZONE, + (BETWEEN, AND), + (NOT_BETWEEN, AND), + IN, + NOT_IN, + IS_NOT, + IS, + LIKE, + NOT_LIKE, + RLIKE, + NOT_RLIKE, + SIMILAR_TO, + NOT_SIMILAR_TO, + NOT, + AND, + OR, + LAMBDA, +] + +times = ["now", "today", "tomorrow", "eod"] + +durations = { + "microseconds": "microsecond", + "microsecond": "microsecond", + "microsecs": "microsecond", + "microsec": "microsecond", + "useconds": "microsecond", + "usecond": "microsecond", + "usecs": "microsecond", + "usec": "microsecond", + "us": "microsecond", + "milliseconds": "millisecond", + "millisecond": "millisecond", + "millisecon": "millisecond", + "mseconds": "millisecond", + "msecond": "millisecond", + "millisecs": "millisecond", + "millisec": "millisecond", + "msecs": "millisecond", + "msec": "millisecond", + "ms": "millisecond", + "seconds": "second", + "second": "second", + "secs": "second", + "sec": "second", + "s": "second", + "minutes": "minute", + "minute": "minute", + "mins": "minute", + "min": "minute", + "m": "minute", + "hours": "hour", + "hour": "hour", + "hrs": "hour", + "hr": "hour", + "h": "hour", + "days": "day", + "day": "day", + "d": "day", + "dayofweek": "dow", + "dow": "dow", + "weekday": "dow", + "weeks": "week", + "week": "week", + "w": "week", + "months": "month", + "month": "month", + "mons": "month", + "mon": "month", + "quarters": "quarter", + "quarter": "quarter", + "years": "year", + "year": "year", + "decades": "decade", + "decade": "decade", + "decs": "decade", + "dec": "decade", + "centuries": "century", + "century": "century", + "cents": "century", + "cent": "century", + "c": "century", + "millennia": "millennium", + "millennium": "millennium", + "mils": "millennium", + "mil": "millennium", + "epoch": "epoch", +} diff --git a/mo_sql_parsing/sql_parser.py b/mo_sql_parsing/sql_parser.py new file mode 100644 index 0000000..499e56b --- /dev/null +++ b/mo_sql_parsing/sql_parser.py @@ -0,0 +1,605 @@ +# encoding: utf-8 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Contact: Kyle Lahnakoski (kyle@lahnakoski.com) +# + + +from mo_parsing.helpers import restOfLine +from mo_parsing.infix import delimited_list +from mo_parsing.whitespaces import NO_WHITESPACE, Whitespace + +from mo_sql_parsing.keywords import * +from mo_sql_parsing.types import get_column_type, time_functions +from mo_sql_parsing.utils import * +from mo_sql_parsing.windows import window + + +def no_dashes(tokens, start, string): + if "-" in tokens[0]: + index = tokens[0].find("-") + raise ParseException( + tokens.type, + start + index + 1, # +1 TO ENSURE THIS MESSAGE HAS PRIORITY + string, + """Ambiguity: Use backticks (``) around identifiers with dashes, or add space around subtraction operator.""", + ) + + +digit = Char("0123456789") +simple_ident = ( + Char(FIRST_IDENT_CHAR) + + (Regex("(?<=[^ 0-9])\\-(?=[^ 0-9])") | Char(IDENT_CHAR))[...] +) +simple_ident = Regex(simple_ident.__regex__()[1]) / no_dashes + + +def common_parser(): + combined_ident = Combine(delimited_list( + ansi_ident | mysql_backtick_ident | simple_ident, separator=".", combine=True, + )).set_parser_name("identifier") + + return parser(ansi_string, combined_ident) + + +def mysql_parser(): + mysql_string = ansi_string | mysql_doublequote_string + mysql_ident = Combine(delimited_list( + mysql_backtick_ident | sqlserver_ident | simple_ident, + separator=".", + combine=True, + )).set_parser_name("mysql identifier") + + return parser(mysql_string, mysql_ident) + + +def sqlserver_parser(): + combined_ident = Combine(delimited_list( + ansi_ident + | mysql_backtick_ident + | sqlserver_ident + | Word(FIRST_IDENT_CHAR, IDENT_CHAR), + separator=".", + combine=True, + )).set_parser_name("identifier") + + return parser(ansi_string, combined_ident, sqlserver=True) + + +def parser(literal_string, ident, sqlserver=False): + with Whitespace() as engine: + engine.add_ignore(Literal("--") + restOfLine) + engine.add_ignore(Literal("#") + restOfLine) + engine.add_ignore(Literal("/*") + SkipTo("*/", include=True)) + + var_name = ~RESERVED + ident + + # EXPRESSIONS + expr = Forward() + column_type, column_definition, column_def_references = get_column_type( + expr, var_name, literal_string + ) + + # CASE + case = ( + CASE + + Group(ZeroOrMore( + (WHEN + expr("when") + THEN + expr("then")) / to_when_call + ))("case") + + Optional(ELSE + expr("else")) + + END + ) / to_case_call + + switch = ( + CASE + + expr("value") + + Group(ZeroOrMore( + (WHEN + expr("when") + THEN + expr("then")) / to_when_call + ))("case") + + Optional(ELSE + expr("else")) + + END + ) / to_switch_call + + cast = ( + Group(CAST("op") + LB + expr("params") + AS + column_type("params") + RB) + / to_json_call + ) + + trim = ( + Group( + keyword("trim").suppress() + + LB + + Optional( + (keyword("both") | keyword("trailing") | keyword("leading")) + / (lambda t: t[0].lower()) + )("direction") + + ( + assign("from", expr) + | expr("chars") + Optional(assign("from", expr)) + ) + + RB + ).set_parser_name("trim") + / to_trim_call + ) + + _standard_time_intervals = MatchFirst([ + keyword(d) / (lambda t: durations[t[0].lower()]) for d in durations.keys() + ]).set_parser_name("duration")("params") + + duration = ( + real_num | int_num | literal_string + )("params") + _standard_time_intervals + + interval = ( + INTERVAL + ("'" + delimited_list(duration) + "'" | duration) + ) / to_interval_call + + timestamp = ( + time_functions("op") + + ( + literal_string("params") + | MatchFirst([ + keyword(t) / (lambda t: t.lower()) for t in times + ])("params") + ) + ) / to_json_call + + extract = ( + keyword("extract")("op") + + LB + + (_standard_time_intervals | expr("params")) + + FROM + + expr("params") + + RB + ) / to_json_call + + alias = Optional(( + ( + AS + + (var_name("name") + Optional(LB + delimited_list(ident("col")) + RB)) + | ( + var_name("name") + + Optional( + (LB + delimited_list(ident("col")) + RB) + | (AS + delimited_list(var_name("col"))) + ) + ) + ) + / to_alias + )("name")) + + named_column = Group(Group(expr)("value") + alias) + + stack = ( + keyword("stack")("op") + + LB + + int_num("width") + + "," + + delimited_list(expr)("args") + + RB + ) / to_stack + + # ARRAY[foo], + # ARRAY < STRING > [foo, bar], INVALID + # ARRAY < STRING > [foo, bar], + create_array = ( + keyword("array")("op") + + Optional(LT.suppress() + column_type("type") + GT.suppress()) + + ( + LB + delimited_list(Group(expr))("args") + RB + | (Literal("[") + delimited_list(Group(expr))("args") + Literal("]")) + ) + ) + + if not sqlserver: + # SQL SERVER DOES NOT SUPPORT [] FOR ARRAY CONSTRUCTION (USED FOR IDENTIFIERS) + create_array = ( + Literal("[") + delimited_list(Group(expr))("args") + Literal("]") + | create_array + ) + + create_array = create_array / to_array + + create_map = ( + keyword("map") + + Literal("[") + + expr("keys") + + "," + + expr("values") + + Literal("]") + ) / to_map + + create_struct = ( + keyword("struct")("op") + + Optional( + LT.suppress() + delimited_list(column_type)("types") + GT.suppress() + ) + + LB + + delimited_list(Group((expr("value") + alias) / to_select_call))("args") + + RB + ).set_parser_name("create struct") / to_struct + + distinct = ( + DISTINCT("op") + delimited_list(named_column)("params") + ) / to_json_call + + query = Forward().set_parser_name("query") + + call_function = ( + ident("op") + + LB + + Optional(Group(query) | delimited_list(Group(expr)))("params") + + Optional( + (keyword("respect") | keyword("ignore"))("nulls") + + keyword("nulls").suppress() + ) + + RB + ).set_parser_name("call function") / to_json_call + + with NO_WHITESPACE: + + def scale(tokens): + return {"mul": [tokens[0], tokens[1]]} + + scale_function = ((real_num | int_num) + call_function) / scale + scale_ident = ((real_num | int_num) + ident) / scale + + compound = ( + NULL + | TRUE + | FALSE + | NOCASE + | interval + | timestamp + | extract + | case + | switch + | cast + | distinct + | trim + | stack + | create_array + | create_map + | create_struct + | (LB + Group(query) + RB) + | (LB + Group(delimited_list(expr)) / to_tuple_call + RB) + | literal_string.set_parser_name("string") + | hex_num.set_parser_name("hex") + | scale_function + | scale_ident + | real_num.set_parser_name("float") + | int_num.set_parser_name("int") + | call_function + | Combine(var_name + Optional(".*")) + ) + + sort_column = ( + expr("value").set_parser_name("sort1") + + Optional(DESC("sort") | ASC("sort")) + + Optional(assign("nulls", keyword("first") | keyword("last"))) + ) + + window_clause, over_clause = window(expr, var_name, sort_column) + + expr << ( + ( + Literal("*") + | infix_notation( + compound, + [ + ( + Literal("[").suppress() + expr + Literal("]").suppress(), + 1, + LEFT_ASSOC, + to_offset, + ), + ( + Literal(".").suppress() + simple_ident, + 1, + LEFT_ASSOC, + to_offset, + ), + (window_clause, 1, LEFT_ASSOC, to_window_mod), + ( + assign("filter", LB + WHERE + expr + RB), + 1, + LEFT_ASSOC, + to_window_mod, + ), + ] + + [ + ( + o, + 1 if o in unary_ops else (3 if isinstance(o, tuple) else 2), + unary_ops.get(o, LEFT_ASSOC), + to_lambda if o is LAMBDA else to_json_operator, + ) + for o in KNOWN_OPS + ], + ) + )("value").set_parser_name("expression") + ) + + select_column = ( + Group( + expr("value") + alias | Literal("*")("value") + ).set_parser_name("column") + / to_select_call + ) + + table_source = Forward() + + join = ( + Group(joins)("op") + + table_source("join") + + Optional((ON + expr("on")) | (USING + expr("using"))) + | ( + Group(WINDOW)("op") + + Group(var_name("name") + AS + over_clause("value"))("join") + ) + ) / to_join_call + + selection = ( + (SELECT + DISTINCT + ON + LB) + + delimited_list(select_column)("distinct_on") + + RB + + delimited_list(select_column)("select") + | SELECT + DISTINCT + delimited_list(select_column)("select_distinct") + | ( + SELECT + + Optional( + TOP + + expr("value") + + Optional(keyword("percent"))("percent") + + Optional(WITH + keyword("ties"))("ties") + )("top") + / to_top_clause + + delimited_list(select_column)("select") + ) + ) + + row = (LB + delimited_list(Group(expr)) + RB) / to_row + values = VALUES + delimited_list(row) / to_values + + unordered_sql = Group( + values + | selection + + Optional( + (FROM + delimited_list(table_source) + ZeroOrMore(join))("from") + + Optional(WHERE + expr("where")) + + Optional(GROUP_BY + delimited_list(Group(named_column))("groupby")) + + Optional(HAVING + expr("having")) + ) + ).set_parser_name("unordered sql") + + with NO_WHITESPACE: + + def mult(tokens): + amount = tokens["bytes"] + scale = tokens["scale"].lower() + return { + "bytes": amount + * {"b": 1, "k": 1_000, "m": 1_000_000, "g": 1_000_000_000}[scale] + } + + ts_bytes = ( + (real_num | int_num)("bytes") + Char("bBkKmMgG")("scale") + ) / mult + + tablesample = assign( + "tablesample", + LB + + ( + ( + keyword("bucket")("op") + + int_num("params") + + keyword("out of") + + int_num("params") + + Optional(ON + expr("on")) + ) + / to_json_call + | (real_num | int_num)("percent") + keyword("percent") + | int_num("rows") + keyword("rows") + | ts_bytes + ) + + RB, + ) + + table_source << Group( + ((LB + query + RB) | stack | call_function | var_name)("value") + + Optional(flag("with ordinality")) + + Optional(tablesample) + + alias + ).set_parser_name("table_source") / to_table + + rows = Optional(keyword("row") | keyword("rows")) + limit = ( + Optional(assign("offset", expr) + rows) + & Optional( + FETCH + + Optional(keyword("first") | keyword("next")) + + expr("fetch") + + rows + + Optional(keyword("only")) + ) + & Optional(assign("limit", expr)) + ) + + ordered_sql = ( + ( + (unordered_sql | (LB + query + RB)) + + ZeroOrMore( + Group( + (UNION | INTERSECT | EXCEPT | MINUS) + Optional(ALL | DISTINCT) + )("op") + + (unordered_sql | (LB + query + RB)) + ) + )("union") + + Optional(ORDER_BY + delimited_list(Group(sort_column))("orderby")) + + limit + ).set_parser_name("ordered sql") / to_union_call + + with_expr = delimited_list(Group( + ( + (var_name("name") + Optional(LB + delimited_list(ident("col")) + RB)) + / to_alias + )("name") + + (AS + LB + (query | expr)("value") + RB) + )) + + query << ( + Optional(assign("with recursive", with_expr) | assign("with", with_expr)) + + Group(ordered_sql)("query") + ) / to_query + + ##################################################################### + # DML STATEMENTS + ##################################################################### + + # MySQL's index_type := Using + ( "BTREE" | "HASH" ) + index_type = Optional(assign("using", ident("index_type"))) + + index_column_names = LB + delimited_list(var_name("columns")) + RB + + column_def_delete = assign( + "on delete", + (keyword("cascade") | keyword("set null") | keyword("set default")), + ) + + table_def_foreign_key = FOREIGN_KEY + Optional( + Optional(var_name("index_name")) + + index_column_names + + column_def_references + + Optional(column_def_delete) + ) + + index_options = ZeroOrMore(var_name)("table_constraint_options") + + table_constraint_definition = Optional(CONSTRAINT + var_name("name")) + ( + assign("primary key", index_type + index_column_names + index_options) + | ( + Optional(flag("unique")) + + Optional(INDEX | KEY) + + Optional(var_name("name")) + + index_type + + index_column_names + + index_options + )("index") + | assign("check", LB + expr + RB) + | table_def_foreign_key("foreign_key") + ) + + table_element = ( + column_definition("columns") | table_constraint_definition("constraint") + ) + + create_table = ( + keyword("create") + + Optional(keyword("or") + flag("replace")) + + Optional(flag("temporary")) + + TABLE + + Optional((keyword("if not exists") / (lambda: False))("replace")) + + var_name("name") + + Optional(LB + delimited_list(table_element) + RB) + + ZeroOrMore( + assign("engine", EQ + var_name) + | assign("collate", EQ + var_name) + | assign("auto_increment", EQ + int_num) + | assign("comment", EQ + literal_string) + | assign("default character set", EQ + var_name) + | assign("default charset", EQ + var_name) + ) + + Optional(AS.suppress() + infix_notation(query, [])("query")) + )("create table") + + create_view = ( + keyword("create") + + Optional(keyword("or") + flag("replace")) + + Optional(flag("temporary")) + + VIEW.suppress() + + Optional((keyword("if not exists") / (lambda: False))("replace")) + + var_name("name") + + AS + + query("query") + )("create view") + + # CREATE INDEX a ON u USING btree (e); + create_index = ( + keyword("create index") + + Optional(keyword("or") + flag("replace"))(INDEX | KEY) + + Optional((keyword("if not exists") / (lambda: False))("replace")) + + var_name("name") + + ON + + var_name("table") + + index_type + + index_column_names + + index_options + )("create index") + + cache_options = Optional(( + keyword("options").suppress() + + LB + + Dict(delimited_list(Group( + literal_string / (lambda tokens: tokens[0]["literal"]) + + Optional(EQ) + + var_name + ))) + + RB + )("options")) + + create_cache = ( + keyword("cache").suppress() + + Optional(flag("lazy")) + + TABLE + + var_name("name") + + cache_options + + Optional(AS + query("query")) + )("cache") + + drop_table = ( + keyword("drop table") + Optional(flag("if exists")) + var_name("table") + )("drop") + + drop_view = ( + keyword("drop view") + Optional(flag("if exists")) + var_name("view") + )("drop") + + drop_index = ( + keyword("drop index") + Optional(flag("if exists")) + var_name("index") + )("drop") + + insert = ( + keyword("insert").suppress() + + ( + flag("overwrite") + keyword("table").suppress() + | keyword("into").suppress() + Optional(keyword("table").suppress()) + ) + + var_name("table") + + Optional(LB + delimited_list(var_name)("columns") + RB) + + Optional(flag("if exists")) + + (values | query)("query") + ) / to_insert_call + + update = ( + keyword("update")("op") + + var_name("params") + + assign("set", Dict(delimited_list(Group(var_name + EQ + expr)))) + + Optional(assign("where", expr)) + ) / to_json_call + + delete = ( + keyword("delete")("op") + + keyword("from").suppress() + + var_name("params") + + Optional(assign("where", expr)) + ) / to_json_call + + return ( + query + | (insert | update | delete) + | (create_table | create_view | create_cache | create_index) + | (drop_table | drop_view | drop_index) + ).finalize() diff --git a/mo_sql_parsing/types.py b/mo_sql_parsing/types.py new file mode 100644 index 0000000..06c0bb7 --- /dev/null +++ b/mo_sql_parsing/types.py @@ -0,0 +1,223 @@ +# encoding: utf-8 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Contact: Kyle Lahnakoski (kyle@lahnakoski.com) +# + + +# KNOWN TYPES +from mo_parsing import Forward, Group, Optional, MatchFirst, Literal, ZeroOrMore, export +from mo_parsing.infix import delimited_list, RIGHT_ASSOC, LEFT_ASSOC + +from mo_sql_parsing.keywords import ( + RB, + LB, + NEG, + NOT, + BINARY_NOT, + NULL, + EQ, + KNOWN_OPS, + LT, + GT, +) +from mo_sql_parsing.utils import ( + keyword, + to_json_call, + int_num, + ansi_string, + ansi_ident, + assign, + flag, +) + +_size = Optional(LB + int_num("params") + RB) +_sizes = Optional(LB + delimited_list(int_num("params")) + RB) + +simple_types = Forward() + +BIGINT = Group(keyword("bigint")("op") + Optional(_size)+Optional(flag("unsigned"))) / to_json_call +BOOL = Group(keyword("bool")("op")) / to_json_call +BOOLEAN = Group(keyword("boolean")("op")) / to_json_call +DOUBLE = Group(keyword("double")("op")) / to_json_call +FLOAT64 = Group(keyword("float64")("op")) / to_json_call +FLOAT = Group(keyword("float")("op")) / to_json_call +GEOMETRY = Group(keyword("geometry")("op")) / to_json_call +INTEGER = Group(keyword("integer")("op")) / to_json_call +INT = (keyword("int")("op") + _size) / to_json_call +INT32 = Group(keyword("int32")("op")) / to_json_call +INT64 = Group(keyword("int64")("op")) / to_json_call +REAL = Group(keyword("real")("op")) / to_json_call +TEXT = Group(keyword("text")("op")) / to_json_call +SMALLINT = Group(keyword("smallint")("op")) / to_json_call +STRING = Group(keyword("string")("op")) / to_json_call + +BLOB = (keyword("blob")("op") + _size) / to_json_call +BYTES = (keyword("bytes")("op") + _size) / to_json_call +CHAR = (keyword("char")("op") + _size) / to_json_call +NCHAR = (keyword("nchar")("op") + _size) / to_json_call +VARCHAR = (keyword("varchar")("op") + _size) / to_json_call +VARCHAR2 = (keyword("varchar2")("op") + _size) / to_json_call +VARBINARY = (keyword("varbinary")("op") + _size) / to_json_call +TINYINT = (keyword("tinyint")("op") + _size) / to_json_call +UUID = Group(keyword("uuid")("op")) / to_json_call + +DECIMAL = (keyword("decimal")("op") + _sizes) / to_json_call +DOUBLE_PRECISION = ( + Group((keyword("double precision") / (lambda: "double_precision"))("op")) + / to_json_call +) +NUMERIC = (keyword("numeric")("op") + _sizes) / to_json_call +NUMBER = (keyword("number")("op") + _sizes) / to_json_call + +MAP_TYPE = ( + keyword("map")("op") + LB + delimited_list(simple_types("params")) + RB +) / to_json_call +ARRAY_TYPE = (keyword("array")("op") + LB + simple_types("params") + RB) / to_json_call + +DATE = keyword("date") +DATETIME = keyword("datetime") +DATETIME_W_TIMEZONE = keyword("datetime with time zone") +TIME = keyword("time") +TIMESTAMP = keyword("timestamp") +TIMESTAMP_W_TIMEZONE = keyword("timestamp with time zone") +TIMESTAMPTZ = keyword("timestamptz") +TIMETZ = keyword("timetz") + +time_functions = DATE | DATETIME | TIME | TIMESTAMP | TIMESTAMPTZ | TIMETZ + +# KNOWNN TIME TYPES +_format = Optional((ansi_string | ansi_ident)("params")) + +DATE_TYPE = (DATE("op") + _format) / to_json_call +DATETIME_TYPE = (DATETIME("op") + _format) / to_json_call +DATETIME_W_TIMEZONE_TYPE = (DATETIME_W_TIMEZONE("op") + _format) / to_json_call +TIME_TYPE = (TIME("op") + _format) / to_json_call +TIMESTAMP_TYPE = (TIMESTAMP("op") + _format) / to_json_call +TIMESTAMP_W_TIMEZONE_TYPE = (TIMESTAMP_W_TIMEZONE("op") + _format) / to_json_call +TIMESTAMPTZ_TYPE = (TIMESTAMPTZ("op") + _format) / to_json_call +TIMETZ_TYPE = (TIMETZ("op") + _format) / to_json_call + +simple_types << MatchFirst([ + ARRAY_TYPE, + BIGINT, + BOOL, + BOOLEAN, + BLOB, + BYTES, + CHAR, + DATE_TYPE, + DATETIME_W_TIMEZONE_TYPE, + DATETIME_TYPE, + DECIMAL, + DOUBLE_PRECISION, + DOUBLE, + FLOAT64, + FLOAT, + GEOMETRY, + MAP_TYPE, + INTEGER, + INT, + INT32, + INT64, + NCHAR, + NUMBER, + NUMERIC, + REAL, + TEXT, + SMALLINT, + STRING, + TIME_TYPE, + TIMESTAMP_W_TIMEZONE_TYPE, + TIMESTAMP_TYPE, + TIMESTAMPTZ_TYPE, + TIMETZ_TYPE, + TINYINT, + UUID, + VARCHAR, + VARCHAR2, + VARBINARY, +]) + +CASTING = (Literal("::").suppress() + simple_types("params")).set_parser_name("cast") +KNOWN_OPS.insert(0, CASTING) + +unary_ops = { + NEG: RIGHT_ASSOC, + NOT: RIGHT_ASSOC, + BINARY_NOT: RIGHT_ASSOC, + CASTING: LEFT_ASSOC, +} + + +def get_column_type(expr, var_name, literal_string): + column_definition = Forward() + column_type = Forward().set_parser_name("column type") + + struct_type = ( + keyword("struct")("op") + + LT.suppress() + + Group(delimited_list(column_definition))("params") + + GT.suppress() + ) / to_json_call + + row_type = ( + keyword("row")("op") + + LB + + Group(delimited_list(column_definition))("params") + + RB + ) / to_json_call + + array_type = ( + keyword("array")("op") + + ( + ( + LT.suppress() + + Group(delimited_list(column_type))("params") + + GT.suppress() + ) + | (LB + Group(delimited_list(column_type))("params") + RB) + ) + ) / to_json_call + + column_type << (struct_type | row_type | array_type | simple_types) + + column_def_identity = ( + assign( + "generated", + (keyword("always") | keyword("by default") / (lambda: "by_default")), + ) + + keyword("as identity").suppress() + + Optional(assign("start with", int_num)) + + Optional(assign("increment by", int_num)) + ) + + column_def_references = assign( + "references", var_name("table") + LB + delimited_list(var_name)("columns") + RB, + ) + + column_options = ZeroOrMore( + ((NOT + NULL) / (lambda: False))("nullable") + | (NULL / (lambda t: True))("nullable") + | flag("unique") + | flag("auto_increment") + | assign("comment", literal_string) + | assign("collate", Optional(EQ) + var_name) + | flag("primary key") + | column_def_identity("identity") + | column_def_references + | assign("check", LB + expr + RB) + | assign("default", expr) + ).set_parser_name("column_options") + + column_definition << Group( + var_name("name") + (column_type | var_name)("type") + column_options + ).set_parser_name("column_definition") + + return column_type, column_definition, column_def_references + + +export("mo_sql_parsing.utils", unary_ops) diff --git a/mo_sql_parsing/utils.py b/mo_sql_parsing/utils.py new file mode 100644 index 0000000..6578c3a --- /dev/null +++ b/mo_sql_parsing/utils.py @@ -0,0 +1,617 @@ +# encoding: utf-8 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Contact: Kyle Lahnakoski (kyle@lahnakoski.com) +# + +import ast + +from mo_dots import is_data, is_null, Data, from_data +from mo_future import text, number_types, binary_type, flatten +from mo_imports import expect +from mo_parsing import * +from mo_parsing.utils import is_number, listwrap + +unary_ops = expect("unary_ops") + + +class Call(object): + __slots__ = ["op", "args", "kwargs"] + + def __init__(self, op, args, kwargs): + self.op = op + self.args = args + self.kwargs = kwargs + + +IDENT_CHAR = Regex("[@_$0-9A-Za-zÀ-ÖØ-öø-ƿ]").expr.parser_config.include +FIRST_IDENT_CHAR = "".join(set(IDENT_CHAR) - set("0123456789")) +SQL_NULL = Call("null", [], {}) + +null_locations = [] + + +def keyword(keywords): + return And([ + Keyword(k, caseless=True) for k in keywords.split(" ") + ]).set_parser_name(keywords) / (lambda: keywords.replace(" ", "_")) + + +def flag(keywords): + """ + RETURN {keywords: True} + """ + return (keyword(keywords) / (lambda: True))(keywords.replace(" ", "_")) + + +def assign(key: str, value: ParserElement): + return keyword(key).suppress() + value(key.replace(" ", "_")) + + +def simple_op(op, args, kwargs): + if args is None: + kwargs[op] = {} + else: + kwargs[op] = args + return kwargs + + +def normal_op(op, args, kwargs): + output = Data(op=op) + args = listwrap(args) + if args and (not isinstance(args[0], dict) or args[0]): + output.args = args + if kwargs: + output.kwargs = kwargs + return from_data(output) + + +scrub_op = simple_op + + +def scrub(result): + if result is SQL_NULL: + return SQL_NULL + elif result == None: + return None + elif isinstance(result, text): + return result + elif isinstance(result, binary_type): + return result.decode("utf8") + elif isinstance(result, number_types): + return result + elif isinstance(result, Call): + kwargs = scrub(result.kwargs) + args = scrub(result.args) + if args is SQL_NULL: + null_locations.append((kwargs, result.op)) + return scrub_op(result.op, args, kwargs) + elif isinstance(result, dict) and not result: + return result + elif isinstance(result, list): + output = [rr for r in result for rr in [scrub(r)]] + + if not output: + return None + elif len(output) == 1: + return output[0] + else: + for i, v in enumerate(output): + if v is SQL_NULL: + null_locations.append((output, i)) + return output + else: + # ATTEMPT A DICT INTERPRETATION + try: + kv_pairs = list(result.items()) + except Exception as c: + print(c) + output = {k: vv for k, v in kv_pairs for vv in [scrub(v)] if not is_null(vv)} + if isinstance(result, dict) or output: + for k, v in output.items(): + if v is SQL_NULL: + null_locations.append((output, k)) + return output + return scrub(list(result)) + + +def _chunk(values, size): + acc = [] + for v in values: + acc.append(v) + if len(acc) == size: + yield acc + acc = [] + if acc: + yield acc + + +def to_lambda(tokens): + params, op, expr = list(tokens) + return Call("lambda", [expr], {"params": list(params)}) + + +def to_json_operator(tokens): + # ARRANGE INTO {op: params} FORMAT + length = len(tokens.tokens) + if length == 2: + if tokens.tokens[1].type.parser_name == "cast": + return Call("cast", list(tokens), {}) + # UNARY OPERATOR + op = tokens.tokens[0].type.parser_name + if op == "neg" and is_number(tokens[1]): + return -tokens[1] + return Call(op, [tokens[1]], {}) + elif length == 5: + # TRINARY OPERATOR + return Call( + tokens.tokens[1].type.parser_name, [tokens[0], tokens[2], tokens[4]], {} + ) + + op = tokens[1] + if not isinstance(op, text): + op = op.type.parser_name + op = binary_ops.get(op, op) + if op == "eq": + if tokens[2] is SQL_NULL: + return Call("missing", tokens[0], {}) + elif tokens[0] is SQL_NULL: + return Call("missing", tokens[2], {}) + elif op == "neq": + if tokens[2] is SQL_NULL: + return Call("exists", tokens[0], {}) + elif tokens[0] is SQL_NULL: + return Call("exists", tokens[2], {}) + elif op == "eq!": + if tokens[2] is SQL_NULL: + return Call("missing", tokens[0], {}) + elif tokens[0] is SQL_NULL: + return Call("missing", tokens[2], {}) + elif op == "ne!": + if tokens[2] is SQL_NULL: + return Call("exists", tokens[0], {}) + elif tokens[0] is SQL_NULL: + return Call("exists", tokens[2], {}) + elif op == "is": + if tokens[2] is SQL_NULL: + return Call("missing", tokens[0], {}) + else: + return Call("exists", tokens[0], {}) + elif op == "is_not": + if tokens[2] is SQL_NULL: + return Call("exists", tokens[0], {}) + else: + return Call("missing", tokens[0], {}) + + operands = [tokens[0], tokens[2]] + binary_op = Call(op, operands, {}) + + if op in {"add", "mul", "and", "or"}: + # ASSOCIATIVE OPERATORS + acc = [] + for operand in operands: + while isinstance(operand, ParseResults) and isinstance(operand.type, Group): + # PARENTHESES CAUSE EXTRA GROUP LAYERS + operand = operand[0] + if isinstance(operand, ParseResults) and isinstance( + operand.type, Forward + ): + operand = operand[0] + + if isinstance(operand, Call) and operand.op == op: + acc.extend(operand.args) + elif isinstance(operand, list): + acc.append(operand) + elif isinstance(operand, dict) and operand.get(op): + acc.extend(operand.get(op)) + else: + acc.append(operand) + binary_op = Call(op, acc, {}) + return binary_op + + +def to_offset(tokens): + expr, offset = tokens.tokens + return Call("get", [expr, offset], {}) + + +def to_window_mod(tokens): + expr, window = tokens.tokens + return Call("value", [expr], {**window}) + + +def to_tuple_call(tokens): + # IS THIS ONE VALUE IN (), OR MANY? + tokens = list(tokens) + if len(tokens) == 1: + return [tokens[0]] + if all(isinstance(r, number_types) for r in tokens): + return [tokens] + if all( + isinstance(r, number_types) or (is_data(r) and "literal" in r.keys()) + for r in tokens + ): + candidate = {"literal": [r["literal"] if is_data(r) else r for r in tokens]} + return candidate + + return [tokens] + + +binary_ops = { + "::": "cast", + "COLLATE": "collate", + "||": "concat", + "*": "mul", + "/": "div", + "%": "mod", + "+": "add", + "-": "sub", + "&": "binary_and", + "|": "binary_or", + "<": "lt", + "<=": "lte", + ">": "gt", + ">=": "gte", + "=": "eq", + "==": "eq", + "is distinct from": "eq!", # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/ + "is_distinct_from": "eq!", + "is not distinct from": "ne!", + "is_not_distinct_from": "ne!", + "<=>": "eq!", # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/ + "!=": "neq", + "<>": "neq", + "not in": "nin", + "in": "in", + "is_not": "neq", + "is": "eq", + "similar_to": "similar_to", + "like": "like", + "rlike": "rlike", + "not like": "not_like", + "not_like": "not_like", + "not rlike": "not_rlike", + "not_rlike": "not_rlike", + "not_simlilar_to": "not_similar_to", + "or": "or", + "and": "and", + "->": "lambda", + "union": "union", + "union_all": "union_all", + "union all": "union_all", + "except": "except", + "minus": "minus", + "intersect": "intersect", +} + +is_set_op = ("union", "union_all", "except", "minus", "intersect") + + +def to_trim_call(tokens): + frum = tokens["from"] + if not frum: + return Call("trim", [tokens["chars"]], {"direction": tokens["direction"]}) + return Call( + "trim", + [frum], + {"characters": tokens["chars"], "direction": tokens["direction"]}, + ) + + +def to_json_call(tokens): + # ARRANGE INTO {op: params} FORMAT + op = tokens["op"].lower() + op = binary_ops.get(op, op) + params = tokens["params"] + if isinstance(params, (dict, str, int, Call)): + args = [params] + else: + args = list(params) + + kwargs = {k: v for k, v in tokens.items() if k not in ("op", "params")} + + return ParseResults( + tokens.type, + tokens.start, + tokens.end, + [Call(op, args, kwargs)], + tokens.failures, + ) + + +def to_interval_call(tokens): + # ARRANGE INTO {interval: [amount, type]} FORMAT + params = tokens["params"] + if not params: + params = {} + if params.length() == 2: + return Call("interval", params, {}) + + return Call("add", [Call("interval", p, {}) for p in _chunk(params, size=2)], {}) + + +def to_case_call(tokens): + cases = list(tokens["case"]) + elze = tokens["else"] + if elze != None: + cases.append(elze) + return Call("case", cases, {}) + + +def to_switch_call(tokens): + # CONVERT TO CLASSIC CASE STATEMENT + value = tokens["value"] + acc = [] + for c in list(tokens["case"]): + acc.append(Call("when", [Call("eq", [value] + c.args, {})], c.kwargs)) + elze = tokens["else"] + if elze != None: + acc.append(elze) + return Call("case", acc, {}) + + +def to_when_call(tokens): + tok = tokens + return Call("when", [tok["when"]], {"then": tok["then"]}) + + +def to_join_call(tokens): + op = " ".join(tokens["op"]) + if tokens["join"]["name"]: + output = {op: { + "name": tokens["join"]["name"], + "value": tokens["join"]["value"], + }} + else: + output = {op: tokens["join"]} + + output["on"] = tokens["on"] + output["using"] = tokens["using"] + return output + + +def to_expression_call(tokens): + if set(tokens.keys()) & {"over", "within", "filter"}: + return + + return ParseResults( + tokens.type, + tokens.start, + tokens.end, + listwrap(tokens["value"]), + tokens.failures, + ) + + +def to_over(tokens): + if not tokens: + return {} + + +def to_alias(tokens): + cols = tokens["col"] + name = tokens["name"] + if cols: + return {name: cols} + return name + + +def to_top_clause(tokens): + value = tokens["value"] + if not value: + return None + + value = value.value() + if tokens["ties"]: + output = {} + output["ties"] = True + if tokens["percent"]: + output["percent"] = value + else: + output["value"] = value + return output + elif tokens["percent"]: + return {"percent": value} + else: + return [value] + + +def to_row(tokens): + columns = list(tokens) + if len(columns) > 1: + return {"select": [{"value": v[0]} for v in columns]} + else: + return {"select": {"value": columns[0]}} + + +def get_literal(value): + if isinstance(value, (int, float)): + return value + elif isinstance(value, Call): + return + elif value is SQL_NULL: + return value + elif "literal" in value: + return value["literal"] + + +def to_values(tokens): + rows = list(tokens) + if len(rows) > 1: + values = [ + [get_literal(s["value"]) for s in listwrap(row["select"])] for row in rows + ] + if all(flatten(values)): + return {"from": {"literal": values}} + return {"union_all": list(tokens)} + else: + return rows + + +def to_stack(tokens): + width = tokens["width"] + args = listwrap(tokens["args"]) + return Call("stack", args, {"width": width}) + + +def to_array(tokens): + types = list(tokens["type"]) + args = list(tokens["args"]) + output = Call("create_array", args, {}) + if types: + output = Call("cast", [output, Call("array", types, {})], {}) + return output + + +def to_map(tokens): + keys = tokens["keys"] + values = tokens["values"] + return Call("create_map", [keys, values], {}) + + +def to_struct(tokens): + types = list(tokens["types"]) + args = list(d for a in tokens["args"] for d in [a if a["name"] else a["value"]]) + + output = Call("create_struct", args, {}) + if types: + output = Call("cast", [output, Call("struct", types, {})], {}) + return output + + +def to_select_call(tokens): + expr = tokens["value"] + if expr == "*": + return ["*"] + try: + call = expr[0][0] + if call.op == "value": + return {"name": tokens["name"], "value": call.args, **call.kwargs} + except: + pass + + +def to_union_call(tokens): + unions = tokens["union"] + if isinstance(unions, dict): + return unions + elif unions.type.parser_name == "unordered sql": + output = {k: v for k, v in unions.items()} # REMOVE THE Group() + else: + unions = list(unions) + sources = [unions[i] for i in range(0, len(unions), 2)] + operators = ["_".join(unions[i]) for i in range(1, len(unions), 2)] + acc = sources[0] + last_union = None + for op, so in list(zip(operators, sources[1:])): + if op == last_union and "union" in op: + acc[op] = acc[op] + [so] + else: + acc = {op: [acc, so]} + last_union = op + + if not tokens["orderby"] and not tokens["offset"] and not tokens["limit"]: + return acc + else: + output = {"from": acc} + + output["orderby"] = tokens["orderby"] + output["limit"] = tokens["limit"] + output["offset"] = tokens["offset"] + output["fetch"] = tokens["fetch"] + return output + + +def to_insert_call(tokens): + options = { + k: v for k, v in tokens.items() if k not in ["columns", "table", "query"] + } + query = tokens["query"] + columns = tokens["columns"] + try: + values = query["from"]["literal"] + if values: + if columns: + data = [dict(zip(columns, row)) for row in values] + return Call("insert", [tokens["table"]], {"values": data, **options}) + else: + return Call("insert", [tokens["table"]], {"values": values, **options}) + except Exception: + pass + + return Call( + "insert", [tokens["table"]], {"columns": columns, "query": query, **options} + ) + + +def to_query(tokens): + output = tokens["query"][0] + try: + output["with"] = tokens["with"] + output["with_recursive"] = tokens["with_recursive"] + + return output + except Exception as cause: + return + + +def to_table(tokens): + output = dict(tokens) + if len(list(output.keys())) > 1: + return output + else: + return output["value"] + + +def unquote(tokens): + val = tokens[0] + if val.startswith("'") and val.endswith("'"): + val = "'" + val[1:-1].replace("''", "\\'") + "'" + elif val.startswith('"') and val.endswith('"'): + val = '"' + val[1:-1].replace('""', '\\"') + '"' + elif val.startswith("`") and val.endswith("`"): + val = '"' + val[1:-1].replace("``", "`").replace('"', '\\"') + '"' + elif val.startswith("[") and val.endswith("]"): + val = '"' + val[1:-1].replace("]]", "]").replace('"', '\\"') + '"' + elif val.startswith("+"): + val = val[1:] + un = ast.literal_eval(val).replace(".", "\\.") + return un + + +def to_string(tokens): + val = tokens[0] + val = "'" + val[1:-1].replace("''", "\\'") + "'" + return {"literal": ast.literal_eval(val)} + + +# NUMBERS +real_num = ( + Regex(r"[+-]?(\d+\.\d*|\.\d+)([eE][+-]?\d+)?").set_parser_name("float") + / (lambda t: float(t[0])) +) + + +def parse_int(tokens): + if "e" in tokens[0].lower(): + return int(float(tokens[0])) + else: + return int(tokens[0]) + + +int_num = Regex(r"[+-]?\d+([eE]\+?\d+)?").set_parser_name("int") / parse_int +hex_num = ( + Regex(r"0x[0-9a-fA-F]+").set_parser_name("hex") / (lambda t: {"hex": t[0][2:]}) +) + +# STRINGS +ansi_string = Regex(r"\'(\'\'|[^'])*\'") / to_string +mysql_doublequote_string = Regex(r'\"(\"\"|[^"])*\"') / to_string + +# BASIC IDENTIFIERS +ansi_ident = Regex(r'\"(\"\"|[^"])*\"') / unquote +mysql_backtick_ident = Regex(r"\`(\`\`|[^`])*\`") / unquote +sqlserver_ident = Regex(r"\[(\]\]|[^\]])*\]") / unquote diff --git a/mo_sql_parsing/windows.py b/mo_sql_parsing/windows.py new file mode 100644 index 0000000..9bf818a --- /dev/null +++ b/mo_sql_parsing/windows.py @@ -0,0 +1,107 @@ +# encoding: utf-8 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Contact: Kyle Lahnakoski (kyle@lahnakoski.com) +# + +from __future__ import absolute_import, division, unicode_literals + +from mo_parsing.infix import delimited_list + +from mo_sql_parsing.keywords import * +from mo_sql_parsing.utils import * + + +# https://docs.microsoft.com/en-us/sql/t-sql/queries/select-over-clause-transact-sql?view=sql-server-ver15 + + +def _to_bound_call(tokens): + zero = tokens["zero"] + if zero: + return {"min": 0, "max": 0} + + direction = scrub(tokens["direction"]) + limit = scrub(tokens["limit"]) + if direction == "preceding": + if limit == "unbounded": + return {"max": 0} + elif is_data(limit): + return {"min": {"neg": limit}, "max": 0} + else: + return {"min": -limit, "max": 0} + else: # following + if limit == "unbounded": + return {"min": 0} + elif is_data(limit): + return {"min": {"neg": limit}, "max": 0} + else: + return {"min": 0, "max": limit} + + +def _to_between_call(tokens): + minn = scrub(tokens["min"]) + maxx = scrub(tokens["max"]) + + if maxx.get("max") == 0: + # following + return { + "min": minn.get("min"), + "max": maxx.get("min"), + } + elif minn.get("min") == 0: + # preceding + return {"min": minn.get("max"), "max": maxx.get("max")} + else: + return { + "min": minn.get("min"), + "max": maxx.get("max"), + } + + +UNBOUNDED = keyword("unbounded") +PRECEDING = keyword("preceding") +FOLLOWING = keyword("following") +CURRENT_ROW = keyword("current row") +ROWS = keyword("rows") +RANGE = keyword("range") + + +def window(expr, var_name, sort_column): + bound_row = ( + CURRENT_ROW("zero") + | (UNBOUNDED | int_num)("limit") + (PRECEDING | FOLLOWING)("direction") + ) / _to_bound_call + bound_expr = ( + CURRENT_ROW("zero") + | (UNBOUNDED | expr)("limit") + (PRECEDING | FOLLOWING)("direction") + ) / _to_bound_call + between_row = ( + BETWEEN + bound_row("min") + AND + bound_row("max") + ) / _to_between_call + between_expr = ( + BETWEEN + bound_expr("min") + AND + bound_expr("max") + ) / _to_between_call + + row_clause = (ROWS.suppress() + (between_row | bound_row)) | ( + RANGE.suppress() + (between_expr | bound_expr) + ) + + over_clause = ( + LB + + Optional(PARTITION_BY + delimited_list(Group(expr))("partitionby")) + + Optional(ORDER_BY + delimited_list(Group(sort_column))("orderby")) + + Optional(row_clause("range")) + + RB + ) + + window_clause = Optional(( + WITHIN_GROUP + + LB + + Optional(ORDER_BY + delimited_list(Group(sort_column))("orderby")) + + RB + )("within")) + ((OVER + (over_clause | var_name) / to_over)("over")) + + return window_clause, over_clause diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..92b3841 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +mo-future +mo-dots==8.20.21357 +mo-parsing +mo-imports diff --git a/run.py b/run.py new file mode 100644 index 0000000..3da498c --- /dev/null +++ b/run.py @@ -0,0 +1,6 @@ +import mo_sql_parsing as parser +q = 'SELECT p.Name, v.Name FROM Production.Product p JOIN Purchasing.ProductVendor pv ON p.ProductID = pv.ProductID JOIN Purchasing.Vendor v ON pv.BusinessEntityID = v.BusinessEntityID WHERE ProductSubcategoryID = 15 ORDER BY v.Name;' + +res = parser.parse(q) + +print(res) \ No newline at end of file