You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
618 lines
17 KiB
618 lines
17 KiB
# encoding: utf-8
|
|
#
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
#
|
|
# Contact: Kyle Lahnakoski (kyle@lahnakoski.com)
|
|
# Bill Sun 2022 - 2023
|
|
|
|
import ast
|
|
|
|
from mo_dots import is_data, is_null, Data, from_data
|
|
from mo_future import text, number_types, binary_type, flatten
|
|
from mo_imports import expect
|
|
from mo_parsing import *
|
|
from mo_parsing.utils import is_number, listwrap
|
|
|
|
unary_ops = expect("unary_ops")
|
|
|
|
|
|
class Call(object):
|
|
__slots__ = ["op", "args", "kwargs"]
|
|
|
|
def __init__(self, op, args, kwargs):
|
|
self.op = op
|
|
self.args = args
|
|
self.kwargs = kwargs
|
|
|
|
|
|
IDENT_CHAR = Regex("[@_$0-9A-Za-zÀ-ÖØ-öø-ƿ]").expr.parser_config.include
|
|
FIRST_IDENT_CHAR = "".join(set(IDENT_CHAR) - set("0123456789"))
|
|
SQL_NULL = Call("null", [], {})
|
|
|
|
null_locations = []
|
|
|
|
|
|
def keyword(keywords):
|
|
return And([
|
|
Keyword(k, caseless=True) for k in keywords.split(" ")
|
|
]).set_parser_name(keywords) / (lambda: keywords.replace(" ", "_"))
|
|
|
|
|
|
def flag(keywords):
|
|
"""
|
|
RETURN {keywords: True}
|
|
"""
|
|
return (keyword(keywords) / (lambda: True))(keywords.replace(" ", "_"))
|
|
|
|
|
|
def assign(key: str, value: ParserElement):
|
|
return keyword(key).suppress() + value(key.replace(" ", "_"))
|
|
|
|
|
|
def simple_op(op, args, kwargs):
|
|
if args is None:
|
|
kwargs[op] = {}
|
|
else:
|
|
kwargs[op] = args
|
|
return kwargs
|
|
|
|
|
|
def normal_op(op, args, kwargs):
|
|
output = Data(op=op)
|
|
args = listwrap(args)
|
|
if args and (not isinstance(args[0], dict) or args[0]):
|
|
output.args = args
|
|
if kwargs:
|
|
output.kwargs = kwargs
|
|
return from_data(output)
|
|
|
|
|
|
scrub_op = simple_op
|
|
|
|
|
|
def scrub(result):
|
|
if result is SQL_NULL:
|
|
return SQL_NULL
|
|
elif result == None:
|
|
return None
|
|
elif isinstance(result, text):
|
|
return result
|
|
elif isinstance(result, binary_type):
|
|
return result.decode("utf8")
|
|
elif isinstance(result, number_types):
|
|
return result
|
|
elif isinstance(result, Call):
|
|
kwargs = scrub(result.kwargs)
|
|
args = scrub(result.args)
|
|
if args is SQL_NULL:
|
|
null_locations.append((kwargs, result.op))
|
|
return scrub_op(result.op, args, kwargs)
|
|
elif isinstance(result, dict) and not result:
|
|
return result
|
|
elif isinstance(result, list):
|
|
output = [rr for r in result for rr in [scrub(r)]]
|
|
|
|
if not output:
|
|
return None
|
|
elif len(output) == 1:
|
|
return output[0]
|
|
else:
|
|
for i, v in enumerate(output):
|
|
if v is SQL_NULL:
|
|
null_locations.append((output, i))
|
|
return output
|
|
else:
|
|
# ATTEMPT A DICT INTERPRETATION
|
|
try:
|
|
kv_pairs = list(result.items())
|
|
except Exception as c:
|
|
print(c)
|
|
output = {k: vv for k, v in kv_pairs for vv in [scrub(v)] if not is_null(vv)}
|
|
if isinstance(result, dict) or output:
|
|
for k, v in output.items():
|
|
if v is SQL_NULL:
|
|
null_locations.append((output, k))
|
|
return output
|
|
return scrub(list(result))
|
|
|
|
|
|
def _chunk(values, size):
|
|
acc = []
|
|
for v in values:
|
|
acc.append(v)
|
|
if len(acc) == size:
|
|
yield acc
|
|
acc = []
|
|
if acc:
|
|
yield acc
|
|
|
|
|
|
def to_lambda(tokens):
|
|
params, op, expr = list(tokens)
|
|
return Call("lambda", [expr], {"params": list(params)})
|
|
|
|
|
|
def to_json_operator(tokens):
|
|
# ARRANGE INTO {op: params} FORMAT
|
|
length = len(tokens.tokens)
|
|
if length == 2:
|
|
if tokens.tokens[1].type.parser_name == "cast":
|
|
return Call("cast", list(tokens), {})
|
|
# UNARY OPERATOR
|
|
op = tokens.tokens[0].type.parser_name
|
|
if op == "neg" and is_number(tokens[1]):
|
|
return -tokens[1]
|
|
return Call(op, [tokens[1]], {})
|
|
elif length == 5:
|
|
# TRINARY OPERATOR
|
|
return Call(
|
|
tokens.tokens[1].type.parser_name, [tokens[0], tokens[2], tokens[4]], {}
|
|
)
|
|
|
|
op = tokens[1]
|
|
if not isinstance(op, text):
|
|
op = op.type.parser_name
|
|
op = binary_ops.get(op, op)
|
|
if op == "eq":
|
|
if tokens[2] is SQL_NULL:
|
|
return Call("missing", tokens[0], {})
|
|
elif tokens[0] is SQL_NULL:
|
|
return Call("missing", tokens[2], {})
|
|
elif op == "neq":
|
|
if tokens[2] is SQL_NULL:
|
|
return Call("exists", tokens[0], {})
|
|
elif tokens[0] is SQL_NULL:
|
|
return Call("exists", tokens[2], {})
|
|
elif op == "eq!":
|
|
if tokens[2] is SQL_NULL:
|
|
return Call("missing", tokens[0], {})
|
|
elif tokens[0] is SQL_NULL:
|
|
return Call("missing", tokens[2], {})
|
|
elif op == "ne!":
|
|
if tokens[2] is SQL_NULL:
|
|
return Call("exists", tokens[0], {})
|
|
elif tokens[0] is SQL_NULL:
|
|
return Call("exists", tokens[2], {})
|
|
elif op == "is":
|
|
if tokens[2] is SQL_NULL:
|
|
return Call("missing", tokens[0], {})
|
|
else:
|
|
return Call("exists", tokens[0], {})
|
|
elif op == "is_not":
|
|
if tokens[2] is SQL_NULL:
|
|
return Call("exists", tokens[0], {})
|
|
else:
|
|
return Call("missing", tokens[0], {})
|
|
|
|
operands = [tokens[0], tokens[2]]
|
|
binary_op = Call(op, operands, {})
|
|
|
|
if op in {"add", "mul", "and", "or"}:
|
|
# ASSOCIATIVE OPERATORS
|
|
acc = []
|
|
for operand in operands:
|
|
while isinstance(operand, ParseResults) and isinstance(operand.type, Group):
|
|
# PARENTHESES CAUSE EXTRA GROUP LAYERS
|
|
operand = operand[0]
|
|
if isinstance(operand, ParseResults) and isinstance(
|
|
operand.type, Forward
|
|
):
|
|
operand = operand[0]
|
|
|
|
if isinstance(operand, Call) and operand.op == op:
|
|
acc.extend(operand.args)
|
|
elif isinstance(operand, list):
|
|
acc.append(operand)
|
|
elif isinstance(operand, dict) and operand.get(op):
|
|
acc.extend(operand.get(op))
|
|
else:
|
|
acc.append(operand)
|
|
binary_op = Call(op, acc, {})
|
|
return binary_op
|
|
|
|
|
|
def to_offset(tokens):
|
|
expr, offset = tokens.tokens
|
|
return Call("get", [expr, offset], {})
|
|
|
|
|
|
def to_window_mod(tokens):
|
|
expr, window = tokens.tokens
|
|
return Call("value", [expr], {**window})
|
|
|
|
|
|
def to_tuple_call(tokens):
|
|
# IS THIS ONE VALUE IN (), OR MANY?
|
|
tokens = list(tokens)
|
|
if len(tokens) == 1:
|
|
return [tokens[0]]
|
|
if all(isinstance(r, number_types) for r in tokens):
|
|
return [tokens]
|
|
if all(
|
|
isinstance(r, number_types) or (is_data(r) and "literal" in r.keys())
|
|
for r in tokens
|
|
):
|
|
candidate = {"literal": [r["literal"] if is_data(r) else r for r in tokens]}
|
|
return candidate
|
|
|
|
return [tokens]
|
|
|
|
|
|
binary_ops = {
|
|
"::": "cast",
|
|
"COLLATE": "collate",
|
|
"||": "concat",
|
|
"*": "mul",
|
|
"/": "div",
|
|
"%": "mod",
|
|
"+": "add",
|
|
"-": "sub",
|
|
"&": "binary_and",
|
|
"|": "binary_or",
|
|
"<": "lt",
|
|
"<=": "lte",
|
|
">": "gt",
|
|
">=": "gte",
|
|
"=": "eq",
|
|
"==": "eq",
|
|
"is distinct from": "eq!", # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/
|
|
"is_distinct_from": "eq!",
|
|
"is not distinct from": "ne!",
|
|
"is_not_distinct_from": "ne!",
|
|
"<=>": "eq!", # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/
|
|
"!=": "neq",
|
|
"<>": "neq",
|
|
"not in": "nin",
|
|
"in": "in",
|
|
"is_not": "neq",
|
|
"is": "eq",
|
|
"similar_to": "similar_to",
|
|
"like": "like",
|
|
"rlike": "rlike",
|
|
"not like": "not_like",
|
|
"not_like": "not_like",
|
|
"not rlike": "not_rlike",
|
|
"not_rlike": "not_rlike",
|
|
"not_simlilar_to": "not_similar_to",
|
|
"or": "or",
|
|
"and": "and",
|
|
"->": "lambda",
|
|
"union": "union",
|
|
"union_all": "union_all",
|
|
"union all": "union_all",
|
|
"except": "except",
|
|
"minus": "minus",
|
|
"intersect": "intersect",
|
|
}
|
|
|
|
is_set_op = ("union", "union_all", "except", "minus", "intersect")
|
|
|
|
|
|
def to_trim_call(tokens):
|
|
frum = tokens["from"]
|
|
if not frum:
|
|
return Call("trim", [tokens["chars"]], {"direction": tokens["direction"]})
|
|
return Call(
|
|
"trim",
|
|
[frum],
|
|
{"characters": tokens["chars"], "direction": tokens["direction"]},
|
|
)
|
|
|
|
|
|
def to_json_call(tokens):
|
|
# ARRANGE INTO {op: params} FORMAT
|
|
op = tokens["op"].lower()
|
|
op = binary_ops.get(op, op)
|
|
params = tokens["params"]
|
|
if isinstance(params, (dict, str, int, Call)):
|
|
args = [params]
|
|
else:
|
|
args = list(params)
|
|
|
|
kwargs = {k: v for k, v in tokens.items() if k not in ("op", "params")}
|
|
|
|
return ParseResults(
|
|
tokens.type,
|
|
tokens.start,
|
|
tokens.end,
|
|
[Call(op, args, kwargs)],
|
|
tokens.failures,
|
|
)
|
|
|
|
|
|
def to_interval_call(tokens):
|
|
# ARRANGE INTO {interval: [amount, type]} FORMAT
|
|
params = tokens["params"]
|
|
if not params:
|
|
params = {}
|
|
if params.length() == 2:
|
|
return Call("interval", params, {})
|
|
|
|
return Call("add", [Call("interval", p, {}) for p in _chunk(params, size=2)], {})
|
|
|
|
|
|
def to_case_call(tokens):
|
|
cases = list(tokens["case"])
|
|
elze = tokens["else"]
|
|
if elze != None:
|
|
cases.append(elze)
|
|
return Call("case", cases, {})
|
|
|
|
|
|
def to_switch_call(tokens):
|
|
# CONVERT TO CLASSIC CASE STATEMENT
|
|
value = tokens["value"]
|
|
acc = []
|
|
for c in list(tokens["case"]):
|
|
acc.append(Call("when", [Call("eq", [value] + c.args, {})], c.kwargs))
|
|
elze = tokens["else"]
|
|
if elze != None:
|
|
acc.append(elze)
|
|
return Call("case", acc, {})
|
|
|
|
|
|
def to_when_call(tokens):
|
|
tok = tokens
|
|
return Call("when", [tok["when"]], {"then": tok["then"]})
|
|
|
|
|
|
def to_join_call(tokens):
|
|
op = " ".join(tokens["op"])
|
|
if tokens["join"]["name"]:
|
|
output = {op: {
|
|
"name": tokens["join"]["name"],
|
|
"value": tokens["join"]["value"],
|
|
}}
|
|
else:
|
|
output = {op: tokens["join"]}
|
|
|
|
output["on"] = tokens["on"]
|
|
output["using"] = tokens["using"]
|
|
return output
|
|
|
|
|
|
def to_expression_call(tokens):
|
|
if set(tokens.keys()) & {"over", "within", "filter"}:
|
|
return
|
|
|
|
return ParseResults(
|
|
tokens.type,
|
|
tokens.start,
|
|
tokens.end,
|
|
listwrap(tokens["value"]),
|
|
tokens.failures,
|
|
)
|
|
|
|
|
|
def to_over(tokens):
|
|
if not tokens:
|
|
return {}
|
|
|
|
|
|
def to_alias(tokens):
|
|
cols = tokens["col"]
|
|
name = tokens["name"]
|
|
if cols:
|
|
return {name: cols}
|
|
return name
|
|
|
|
|
|
def to_top_clause(tokens):
|
|
value = tokens["value"]
|
|
if not value:
|
|
return None
|
|
|
|
value = value.value()
|
|
if tokens["ties"]:
|
|
output = {}
|
|
output["ties"] = True
|
|
if tokens["percent"]:
|
|
output["percent"] = value
|
|
else:
|
|
output["value"] = value
|
|
return output
|
|
elif tokens["percent"]:
|
|
return {"percent": value}
|
|
else:
|
|
return [value]
|
|
|
|
|
|
def to_row(tokens):
|
|
columns = list(tokens)
|
|
if len(columns) > 1:
|
|
return {"select": [{"value": v[0]} for v in columns]}
|
|
else:
|
|
return {"select": {"value": columns[0]}}
|
|
|
|
|
|
def get_literal(value):
|
|
if isinstance(value, (int, float)):
|
|
return value
|
|
elif isinstance(value, Call):
|
|
return
|
|
elif value is SQL_NULL:
|
|
return value
|
|
elif "literal" in value:
|
|
return value["literal"]
|
|
|
|
|
|
def to_values(tokens):
|
|
rows = list(tokens)
|
|
if len(rows) > 1:
|
|
values = [
|
|
[get_literal(s["value"]) for s in listwrap(row["select"])] for row in rows
|
|
]
|
|
if all(flatten(values)):
|
|
return {"from": {"literal": values}}
|
|
return {"union_all": list(tokens)}
|
|
else:
|
|
return rows
|
|
|
|
|
|
def to_stack(tokens):
|
|
width = tokens["width"]
|
|
args = listwrap(tokens["args"])
|
|
return Call("stack", args, {"width": width})
|
|
|
|
|
|
def to_array(tokens):
|
|
types = list(tokens["type"])
|
|
args = list(tokens["args"])
|
|
output = Call("create_array", args, {})
|
|
if types:
|
|
output = Call("cast", [output, Call("array", types, {})], {})
|
|
return output
|
|
|
|
|
|
def to_map(tokens):
|
|
keys = tokens["keys"]
|
|
values = tokens["values"]
|
|
return Call("create_map", [keys, values], {})
|
|
|
|
|
|
def to_struct(tokens):
|
|
types = list(tokens["types"])
|
|
args = list(d for a in tokens["args"] for d in [a if a["name"] else a["value"]])
|
|
|
|
output = Call("create_struct", args, {})
|
|
if types:
|
|
output = Call("cast", [output, Call("struct", types, {})], {})
|
|
return output
|
|
|
|
|
|
def to_select_call(tokens):
|
|
expr = tokens["value"]
|
|
if expr == "*":
|
|
return ["*"]
|
|
try:
|
|
call = expr[0][0]
|
|
if call.op == "value":
|
|
return {"name": tokens["name"], "value": call.args, **call.kwargs}
|
|
except:
|
|
pass
|
|
|
|
|
|
def to_union_call(tokens):
|
|
unions = tokens["union"]
|
|
if isinstance(unions, dict):
|
|
return unions
|
|
elif unions.type.parser_name == "unordered sql":
|
|
output = {k: v for k, v in unions.items()} # REMOVE THE Group()
|
|
else:
|
|
unions = list(unions)
|
|
sources = [unions[i] for i in range(0, len(unions), 2)]
|
|
operators = ["_".join(unions[i]) for i in range(1, len(unions), 2)]
|
|
acc = sources[0]
|
|
last_union = None
|
|
for op, so in list(zip(operators, sources[1:])):
|
|
if op == last_union and "union" in op:
|
|
acc[op] = acc[op] + [so]
|
|
else:
|
|
acc = {op: [acc, so]}
|
|
last_union = op
|
|
|
|
if not tokens["orderby"] and not tokens["offset"] and not tokens["limit"]:
|
|
return acc
|
|
else:
|
|
output = {"from": acc}
|
|
|
|
output["orderby"] = tokens["orderby"]
|
|
output["limit"] = tokens["limit"]
|
|
output["offset"] = tokens["offset"]
|
|
output["fetch"] = tokens["fetch"]
|
|
output["outfile"] = tokens["outfile"]
|
|
return output
|
|
|
|
|
|
def to_insert_call(tokens):
|
|
options = {
|
|
k: v for k, v in tokens.items() if k not in ["columns", "table", "query"]
|
|
}
|
|
query = tokens["query"]
|
|
columns = tokens["columns"]
|
|
try:
|
|
values = query["from"]["literal"]
|
|
if values:
|
|
if columns:
|
|
data = [dict(zip(columns, row)) for row in values]
|
|
return Call("insert", [tokens["table"]], {"values": data, **options})
|
|
else:
|
|
return Call("insert", [tokens["table"]], {"values": values, **options})
|
|
except Exception:
|
|
pass
|
|
|
|
return Call(
|
|
"insert", [tokens["table"]], {"columns": columns, "query": query, **options}
|
|
)
|
|
|
|
|
|
def to_query(tokens):
|
|
output = tokens["query"][0]
|
|
try:
|
|
output["with"] = tokens["with"]
|
|
output["with_recursive"] = tokens["with_recursive"]
|
|
|
|
return output
|
|
except Exception as cause:
|
|
return
|
|
|
|
|
|
def to_table(tokens):
|
|
output = dict(tokens)
|
|
if len(list(output.keys())) > 1:
|
|
return output
|
|
else:
|
|
return output["value"]
|
|
|
|
|
|
def unquote(tokens):
|
|
val = tokens[0]
|
|
if val.startswith("'") and val.endswith("'"):
|
|
val = "'" + val[1:-1].replace("''", "\\'") + "'"
|
|
elif val.startswith('"') and val.endswith('"'):
|
|
val = '"' + val[1:-1].replace('""', '\\"') + '"'
|
|
elif val.startswith("`") and val.endswith("`"):
|
|
val = '"' + val[1:-1].replace("``", "`").replace('"', '\\"') + '"'
|
|
elif val.startswith("[") and val.endswith("]"):
|
|
val = '"' + val[1:-1].replace("]]", "]").replace('"', '\\"') + '"'
|
|
elif val.startswith("+"):
|
|
val = val[1:]
|
|
un = ast.literal_eval(val).replace(".", "\\.")
|
|
return un
|
|
|
|
|
|
def to_string(tokens):
|
|
val = tokens[0]
|
|
val = "'" + val[1:-1].replace("''", "\\'") + "'"
|
|
return {"literal": ast.literal_eval(val)}
|
|
|
|
|
|
# NUMBERS
|
|
real_num = (
|
|
Regex(r"[+-]?(\d+\.\d*|\.\d+)([eE][+-]?\d+)?").set_parser_name("float")
|
|
/ (lambda t: float(t[0]))
|
|
)
|
|
|
|
|
|
def parse_int(tokens):
|
|
if "e" in tokens[0].lower():
|
|
return int(float(tokens[0]))
|
|
else:
|
|
return int(tokens[0])
|
|
|
|
|
|
int_num = Regex(r"[+-]?\d+([eE]\+?\d+)?").set_parser_name("int") / parse_int
|
|
hex_num = (
|
|
Regex(r"0x[0-9a-fA-F]+").set_parser_name("hex") / (lambda t: {"hex": t[0][2:]})
|
|
)
|
|
|
|
# STRINGS
|
|
ansi_string = Regex(r"\'(\'\'|[^'])*\'") / to_string
|
|
aquery_doublequote_string = Regex(r'\"(\"\"|[^"])*\"') / to_string
|
|
|
|
# BASIC IDENTIFIERS
|
|
ansi_ident = Regex(r'\"(\"\"|[^"])*\"') / unquote
|
|
aquery_backtick_ident = Regex(r"\`(\`\`|[^`])*\`") / unquote
|