# encoding: utf-8 # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. # # Contact: Kyle Lahnakoski (kyle@lahnakoski.com) # Bill Sun 2022 - 2023 import ast from mo_dots import is_data, is_null, Data, from_data from mo_future import text, number_types, binary_type, flatten from mo_imports import expect from mo_parsing import * from mo_parsing.utils import is_number, listwrap unary_ops = expect("unary_ops") class Call(object): __slots__ = ["op", "args", "kwargs"] def __init__(self, op, args, kwargs): self.op = op self.args = args self.kwargs = kwargs IDENT_CHAR = Regex("[@_$0-9A-Za-zÀ-ÖØ-öø-ƿ]").expr.parser_config.include FIRST_IDENT_CHAR = "".join(set(IDENT_CHAR) - set("0123456789")) SQL_NULL = Call("null", [], {}) null_locations = [] def keyword(keywords): return And([ Keyword(k, caseless=True) for k in keywords.split(" ") ]).set_parser_name(keywords) / (lambda: keywords.replace(" ", "_")) def flag(keywords): """ RETURN {keywords: True} """ return (keyword(keywords) / (lambda: True))(keywords.replace(" ", "_")) def assign(key: str, value: ParserElement): return keyword(key).suppress() + value(key.replace(" ", "_")) def simple_op(op, args, kwargs): if args is None: kwargs[op] = {} else: kwargs[op] = args return kwargs def normal_op(op, args, kwargs): output = Data(op=op) args = listwrap(args) if args and (not isinstance(args[0], dict) or args[0]): output.args = args if kwargs: output.kwargs = kwargs return from_data(output) scrub_op = simple_op def scrub(result): if result is SQL_NULL: return SQL_NULL elif result == None: return None elif isinstance(result, text): return result elif isinstance(result, binary_type): return result.decode("utf8") elif isinstance(result, number_types): return result elif isinstance(result, Call): kwargs = scrub(result.kwargs) args = scrub(result.args) if args is SQL_NULL: null_locations.append((kwargs, result.op)) return scrub_op(result.op, args, kwargs) elif isinstance(result, dict) and not result: return result elif isinstance(result, list): output = [rr for r in result for rr in [scrub(r)]] if not output: return None elif len(output) == 1: return output[0] else: for i, v in enumerate(output): if v is SQL_NULL: null_locations.append((output, i)) return output else: # ATTEMPT A DICT INTERPRETATION try: kv_pairs = list(result.items()) except Exception as c: print(c) output = {k: vv for k, v in kv_pairs for vv in [scrub(v)] if not is_null(vv)} if isinstance(result, dict) or output: for k, v in output.items(): if v is SQL_NULL: null_locations.append((output, k)) return output return scrub(list(result)) def _chunk(values, size): acc = [] for v in values: acc.append(v) if len(acc) == size: yield acc acc = [] if acc: yield acc def to_lambda(tokens): params, op, expr = list(tokens) return Call("lambda", [expr], {"params": list(params)}) def to_json_operator(tokens): # ARRANGE INTO {op: params} FORMAT length = len(tokens.tokens) if length == 2: if tokens.tokens[1].type.parser_name == "cast": return Call("cast", list(tokens), {}) # UNARY OPERATOR op = tokens.tokens[0].type.parser_name if op == "neg" and is_number(tokens[1]): return -tokens[1] return Call(op, [tokens[1]], {}) elif length == 5: # TRINARY OPERATOR return Call( tokens.tokens[1].type.parser_name, [tokens[0], tokens[2], tokens[4]], {} ) op = tokens[1] if not isinstance(op, text): op = op.type.parser_name op = binary_ops.get(op, op) if op == "eq": if tokens[2] is SQL_NULL: return Call("missing", tokens[0], {}) elif tokens[0] is SQL_NULL: return Call("missing", tokens[2], {}) elif op == "neq": if tokens[2] is SQL_NULL: return Call("exists", tokens[0], {}) elif tokens[0] is SQL_NULL: return Call("exists", tokens[2], {}) elif op == "eq!": if tokens[2] is SQL_NULL: return Call("missing", tokens[0], {}) elif tokens[0] is SQL_NULL: return Call("missing", tokens[2], {}) elif op == "ne!": if tokens[2] is SQL_NULL: return Call("exists", tokens[0], {}) elif tokens[0] is SQL_NULL: return Call("exists", tokens[2], {}) elif op == "is": if tokens[2] is SQL_NULL: return Call("missing", tokens[0], {}) else: return Call("exists", tokens[0], {}) elif op == "is_not": if tokens[2] is SQL_NULL: return Call("exists", tokens[0], {}) else: return Call("missing", tokens[0], {}) operands = [tokens[0], tokens[2]] binary_op = Call(op, operands, {}) if op in {"add", "mul", "and", "or"}: # ASSOCIATIVE OPERATORS acc = [] for operand in operands: while isinstance(operand, ParseResults) and isinstance(operand.type, Group): # PARENTHESES CAUSE EXTRA GROUP LAYERS operand = operand[0] if isinstance(operand, ParseResults) and isinstance( operand.type, Forward ): operand = operand[0] if isinstance(operand, Call) and operand.op == op: acc.extend(operand.args) elif isinstance(operand, list): acc.append(operand) elif isinstance(operand, dict) and operand.get(op): acc.extend(operand.get(op)) else: acc.append(operand) binary_op = Call(op, acc, {}) return binary_op def to_offset(tokens): expr, offset = tokens.tokens return Call("get", [expr, offset], {}) def to_window_mod(tokens): expr, window = tokens.tokens return Call("value", [expr], {**window}) def to_tuple_call(tokens): # IS THIS ONE VALUE IN (), OR MANY? tokens = list(tokens) if len(tokens) == 1: return [tokens[0]] if all(isinstance(r, number_types) for r in tokens): return [tokens] if all( isinstance(r, number_types) or (is_data(r) and "literal" in r.keys()) for r in tokens ): candidate = {"literal": [r["literal"] if is_data(r) else r for r in tokens]} return candidate return [tokens] binary_ops = { "::": "cast", "COLLATE": "collate", "||": "concat", "*": "mul", "/": "div", "%": "mod", "+": "add", "-": "sub", "&": "binary_and", "|": "binary_or", "<": "lt", "<=": "lte", ">": "gt", ">=": "gte", "=": "eq", "==": "eq", "is distinct from": "eq!", # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/ "is_distinct_from": "eq!", "is not distinct from": "ne!", "is_not_distinct_from": "ne!", "<=>": "eq!", # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/ "!=": "neq", "<>": "neq", "not in": "nin", "in": "in", "is_not": "neq", "is": "eq", "similar_to": "similar_to", "like": "like", "rlike": "rlike", "not like": "not_like", "not_like": "not_like", "not rlike": "not_rlike", "not_rlike": "not_rlike", "not_simlilar_to": "not_similar_to", "or": "or", "and": "and", "->": "lambda", "union": "union", "union_all": "union_all", "union all": "union_all", "except": "except", "minus": "minus", "intersect": "intersect", } is_set_op = ("union", "union_all", "except", "minus", "intersect") def to_trim_call(tokens): frum = tokens["from"] if not frum: return Call("trim", [tokens["chars"]], {"direction": tokens["direction"]}) return Call( "trim", [frum], {"characters": tokens["chars"], "direction": tokens["direction"]}, ) def to_json_call(tokens): # ARRANGE INTO {op: params} FORMAT op = tokens["op"].lower() op = binary_ops.get(op, op) params = tokens["params"] if isinstance(params, (dict, str, int, Call)): args = [params] else: args = list(params) kwargs = {k: v for k, v in tokens.items() if k not in ("op", "params")} return ParseResults( tokens.type, tokens.start, tokens.end, [Call(op, args, kwargs)], tokens.failures, ) def to_interval_call(tokens): # ARRANGE INTO {interval: [amount, type]} FORMAT params = tokens["params"] if not params: params = {} if params.length() == 2: return Call("interval", params, {}) return Call("add", [Call("interval", p, {}) for p in _chunk(params, size=2)], {}) def to_case_call(tokens): cases = list(tokens["case"]) elze = tokens["else"] if elze != None: cases.append(elze) return Call("case", cases, {}) def to_switch_call(tokens): # CONVERT TO CLASSIC CASE STATEMENT value = tokens["value"] acc = [] for c in list(tokens["case"]): acc.append(Call("when", [Call("eq", [value] + c.args, {})], c.kwargs)) elze = tokens["else"] if elze != None: acc.append(elze) return Call("case", acc, {}) def to_when_call(tokens): tok = tokens return Call("when", [tok["when"]], {"then": tok["then"]}) def to_join_call(tokens): op = " ".join(tokens["op"]) if tokens["join"]["name"]: output = {op: { "name": tokens["join"]["name"], "value": tokens["join"]["value"], }} else: output = {op: tokens["join"]} output["on"] = tokens["on"] output["using"] = tokens["using"] return output def to_expression_call(tokens): if set(tokens.keys()) & {"over", "within", "filter"}: return return ParseResults( tokens.type, tokens.start, tokens.end, listwrap(tokens["value"]), tokens.failures, ) def to_over(tokens): if not tokens: return {} def to_alias(tokens): cols = tokens["col"] name = tokens["name"] if cols: return {name: cols} return name def to_top_clause(tokens): value = tokens["value"] if not value: return None value = value.value() if tokens["ties"]: output = {} output["ties"] = True if tokens["percent"]: output["percent"] = value else: output["value"] = value return output elif tokens["percent"]: return {"percent": value} else: return [value] def to_row(tokens): columns = list(tokens) if len(columns) > 1: return {"select": [{"value": v[0]} for v in columns]} else: return {"select": {"value": columns[0]}} def get_literal(value): if isinstance(value, (int, float)): return value elif isinstance(value, Call): return elif value is SQL_NULL: return value elif "literal" in value: return value["literal"] def to_values(tokens): rows = list(tokens) if len(rows) > 1: values = [ [get_literal(s["value"]) for s in listwrap(row["select"])] for row in rows ] if all(flatten(values)): return {"from": {"literal": values}} return {"union_all": list(tokens)} else: return rows def to_stack(tokens): width = tokens["width"] args = listwrap(tokens["args"]) return Call("stack", args, {"width": width}) def to_array(tokens): types = list(tokens["type"]) args = list(tokens["args"]) output = Call("create_array", args, {}) if types: output = Call("cast", [output, Call("array", types, {})], {}) return output def to_map(tokens): keys = tokens["keys"] values = tokens["values"] return Call("create_map", [keys, values], {}) def to_struct(tokens): types = list(tokens["types"]) args = list(d for a in tokens["args"] for d in [a if a["name"] else a["value"]]) output = Call("create_struct", args, {}) if types: output = Call("cast", [output, Call("struct", types, {})], {}) return output def to_select_call(tokens): expr = tokens["value"] if expr == "*": return ["*"] try: call = expr[0][0] if call.op == "value": return {"name": tokens["name"], "value": call.args, **call.kwargs} except: pass def to_union_call(tokens): unions = tokens["union"] if isinstance(unions, dict): return unions elif unions.type.parser_name == "unordered sql": output = {k: v for k, v in unions.items()} # REMOVE THE Group() else: unions = list(unions) sources = [unions[i] for i in range(0, len(unions), 2)] operators = ["_".join(unions[i]) for i in range(1, len(unions), 2)] acc = sources[0] last_union = None for op, so in list(zip(operators, sources[1:])): if op == last_union and "union" in op: acc[op] = acc[op] + [so] else: acc = {op: [acc, so]} last_union = op if not tokens["orderby"] and not tokens["offset"] and not tokens["limit"]: return acc else: output = {"from": acc} output["orderby"] = tokens["orderby"] output["limit"] = tokens["limit"] output["offset"] = tokens["offset"] output["fetch"] = tokens["fetch"] output["outfile"] = tokens["outfile"] return output def to_insert_call(tokens): options = { k: v for k, v in tokens.items() if k not in ["columns", "table", "query"] } query = tokens["query"] columns = tokens["columns"] try: values = query["from"]["literal"] if values: if columns: data = [dict(zip(columns, row)) for row in values] return Call("insert", [tokens["table"]], {"values": data, **options}) else: return Call("insert", [tokens["table"]], {"values": values, **options}) except Exception: pass return Call( "insert", [tokens["table"]], {"columns": columns, "query": query, **options} ) def to_query(tokens): output = tokens["query"][0] try: output["with"] = tokens["with"] output["with_recursive"] = tokens["with_recursive"] return output except Exception as cause: return def to_table(tokens): output = dict(tokens) if len(list(output.keys())) > 1: return output else: return output["value"] def unquote(tokens): val = tokens[0] if val.startswith("'") and val.endswith("'"): val = "'" + val[1:-1].replace("''", "\\'") + "'" elif val.startswith('"') and val.endswith('"'): val = '"' + val[1:-1].replace('""', '\\"') + '"' elif val.startswith("`") and val.endswith("`"): val = '"' + val[1:-1].replace("``", "`").replace('"', '\\"') + '"' elif val.startswith("[") and val.endswith("]"): val = '"' + val[1:-1].replace("]]", "]").replace('"', '\\"') + '"' elif val.startswith("+"): val = val[1:] un = ast.literal_eval(val).replace(".", "\\.") return un def to_string(tokens): val = tokens[0] val = "'" + val[1:-1].replace("''", "\\'") + "'" return {"literal": ast.literal_eval(val)} # NUMBERS real_num = ( Regex(r"[+-]?(\d+\.\d*|\.\d+)([eE][+-]?\d+)?").set_parser_name("float") / (lambda t: float(t[0])) ) def parse_int(tokens): if "e" in tokens[0].lower(): return int(float(tokens[0])) else: return int(tokens[0]) int_num = Regex(r"[+-]?\d+([eE]\+?\d+)?").set_parser_name("int") / parse_int hex_num = ( Regex(r"0x[0-9a-fA-F]+").set_parser_name("hex") / (lambda t: {"hex": t[0][2:]}) ) # STRINGS ansi_string = Regex(r"\'(\'\'|[^'])*\'") / to_string aquery_doublequote_string = Regex(r'\"(\"\"|[^"])*\"') / to_string # BASIC IDENTIFIERS ansi_ident = Regex(r'\"(\"\"|[^"])*\"') / unquote aquery_backtick_ident = Regex(r"\`(\`\`|[^`])*\`") / unquote