From 8b182cf0f2cd1051da13256ac04d01c6eebd9480 Mon Sep 17 00:00:00 2001 From: BillSun Date: Fri, 18 Feb 2022 09:01:36 -0500 Subject: [PATCH] Order by, on-demand assumptions, bugfixes Nested-loop join (concept) --- aquery_parser/sql_parser.py | 2 +- engine/agg.py | 0 engine/ast.py | 104 +++++++++++++++++++++++++++++------- engine/expr.py | 10 +++- engine/groupby.py | 9 ++-- engine/orderby.py | 77 ++++++++++++++++---------- engine/projection.py | 43 +++++++++------ moving_avg.csv | 4 +- prompt.py | 4 +- q1.sql | 3 +- 10 files changed, 183 insertions(+), 73 deletions(-) delete mode 100644 engine/agg.py diff --git a/aquery_parser/sql_parser.py b/aquery_parser/sql_parser.py index 073731c..17790aa 100644 --- a/aquery_parser/sql_parser.py +++ b/aquery_parser/sql_parser.py @@ -412,7 +412,7 @@ def parser(literal_string, ident, sqlserver=False): + RB, ) - assumption = Group((ASC|DESC) ("ord") + var_name("attrib")) + assumption = Group((ASC|DESC) ("sort") + var_name("value")) assumptions = (ASSUMING + Group(delimited_list(assumption))("assumptions")) table_source << Group( diff --git a/engine/agg.py b/engine/agg.py deleted file mode 100644 index e69de29..0000000 diff --git a/engine/ast.py b/engine/ast.py index fc147af..a348208 100644 --- a/engine/ast.py +++ b/engine/ast.py @@ -6,7 +6,7 @@ from engine.utils import base62uuid # replace column info with this later. class ColRef: - def __init__(self, k9name, _ty, cobj, cnt, table, name, id, order = None, compound = False): + def __init__(self, k9name, _ty, cobj, cnt, table, name, id, compound = False): self.k9name = k9name self.type = _ty self.cobj = cobj @@ -14,13 +14,16 @@ class ColRef: self.table = table self.name = name self.id = id - self.order = order # True -> asc, False -> dsc; None -> unordered + self.order_pending = None # order_pending self.compound = compound # compound field (list as a field) self.views = [] self.__arr__ = (k9name, _ty, cobj, cnt, table, name, id) def __getitem__(self, key): - return self.__arr__[key] + if type(key) is str: + return getattr(self, key) + else: + return self.__arr__[key] def __setitem__(self, key, value): self.__arr__[key] = value @@ -40,21 +43,24 @@ class TableInfo: self.views = set() self.rec = None self.groupinfo = None - for c in cols: - self.add_col(c) - + self.add_cols(cols) # runtime self.n_rows = 0 # number of cols self.order = [] # assumptions cxt.tables_byname[self.table_name] = self # construct reverse map - - def add_col(self, c): - if type(c) is ColRef: - c = c.cobj - k9name = 'c' + base62uuid(7) - col_object = ColRef(k9name, (list(c['type'].keys()))[0], c, 1, self,c['name'], len(self.columns)) - + def add_cols(self, cols, new = True): + for c in cols: + self.add_col(c, new) + def add_col(self, c, new = True): + _ty = c['type'] + if new: + k9name = 'c' + base62uuid(7) + _ty = _ty if type(c) is ColRef else list(_ty.keys())[0] + col_object = ColRef(k9name, _ty, c, 1, self,c['name'], len(self.columns)) + else: + col_object = c + k9name = c.k9name self.cxt.k9cols_byname[k9name] = col_object self.columns_byname[c['name']] = col_object self.columns.append(col_object) @@ -66,21 +72,53 @@ class TableInfo: def n_cols(self): return len(self.columns) - def get_col(self, col_name): + def materialize_orderbys(self): + view_stack = '' + stack_name = '' + for o in self.order: + o.materialize() + if len(view_stack) == 0: + view_stack = o.view.name + stack_name = view_stack + else: + view_stack = view_stack+'['+ o.view.name +']' + # TODO: Optimize by doing everything in a stmt + if len(view_stack) > 0: + if len(self.order) > 1: + self.cxt.emit(f'{stack_name}:{view_stack}') + for c in self.columns: + c.order_pending = stack_name + self.order[0].node.view = stack_name + self.order.clear() + + def get_col_d(self, col_name): col = self.columns_byname[col_name] if type(self.rec) is list: self.rec.append(col) return col + + def get_k9colname_d(self, col_name): + return self.get_col_d(col_name).k9name + + def get_col(self, col_name): + self.materialize_orderbys() + col = self.get_col_d(col_name) + if type(col.order_pending) is str: + self.cxt.emit_no_flush(f'{col.k9name}:{col.k9name}[{col.order_pending}]') + col.order_pending = None + return col def get_k9colname(self, col_name): return self.get_col(col_name).k9name - + def add_alias(self, alias): # TODO: Exception when alias already defined. # TODO: Scoping of alias should be constrainted in the query. self.cxt.tables_byname[alias] = self self.alias.add(alias) - def parse_tablenames(self, colExpr): + def parse_tablenames(self, colExpr, materialize = True): + self.get_col = self.get_col if materialize else self.get_col_d + parsedColExpr = colExpr.split('.') ret = None if len(parsedColExpr) <= 1: @@ -117,12 +155,13 @@ class Context: self.udf_map = dict() # read header self.k9code = '' + self.k9codelet = '' with open('header.k', 'r') as outfile: self.k9code = outfile.read() # datasource will be availible after `from' clause is parsed # and will be deactivated when the `from' is out of scope self.datasource = None - + self.ds_stack = [] def add_table(self, table_name, cols): tbl = TableInfo(table_name, cols, self) @@ -134,17 +173,46 @@ class Context: return f't{base62uuid(7)}' def emit(self, codelet): + self.k9code += self.k9codelet + codelet + '\n' + self.k9codelet = '' + def emit_no_flush(self, codelet): self.k9code += codelet + '\n' + def emit_flush(self): + self.k9code += self.k9codelet + '\n' + self.k9codelet = '' def emit_nonewline(self, codelet): - self.k9code += codelet + self.k9codelet += codelet + + def datsource_top(self): + if len(self.ds_stack) > 0: + return self.ds_stack[-1] + else: + return None + def datasource_pop(self): + if len(self.ds_stack) > 0: + self.ds_stack.pop() + return self.ds_stack[-1] + else: + return None + def datasource_push(self, ds): + if type(ds) is TableInfo: + self.ds_stack.append(ds) + return ds + else: + return None + def __str__(self): return self.k9code + def __repr__(self) -> str: + return self.__str__() + class ast_node: types = dict() def __init__(self, parent:"ast_node", node, context:Context = None): self.context = parent.context if context is None else context self.parent = parent + self.datasource = None self.init(node) self.produce(node) self.spawn(node) diff --git a/engine/expr.py b/engine/expr.py index 5432bc0..753ab47 100644 --- a/engine/expr.py +++ b/engine/expr.py @@ -14,6 +14,7 @@ class expr(ast_node): 'avgs': ['avgs', 'avgsw'], 'sums': ['sums', 'sumsw'], } + binary_ops = { 'sub':'-', 'add':'+', @@ -24,18 +25,23 @@ class expr(ast_node): 'gt':'>', 'lt':'<', } + compound_ops = { 'ge' : [2, lambda x: f'~({x[0]}<{x[1]})'], 'le' : [2, lambda x: f'~({x[0]}>{x[1]})'], + 'count' : [1, lambda x: f'#({x[0]})'] } + unary_ops = { 'neg' : '-', 'not' : '~' } + coumpound_generating_ops = ['mod', 'mins', 'maxs', 'sums'] + \ list( binary_ops.keys()) + list(compound_ops.keys()) + list(unary_ops.keys() ) - def __init__(self, parent, node): + def __init__(self, parent, node, materialize_cols = True): + self.materialize_cols = materialize_cols ast_node.__init__(self, parent, node, None) def init(self, _): @@ -95,7 +101,7 @@ class expr(ast_node): while type(p) is expr and not p.isvector: p.isvector = True p = p.parent - self.k9expr = self.datasource.parse_tablenames(node) + self.k9expr = self.datasource.parse_tablenames(node, self.materialize_cols) elif type(node) is bool: self.k9expr = '1' if node else '0' else: diff --git a/engine/groupby.py b/engine/groupby.py index 3e73866..69732ce 100644 --- a/engine/groupby.py +++ b/engine/groupby.py @@ -46,10 +46,11 @@ class groupby(ast_node): self.parent.inv = False else: k9fn = "{[ids;grps;ll;dim;x] " + \ - "start:$[x=ll;ll;grps[x+1][dim-1]];" + \ - "end: grps[x][dim-1];" + \ - "range:(end-start)#(((start-ll))#ids);" + \ + "start:grps[x][dim];" + \ + "end:$[x=0;ll;grps[x-1][dim]];" + \ + "range:(end-start)#((start-ll)#ids);" + \ + "start:ids[start];" + \ ret + '}' self.emit(f'{self.groupby_function}:{k9fn}') self.emit(f'{out}:+({self.groupby_function}' + \ - f'[{grp}[1];{grp}[0];(#{grp}[0])-1;#({grp}[0][0])]\'!((#({grp}[0]))-1))') \ No newline at end of file + f'[{grp}[1];{grp}[0];(#{grp}[0])+1;(#({grp}[0][0]))-1]\'!(#({grp}[0])))') \ No newline at end of file diff --git a/engine/orderby.py b/engine/orderby.py index 36fc649..13311b7 100644 --- a/engine/orderby.py +++ b/engine/orderby.py @@ -1,38 +1,59 @@ -from engine.ast import ColRef, TableInfo, ast_node -from engine.utils import base62uuid +from engine.ast import ColRef, TableInfo, View, ast_node, Context +from engine.utils import base62uuid, seps from engine.expr import expr +import k + +class order_item: + def __init__(self, name, node, order = True): + self.name = name + self.order = order + self.node = node + self.materialized = False + + def materialize(self): + if not self.materialized: + self.name = expr(self.node, self.name, False).k9expr + self.materialized = True + return ('' if self.order else '-') + f'({self.name})' + + def __str__(self): + return self.materialize() + def __repr__(self): + return self.__str__() + +class orders: + def __init__(self, node, datasource): + self.order_items = [] + self.materialized = False + self.view = None + self.node = node + self.datasource = datasource + self.n_attrs = -1 + + def materialize(self): + if not self.materialized: + self.view = View(self.node.context, self.datasource, False) + keys = ';'.join([f'{o}' for o in self.order_items]) + self.n_attrs = len(self.order_items) + self.node.emit(f"{self.view.name}: > +`j (({',' if self.n_attrs == 1 else ''}{keys}))") + self.materialized = True + + def append(self, o): + self.order_items.append(o) class orderby(ast_node): name = '_orderby' + def init(self, _): - self.group = 'g' + base62uuid(7) self.datasource = self.parent.datasource - self.datasource.rec = [] + self.order = orders(self, self.datasource) + self.view = '' def produce(self, node): if type(node) is not list: node = [node] - g_contents = '(' - first_col = '' - for i, g in enumerate(node): - v = g['value'] - e = expr(self, v).k9expr - # if v is compound expr, create tmp cols - if type(v) is not str: - tmpcol = 't' + base62uuid(7) - self.emit(f'{tmpcol}:{e}') - e = tmpcol - if i == 0: - first_col = e - g_contents += e + (';'if i < len(node)-1 else '') - - self.emit(f'{self.group}:'+g_contents+')') - self.n_grps = len(node) - if self.n_grps <= 1: - self.emit(f'{self.group}:={self.group}') - else: - self.emit(f'{self.group}:groupby[+({self.group},(,!(#({first_col}))))]') - + for n in node: + order = not ('sort' in n and n['sort'] == 'desc') + self.order.append(order_item(n['value'], self, order)) + def consume(self, _): - self.referenced = self.datasource.rec - self.datasource.rec = None - return super().consume(_) \ No newline at end of file + self.datasource.order.append(self.order) \ No newline at end of file diff --git a/engine/projection.py b/engine/projection.py index a9c3fde..426a015 100644 --- a/engine/projection.py +++ b/engine/projection.py @@ -1,7 +1,9 @@ +from attr import has from engine.ast import ColRef, TableInfo, ast_node, Context, include from engine.groupby import groupby from engine.join import join from engine.expr import expr +from engine.orderby import orderby from engine.scan import filter from engine.utils import base62uuid, enlist, base62alp from engine.ddl import outfile @@ -44,10 +46,7 @@ class projection(ast_node): self.datasource = self.context.tables_byname[value] if 'assumptions' in from_clause: for assumption in enlist(from_clause['assumptions']): - ord = assumption['ord'] == 'asc' - attrib = assumption['attrib'] - ord = '^' if ord else '|^' - # TODO: generate view of table by order + orderby(self, assumption) elif type(from_clause) is str: self.datasource = self.context.tables_byname[from_clause] @@ -92,8 +91,8 @@ class projection(ast_node): if 'value' in proj: e = proj['value'] if type(e) is str: - cname = self.datasource.parse_tablenames(proj['value']) - k9expr += (f"{cname}") + cname = e # TODO: deal w/ alias + k9expr += (f"{self.datasource.parse_tablenames(proj['value'])}") elif type(e) is dict: p_expr = expr(self, e) cname = p_expr.k9expr @@ -104,27 +103,41 @@ class projection(ast_node): compound = compound and has_groupby and self.datasource.rec not in self.group_node.referenced - cols.append(ColRef(f'(+{disp_varname})[{i}]', 'generic', self.out_table, 0, None, cname, i, compound=compound)) + cols.append(ColRef(f'{disp_varname}[{i}]', 'generic', self.out_table, 0, None, cname, i, compound=compound)) + self.out_table.add_cols(cols, False) + k9expr += ')' if has_groupby: self.group_node.finalize(k9expr, disp_varname) else: self.emit(f'{disp_varname}:{k9expr}') - self.datasource.group_node = None - if flatten: - self.emit_no_ln(f'{disp_varname}:' if flatten else '') + + has_orderby = 'orderby' in node + + if has_orderby: + self.datasource = self.out_table + self.context.datasource = self.out_table # discard current ds + orderby_node = orderby(self, node['orderby']) + self.context.datasource.materialize_orderbys() + self.emit_no_ln(f"{f'{disp_varname}:+' if flatten else ''}(") - if flatten or self.disp: + if self.disp or has_orderby: if len(self.projections) > 1: - self.emit(f"{'+' if self.inv else ''}{disp_varname}") + self.emit_no_ln(f"{'+' if self.inv else ''}{disp_varname}") else: - self.emit(f'$[(#{disp_varname})>1;+,({disp_varname});+,(,{disp_varname})]') + self.emit_no_ln(f'$[(#{disp_varname})>1;+,({disp_varname});+,(,{disp_varname})]') if flatten: - self.emit(f'{disp_varname}') + self.emit_no_ln(f'{disp_varname}') + if has_orderby: + self.emit(f')[{orderby_node.view}]') + else: + self.context.emit_flush() if flatten: - self.out_table.columns = cols + if len(self.projections) > 1 and not self.inv: + self.emit(f"{disp_varname}:+{disp_varname}") outfile(self, node['outfile']) + if self.datasource_changed: self.context.datasource = self.prev_datasource diff --git a/moving_avg.csv b/moving_avg.csv index 8016053..6d3b55d 100644 --- a/moving_avg.csv +++ b/moving_avg.csv @@ -1,6 +1,6 @@ Month,sales 1,100 -2,120 -3,140 4,140 5,130 +3,140 +2,120 diff --git a/prompt.py b/prompt.py index 04d8401..06ec96d 100644 --- a/prompt.py +++ b/prompt.py @@ -6,9 +6,9 @@ import subprocess import sys if sys.platform != 'win32': import readline - basecmd = ['bash', '-c', 'k'] + basecmd = ['bash', '-c', 'rlwrap k'] else: - basecmd = ['bash.exe', '-c', './k'] + basecmd = ['bash.exe', '-c', 'rlwrap ./k'] test_parser = True diff --git a/q1.sql b/q1.sql index 72b1d2a..2acaf11 100644 --- a/q1.sql +++ b/q1.sql @@ -6,4 +6,5 @@ FIELDS TERMINATED BY "," SELECT sum(c), b, d FROM test -group by a,b,d \ No newline at end of file +group by a,b,d +order by d DESC, b ASC