From 3b2dfb295e03ebd06f8b3ccedaab4db7e6f77447 Mon Sep 17 00:00:00 2001 From: Bill Sun Date: Fri, 28 Jan 2022 10:02:43 -0500 Subject: [PATCH] added filter, basic aggregations. Fixed bugs with var length, etc. --- .gitignore | 5 +- engine/ast.py | 100 ++++++++++++++++++++++++++++++---------- engine/expr.py | 26 ++++++++--- engine/groupby.py | 0 engine/projection.py | 31 +++++++++---- engine/scan.py | 107 +++++++++++++++++++++++++++++++++++++++++++ engine/utils.py | 2 +- header.k | 2 + run.py => prompt.py | 8 +++- stock.a | 6 ++- 10 files changed, 241 insertions(+), 46 deletions(-) create mode 100644 engine/groupby.py create mode 100644 engine/scan.py create mode 100644 header.k rename run.py => prompt.py (82%) diff --git a/.gitignore b/.gitignore index 48d6ec7..d544b90 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,7 @@ vendor/ .DS_Store .eggs .vscode -out.k \ No newline at end of file +out.k +k +*.so +*.pdf diff --git a/engine/ast.py b/engine/ast.py index aa23aef..fb48b03 100644 --- a/engine/ast.py +++ b/engine/ast.py @@ -1,60 +1,111 @@ from typing import List +from engine.utils import base62uuid + # replace column info with this later. class ColRef: - def __init__(self, k9name, type, cobj, cnt): + def __init__(self, k9name, type, cobj, cnt, table): self.k9name = k9name self.type = type self.cobj = cobj self.cnt = cnt + self.table = table + self.__arr__ = (k9name, type, cobj, cnt, table) + + def __getitem__(self, key): + return self.__arr__[key] + + def __setitem__(self, key, value): + self.__arr__[key] = value class TableInfo: + def __init__(self, table_name, cols, cxt:'Context'): # statics self.table_name = table_name + self.alias = set([table_name]) self.columns_byname = dict() # column_name, type self.columns = [] - + self.cxt = cxt + self.views = set() for c in cols: - k9name = self.table_name + c['name'] - if k9name in cxt.k9cols_byname: # duplicate names? - root = cxt.k9cols_byname[k9name] - k9name = k9name + root[3] - root[3] += 1 - - # column: (k9name, type, original col_object, dup_count) - col_object = (k9name, (list(c['type'].keys()))[0], c, 1) - - cxt.k9cols_byname[k9name] = col_object - self.columns_byname[c['name']] = col_object - self.columns.append(col_object) + self.add_col(c) # runtime self.n_rows = 0 # number of cols self.order = [] # assumptions cxt.tables_byname[self.table_name] = self # construct reverse map - + + def add_col(self, c): + if type(c) is ColRef: + c = c.cobj + k9name = 'c' + base62uuid(7) + # k9name = self.table_name + c['name'] + # if k9name in self.cxt.k9cols_byname: # duplicate names? + # root = self.cxt.k9cols_byname[k9name] + # k9name = k9name + root.cnt + # root.cnt += 1 + + # column: (k9name, type, original col_object, dup_count) + col_object = ColRef(k9name, (list(c['type'].keys()))[0], c, 1, self) + + self.cxt.k9cols_byname[k9name] = col_object + self.columns_byname[c['name']] = col_object + self.columns.append(col_object) + + def construct(self): + for c in self.columns: + self.cxt.emit(f'{c.k9name}:()') @property def n_cols(self): return len(self.columns) def get_k9colname(self, col_name): - return self.columns_byname[col_name][0] - - def parse_tablenames(self, str): - # TODO: deal with alias - return self.get_k9colname(str) - + return self.columns_byname[col_name].k9name + def add_alias(self, alias): + # TODO: Exception when alias already defined. + # TODO: Scoping of alias should be constrainted in the query. + self.cxt.tables_byname[alias] = self + self.alias.add(alias) + + def parse_tablenames(self, colExpr): + parsedColExpr = colExpr.split('.') + if len(parsedColExpr) <= 1: + return self.get_k9colname(colExpr) + else: + datasource = self.cxt.tables_byname[parsedColExpr[0]] + if datasource is None: + raise ValueError(f'Table name/alias not defined{parsedColExpr[0]}') + else: + return datasource.get_k9colname(parsedColExpr[1]) + +class View: + def __init__(self, context, table = None, tmp = True): + self.table: TableInfo = table + self.name = 'v'+base62uuid(7) + if type(table) is TableInfo: + table.views.add(self) + self.context = context + + def construct(self): + self.context.emit(f'{self.name}:()') + class Context: def __init__(self): self.tables:List[TableInfo] = [] self.tables_byname = dict() self.k9cols_byname = dict() - + self.udf_map = dict() - + # read header self.k9code = '' + with open('header.k', 'r') as outfile: + self.k9code = outfile.read() + # datasource will be availible after `from' clause is parsed + # and will be deactivated when the `from' is out of scope + self.datasource = None + def add_table(self, table_name, cols): tbl = TableInfo(table_name, cols, self) @@ -63,7 +114,7 @@ class Context: def gen_tmptable(self): from engine.utils import base62uuid - return f'tmp{base62uuid()}' + return f't{base62uuid(7)}' def emit(self, codelet): self.k9code += codelet + '\n' @@ -76,6 +127,7 @@ class ast_node: types = dict() def __init__(self, parent:"ast_node", node, context:Context = None): self.context = parent.context if context is None else context + self.parent = parent self.init(node) self.produce(node) self.spawn(node) diff --git a/engine/expr.py b/engine/expr.py index adf0819..9aa61c6 100644 --- a/engine/expr.py +++ b/engine/expr.py @@ -8,29 +8,38 @@ class expr(ast_node): 'min': 'min', 'avg':'avg', 'sum':'sum', - + 'mins': 'mins', + 'maxs': 'maxs' } binary_ops = { 'sub':'-', 'add':'+', 'mul':'*', 'div':'%', - + 'mod':'mod', + 'and':'&', + 'or':'|', + 'gt':'>', + 'lt':'<', } unary_ops = { 'neg' : '-', - + 'not' : '~' } def __init__(self, parent, node): + ast_node.__init__(self, parent, node, None) + + def init(self, _): from engine.projection import projection + parent = self.parent + self.isvector = parent.isvector if type(parent) is expr else False if type(parent) in [projection, expr]: self.datasource = parent.datasource else: - self.datasource = None + self.datasource = self.context.datasource self.udf_map = parent.context.udf_map self.k9expr = '' self.func_maps = {**self.udf_map, **self.builtin_func_maps} - ast_node.__init__(self, parent, node, None) def produce(self, node): if type(node) is dict: @@ -39,7 +48,6 @@ class expr(ast_node): self.k9expr += f"{self.func_maps[key]}(" # if type(val) in [dict, str]: self.k9expr += expr(self, val).k9expr - self.k9expr += ')' elif key in self.binary_ops: l = expr(self, val[0]).k9expr @@ -51,7 +59,13 @@ class expr(ast_node): else: print(f'Undefined expr: {key}{val}') elif type(node) is str: + p = self.parent + while type(p) is expr and not p.isvector: + p.isvector = True + p = p.parent self.k9expr = self.datasource.parse_tablenames(node) + elif type(node) is bool: + self.k9expr = '1' if node else '0' else: self.k9expr = f'{node}' def __str__(self): diff --git a/engine/groupby.py b/engine/groupby.py new file mode 100644 index 0000000..e69de29 diff --git a/engine/projection.py b/engine/projection.py index 40d02b8..8bc7593 100644 --- a/engine/projection.py +++ b/engine/projection.py @@ -1,6 +1,7 @@ from engine.ast import TableInfo, ast_node, Context, include from engine.join import join from engine.expr import expr +from engine.scan import filter from engine.utils import base62uuid class projection(ast_node): @@ -15,7 +16,7 @@ class projection(ast_node): def produce(self, node): p = node['select'] - self.projections = p if type(projection) == list else [p] + self.projections = p if type(p) is list else [p] print(node) def spawn(self, node): @@ -47,27 +48,37 @@ class projection(ast_node): if self.datasource is None: raise ValueError('spawn error: from clause') + + if self.datasource is not None: + self.datasource_changed = True + self.prev_datasource = self.context.datasource + self.context.datasource = self.datasource if 'where' in node: - # apply filter - pass + self.datasource = filter(self, node['where'], True).output + self.context.datasource = self.datasource - def consume(self, node): - disp_varname = 'disptmp' + base62uuid() + def consume(self, _): + disp_varname = 'd'+base62uuid(7) self.emit_no_ln(f'{disp_varname}:(') - for proj in self.projections: + for i, proj in enumerate(self.projections): if type(proj) is dict: if 'value' in proj: e = proj['value'] - if type(e) is str: - self.emit_no_ln(f"{self.datasource.parse_tablenames(proj['value'])};") + self.emit_no_ln(f"{self.datasource.parse_tablenames(proj['value'])}") elif type(e) is dict: - self.emit_no_ln(f"{expr(self, e).k9expr};") + self.emit_no_ln(f"{expr(self, e).k9expr}") + self.emit_no_ln(';'if i < len(self.projections)-1 else '') self.emit(')') if self.disp: - self.emit(disp_varname) + if len(self.projections) > 1: + self.emit(f'+{disp_varname}') + else: + self.emit(f'+,(,{disp_varname})') + if self.datasource_changed: + self.context.datasource = self.prev_datasource import sys diff --git a/engine/scan.py b/engine/scan.py new file mode 100644 index 0000000..c1e8edd --- /dev/null +++ b/engine/scan.py @@ -0,0 +1,107 @@ +from xmlrpc.client import Boolean +from engine.ast import ColRef, TableInfo, View, ast_node +from engine.utils import base62uuid +from engine.expr import expr + +class scan(ast_node): + name = 'scan' + +class filter(ast_node): + name = 'filter' + def __init__(self, parent: "ast_node", node, materialize = False, context = None): + self.materialize = materialize + super().__init__(parent, node, context) + def init(self, _): + self.datasource = self.context.datasource + self.view = View(self.context, self.datasource) + self.value = None + + def spawn(self, node): + # TODO: deal with subqueries + return super().spawn(node) + def __materialize__(self): + if self.materialize: + cols = [] if self.datasource is None else self.datasource.columns + self.output = TableInfo('tn'+base62uuid(6), cols, self.context) + self.output.construct() + if type(self.value) is View: # cond filtered on tables. + self.emit(f'{self.value.name}:&{self.value.name}') + for o, c in zip(self.output.columns,self.value.table.columns): + self.emit(f'{o.k9name}:{c.k9name}[{self.value.name}]') + elif self.value is not None: # cond is scalar + tmpVar = 't'+base62uuid(7) + self.emit(f'{tmpVar}:{self.value}') + for o, c in zip(self.output.columns, self.datasource.columns): + self.emit(f'{o.k9name}:$[{tmpVar};{c.k9name};()]') + + def consume(self, node): + # TODO: optimizations after converting expr to cnf + if type(node) is bool and node and self.materialize: + self.output = self.context.datasource if node else None + self.value = '1' if node else '0' + else: + if type(node) is dict: + def short_circuit(op, idx, inv = True): + v = filter(self, node[op][idx]).value + inv_filter = lambda x: not x if inv else x + if type(v) is bool and inv_filter(v): + self.value = inv_filter(v) + self.__materialize__() + return None + return v + def binary(l, r, _ty = '&'): + if type(l) is bool: + self.value = r + elif type(r) is bool: + self.value = l + elif type(l) is View: + if type(r) is View: + self.emit(f"{l.name}: {l.name} {_ty} {r.name if type(r) is View else f'({r})'}") + self.value = l + elif type(l) is str: + if type(r) is str: + self.value = f'({l}){_ty}({r})' + else: + self.emit(f'{r.name}:{r.name} {_ty} ({l})') + self.value = r + if 'and' in node: + l = short_circuit('and', 0) + if l is not None: + r = short_circuit('and', 1) + if r is not None: + binary(l, r) + + elif 'or' in node: + l = short_circuit('or', 0, False) + if l is not None: + r = short_circuit('or', 1, False) + if r is not None: + binary(l, r, '|') + + elif 'not' in node: + v = filter(self, node['not']).value + if type(v) is bool: + self.value = not v + self.__materialize__() + elif type(v) is View: + if len(v.table.columns) > 0: + all_rows = View(self.context, v.table) + self.emit(f'{all_rows.name}:(#{v.table.columns[0].k9name})#1') + self.emit(f'{v.name}:{all_rows.name}-{v.name}') + self.value = v + else: + self.value = '~(' + v + ')' + # TODO: arithmetic ops connecting logical ops. + else: + e = expr(self, node) + if e.isvector: + v = View(self.context, self.datasource) + v.construct() + self.emit(f'{v.name}:{e.k9expr}') + self.value = v + else: + self.value = e.k9expr + self.__materialize__() + + print(node) + \ No newline at end of file diff --git a/engine/utils.py b/engine/utils.py index 60f3389..9c58764 100644 --- a/engine/utils.py +++ b/engine/utils.py @@ -4,7 +4,7 @@ def base62uuid(crop=8): alp = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' id = uuid.uuid4().int ret = '' - + while id: ret = alp[id % 62] + ret id //= 62 diff --git a/header.k b/header.k new file mode 100644 index 0000000..0d1cc4c --- /dev/null +++ b/header.k @@ -0,0 +1,2 @@ +maxs:{[L]{max(x, y)}\L} +mins:{[L]{min(x, y)}\L} diff --git a/run.py b/prompt.py similarity index 82% rename from run.py rename to prompt.py index 09db800..1395edc 100644 --- a/run.py +++ b/prompt.py @@ -1,7 +1,7 @@ -from multiprocessing.sharedctypes import Value import re import aquery_parser as parser import engine +import subprocess test_parser = True @@ -27,7 +27,11 @@ while test_parser: engine.generate(stmts_stmts, cxt) print(cxt.k9code) with open('out.k', 'wb') as outfile: - outfile.write(cxt.k9code.encode('utf-8')) + outfile.write((cxt.k9code+'\n\\\\').encode('utf-8')) + subprocess.call(['bash.exe', '-c',"./k out.k"]) + continue + elif q == 'k': + subprocess.call(['bash.exe', '-c',"./k"]) continue elif q == 'print': print(stmts) diff --git a/stock.a b/stock.a index 992b952..a5578ba 100644 --- a/stock.a +++ b/stock.a @@ -18,8 +18,10 @@ INSERT INTO stocks VALUES(15,2) INSERT INTO stocks VALUES(16,5) SELECT max(price-min(timestamp)) FROM stocks + +SELECT price, timestamp FROM stocks where price -timestamp > 1 and not (price*timestamp<100) -/*SELECT max(price-mins(price)) +SELECT max(price-mins(price)) FROM stocks - ASSUMING ASC timestamp*/ + ASSUMING ASC timestamp