Bill Sun 3 years ago
parent 3b2dfb295e
commit 277dad6b3e

3
.gitignore vendored

@ -18,3 +18,6 @@ out.k
k k
*.so *.so
*.pdf *.pdf
test*.c*
*.csv
*.out

@ -72,6 +72,7 @@ REFERENCES = keyword("references").suppress()
RECURSIVE = keyword("recursive").suppress() RECURSIVE = keyword("recursive").suppress()
VALUES = keyword("values").suppress() VALUES = keyword("values").suppress()
WINDOW = keyword("window") WINDOW = keyword("window")
INTO = keyword("into").suppress()
PRIMARY_KEY = Group(PRIMARY + KEY).set_parser_name("primary_key") PRIMARY_KEY = Group(PRIMARY + KEY).set_parser_name("primary_key")
FOREIGN_KEY = Group(FOREIGN + KEY).set_parser_name("foreign_key") FOREIGN_KEY = Group(FOREIGN + KEY).set_parser_name("foreign_key")
@ -226,6 +227,7 @@ RESERVED = MatchFirst([
WINDOW, WINDOW,
WITH, WITH,
WITHIN, WITHIN,
INTO,
]) ])
L_INLINE = Literal("<k>").suppress() L_INLINE = Literal("<k>").suppress()
R_INLINE = Literal("</k>").suppress() R_INLINE = Literal("</k>").suppress()

@ -29,7 +29,7 @@ def common_parser():
ansi_ident | mysql_backtick_ident | simple_ident, separator=".", combine=True, ansi_ident | mysql_backtick_ident | simple_ident, separator=".", combine=True,
)).set_parser_name("identifier") )).set_parser_name("identifier")
return parser(ansi_string, combined_ident) return parser(ansi_string | mysql_doublequote_string, combined_ident)
def mysql_parser(): def mysql_parser():
@ -436,6 +436,19 @@ def parser(literal_string, ident, sqlserver=False):
& Optional(assign("limit", expr)) & Optional(assign("limit", expr))
) )
outfile = Optional(
(
INTO
+ keyword("outfile").suppress()
+ literal_string ("loc")
+ Optional (
keyword("fields")
+ keyword("terminated")
+ keyword("by")
+ literal_string ("term")
)
)("outfile")
)
ordered_sql = ( ordered_sql = (
( (
(unordered_sql | (LB + query + RB)) (unordered_sql | (LB + query + RB))
@ -448,6 +461,7 @@ def parser(literal_string, ident, sqlserver=False):
)("union") )("union")
+ Optional(ORDER_BY + delimited_list(Group(sort_column))("orderby")) + Optional(ORDER_BY + delimited_list(Group(sort_column))("orderby"))
+ limit + limit
+ outfile
).set_parser_name("ordered sql") / to_union_call ).set_parser_name("ordered sql") / to_union_call
with_expr = delimited_list(Group( with_expr = delimited_list(Group(
@ -605,9 +619,27 @@ def parser(literal_string, ident, sqlserver=False):
+ Optional(assign("where", expr)) + Optional(assign("where", expr))
) / to_json_call ) / to_json_call
load = (
keyword("load")("op")
+ keyword("data").suppress()
+ keyword("infile")("loc")
+ literal_string ("file")
+ INTO
+ keyword("table").suppress()
+ var_name ("table")
+ Optional(
keyword("fields").suppress()
+ keyword("terminated").suppress()
+ keyword("by").suppress()
+ literal_string ("term")
)
) ("load")
sql_stmts = delimited_list( ( sql_stmts = delimited_list( (
query query
| (insert | update | delete) | (insert | update | delete | load)
| (create_table | create_view | create_cache | create_index) | (create_table | create_view | create_cache | create_index)
| (drop_table | drop_view | drop_index) | (drop_table | drop_view | drop_index)
)("stmts"), ";") )("stmts"), ";")
@ -617,6 +649,10 @@ def parser(literal_string, ident, sqlserver=False):
| udf | udf
) ("stmts") ) ("stmts")
stmts = ZeroOrMore(sql_stmts|other_stmt) stmts = ZeroOrMore(
sql_stmts
|other_stmt
| keyword(";").suppress() # empty stmt
)
return stmts.finalize() return stmts.finalize()

@ -522,6 +522,7 @@ def to_union_call(tokens):
output["limit"] = tokens["limit"] output["limit"] = tokens["limit"]
output["offset"] = tokens["offset"] output["offset"] = tokens["offset"]
output["fetch"] = tokens["fetch"] output["fetch"] = tokens["fetch"]
output["outfile"] = tokens["outfile"]
return output return output

@ -4,13 +4,15 @@ from engine.utils import base62uuid
# replace column info with this later. # replace column info with this later.
class ColRef: class ColRef:
def __init__(self, k9name, type, cobj, cnt, table): def __init__(self, k9name, _ty, cobj, cnt, table, name, id):
self.k9name = k9name self.k9name = k9name
self.type = type self.type = _ty
self.cobj = cobj self.cobj = cobj
self.cnt = cnt self.cnt = cnt
self.table = table self.table = table
self.__arr__ = (k9name, type, cobj, cnt, table) self.name = name
self.id = id
self.__arr__ = (k9name, _ty, cobj, cnt, table, name, id)
def __getitem__(self, key): def __getitem__(self, key):
return self.__arr__[key] return self.__arr__[key]
@ -28,6 +30,7 @@ class TableInfo:
self.columns = [] self.columns = []
self.cxt = cxt self.cxt = cxt
self.views = set() self.views = set()
self.rec = None
for c in cols: for c in cols:
self.add_col(c) self.add_col(c)
@ -48,7 +51,7 @@ class TableInfo:
# root.cnt += 1 # root.cnt += 1
# column: (k9name, type, original col_object, dup_count) # column: (k9name, type, original col_object, dup_count)
col_object = ColRef(k9name, (list(c['type'].keys()))[0], c, 1, self) col_object = ColRef(k9name, (list(c['type'].keys()))[0], c, 1, self,c['name'], len(self.columns))
self.cxt.k9cols_byname[k9name] = col_object self.cxt.k9cols_byname[k9name] = col_object
self.columns_byname[c['name']] = col_object self.columns_byname[c['name']] = col_object
@ -62,7 +65,11 @@ class TableInfo:
return len(self.columns) return len(self.columns)
def get_k9colname(self, col_name): def get_k9colname(self, col_name):
return self.columns_byname[col_name].k9name col = self.columns_byname[col_name]
if type(self.rec) is list:
self.rec.append(col)
return col.k9name
def add_alias(self, alias): def add_alias(self, alias):
# TODO: Exception when alias already defined. # TODO: Exception when alias already defined.
# TODO: Scoping of alias should be constrainted in the query. # TODO: Scoping of alias should be constrainted in the query.
@ -158,5 +165,5 @@ class ast_node:
def include(objs): def include(objs):
import inspect import inspect
for _, cls in inspect.getmembers(objs): for _, cls in inspect.getmembers(objs):
if inspect.isclass(cls) and issubclass(cls, ast_node): if inspect.isclass(cls) and issubclass(cls, ast_node) and not cls.name.startswith('_'):
ast_node.types[cls.name] = cls ast_node.types[cls.name] = cls

@ -1,7 +1,7 @@
# code-gen for data decl languages # code-gen for data decl languages
from engine.ast import TableInfo, ast_node, include from engine.ast import ColRef, TableInfo, ast_node, include
from engine.utils import base62uuid
class create_table(ast_node): class create_table(ast_node):
name = 'create_table' name = 'create_table'
def produce(self, node): def produce(self, node):
@ -28,5 +28,33 @@ class insert(ast_node):
# subquery, dispatch to select astnode # subquery, dispatch to select astnode
pass pass
class k9(ast_node):
name='k9'
def produce(self, node):
self.emit(node[self.name])
class load(ast_node):
name="load"
def produce(self, node):
node = node[self.name]
tablename = 'l'+base62uuid(7)
keys = 'k'+base62uuid(7)
self.emit(f"{tablename}:`csv ? 1:\"{node['file']['literal']}\"")
self.emit(f"{keys}:!{tablename}")
table:TableInfo = self.context.tables_byname[node['table']]
for i, c in enumerate(table.columns):
c:ColRef
self.emit(f'{c.k9name}:{tablename}[({keys})[{i}]]')
class outfile(ast_node):
name="_outfile"
def produce(self, node):
out_table:TableInfo = self.parent.out_table
self.emit_no_ln(f"\"{node['loc']['literal']}\"1:`csv@[[]")
for i, c in enumerate(out_table.columns):
self.emit_no_ln(f"{c.name}:{c.k9name}{';' if i < len(out_table.columns) - 1 else ''}")
self.emit(']')
import sys import sys
include(sys.modules[__name__]) include(sys.modules[__name__])

@ -6,10 +6,12 @@ class expr(ast_node):
builtin_func_maps = { builtin_func_maps = {
'max': 'max', 'max': 'max',
'min': 'min', 'min': 'min',
'avg':'avg', 'avg': 'avg',
'sum':'sum', 'sum': 'sum',
'mins': 'mins', 'mins': ['mins', 'minsw'],
'maxs': 'maxs' 'maxs': ['maxs', 'maxsw'],
'avgs': ['avgs', 'avgsw'],
'sums': ['sums', 'sumsw'],
} }
binary_ops = { binary_ops = {
'sub':'-', 'sub':'-',
@ -22,6 +24,10 @@ class expr(ast_node):
'gt':'>', 'gt':'>',
'lt':'<', 'lt':'<',
} }
compound_ops = {
'ge' : [2, lambda x: f'~({x[0]}<{x[1]})'],
'le' : [2, lambda x: f'~({x[0]}>{x[1]})'],
}
unary_ops = { unary_ops = {
'neg' : '-', 'neg' : '-',
'not' : '~' 'not' : '~'
@ -45,19 +51,32 @@ class expr(ast_node):
if type(node) is dict: if type(node) is dict:
for key, val in node.items(): for key, val in node.items():
if key in self.func_maps: if key in self.func_maps:
self.k9expr += f"{self.func_maps[key]}("
# if type(val) in [dict, str]: # if type(val) in [dict, str]:
if type(val) is list and len(val) > 1:
k9func = self.func_maps[key]
k9func = k9func[len(val) - 1] if type(k9func) is list else k9func
self.k9expr += f"{k9func}["
for i, p in enumerate(val):
self.k9expr += expr(self, p).k9expr + (';'if i<len(val)-1 else '')
else:
self.k9expr += f"{self.func_maps[key]}["
self.k9expr += expr(self, val).k9expr self.k9expr += expr(self, val).k9expr
self.k9expr += ')' self.k9expr += ']'
elif key in self.binary_ops: elif key in self.binary_ops:
l = expr(self, val[0]).k9expr l = expr(self, val[0]).k9expr
r = expr(self, val[1]).k9expr r = expr(self, val[1]).k9expr
self.k9expr += f'({l}{self.binary_ops[key]}{r})' self.k9expr += f'({l}{self.binary_ops[key]}{r})'
elif key in self.compound_ops:
x = []
if type(val) is list:
for v in val:
x.append(expr(self, v).k9expr)
self.k9expr = self.compound_ops[key][1](x)
elif key in self.unary_ops: elif key in self.unary_ops:
self.k9expr += f'({expr(self, val).k9expr}{self.unary_ops[key]})' self.k9expr += f'({expr(self, val).k9expr}{self.unary_ops[key]})'
else: else:
print(f'Undefined expr: {key}{val}') print(f'Undefined expr: {key}{val}')
elif type(node) is str: elif type(node) is str:
p = self.parent p = self.parent
while type(p) is expr and not p.isvector: while type(p) is expr and not p.isvector:

@ -0,0 +1,37 @@
from engine.ast import ast_node
from engine.utils import base62uuid
from engine.expr import expr
class groupby(ast_node):
name = '_groupby'
def init(self, _):
self.group = 'g' + base62uuid(7)
self.datasource = self.parent.datasource
self.datasource.rec = []
def produce(self, node):
if type(node) is not list:
node = [node]
g_contents = '('
for i, g in enumerate(node):
v = g['value']
e = expr(self, v).k9expr
# if v is compound expr, create tmp cols
if type(v) is not str:
tmpcol = 't' + base62uuid(7)
self.emit(f'{tmpcol}:{e}')
e = tmpcol
g_contents += e + (';'if i < len(node)-1 else '')
self.emit(f'{self.group}:'+g_contents+')')
if len(node) <= 1:
self.emit(f'{self.group}:={self.group}')
else:
self.emit(f'{self.group}:groupby[{self.group}[0];+{self.group}]')
def consume(self, _):
self.referenced = self.datasource.rec
self.datasource.rec = None
return super().consume(_)

@ -1,9 +1,10 @@
from engine.ast import TableInfo, ast_node, Context, include from engine.ast import ColRef, TableInfo, ast_node, Context, include
from engine.groupby import groupby
from engine.join import join from engine.join import join
from engine.expr import expr from engine.expr import expr
from engine.scan import filter from engine.scan import filter
from engine.utils import base62uuid from engine.utils import base62uuid, enlist, base62alp
from engine.ddl import outfile
class projection(ast_node): class projection(ast_node):
name='select' name='select'
def __init__(self, parent:ast_node, node, context:Context = None, outname = None, disp = True): def __init__(self, parent:ast_node, node, context:Context = None, outname = None, disp = True):
@ -35,11 +36,13 @@ class projection(ast_node):
projection(self, from_clause, disp = False) projection(self, from_clause, disp = False)
else: else:
# TODO: from func over table # TODO: from func over table
print(f"from func over table{node}") print(f'from func over table{node}')
elif type(value) is str: elif type(value) is str:
self.datasource = self.context.tables_byname[value] self.datasource = self.context.tables_byname[value]
if 'assumptions' in from_clause: if 'assumptions' in from_clause:
ord = from_clause['assumptions']['ord'] == 'asc' for assumption in enlist(from_clause['assumptions']):
ord = assumption['ord'] == 'asc'
attrib = assumption['attrib']
ord = '^' if ord else '|^' ord = '^' if ord else '|^'
# TODO: generate view of table by order # TODO: generate view of table by order
@ -57,26 +60,70 @@ class projection(ast_node):
self.datasource = filter(self, node['where'], True).output self.datasource = filter(self, node['where'], True).output
self.context.datasource = self.datasource self.context.datasource = self.datasource
if 'groupby' in node:
self.group_node = groupby(self, node['groupby'])
else:
self.group_node = None
def consume(self, _): def consume(self, node):
disp_varname = 'd'+base62uuid(7) disp_varname = 'd'+base62uuid(7)
pcolrefs = []
if type(self.group_node) is groupby:
grp_table = self.group_node.group
grp_refs = self.group_node.referenced
for i, proj in enumerate(self.projections):
self.datasource.rec = []
cname = ''
if type(proj) is dict:
if 'value' in proj:
e = proj['value']
if type(e) is str:
cname = self.datasource.parse_tablenames(proj['value'])
elif type(e) is dict:
cname = expr(self, e).k9expr
cname = ''.join([a if a in base62alp else '' for a in cname])
pcolrefs.append(self.datasource.rec)
self.datasource.rec = None
keys = 'k'+base62uuid(7)
self.emit(f'{keys}:!{grp_table}')
fn = 'fn' + base62uuid(6)
# self.emit
self.emit_no_ln(f'{disp_varname}:(') self.emit_no_ln(f'{disp_varname}:(')
flatten = False
cols = []
self.out_table = TableInfo('out_'+base62uuid(4), [], self.context)
if 'outfile' in node:
flatten = True
for i, proj in enumerate(self.projections): for i, proj in enumerate(self.projections):
cname = ''
if type(proj) is dict: if type(proj) is dict:
if 'value' in proj: if 'value' in proj:
e = proj['value'] e = proj['value']
if type(e) is str: if type(e) is str:
self.emit_no_ln(f"{self.datasource.parse_tablenames(proj['value'])}") cname = self.datasource.parse_tablenames(proj['value'])
self.emit_no_ln(f"{cname}")
elif type(e) is dict: elif type(e) is dict:
self.emit_no_ln(f"{expr(self, e).k9expr}") cname = expr(self, e).k9expr
self.emit_no_ln(f"{cname}")
cname = ''.join([a if a in base62alp else '' for a in cname])
self.emit_no_ln(';'if i < len(self.projections)-1 else '') self.emit_no_ln(';'if i < len(self.projections)-1 else '')
cols.append(ColRef(f'(+{disp_varname})[{i}]', 'generic', self.out_table, 0, None, cname, i))
self.emit(')') self.emit(')')
if self.disp: if flatten:
self.emit_no_ln(f'{disp_varname}:' if flatten else '')
if flatten or self.disp:
if len(self.projections) > 1: if len(self.projections) > 1:
self.emit(f'+{disp_varname}') self.emit(f"+{disp_varname}")
else: else:
self.emit(f'+,(,{disp_varname})') self.emit(f'+,(,{disp_varname})')
if flatten:
self.emit(f'{disp_varname}')
if flatten:
self.out_table.columns = cols
outfile(self, node['outfile'])
if self.datasource_changed: if self.datasource_changed:
self.context.datasource = self.prev_datasource self.context.datasource = self.prev_datasource

@ -1,12 +1,19 @@
import uuid import uuid
base62alp = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
def base62uuid(crop=8): def base62uuid(crop=8):
alp = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
id = uuid.uuid4().int id = uuid.uuid4().int
ret = '' ret = ''
while id: while id:
ret = alp[id % 62] + ret ret = base62alp[id % 62] + ret
id //= 62 id //= 62
return ret[:crop] if len(ret) else '0' return ret[:crop] if len(ret) else '0'
def enlist(l):
return l if type(l) is list else [l]
def seps(s, i, l):
return s if i < len(l) - 1 else ''

@ -1,2 +1,28 @@
import`csv
maxs:{[L]{max(x, y)}\L} maxs:{[L]{max(x, y)}\L}
mins:{[L]{min(x, y)}\L} mins:{[L]{min(x, y)}\L}
sums:{[L]{(x + y)}\L}
avgsimpl:{[L;i] curr:L[i]%(i+1); $[i<(#L)-1;curr, avgsimpl[L;i+1];curr]}
avgs:{[L] avgsimpl[sums[L];0]}
maxswimp:{[L;w;i] curr:max(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
maxsw:{[w;L]maxswimp[L; w; 1]}
minswimp:{[L;w;i] curr:min(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
minsw:{[w;L]minswimp[L;w;1]}
avgswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];curr:s%((i+1)&w);$[i<(#L)-1; curr, avgswimp[L; w; s; i+1]; curr]}
avgsw:{[w;L] avgswimp[L;w;0;0]}
sumswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];$[i<(#L)-1; s, sumswimp[L; w; s; i+1]; s]}
sumsw:{[w;L] sumswimp[L;w;0;0]}
groupbyi:{[L;GV;i]
k:(,(L[i]));gvk:GV[k][0];
found:$[(gvk[0]+gvk[1])>0;1;L[i] in !GV];
cg:(,L[i])!$[found;,gvk[0],i;,(,i)];
$[i<(#L)-1; groupbyi[L;(GV,cg);i+1]; (GV,cg)]}
groupbys:{[L;ll] GV1:(,(L[0]))!,(,0);$[ll>1;groupbyi[L;GV1;1];GV1]}
groupby:{[l;L] $[(#l)=0;,();groupbys[L;#l]]}

@ -1,6 +1,6 @@
Month,sales Month,sales
1,100 1,100
2,120 2,120
4,140
3,140 3,140
4,140
5,130 5,130

1 Month sales
2 1 100
3 2 120
4 140
4 3 140
5 4 140
6 5 130

@ -3,6 +3,13 @@ import aquery_parser as parser
import engine import engine
import subprocess import subprocess
import sys
if sys.platform != 'win32':
import readline
# else:
# import pyreadline3
test_parser = True test_parser = True
# code to test parser # code to test parser
@ -37,8 +44,8 @@ while test_parser:
print(stmts) print(stmts)
continue continue
trimed = ws.sub(' ', q.lower()).split(' ') trimed = ws.sub(' ', q.lower()).split(' ')
if trimed[0] == 'file': if trimed[0].startswith('f'):
fn = 'q.sql' if len(trimed) <= 1 or len(trimed[1]) == 0 \ fn = 'stock.a' if len(trimed) <= 1 or len(trimed[1]) == 0 \
else trimed[1] else trimed[1]
with open(fn, 'r') as file: with open(fn, 'r') as file:
@ -47,6 +54,6 @@ while test_parser:
continue continue
stmts = parser.parse(q) stmts = parser.parse(q)
print(stmts) print(stmts)
except ValueError as e: except (ValueError) as e:
print(type(e), e) print(type(e), e)

@ -0,0 +1,9 @@
CREATE TABLE test(a INT, b INT, c INT, d INT)
LOAD DATA INFILE "test.csv"
INTO TABLE test
FIELDS TERMINATED BY ","
SELECT sum(c), b, d
FROM test
group by a,b,d

@ -2,3 +2,4 @@ mo-future
mo-dots==8.20.21357 mo-dots==8.20.21357
mo-parsing mo-parsing
mo-imports mo-imports
readline; sys_platform != 'win32'

@ -17,11 +17,13 @@ INSERT INTO stocks VALUES(14,5)
INSERT INTO stocks VALUES(15,2) INSERT INTO stocks VALUES(15,2)
INSERT INTO stocks VALUES(16,5) INSERT INTO stocks VALUES(16,5)
SELECT max(price-min(timestamp)) FROM stocks <k> "q1" </k>
SELECT price, timestamp FROM stocks where price -timestamp > 1 and not (price*timestamp<100)
SELECT max(price-min(timestamp)) FROM stocks
<k> "q2" </k>
SELECT price, timestamp FROM stocks where price -timestamp > 1 and not (price*timestamp<100);
<k> "q3"</k>
SELECT max(price-mins(price)) SELECT max(price-mins(price))
FROM stocks FROM stocks
ASSUMING ASC timestamp ASSUMING ASC timestamp

Loading…
Cancel
Save