updated code for groupby and windowed aggregations

dev
Bill Sun 3 years ago
parent 277dad6b3e
commit 7eac7837a3

6
.gitignore vendored

@ -20,4 +20,8 @@ k
*.pdf
test*.c*
*.csv
*.out
*.out
*.asm
!mmw.so
*.k
!header.k

@ -0,0 +1,8 @@
all:
g++ mmw.cpp --std=c++1z -shared -fPIC -Ofast -march=native -g0 -s -o mmw.so
avx512:
g++ mmw.cpp --std=c++1z -shared -fPIC -Ofast -mavx512f -g0 -s -o mmw.so
debug:
g++ mmw.cpp --std=c++1z -shared -fPIC -O0 -march=native -g3 -o mmw.so
clean:
rm mmw.so -rf

@ -4,7 +4,7 @@ from engine.utils import base62uuid
# replace column info with this later.
class ColRef:
def __init__(self, k9name, _ty, cobj, cnt, table, name, id):
def __init__(self, k9name, _ty, cobj, cnt, table, name, id, order = None, compound = False):
self.k9name = k9name
self.type = _ty
self.cobj = cobj
@ -12,6 +12,9 @@ class ColRef:
self.table = table
self.name = name
self.id = id
self.order = order # True -> asc, False -> dsc; None -> unordered
self.compound = compound # compound field (list as a field)
self.views = []
self.__arr__ = (k9name, _ty, cobj, cnt, table, name, id)
def __getitem__(self, key):
@ -31,6 +34,7 @@ class TableInfo:
self.cxt = cxt
self.views = set()
self.rec = None
self.groupinfo = None
for c in cols:
self.add_col(c)
@ -44,13 +48,6 @@ class TableInfo:
if type(c) is ColRef:
c = c.cobj
k9name = 'c' + base62uuid(7)
# k9name = self.table_name + c['name']
# if k9name in self.cxt.k9cols_byname: # duplicate names?
# root = self.cxt.k9cols_byname[k9name]
# k9name = k9name + root.cnt
# root.cnt += 1
# column: (k9name, type, original col_object, dup_count)
col_object = ColRef(k9name, (list(c['type'].keys()))[0], c, 1, self,c['name'], len(self.columns))
self.cxt.k9cols_byname[k9name] = col_object

@ -37,15 +37,18 @@ class load(ast_node):
name="load"
def produce(self, node):
node = node[self.name]
tablename = 'l'+base62uuid(7)
keys = 'k'+base62uuid(7)
self.emit(f"{tablename}:`csv ? 1:\"{node['file']['literal']}\"")
self.emit(f"{keys}:!{tablename}")
table:TableInfo = self.context.tables_byname[node['table']]
n_keys = len(table.columns)
keys = ''
for _ in n_keys:
keys+='`tk'+base62uuid(6)
tablename = 'l'+base62uuid(7)
self.emit(f"{tablename}:[{keys}!+(`csv ? 1:\"{node['file']['literal']}\")][{keys}]")
for i, c in enumerate(table.columns):
c:ColRef
self.emit(f'{c.k9name}:{tablename}[({keys})[{i}]]')
self.emit(f'{c.k9name}:{tablename}[{i}]')
class outfile(ast_node):
name="_outfile"

@ -8,17 +8,17 @@ class expr(ast_node):
'min': 'min',
'avg': 'avg',
'sum': 'sum',
'mod':'mod',
'mins': ['mins', 'minsw'],
'maxs': ['maxs', 'maxsw'],
'avgs': ['avgs', 'avgsw'],
'sums': ['sums', 'sumsw'],
}
binary_ops = {
'sub':'-',
'sub':'-',
'add':'+',
'mul':'*',
'div':'%',
'mod':'mod',
'and':'&',
'or':'|',
'gt':'>',

@ -12,7 +12,7 @@ class groupby(ast_node):
if type(node) is not list:
node = [node]
g_contents = '('
first_col = ''
for i, g in enumerate(node):
v = g['value']
e = expr(self, v).k9expr
@ -21,7 +21,8 @@ class groupby(ast_node):
tmpcol = 't' + base62uuid(7)
self.emit(f'{tmpcol}:{e}')
e = tmpcol
if i == 0:
first_col = e
g_contents += e + (';'if i < len(node)-1 else '')
self.emit(f'{self.group}:'+g_contents+')')
@ -29,8 +30,8 @@ class groupby(ast_node):
if len(node) <= 1:
self.emit(f'{self.group}:={self.group}')
else:
self.emit(f'{self.group}:groupby[{self.group}[0];+{self.group}]')
self.emit(f'{self.group}:groupby[+({self.group},(,!(#({first_col}))))]')
def consume(self, _):
self.referenced = self.datasource.rec
self.datasource.rec = None

@ -5,6 +5,8 @@ from engine.expr import expr
from engine.scan import filter
from engine.utils import base62uuid, enlist, base62alp
from engine.ddl import outfile
import copy
class projection(ast_node):
name='select'
def __init__(self, parent:ast_node, node, context:Context = None, outname = None, disp = True):
@ -62,6 +64,8 @@ class projection(ast_node):
if 'groupby' in node:
self.group_node = groupby(self, node['groupby'])
self.datasource = copy(self.datasource) # shallow copy
self.datasource.groupinfo = self.group_node
else:
self.group_node = None

@ -1,5 +1,7 @@
import`csv
md:{y-x*_y%x}
maxs:{[L]{max(x, y)}\L}
mins:{[L]{min(x, y)}\L}
sums:{[L]{(x + y)}\L}
@ -7,22 +9,56 @@ sums:{[L]{(x + y)}\L}
avgsimpl:{[L;i] curr:L[i]%(i+1); $[i<(#L)-1;curr, avgsimpl[L;i+1];curr]}
avgs:{[L] avgsimpl[sums[L];0]}
maxswimp:{[L;w;i] curr:max(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
maxsw:{[w;L]maxswimp[L; w; 1]}
/ maxswimp:{[L;w;i] curr:max(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
/ maxsw:{[w;L]maxswimp[L; w; 1]}
/ minswimp:{[L;w;i] curr:min(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
/ minsw:{[w;L]minswimp[L;w;1]}
/ avgswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];curr:s%((i+1)&w);$[i<(#L)-1; curr, avgswimp[L; w; s; i+1]; curr]}
/ avgsw:{[w;L] avgswimp[L;w;0;0]}
/ sumswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];$[i<(#L)-1; s, sumswimp[L; w; s; i+1]; s]}
/ sumsw:{[w;L] sumswimp[L;w;0;0]}
groupby0:{[L]
{[x;y]
x:$[(@x)=`i;(,(L[0]))!,(,0);x];
k:(,(L[y]));gvk:x[k][0];
found:$[(gvk[0]+gvk[1])>0;1;L[y] in !x];
cg:(,L[y])!$[found;,gvk[0],y;,(,y)];
(x,cg)}/!(#L)}
groupBy:{[x]groupBySingle:{[a;x]
findAll:{[c;xx]
f:{[i;c]$[(c[0])[i]~c[1];i+1;0]};
@[!#xx;!#xx;f;(#xx)#,(xx;c)]};
z:findAll[a;x];
b:(findAll[0;z]_(!(1+#z)))-1;(a;b)};
x:+x;y:?x;
@[y;!#y;groupBySingle;(#y)#,x]}
minswimp:{[L;w;i] curr:min(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
minsw:{[w;L]minswimp[L;w;1]}
groupby:{[L]
L:^+L;
dimy:(#(L[0]))-1;
((({[L;dim;x;y]
x:$[x~0;(,(dim#(L[0])),0);x];
curr:dim#(L[y]);
$[(dim#*x)~curr;x;((,curr,y),x)]}[L;dimy])/!(#L));(+L)[dimy]) }
avgswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];curr:s%((i+1)&w);$[i<(#L)-1; curr, avgswimp[L; w; s; i+1]; curr]}
avgsw:{[w;L] avgswimp[L;w;0;0]}
lststr:{[L](+({[x;y] ($x,$y)}/L))[0]}
delist:{[L] $[(@L)in(`LL`LC`LG`L);delist[(,/L)];L]}
cntlist:{[L;i] $[(@L)in(`LL`LC`LG`L);cntlist[(,/L);i+1];i+1]}
sumswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];$[i<(#L)-1; s, sumswimp[L; w; s; i+1]; s]}
sumsw:{[w;L] sumswimp[L;w;0;0]}
sumswkrl:{[L;w;x;y] ((x-L[y-w])+L[y])}
sumsw:{[L;w] $[(#L)=0;L;(sumswkrl[L;w])\@[!#L;0;L[0]]]}
avgswkrl:{[L;w;x;y] (x-(L[y-w]-L[y])%w)}
avgsw:{[L;w] $[(#L)=0;L;(avgswkrl[L;w])\@[!#L;0;L[0]]]}
groupbyi:{[L;GV;i]
k:(,(L[i]));gvk:GV[k][0];
found:$[(gvk[0]+gvk[1])>0;1;L[i] in !GV];
cg:(,L[i])!$[found;,gvk[0],i;,(,i)];
$[i<(#L)-1; groupbyi[L;(GV,cg);i+1]; (GV,cg)]}
groupbys:{[L;ll] GV1:(,(L[0]))!,(,0);$[ll>1;groupbyi[L;GV1;1];GV1]}
groupby:{[l;L] $[(#l)=0;,();groupbys[L;#l]]}
/ minsw:{[w;L] ({[L;w;x] min(L[$[x>w;(!w) + ((x-w)+1);!(x+1)]])}[L;w])'!#L}
import`mmw
minsw:{[w;L] ret:L; mmw[ret;((`g ($@ret)[0]), (#ret), w, 65536)];ret}
maxsw:{[w;L] ret:L; mmw[ret;((`g ($@ret)[0]), (#ret), w, 65537)];ret}
minswip:{[w;L] mmw[L;((`g ($@L)[0]), (#L), w, 65536)];}
maxswip:{[w;L] mmw[L;((`g ($@L)[0]), (#L), w, 65537)];}

@ -0,0 +1,48 @@
#include <cstring>
#include <cstdlib>
#include <cstdint>
#include <deque>
using std::size_t;
using std::uint32_t;
template<class T, bool minmax>
void running(void *array, uint32_t len, uint32_t w){
using std::deque;
T* arr = static_cast<T*> (array);
deque<std::pair<T, uint32_t>> cache;
for(int i = 0; i < len; ++i){
if(!cache.empty() && cache.front().second == i-w) cache.pop_front();
if constexpr(minmax)
while(!cache.empty() && cache.back().first>arr[i]) cache.pop_back();
else
while(!cache.empty() && cache.back().first<arr[i]) cache.pop_back();
cache.push_back({arr[i], i});
arr[i] = cache.front().first;
}
}
template<class T>
inline void mm(void *array, uint32_t len, uint32_t w, bool mm){
mm? running<T, true>(array, len, w) : running<T, false>(array, len, w);
}
extern "C" {
#include <stdio.h>
int mmw(void *array, unsigned long long misc[]){
char _ty = misc[0];
uint32_t len = misc[1];
uint32_t w = misc[2];
bool minmax = misc[3]-0x10000;
switch(_ty){
case 'F': mm<double>(array, len, w, minmax); break;
case 'C': case 'G': mm<unsigned char>(array, len, w, minmax); break;
case 'H': mm<unsigned short>(array, len, w, minmax); break;
case 'D': case 'I': mm<unsigned int>(array, len, w, minmax); break;
case 'T': case 'J': mm<long long>(array, len, w, minmax); break;
case 'L': if(len == 0) break;
default: printf("nyi %c\n", _ty);
}
return 0;
}
}

BIN
mmw.so

Binary file not shown.

@ -7,9 +7,6 @@ import sys
if sys.platform != 'win32':
import readline
# else:
# import pyreadline3
test_parser = True
# code to test parser

Loading…
Cancel
Save