updated code for groupby and windowed aggregations

4 years ago · 7eac7837a3
parent 277dad6b3e
commit 7eac7837a3
11 changed files with 137 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -20,4 +20,8 @@ k
 *.pdf
 test*.c*
 *.csv
-*.out
+*.out
+*.asm
+!mmw.so
+*.k
+!header.k
--- a/8
+++ b/8
@ -0,0 +1,8 @@
+all:
+	g++ mmw.cpp --std=c++1z -shared -fPIC -Ofast -march=native -g0 -s -o mmw.so
+avx512:
+	g++ mmw.cpp --std=c++1z -shared -fPIC -Ofast -mavx512f -g0 -s -o mmw.so
+debug:
+	g++ mmw.cpp --std=c++1z -shared -fPIC -O0 -march=native -g3 -o mmw.so
+clean:
+	rm  mmw.so -rf
--- a/engine/ast.py
+++ b/engine/ast.py
@ -4,7 +4,7 @@ from engine.utils import base62uuid

 # replace column info with this later.
 class ColRef:
-    def __init__(self, k9name, _ty, cobj, cnt, table, name, id):
+    def __init__(self, k9name, _ty, cobj, cnt, table, name, id, order = None, compound = False):
        self.k9name = k9name
        self.type = _ty
        self.cobj = cobj
@ -12,6 +12,9 @@ class ColRef:
        self.table = table
        self.name = name
        self.id = id
+        self.order = order # True -> asc, False -> dsc; None -> unordered
+        self.compound = compound # compound field (list as a field) 
+        self.views = []
        self.__arr__ = (k9name, _ty, cobj, cnt, table, name, id)
        
    def __getitem__(self, key):
@ -31,6 +34,7 @@ class TableInfo:
        self.cxt = cxt
        self.views = set()
        self.rec = None 
+        self.groupinfo = None
        for c in cols:
            self.add_col(c)

@ -44,13 +48,6 @@ class TableInfo:
        if type(c) is ColRef:
            c = c.cobj
        k9name = 'c' + base62uuid(7)
-        # k9name = self.table_name + c['name']
-        # if k9name in self.cxt.k9cols_byname: # duplicate names?
-        #     root = self.cxt.k9cols_byname[k9name] 
-        #     k9name = k9name + root.cnt
-        #     root.cnt += 1
-
-        # column: (k9name, type, original col_object, dup_count)
        col_object =  ColRef(k9name, (list(c['type'].keys()))[0], c, 1, self,c['name'], len(self.columns))

        self.cxt.k9cols_byname[k9name] = col_object
--- a/engine/ddl.py
+++ b/engine/ddl.py
@ -37,15 +37,18 @@ class load(ast_node):
    name="load"
    def produce(self, node):
        node = node[self.name]
-        tablename = 'l'+base62uuid(7)
-        keys = 'k'+base62uuid(7)
-        self.emit(f"{tablename}:`csv ? 1:\"{node['file']['literal']}\"")
-        self.emit(f"{keys}:!{tablename}")
        table:TableInfo = self.context.tables_byname[node['table']]
-        
+        n_keys = len(table.columns)
+        keys = ''
+        for _ in n_keys:
+            keys+='`tk'+base62uuid(6)
+        tablename = 'l'+base62uuid(7)        
+
+        self.emit(f"{tablename}:[{keys}!+(`csv ? 1:\"{node['file']['literal']}\")][{keys}]")
+
        for i, c in enumerate(table.columns):
            c:ColRef
-            self.emit(f'{c.k9name}:{tablename}[({keys})[{i}]]')
+            self.emit(f'{c.k9name}:{tablename}[{i}]')
            
 class outfile(ast_node):
    name="_outfile"
--- a/engine/expr.py
+++ b/engine/expr.py
@ -8,17 +8,17 @@ class expr(ast_node):
        'min': 'min', 
        'avg': 'avg',
        'sum': 'sum',
+        'mod':'mod',
        'mins': ['mins', 'minsw'],
        'maxs': ['maxs', 'maxsw'],
        'avgs': ['avgs', 'avgsw'],
        'sums': ['sums', 'sumsw'],
    }
    binary_ops = {
-        'sub':'-', 
+        'sub':'-',  
        'add':'+', 
        'mul':'*', 
        'div':'%',
-        'mod':'mod',
        'and':'&',
        'or':'|',
        'gt':'>',
--- a/engine/groupby.py
+++ b/engine/groupby.py
@ -12,7 +12,7 @@ class groupby(ast_node):
        if type(node) is not list:
            node = [node]
        g_contents = '('
-        
+        first_col = ''
        for i, g in enumerate(node):
            v = g['value']
            e = expr(self, v).k9expr
@ -21,7 +21,8 @@ class groupby(ast_node):
                tmpcol = 't' + base62uuid(7)
                self.emit(f'{tmpcol}:{e}')
                e = tmpcol
-
+            if i == 0:
+                first_col = e
            g_contents += e + (';'if i < len(node)-1 else '')
            
        self.emit(f'{self.group}:'+g_contents+')')
@ -29,8 +30,8 @@ class groupby(ast_node):
        if len(node) <= 1:
            self.emit(f'{self.group}:={self.group}')
        else:
-            self.emit(f'{self.group}:groupby[{self.group}[0];+{self.group}]')
-    
+            self.emit(f'{self.group}:groupby[+({self.group},(,!(#({first_col}))))]')
+        
    def consume(self, _):
        self.referenced = self.datasource.rec
        self.datasource.rec = None
--- a/engine/projection.py
+++ b/engine/projection.py
@ -5,6 +5,8 @@ from engine.expr import expr
 from engine.scan import filter
 from engine.utils import base62uuid, enlist, base62alp
 from engine.ddl import outfile
+import copy
+
 class projection(ast_node):
    name='select'
    def __init__(self, parent:ast_node, node, context:Context = None, outname = None, disp = True):
@ -62,6 +64,8 @@ class projection(ast_node):

        if 'groupby' in node:
            self.group_node = groupby(self, node['groupby'])
+            self.datasource = copy(self.datasource) # shallow copy
+            self.datasource.groupinfo = self.group_node
        else:
            self.group_node = None
            
--- a/header.k
+++ b/header.k
@ -1,5 +1,7 @@
 import`csv

+md:{y-x*_y%x}
+
 maxs:{[L]{max(x, y)}\L}
 mins:{[L]{min(x, y)}\L}
 sums:{[L]{(x + y)}\L}
@ -7,22 +9,56 @@ sums:{[L]{(x + y)}\L}
 avgsimpl:{[L;i] curr:L[i]%(i+1); $[i<(#L)-1;curr, avgsimpl[L;i+1];curr]}
 avgs:{[L] avgsimpl[sums[L];0]}

-maxswimp:{[L;w;i] curr:max(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
-maxsw:{[w;L]maxswimp[L; w; 1]}
+/ maxswimp:{[L;w;i] curr:max(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
+/ maxsw:{[w;L]maxswimp[L; w; 1]}
+
+/ minswimp:{[L;w;i] curr:min(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
+/ minsw:{[w;L]minswimp[L;w;1]}
+
+/ avgswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];curr:s%((i+1)&w);$[i<(#L)-1; curr, avgswimp[L; w; s; i+1]; curr]}
+/ avgsw:{[w;L] avgswimp[L;w;0;0]}
+
+/ sumswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];$[i<(#L)-1; s, sumswimp[L; w; s; i+1]; s]}
+/ sumsw:{[w;L] sumswimp[L;w;0;0]}
+
+
+groupby0:{[L] 
+            {[x;y]
+                x:$[(@x)=`i;(,(L[0]))!,(,0);x];
+                k:(,(L[y]));gvk:x[k][0];
+                found:$[(gvk[0]+gvk[1])>0;1;L[y] in !x];
+                cg:(,L[y])!$[found;,gvk[0],y;,(,y)];
+                (x,cg)}/!(#L)}
+
+groupBy:{[x]groupBySingle:{[a;x]
+        findAll:{[c;xx]
+            f:{[i;c]$[(c[0])[i]~c[1];i+1;0]};
+            @[!#xx;!#xx;f;(#xx)#,(xx;c)]};
+        z:findAll[a;x];
+        b:(findAll[0;z]_(!(1+#z)))-1;(a;b)};
+    x:+x;y:?x;
+    @[y;!#y;groupBySingle;(#y)#,x]}

-minswimp:{[L;w;i] curr:min(L@(((i-w)+!w)|0)); $[i<#L;curr, maxswimp[L; w; i + 1];curr]}
-minsw:{[w;L]minswimp[L;w;1]}
+groupby:{[L]
+        L:^+L;
+        dimy:(#(L[0]))-1;
+        ((({[L;dim;x;y] 
+            x:$[x~0;(,(dim#(L[0])),0);x];
+            curr:dim#(L[y]);
+            $[(dim#*x)~curr;x;((,curr,y),x)]}[L;dimy])/!(#L));(+L)[dimy]) }

-avgswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];curr:s%((i+1)&w);$[i<(#L)-1; curr, avgswimp[L; w; s; i+1]; curr]}
-avgsw:{[w;L] avgswimp[L;w;0;0]}
+lststr:{[L](+({[x;y] ($x,$y)}/L))[0]}
+delist:{[L] $[(@L)in(`LL`LC`LG`L);delist[(,/L)];L]}
+cntlist:{[L;i] $[(@L)in(`LL`LC`LG`L);cntlist[(,/L);i+1];i+1]}

-sumswimp:{[L;w;s;i] s:(s+L[i])-L[i-w];$[i<(#L)-1; s, sumswimp[L; w; s; i+1]; s]}
-sumsw:{[w;L] sumswimp[L;w;0;0]}
+sumswkrl:{[L;w;x;y] ((x-L[y-w])+L[y])}
+sumsw:{[L;w] $[(#L)=0;L;(sumswkrl[L;w])\@[!#L;0;L[0]]]}
+avgswkrl:{[L;w;x;y] (x-(L[y-w]-L[y])%w)}
+avgsw:{[L;w] $[(#L)=0;L;(avgswkrl[L;w])\@[!#L;0;L[0]]]}

-groupbyi:{[L;GV;i] 
-            k:(,(L[i]));gvk:GV[k][0];
-            found:$[(gvk[0]+gvk[1])>0;1;L[i] in !GV];
-            cg:(,L[i])!$[found;,gvk[0],i;,(,i)]; 
-            $[i<(#L)-1; groupbyi[L;(GV,cg);i+1]; (GV,cg)]}
-groupbys:{[L;ll] GV1:(,(L[0]))!,(,0);$[ll>1;groupbyi[L;GV1;1];GV1]}
-groupby:{[l;L] $[(#l)=0;,();groupbys[L;#l]]}
+/ minsw:{[w;L] ({[L;w;x] min(L[$[x>w;(!w) + ((x-w)+1);!(x+1)]])}[L;w])'!#L}
+import`mmw
+minsw:{[w;L] ret:L; mmw[ret;((`g ($@ret)[0]), (#ret), w, 65536)];ret}
+maxsw:{[w;L] ret:L; mmw[ret;((`g ($@ret)[0]), (#ret), w, 65537)];ret}
+minswip:{[w;L] mmw[L;((`g ($@L)[0]), (#L), w, 65536)];}
+maxswip:{[w;L] mmw[L;((`g ($@L)[0]), (#L), w, 65537)];}
--- a/mmw.cpp
+++ b/mmw.cpp
@ -0,0 +1,48 @@
+
+#include <cstring>
+#include <cstdlib>
+#include <cstdint>
+#include <deque>
+
+using std::size_t;
+using std::uint32_t;
+
+template<class T, bool minmax>
+void running(void *array, uint32_t len, uint32_t w){
+	using std::deque;
+	T* arr = static_cast<T*> (array);
+	deque<std::pair<T, uint32_t>> cache;
+	for(int i = 0; i < len; ++i){
+		if(!cache.empty() && cache.front().second == i-w) cache.pop_front();
+		if constexpr(minmax)
+			while(!cache.empty() && cache.back().first>arr[i]) cache.pop_back();
+		else
+			while(!cache.empty() && cache.back().first<arr[i]) cache.pop_back();
+		cache.push_back({arr[i], i});
+		arr[i] = cache.front().first;
+	}
+}
+template<class T>
+inline void mm(void *array, uint32_t len, uint32_t w, bool mm){
+	mm?	running<T, true>(array, len, w) : running<T, false>(array, len, w);
+}
+extern "C" { 
+	#include <stdio.h> 
+
+	int mmw(void *array, unsigned long long misc[]){
+		char _ty = misc[0];
+		uint32_t len = misc[1];
+		uint32_t w = misc[2];
+		bool minmax = misc[3]-0x10000;
+		switch(_ty){
+			case 'F': mm<double>(array, len, w, minmax); break;
+			case 'C': case 'G': mm<unsigned char>(array, len, w, minmax); break;
+			case 'H': mm<unsigned short>(array, len, w, minmax); break;
+			case 'D': case 'I': mm<unsigned int>(array, len, w, minmax); break;
+			case 'T': case 'J': mm<long long>(array, len, w, minmax); break;
+			case 'L': if(len == 0) break;
+			default: printf("nyi %c\n", _ty);
+		}
+		return 0; 
+	}
+}
--- a/mmw.so
+++ b/mmw.so
--- a/prompt.py
+++ b/prompt.py
@ -7,9 +7,6 @@ import sys
 if sys.platform != 'win32':
    import readline
    
-# else:
-#     import pyreadline3
-
 test_parser = True

 # code to test parser