Merge pull request #5 from sunyinqi0508/dev

Merge from dev branch
3 years ago · c4e92abf61
parent 33c16e062e 4f7f544983
commit c4e92abf61
51 changed files with 5172 additions and 490 deletions
--- a/.gitignore
+++ b/.gitignore
@ -51,12 +51,14 @@ k
 **/Debug
 **/Release
 test*.c*
+data/benchmark
 *.csv
 !test.csv
 !test2.csv
 !moving_avg.csv
 !nyctx100.csv
 !network.csv
+!test_complex.csv
 *.out
 *.asm
 !mmw.so
--- a/91
+++ b/91
@ -1,25 +1,34 @@
 OS_SUPPORT = 
 MonetDB_LIB = 
 MonetDB_INC = 
-Threading = 
+Defines = 
 CXXFLAGS = --std=c++1z
 ifeq ($(AQ_DEBUG), 1)
-	OPTFLAGS = -g3 
+	OPTFLAGS = -g3 -fsanitize=address -fsanitize=leak
+	LINKFLAGS = 
 else
 	OPTFLAGS = -O3 -DNDEBUG -fno-stack-protector 
+	LINKFLAGS = -flto -s
 endif
-LINKFLAGS = -flto # + $(AQ_LINK_FLAG)
 SHAREDFLAGS = -shared  
 FPIC = -fPIC
-COMPILER = $(shell $(CXX) --version | grep -q clang && echo clang|| echo gcc) 
+_COMPILER = $(shell $(CXX) --version | grep -q clang && echo clang|| echo gcc) 
+COMPILER = $(strip $(_COMPILER))
 LIBTOOL = ar rcs
 USELIB_FLAG = -Wl,--whole-archive,libaquery.a -Wl,-no-whole-archive
-LIBAQ_SRC = server/server.cpp server/monetdb_conn.cpp server/io.cpp 
-LIBAQ_OBJ = server.o monetdb_conn.o io.o 
+LIBAQ_SRC = server/monetdb_conn.cpp server/libaquery.cpp 
+LIBAQ_OBJ = monetdb_conn.o libaquery.o
 SEMANTIC_INTERPOSITION = -fno-semantic-interposition
 RANLIB = ranlib
+_LINKER_BINARY = $(shell `$(CXX) -print-prog-name=ld` -v 2>&1 | grep -q LLVM && echo lld || echo ld)
+LINKER_BINARY = $(strip $(_LINKER_BINARY))
+ifeq ($(LINKER_BINARY), ld)
+	LINKER_FLAGS = -Wl,--allow-multiple-definition
+else
+	LINKER_FLAGS =
+endif

-ifeq ($(COMPILER), clang )
+ifeq ($(COMPILER), clang)
 	CLANG_GE_10 = $(shell expr `$(CXX) -dumpversion | cut -f1 -d.` \>= 10)
 	ifneq ($(CLANG_GE_10), 1)
 		SEMANTIC_INTERPOSITION = 
@ -49,7 +58,7 @@ ifeq ($(OS),Windows_NT)
 	MonetDB_LIB += msc-plugin/monetdbe.dll 
 	MonetDB_INC +=  -Imonetdb/msvc
 	LIBTOOL = gcc-ar rcs
-	ifeq ($(COMPILER), clang )
+	ifeq ($(COMPILER), clang)
 		FPIC =
 	endif
 else
@ -61,7 +70,7 @@ else
 		USELIB_FLAG = -Wl,-force_load
 		MonetDB_LIB += -L$(shell brew --prefix monetdb)/lib 
 		MonetDB_INC += -I$(shell brew --prefix monetdb)/include/monetdb
-		ifeq ($(COMPILER), clang )
+		ifeq ($(COMPILER), clang)
 			LIBTOOL = libtool -static -o
 		endif
 		ifneq ($(UNAME_M),arm64)
@ -79,43 +88,65 @@ endif
 ifeq ($(THREADING),1)
 	LIBAQ_SRC += server/threading.cpp
 	LIBAQ_OBJ += threading.o
-	Threading +=  -DTHREADING
+	Defines +=  -DTHREADING
+endif
+
+ifeq ($(AQUERY_ITC_USE_SEMPH), 1)
+	Defines += -D__AQUERY_ITC_USE_SEMPH__
 endif

-SHAREDFLAGS += $(FPIC)
+CXXFLAGS += $(OPTFLAGS) $(Defines) $(MonetDB_INC) 
+BINARYFLAGS = $(CXXFLAGS) $(LINKFLAGS) $(MonetDB_LIB)
+SHAREDFLAGS += $(FPIC) $(BINARYFLAGS)

 info:
-	$(info $(OPTFLAGS))
-	$(info $(OS_SUPPORT))
-	$(info $(OS)) 
-	$(info $(Threading))
-	$(info "test")
-	$(info $(LIBTOOL))
-	$(info $(MonetDB_INC))
-	$(info $(COMPILER))
-	$(info $(CXX))
-	$(info $(FPIC))
+	$(info This makefile script is used in AQuery to automatically build required libraries and executables.)
+	$(info Run it manually only for debugging purposes.)
+	$(info Targets (built by `make <target>`):)
+	$(info $"	pch: generate precompiled header)
+	$(info $"	libaquery.a: build static library)
+	$(info $"	server.so: build execution engine)
+	$(info $"	snippet: build generated query snippet)
+	$(info $"	server_uselib: build execution engine using shared library and pch)
+	$(info $"	snippet_uselib: build generated query snippet using shared library and pch)
+	$(info $"	docker: build docker image with name aquery)
+	$(info $"	launcher: build launcher for aquery ./aq)
+	$(info $"	clean: remove all generated binaraies and caches)
+	$(info )
+	$(info Variables:)
+	$(info $"	OPTFLAGS: $(OPTFLAGS))
+	$(info $"	OS_SUPPORT: $(OS_SUPPORT))
+	$(info $"	OS: $(OS)) 
+	$(info $"	Defines: $(Defines))
+	$(info $"	LIBTOOL: $(LIBTOOL))
+	$(info $"	MonetDB_INC: $(MonetDB_INC))
+	$(info $"	COMPILER: $(COMPILER))
+	$(info $"	CXX: $(CXX))
+	$(info $"	LINKER_BINARY: $(LINKER_BINARY))
+	$(info $"	LINKER_FLAGS: $(LINKER_FLAGS))
 pch:
-	$(CXX) -x c++-header server/pch.hpp $(FPIC) $(MonetDB_INC) $(OPTFLAGS) $(CXXFLAGS) $(Threading)
-libaquery.a:
-	$(CXX) -c $(FPIC) $(PCHFLAGS) $(LIBAQ_SRC) $(MonetDB_INC) $(MonetDB_LIB) $(OS_SUPPORT) $(Threading) $(OPTFLAGS) $(LINKFLAGS) $(CXXFLAGS) &&\
+	$(CXX) -x c++-header server/pch.hpp $(FPIC) $(CXXFLAGS)
+libaquery:
+	$(CXX) -c $(FPIC) $(PCHFLAGS) $(LIBAQ_SRC) $(OS_SUPPORT) $(CXXFLAGS) &&\
 	$(LIBTOOL) libaquery.a $(LIBAQ_OBJ) &&\
 	$(RANLIB) libaquery.a

+warmup:
+	$(CXX)  msc-plugin/dummy.cpp libaquery.a $(SHAREDFLAGS) -o dll.so
 server.bin:
-	$(CXX) $(LIBAQ_SRC) $(LINKFLAGS) $(OS_SUPPORT) $(Threading)  $(MonetDB_INC) $(MonetDB_LIB) $(OPTFLAGS) $(CXXFLAGS) -o server.bin
+	$(CXX) $(LIBAQ_SRC) $(OS_SUPPORT) $(BINARYFLAGS) -o server.bin
 launcher:
-	$(CXX) -D__AQ_BUILD_LAUNCHER__ $(LIBAQ_SRC) $(LINKFLAGS) $(OS_SUPPORT) $(Threading)  $(MonetDB_INC) $(MonetDB_LIB) $(OPTFLAGS) $(CXXFLAGS) -o aq
+	$(CXX) -D__AQ_BUILD_LAUNCHER__ server/server.cpp $(LIBAQ_SRC) $(OS_SUPPORT) $(BINARYFLAGS) -o aq
 server.so:
 #	$(CXX) -z muldefs server/server.cpp server/monetdb_conn.cpp -fPIC -shared $(OS_SUPPORT) monetdb/msvc/monetdbe.dll --std=c++1z -O3 -march=native -o server.so -I./monetdb/msvc 
-	$(CXX) $(SHAREDFLAGS) $(PCHFLAGS) $(LIBAQ_SRC) $(OS_SUPPORT) $(Threading) $(MonetDB_INC) $(MonetDB_LIB) $(OPTFLAGS) $(LINKFLAGS) $(CXXFLAGS) -o server.so 
+	$(CXX) $(PCHFLAGS) $(LIBAQ_SRC) server/server.cpp $(OS_SUPPORT) $(SHAREDFLAGS) -o server.so 
 server_uselib:
-	$(CXX) $(SHAREDFLAGS) $(USELIB_FLAG),libaquery.a $(MonetDB_LIB) $(OPTFLAGS) $(LINKFLAGS) $(CXXFLAGS) -o server.so
+	$(CXX) $(LINKER_FLAGS) server/server.cpp libaquery.a $(SHAREDFLAGS) -o server.so

 snippet:
-	$(CXX) $(SHAREDFLAGS) $(PCHFLAGS) out.cpp $(LIBAQ_SRC) $(MonetDB_INC) $(MonetDB_LIB) $(Threading) $(OPTFLAGS) $(LINKFLAGS) $(CXXFLAGS) -o dll.so
+	$(CXX) $(PCHFLAGS) out.cpp $(LIBAQ_SRC) $(SHAREDFLAGS) -o dll.so
 snippet_uselib:
-	$(CXX) $(SHAREDFLAGS) $(PCHFLAGS) out.cpp libaquery.a $(MonetDB_INC) $(Threading) $(MonetDB_LIB) $(OPTFLAGS) $(LINKFLAGS) $(CXXFLAGS) -o dll.so
+	$(CXX) $(PCHFLAGS) out.cpp libaquery.a $(SHAREDFLAGS) -o dll.so

 docker:
 	docker build -t aquery .
--- a/README.md
+++ b/README.md
@ -226,9 +226,38 @@ DROP TABLE my_table IF EXISTS
 - File name can also be absolute path.
 - See `data/q1.sql` for more information 

+## Combine Queries
+- `UNION ALL` is a bag union of two query results with same schema. e.g. 
+```
+SELECT * FROM table 1 UNION ALL SELECT * FROM table 2
+```
+- `EXCEPT` clause will return the difference of two query results. e.g.
+  
 ## Delete Data:
 - Use a query like `DELETE FROM <table_name> [WHERE <conditions>]` to delete rows from a table that matches the conditions.
  
+## Performance Measurement 
+- Execution time can be recorded using the `stats` command described above.
+  - `stats` command without any argument will show the execution time of all queries executed so far.
+  - `stats reset` will reset the timer for total execution time printed by `stats` command above.
+  - `stats on` will show execution time for every following query until a `stats off` command is received.
+
+## MonetDB Passthrough for Hybrid Engine 
+AQuery++ supports MonetDB passthrough for hybrid engine. Simply put standard SQL queries inside a \<sql> \</sql> block. <br>
+
+Each query inside an sql block must be separated by a semicolon. And they will be sent to MonetDB directly which means they should be written in MonetDB dialect instead of AQuery dialect. Please refer to the [MonetDB documentation](https://www.monetdb.org/documentation-Sep2022/user-guide/sql-summary/) for more information.
+
+For example:
+```
+CREATE TABLE my_table (c1 INT, c2 INT, c3 STRING)
+INSERT INTO my_table VALUES(10, 20, "example"), (20, 30, "example2")
+<sql>
+INSERT INTO my_table VALUES(10, 20, "example3");
+CREATE INDEX idx1 ON my_table(c1);
+</sql>
+SELECT * FROM my_table WHERE c1 > 10
+```
+
 ## Built-in functions: 
 - `avg[s]`: average of a column. `avgs(col), avgs(w, col)` is rolling and moving average with window `w` of the column `col`.
 - `var[s]`, `stddev[s]`: [moving/rolling] **population** variance, standard deviation.
@ -250,7 +279,7 @@ DROP TABLE my_table IF EXISTS
 - AQuery++ supports different execution engines thanks to the decoupled compiler structure.
 - Hybrid Execution Engine: decouples the query into two parts. The sql-compliant part is executed by an Embedded version of Monetdb and everything else is executed by a post-process module which is generated by AQuery++ Compiler in C++ and then compiled and executed.
 - AQuery Library: A set of header based libraries that provide column arithmetic and operations inspired by array programming languages like kdb. This library is used by C++ post-processor code which can significantly reduce the complexity of generated code, reducing compile time while maintaining the best performance. The set of libraries can also be used by UDFs as well as User modules which makes it easier for users to write simple but powerful extensions. 
-  
+
 # Roadmap
 - [x] SQL Parser -> AQuery Parser (Front End)
 - [x] AQuery-C++ Compiler (Back End)
--- a/aquery_config.py
+++ b/aquery_config.py
@ -2,7 +2,7 @@

 ## GLOBAL CONFIGURATION FLAGS

-version_string = '0.4.9a'
+version_string = '0.5.3a'
 add_path_to_ldpath = True
 rebuild_backend = False
 run_backend = True
@ -11,6 +11,9 @@ cygroot = 'c:/msys64/usr/bin'
 msbuildroot = ''
 os_platform = 'unknown'
 build_driver = 'Auto'
+compilation_output = True
+
+## END GLOBAL CONFIGURATION FLAGS

 def init_config():
    global __config_initialized__, os_platform, msbuildroot, build_driver
@ -21,7 +24,8 @@ def init_config():
    import os
    from engine.utils import add_dll_dir
    # os.environ['CXX'] = 'C:/Program Files/LLVM/bin/clang.exe'
-    # os.environ['THREADING'] = '1'
+    os.environ['THREADING'] = '1'
+    os.environ['AQUERY_ITC_USE_SEMPH'] = '1'

    if  ('__config_initialized__' not in globals() or 
            not __config_initialized__):
--- a/aquery_parser/keywords.py
+++ b/aquery_parser/keywords.py
@ -243,8 +243,8 @@ RESERVED = MatchFirst([
    WITHIN,
    INTO,
 ])
-L_INLINE = Literal("<k>").suppress()
-R_INLINE = Literal("</k>").suppress()
+L_INLINE = Literal("<sql>").suppress()
+R_INLINE = Literal("</sql>").suppress()
 LBRACE = Literal("{").suppress()
 RBRACE = Literal("}").suppress()
 LSB = Literal("[").suppress()
--- a/aquery_parser/sql_parser.py
+++ b/aquery_parser/sql_parser.py
@ -8,6 +8,7 @@
 #

 from sre_parse import WHITESPACE
+
 from mo_parsing.helpers import restOfLine
 from mo_parsing.infix import delimited_list
 from mo_parsing.whitespaces import NO_WHITESPACE, Whitespace
@ -65,7 +66,7 @@ def parser(literal_string, ident, sqlserver=False):

        var_name = ~RESERVED + ident
        
-        inline_kblock = (L_INLINE + SkipTo(R_INLINE, include=True))("c")
+        inline_sqlblock = (L_INLINE + SkipTo(R_INLINE, include=True))("sql")
        # EXPRESSIONS
        expr = Forward()
        column_type, column_definition, column_def_references = get_column_type(
@ -568,8 +569,9 @@ def parser(literal_string, ident, sqlserver=False):
                | assign("comment", EQ + literal_string)
                | assign("default character set", EQ + var_name)
                | assign("default charset", EQ + var_name)
-            )
-            + Optional(AS.suppress() + infix_notation(query, [])("query"))
+            ) 
+            + Optional(AS.suppress() + query("query")) 
+            # investigate why infix_notation(query, []) eats up the rest of queries
        )("create_table")

        create_view = (
@ -655,7 +657,8 @@ def parser(literal_string, ident, sqlserver=False):
        ) / to_json_call

        load_data = (
-            keyword("data") ("file_type")
+            Optional(keyword("complex")("complex"))
+            + keyword("data") ("file_type")
            + keyword("infile")("loc")  
            + literal_string ("file")
            + INTO
@ -667,6 +670,12 @@ def parser(literal_string, ident, sqlserver=False):
                  + keyword("by").suppress() 
                  + literal_string ("term")
            )
+            + Optional(
+                  keyword("element").suppress()
+                  + keyword("terminated").suppress()
+                  + keyword("by").suppress() 
+                  + literal_string ("ele")
+            )
        )
        
        module_func_def = (
@ -716,7 +725,7 @@ def parser(literal_string, ident, sqlserver=False):
        )("stmts"), ";")

        other_stmt = (
-            inline_kblock
+            inline_sqlblock
            | udf
        ) ("stmts")
        
--- a/build.py
+++ b/build.py
@ -16,6 +16,7 @@ class checksums:
    server : Optional[Union[bytes, bool]] = None
    sources : Optional[Union[Dict[str, bytes], bool]] = None
    env : str = ''
+    
    def calc(self, compiler_name, libaquery_a = 'libaquery.a' , 
                pch_hpp_gch = 'server/pch.hpp.gch', 
                server = 'server.so'
@ -24,7 +25,8 @@ class checksums:
        self.env = (aquery_config.os_platform +
                    machine() + 
                    aquery_config.build_driver + 
-                    compiler_name
+                    compiler_name + 
+                    aquery_config.version_string
                )
        for key in self.__dict__.keys():
            try:
@ -71,14 +73,14 @@ class checksums:
 class build_manager:
    sourcefiles = [
                   'build.py', 'Makefile', 
-                   'server/server.cpp', 'server/io.cpp',  
+                   'server/server.cpp', 'server/libaquery.cpp',  
                   'server/monetdb_conn.cpp', 'server/threading.cpp', 
                   'server/winhelper.cpp' 
                   ]
    headerfiles = ['server/aggregations.h', 'server/hasher.h', 'server/io.h', 
                   'server/libaquery.h', 'server/monetdb_conn.h', 'server/pch.hpp', 
                   'server/table.h', 'server/threading.h', 'server/types.h', 'server/utils.h', 
-                   'server/winhelper.h', 'server/gc.hpp', 'server/vector_type.hpp', 
+                   'server/winhelper.h', 'server/gc.h', 'server/vector_type.hpp', 
                   'server/table_ext_monetdb.hpp' 
                   ]
   
@ -92,6 +94,9 @@ class build_manager:
            return False
        def build(self, stdout = sys.stdout, stderr = sys.stderr):
            ret = True
+            if not aquery_config.compilation_output:
+                stdout = nullstream
+                stderr = nullstream
            for c in self.build_cmd:
                if c:
                    try: # only last success matters
@ -100,6 +105,8 @@ class build_manager:
                        ret = False
                        pass
            return ret
+        def warmup(self):
+            return True
                
    class MakefileDriver(DriverBase):
        def __init__(self, mgr : 'build_manager') -> None:
@ -111,9 +118,9 @@ class build_manager:
                mgr.cxx = os.environ['CXX']
            if 'AQ_DEBUG' not in os.environ:
                os.environ['AQ_DEBUG'] = '0' if mgr.OptimizationLv else '1'
-                
+
        def libaquery_a(self):
-            self.build_cmd = [['rm', 'libaquery.a'],['make', 'libaquery.a']]
+            self.build_cmd = [['rm', 'libaquery.a'],['make', 'libaquery']]
            return self.build()
        def pch(self):
            self.build_cmd = [['rm', 'server/pch.hpp.gch'], ['make', 'pch']]
@ -166,6 +173,10 @@ class build_manager:
            self.build_cmd = [[aquery_config.msbuildroot, loc, self.opt, self.platform]]
            return self.build()

+        def warmup(self):
+            self.build_cmd = [['make', 'warmup']]
+            return self.build()
+            
    #class PythonDriver(DriverBase):
    #    def __init__(self, mgr : 'build_manager') -> None:
    #        super().__init__(mgr)           
@ -221,6 +232,9 @@ class build_manager:
            current.calc(self.cxx, libaquery_a)
            with open('.cached', 'wb') as cache_sig:
                cache_sig.write(pickle.dumps(current))
+            self.driver.warmup()
+            
+            
        else:
            if aquery_config.os_platform == 'mac':
                os.system('./arch-check.sh')
--- a/csv.h
+++ b/csv.h
@ -1,4 +1,4 @@
-// Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net>
+// Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net>, 2022 Bill Sun
 // License: BSD-3
 //
 // All rights reserved.
@ -49,6 +49,7 @@
 #include <cerrno>
 #include <istream>
 #include <limits>
+#include "server/vector_type.hpp"

 namespace io{
        ////////////////////////////////////////////////////////////////////////////
@ -974,8 +975,7 @@ namespace io{
                                                return;
                                        }
                                        x = 10*x+y;
-                                }else
-                                        throw error::no_digit();
+                                }
                                ++col;
                        }
                }
@ -1005,8 +1005,7 @@ namespace io{
                                                        return;
                                                }
                                                x = 10*x-y;
-                                        }else
-                                                throw error::no_digit();
+                                        }
                                        ++col;
                                }
                                return;
@ -1080,19 +1079,37 @@ namespace io{
                                        }
                                        x *= base;
                                }
-                        }else{
-                                if(*col != '\0')
-                                        throw error::no_digit();
                        }

                        if(is_neg)
                                x = -x;
                }

+
                template<class overflow_policy> void parse(char*col, float&x) { parse_float(col, x); }
                template<class overflow_policy> void parse(char*col, double&x) { parse_float(col, x); }
                template<class overflow_policy> void parse(char*col, long double&x) { parse_float(col, x); }
-
+                
+
+                template<class overflow_policy, class T, char sep2 = ';'>
+                void parse_vector(char* col, vector_type<T>& x) {
+                    while (*col != '\0') {
+                        char* next_col = col;
+                        while (*next_col != sep2 && *next_col != '\0')
+                            ++next_col;
+                        while (*next_col == ' ' || *next_col == '\t' || 
+                            *next_col == sep2 || *next_col == '\r' || 
+                            *next_col == '\n') 
+                            ++next_col;
+                        char _next_end = *next_col;
+                        *next_col = '\0';
+                        T y;
+                        ::io::detail::parse<overflow_policy>(col, y);
+                        x.emplace_back(y);
+                        col = next_col;
+                        *next_col = _next_end;
+                    }
+                }
                template<class overflow_policy, class T>
                void parse(char*col, T&x){
                        // Mute unused variable compiler warning
@ -1108,6 +1125,7 @@ namespace io{
        }

        template<unsigned column_count,
+                char sep2 = ';',
                class trim_policy = trim_chars<' ', '\t'>,
                class quote_policy = no_quote_escape<','>,
                class overflow_policy = throw_on_overflow,
@ -1234,7 +1252,23 @@ namespace io{
                        parse_helper(r+1, cols...);
                }

-
+                template<class T, class ...ColType>
+                void parse_helper(std::size_t r, vector_type<T>&t, ColType&...cols){
+                        if(row[r]){
+                                try{
+                                        try{
+                                                ::io::detail::parse_vector<overflow_policy, T, sep2>(row[r], t);
+                                        }catch(error::with_column_content&err){
+                                                err.set_column_content(row[r]);
+                                                throw;
+                                        }
+                                }catch(error::with_column_name&err){
+                                        err.set_column_name(column_names[r].c_str());
+                                        throw;
+                                }
+                        }
+                        parse_helper(r+1, cols...);
+                }
        public:
                template<class ...ColType>
                bool read_row(ColType& ...cols){
@ -1269,5 +1303,12 @@ namespace io{
                }
        };
 }
+
+template <unsigned column_count, char sep1 = ',', char sep2 = ';'>
+using AQCSVReader = io::CSVReader<column_count, sep2, 
+        io::trim_chars<(char)32, (char)9>, io::no_quote_escape<sep1>, 
+        io::ignore_overflow, io::empty_line_comment
+        >;
+
 #endif

--- a/data/test.csv
+++ b/data/test.csv
@ -1,11 +1,21 @@
 a, b, c, d
 1,1,2,2
+2,1,2,2
+2,4,3,4
 1,2,2,2
 1,2,3,4
 4,2,1,4
-2,1,3,4
+2,1,3,3
+2,1,1,2
 1,2,3,4
+3,2,4,2
 1,2,3,3
 3,2,1,2
-2,1,2,2
+2,1,4,2
+3,3,4,4
+2,2,3,1
+2,3,4,4
+2,4,1,2
+3,4,1,2
+2,3,2,2
 1,2,3,1
--- a/data/test_complex.csv
+++ b/data/test_complex.csv
@ -0,0 +1,6 @@
+a,b,c
+5e-3, 3;4 ;5e-3;6.32,7
+1,2,3
+4,5;6;7;8;9, 0
+    3 ,2 ; 4; 5.7; -.3; 5., 6
+-3.12312,-4E+7;67456746744567;75,4
--- a/datagen.cpp
+++ b/datagen.cpp
@ -151,5 +151,5 @@ int gen_stock_data(int argc, char* argv[]){
 }

 int main(int argc, char* argv[]){
-    gen_stock_data(argc, argv);
+    return gen_stock_data(argc, argv);
 }
--- a/engine/types.py
+++ b/engine/types.py
@ -1,8 +1,9 @@
 from copy import deepcopy
-from engine.utils import base62uuid, defval
-from aquery_config import have_hge
 from typing import Dict, List

+from aquery_config import have_hge
+from engine.utils import base62uuid, defval
+
 type_table: Dict[str, "Types"] = {}

 class Types:
@ -65,10 +66,10 @@ class Types:
        return self.sqlname
    
    @staticmethod
-    def decode(aquery_type : str, vector_type:str = 'ColRef') -> "Types":
-        if (aquery_type.startswith('vec')):
+    def decode(aquery_type : str, vector_type:str = 'vector_type') -> "Types":
+        if (aquery_type.lower().startswith('vec')):
            return VectorT(Types.decode(aquery_type[3:]), vector_type)
-        return type_table[aquery_type]
+        return type_table[aquery_type.lower()]
    
 class TypeCollection:
    def __init__(self, sz, deftype, fptype = None, utype = None, *, collection = None) -> None:
@ -121,7 +122,7 @@ class VectorT(Types):
        return f'{self.vector_type}<{self.inner_type.name}>'
    @property
    def sqlname(self) -> str:
-        return 'BIGINT'
+        return 'HUGEINT' # Store vector_type into 16 bit integers
    @property
    def cname(self) -> str:
        return f'{self.vector_type}<{self.inner_type.cname}>'
@ -142,7 +143,7 @@ fp_types : Dict[str, Types] = _ty_make_dict('t.sqlname.lower()', FloatT, DoubleT
 temporal_types : Dict[str, Types] = _ty_make_dict('t.sqlname.lower()', DateT, TimeT, TimeStampT)
 builtin_types : Dict[str, Types] = {
    'string' : StrT,
-    **_ty_make_dict('t.sqlname.lower()', AnyT, TextT, VarcharT),
+    **_ty_make_dict('t.sqlname.lower()', AnyT, TextT, VarcharT, HgeT),
    **int_types, **fp_types, **temporal_types}

 def get_int128_support():
@ -294,7 +295,7 @@ opadd = OperatorBase('add', 2, auto_extension, cname = '+', sqlname = '+', call
 # monetdb wont extend int division to fp type
 # opdiv = OperatorBase('div', 2, fp(auto_extension), cname = '/', sqlname = '/', call = binary_op_behavior)
 opdiv = OperatorBase('div', 2, auto_extension, cname = '/', sqlname = '/', call = binary_op_behavior)
-opmul = OperatorBase('mul', 2, fp(auto_extension), cname = '*', sqlname = '*', call = binary_op_behavior)
+opmul = OperatorBase('mul', 2, auto_extension, cname = '*', sqlname = '*', call = binary_op_behavior)
 opsub = OperatorBase('sub', 2, auto_extension, cname = '-', sqlname = '-', call = binary_op_behavior)
 opmod = OperatorBase('mod', 2, auto_extension_int, cname = '%', sqlname = '%', call = binary_op_behavior)
 opneg = OperatorBase('neg', 1, as_is, cname = '-', sqlname = '-', call = unary_op_behavior)
@ -323,10 +324,14 @@ fnfirst = OperatorBase('first', 1, as_is, cname = 'frist', sqlname = 'FRIST', ca
 #fnavg = OperatorBase('avg', 1, fp(ext(auto_extension)), cname = 'avg', sqlname = 'AVG', call = fn_behavior)
 fnsum = OperatorBase('sum', 1, long_return, cname = 'sum', sqlname = 'SUM', call = fn_behavior)
 fnavg = OperatorBase('avg', 1, lfp_return, cname = 'avg', sqlname = 'AVG', call = fn_behavior)
+fnvar = OperatorBase('var', 1, lfp_return, cname = 'var', sqlname = 'VAR_POP', call = fn_behavior)
+fnstd = OperatorBase('stddev', 1, lfp_return, cname = 'stddev', sqlname = 'STDDEV_POP', call = fn_behavior)
 fnmaxs = OperatorBase('maxs', [1, 2], ty_clamp(as_is, -1), cname = 'maxs', sqlname = 'MAXS', call = windowed_fn_behavor)
 fnmins = OperatorBase('mins', [1, 2], ty_clamp(as_is, -1), cname = 'mins', sqlname = 'MINS', call = windowed_fn_behavor)
 fnsums = OperatorBase('sums', [1, 2], ext(ty_clamp(auto_extension, -1)), cname = 'sums', sqlname = 'SUMS', call = windowed_fn_behavor)
 fnavgs = OperatorBase('avgs', [1, 2], fp(ext(ty_clamp(auto_extension, -1))), cname = 'avgs', sqlname = 'AVGS', call = windowed_fn_behavor)
+fnvars = OperatorBase('vars', [1, 2], fp(ext(ty_clamp(auto_extension, -1))), cname = 'vars', sqlname = 'VARS', call = windowed_fn_behavor)
+fnstds = OperatorBase('stddevs', [1, 2], fp(ext(ty_clamp(auto_extension, -1))), cname = 'stddevs', sqlname = 'STDDEVS', call = windowed_fn_behavor)
 fncnt = OperatorBase('count', 1, int_return, cname = 'count', sqlname = 'COUNT', call = count_behavior)
 fnpack = OperatorBase('pack', -1, pack_return, cname = 'pack', sqlname = 'PACK', call = pack_behavior)
 # special
@ -360,8 +365,14 @@ builtin_cstdlib = _op_make_dict(fnsqrt, fnlog, fnsin, fncos, fntan, fnpow)
 builtin_func = _op_make_dict(fnmax, fnmin, fnsum, fnavg, fnmaxs, 
                             fnmins, fndeltas, fnratios, fnlast,
                             fnfirst, fnsums, fnavgs, fncnt, 
-                             fnpack, fntrunc, fnprev, fnnext)
+                             fnpack, fntrunc, fnprev, fnnext, 
+                             fnvar, fnvars, fnstd, fnstds)
 user_module_func = {}
 builtin_operators : Dict[str, OperatorBase] = {**builtin_binary_arith, **builtin_binary_logical, 
    **builtin_unary_arith, **builtin_unary_logical, **builtin_unary_special, **builtin_func, **builtin_cstdlib, 
    **user_module_func}
+
+type_table = {**builtin_types, **type_table}
+
+# Additional Aliases for type names
+type_table['boolean'] = BoolT
--- a/engine/utils.py
+++ b/engine/utils.py
@ -1,6 +1,6 @@
-from collections import OrderedDict
-from collections.abc import MutableMapping, Mapping
 import uuid
+from collections import OrderedDict
+from collections.abc import Mapping, MutableMapping

 lower_alp = 'abcdefghijklmnopqrstuvwxyz'
 upper_alp = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -107,6 +107,8 @@ def defval(val, default):

 # escape must be readonly
 from typing import Mapping, Set
+
+
 def remove_last(pattern : str, string : str, escape : Set[str] = set()) -> str:
    idx = string.rfind(pattern)
    if idx == -1:
@ -126,9 +128,11 @@ class _Counter:
        return cnt

 import re
+
 ws = re.compile(r'\s+')
 import os

+
 def add_dll_dir(dll: str):
    import sys
    if sys.version_info.major >= 3 and sys.version_info.minor >7 and os.name == 'nt':
@ -144,3 +148,13 @@ def clamp(val, minval, maxval):

 def escape_qoutes(string : str):
    return re.sub(r'^\'', r'\'',re.sub(r'([^\\])\'', r'\1\'', string))
+
+def get_innermost(sl):
+    if sl and type(sl) is dict:
+        if 'literal' in sl and type(sl['literal']) is str:
+            return f"'{get_innermost(sl['literal'])}'"
+        return get_innermost(next(iter(sl.values()), None))
+    elif sl and type(sl) is list:
+        return get_innermost(sl[0])
+    else:
+        return sl
--- a/msc-plugin/libaquery.vcxproj
+++ b/msc-plugin/libaquery.vcxproj
@ -221,7 +221,7 @@
  <ItemGroup>
    <ClInclude Include="..\csv.h" />
    <ClInclude Include="..\server\aggregations.h" />
-    <ClInclude Include="..\server\gc.hpp" />
+    <ClInclude Include="..\server\gc.h" />
    <ClInclude Include="..\server\hasher.h" />
    <ClInclude Include="..\server\io.h" />
    <ClInclude Include="..\server\libaquery.h" />
@ -238,7 +238,7 @@
  <ItemGroup>
    <ClCompile Include="..\server\server.cpp" />
    <ClCompile Include="..\server\winhelper.cpp" />
-    <ClCompile Include="..\server\io.cpp" />
+    <ClCompile Include="..\server\libaquery.cpp" />
    <ClCompile Include="..\server\monetdb_conn.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/prompt.py
+++ b/prompt.py
@ -1,4 +1,5 @@
 import aquery_config
+
 help_message = '''\
 ======================================================
                AQUERY COMMANDLINE HELP
@ -82,31 +83,31 @@ if __name__ == '__main__':
    

    
-import os
-from dataclasses import dataclass
+import atexit
+import ctypes
 import enum
-import time
+import mmap
+import os
 # import dbconn
 import re
+import subprocess
+import sys
+import threading
+import time
+from dataclasses import dataclass
 from typing import Callable, List, Optional
+
+import numpy as np
 from mo_parsing import ParseException
+
 import aquery_parser as parser
 import engine
-import engine.projection
 import engine.ddl
+import engine.projection
 import reconstruct as xengine
-import subprocess
-import mmap
-import sys
-from engine.utils import base62uuid
-import atexit
-import threading
-import ctypes
-import numpy as np
-from engine.utils import ws
-from engine.utils import add_dll_dir
-from engine.utils import nullstream
 from build import build_manager
+from engine.utils import add_dll_dir, base62uuid, nullstream, ws
+

 ## CLASSES BEGIN
 class RunType(enum.Enum):
@ -159,9 +160,11 @@ class QueryStats:
 class Config:
    __all_attrs__ = ['running', 'new_query', 'server_mode', 
                     'backend_type', 'has_dll', 
-                     'postproc_time', 'sql_time', 
-                     'n_buffers'
+                     'n_buffers',
                     ]
+    __i64_attrs__ = [
+                     'monetdb_time', 'postproc_time'
+                    ]
    __init_attributes__ = False
    
    @staticmethod
@ -170,26 +173,42 @@ class Config:
            from functools import partial
            for _i, attr in enumerate(Config.__all_attrs__):
                if not hasattr(Config, attr):
-                    setattr(Config, attr, property(partial(Config.getter, i = _i), partial(Config.setter, i = _i)))
+                    setattr(Config, attr, property(
+                        partial(Config.getter, i = _i), partial(Config.setter, i = _i)
+                    ))
+            for _i, attr in enumerate(Config.__i64_attrs__):
+                if not hasattr(Config, attr):
+                    setattr(Config, attr, property(
+                        partial(Config.i64_getter, i = _i), partial(Config.i64_setter, i = _i)
+                    ))
            Config.__init_attributes__ = True
            
    def __init__(self, mode, nq = 0, n_bufs = 0, bf_szs = []) -> None:
        Config.__init_self__()
-        self.int_size = 4
        self.n_attrib = len(Config.__all_attrs__)
-        self.buf = bytearray((self.n_attrib + n_bufs) * self.int_size)
-        self.np_buf = np.ndarray(shape=(self.n_attrib), buffer=self.buf, dtype=np.int32)
+        self.buf = bytearray((self.n_attrib + n_bufs) * 4 +
+                              len(self.__i64_attrs__) * 8
+                             )
+        self.np_buf = np.ndarray(shape = (self.n_attrib), buffer = self.buf, dtype = np.int32)
+        self.np_i64buf = np.ndarray(shape = len(self.__i64_attrs__), buffer = self.buf, 
+                                    dtype = np.int64, offset = 4 * len(self.__all_attrs__))
        self.new_query = nq
        self.server_mode = mode.value 
        self.running = 1
-        self.backend_type = Backend_Type.BACKEND_AQuery.value
+        self.backend_type = Backend_Type.BACKEND_MonetDB.value
        self.has_dll = 0
        self.n_buffers = n_bufs
+        self.monetdb_time = 0
+        self.postproc_time = 0
        
    def getter (self, *, i):
        return self.np_buf[i]
    def setter(self, v, *, i):
        self.np_buf[i] = v
+    def i64_getter (self, *, i):
+        return self.np_i64buf[i]
+    def i64_setter(self, v, *, i):
+        self.np_i64buf[i] = v

    def set_bufszs(self, buf_szs):
        for i in range(min(len(buf_szs), self.n_buffers)):
@ -208,6 +227,8 @@ class PromptState():
    test_parser = True
    server_mode: RunType = RunType.Threaded
    server_bin = 'server.bin' if server_mode == RunType.IPC else 'server.so'
+    wait_engine = lambda: None
+    wake_engine = lambda: None
    set_ready = lambda: None
    get_ready = lambda: None
    server_status = lambda: False
@ -298,12 +319,14 @@ def init_threaded(state : PromptState):
    if aquery_config.run_backend:    
        server_so = ctypes.CDLL('./'+state.server_bin)
        state.send = server_so['receive_args']
+        state.wait_engine = server_so['wait_engine']
+        state.wake_engine = server_so['wake_engine']
        aquery_config.have_hge = server_so['have_hge']()
        if aquery_config.have_hge != 0:
            from engine.types import get_int128_support
            get_int128_support()
        state.th = threading.Thread(target=server_so['main'], args=(-1, ctypes.POINTER(ctypes.c_char_p)(state.cfg.c)), daemon=True)
-        state.th.start()
+        state.th.start() 

 def init_prompt() -> PromptState:
    aquery_config.init_config()
@ -336,6 +359,8 @@ def init_prompt() -> PromptState:
        rm = lambda: None
        def __set_ready():
            state.cfg.new_query = 1
+            state.wake_engine()
+            
        state.set_ready = __set_ready
        state.get_ready = lambda: aquery_config.run_backend and state.cfg.new_query
        if aquery_config.run_backend:
@ -374,14 +399,23 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
    payload = None
    keep = True
    cxt = engine.initialize()
+    parser.parse('SELECT "**** WELCOME TO AQUERY++! ****";')
+    
    # state.currstats = QueryStats()
    # state.need_print = False
    while running():
        try:
            if state.server_status():
-                state.init()
+                state.init(state)
+            # *** busy waiting ***
+            # while state.get_ready():
+            #     time.sleep(.00001)
            while state.get_ready():
-                time.sleep(.00001)
+                state.wait_engine()
+                if state.need_print:
+                    print(f'MonetDB Time: {state.cfg.monetdb_time/10**9}, '
+                          f'PostProc Time: {state.cfg.postproc_time/10**9}')
+                    state.cfg.monetdb_time = state.cfg.postproc_time = 0
            state.currstats.print(state.stats, need_print=state.need_print)
            try:
                og_q : str = next()
@ -407,7 +441,7 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
                    for t in cxt.tables:
                        lst_cols = []
                        for c in t.columns:
-                            lst_cols.append(f'{c.name} : {c.type}')
+                            lst_cols.append(f'{c.name} : {c.type.name}')
                        print(f'{t.table_name} ({", ".join(lst_cols)})')
                continue
            elif q.startswith('help'):
@ -498,17 +532,17 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
                rm(state)
                exit()
            elif q.startswith('sh'):
-                from distutils.spawn import find_executable
+                from shutil import which
                qs = re.split(r'[ \t]', q)
                shells = ('zsh', 'bash', 'sh', 'fish', 'cmd', 'pwsh', 'powershell', 'csh', 'tcsh', 'ksh')
                shell_path = ''
                if len(qs) > 1 and qs[1] in shells:
-                    shell_path = find_executable(qs[1])
+                    shell_path = which(qs[1])
                    if shell_path:
                        os.system(shell_path)
                else:
                    for sh in shells:
-                        shell_path = find_executable(sh)
+                        shell_path = which(sh)
                        if shell_path:
                            os.system(shell_path)
                            break
@ -575,7 +609,7 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
                state.stats.print(clear = False)
                continue
            trimed = ws.sub(' ', og_q).split(' ') 
-            if trimed[0].lower().startswith('f'):
+            if len(trimed) > 1 and trimed[0].lower().startswith('fi') or trimed[0].lower() == 'f':
                fn = 'stock.a' if len(trimed) <= 1 or len(trimed[1]) == 0 \
                                else trimed[1]
                try:
@ -605,7 +639,8 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
            print("\nBye.")
            raise
        except ValueError as e:
-            import code, traceback
+            import code
+            import traceback
            __stdin = os.dup(0)
            raise_exception = True
            sh = code.InteractiveConsole({**globals(), **locals()})
--- a/reconstruct/init.py
+++ b/reconstruct/init.py
@ -1,4 +1,5 @@
 from reconstruct.ast import Context, ast_node
+
 saved_cxt = None

 def initialize(cxt = None, keep = False):
--- a/reconstruct/ast.py
+++ b/reconstruct/ast.py
@ -1,12 +1,14 @@
+from binascii import Error
 from copy import deepcopy
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Set, Tuple, Dict, Union, List, Optional
+from typing import Dict, List, Optional, Set, Tuple, Union

 from engine.types import *
-from engine.utils import enlist, base62uuid, base62alp, get_legal_name
-from reconstruct.storage import Context, TableInfo, ColRef
-    
+from engine.utils import (base62alp, base62uuid, enlist, get_innermost,
+                          get_legal_name)
+from reconstruct.storage import ColRef, Context, TableInfo
+
 class ast_node:
    header = []
    types = dict()
@ -28,8 +30,8 @@ class ast_node:
    
    def emit(self, code):
        self.context.emit(code)
-    def add(self, code):
-        self.sql += code + ' '
+    def add(self, code, sp = ' '):
+        self.sql += code + sp
    def addc(self, code):
        self.ccode += code + '\n'

@ -51,26 +53,60 @@ class ast_node:
            self.context.sql_end()
        
 from reconstruct.expr import expr, fastscan
-
-
+class SubqType(Enum):
+    WITH = auto()
+    FROM = auto()
+    PROJECTION = auto()
+    FILTER = auto()
+    GROUPBY = auto()
+    ORDERBY = auto()
+    NONE = auto()
 class projection(ast_node):
    name = 'projection'
    first_order = 'select'
-    
+
+        
+    def __init__(self, 
+                 parent : Optional["ast_node"],
+                 node, 
+                 context : Optional[Context] = None,
+                 force_use_spgb : bool = False,
+                 subq_type: SubqType = SubqType.NONE
+                ):
+        self.force_use_spgb = force_use_spgb
+        self.subq_type = subq_type
+        super().__init__(parent, node, context)
+        
    def init(self, _):
        # skip default init
        pass
    
    def produce(self, node):
        self.add('SELECT')
-        self.has_postproc = False
+        self.has_postproc = 'into' in node
        if 'select' in node:
            p = node['select']
            self.distinct = False
        elif 'select_distinct' in node:
            p = node['select_distinct']
            self.distinct = True
-
+        if 'with' in node:
+            with_table = node['with']['name']
+            with_table_name = tuple(with_table.keys())[0]
+            with_table_cols = tuple(with_table.values())[0]
+            self.with_clause = projection(self, node['with']['value'], subq_type=SubqType.WITH)
+            self.with_clause.out_table.add_alias(with_table_name)
+            for new_name, col in zip(with_table_cols, self.with_clause.out_table.columns):
+                col.rename(new_name)
+            self.with_clause.out_table.contextname_cpp 
+            # in monetdb, in cxt 
+        else:
+            self.with_clause = None
+        
+        self.limit = None
+        if 'limit' in node:
+            self.limit = node['limit']
+            
        self.projections = p if type(p) is list else [p]
        if self.parent is None:
            self.context.sql_begin()
@ -99,8 +135,9 @@ class projection(ast_node):
        if type(self.datasource) is join:
            self.datasource.process_join_conditions()
        
-        if 'groupby' in node:
-            self.context.special_gb = groupby.check_special(self, node['groupby'])
+        self.context.special_gb = self.force_use_spgb
+        if 'groupby' in node: # if groupby clause contains special stuff
+            self.context.special_gb |= groupby.check_special(self, node['groupby'])

    def consume(self, node):
        # deal with projections
@ -158,6 +195,11 @@ class projection(ast_node):
                        this_type = [c.type for c in _datasource]
                        compound = [c.compound for c in _datasource]
                        proj_expr = [expr(self, c.name) for c in _datasource]
+                        for pe in proj_expr:
+                            if pe.is_ColExpr:
+                                pe.cols_mentioned = {pe.raw_col}
+                            else:
+                                pe.cols_mentioned = set()
                    else:
                        y = lambda x:x
                        count = lambda : 'count(*)'
@ -203,8 +245,14 @@ class projection(ast_node):
        
        self.out_table.add_cols(cols, new = False)
        
+        self.proj_map = proj_map
+        
        if 'groupby' in node:
            self.group_node = groupby(self, node['groupby'])
+            if self.group_node.terminate:
+                self.context.abandon_query()
+                projection(self.parent, node, self.context, True, subq_type=self.subq_type)
+                return
            if self.group_node.use_sp_gb:
                self.has_postproc = True
        else:
@ -223,7 +271,11 @@ class projection(ast_node):
                self.var_table[col.name] = offset
                for n in (col.table.alias):
                    self.var_table[f'{n}.'+col.name] = offset
-    
+        # monetdb doesn't support select into table
+        # if 'into' in node:
+        #     self.into_stub = f'{{INTOSTUB{base62uuid(20)}}}'
+        #     self.add(self.into_stub, '')
+            
        def finialize(astnode:ast_node):
            if(astnode is not None):
                self.add(astnode.sql)
@ -235,6 +287,9 @@ class projection(ast_node):
        if self.col_ext or self.group_node and self.group_node.use_sp_gb:
            self.has_postproc = True
        
+        if self.group_node and self.group_node.use_sp_gb :
+            self.group_node.dedicated_glist
+            ...
        o = self.assumptions
        if 'orderby' in node:
            o.extend(enlist(node['orderby']))
@ -258,7 +313,6 @@ class projection(ast_node):
        
        
        # cpp module codegen
-        self.context.has_dll = True
        # extract typed-columns from result-set
        vid2cname = [0]*len(self.var_table)
        self.pyname2cname = dict()
@ -338,28 +392,36 @@ class projection(ast_node):
                    )
                else:
                    # for funcs evaluate f_i(x, ...)
-                    self.context.emitc(f'{self.out_table.contextname_cpp}->get_col<{key}>() = {val[1]};')
+                    self.context.emitc(f'{self.out_table.contextname_cpp}->get_col<{key}>().initfrom({val[1]}, "{cols[i].name}");')
        # print out col_is
-        if 'into' not in node:
-            self.context.emitc(f'print(*{self.out_table.contextname_cpp});')
+        
+        if 'into' not in node and self.subq_type == SubqType.NONE:
+            if self.limit is None:
+                self.context.emitc(f'print(*{self.out_table.contextname_cpp});')
+            else:
+                self.context.emitc(f'{self.out_table.contextname_cpp}->printall(" ","\\n", nullptr, nullptr, {self.limit});')
        
        if self.outfile and self.has_postproc:
                self.outfile.finalize()

        if 'into' in node: 
            self.context.emitc(select_into(self, node['into']).ccode)
+            self.has_postproc = True
        if not self.distinct:
            self.finalize()
-            
+                    
    def finalize(self):      
        self.context.emitc(f'puts("done.");')

        if self.parent is None:
            self.context.sql_end()
-            if self.outfile and not self.has_postproc:
-                self.context.abandon_postproc()
-            else:
+            if self.has_postproc:
+                self.context.has_dll = True
                self.context.postproc_end(self.postproc_fname)
+            else:
+                self.context.ccode = ''
+                if self.limit != 0 and not self.outfile:
+                    self.context.direct_output()
        
 class select_distinct(projection):
    first_order = 'select_distinct'
@ -367,18 +429,18 @@ class select_distinct(projection):
        super().consume(node)
        if self.has_postproc:
            self.context.emitc(
-                f'{self.out_table.table_name}->distinct();'
+                f'{self.out_table.contextname_cpp}->distinct();'
            )
        self.finalize()
        
 class select_into(ast_node):
-    def init(self, node):
+    def init(self, _):
        if isinstance(self.parent, projection):
-            if self.context.has_dll:
-                # has postproc put back to monetdb
-                self.produce = self.produce_cpp
-            else:
-                self.produce = self.produce_sql
+            # if self.parent.has_postproc:
+            #     # has postproc put back to monetdb
+            self.produce = self.produce_cpp
+            # else:
+            #     self.produce = self.produce_sql
        else:
            raise ValueError('parent must be projection')
        
@ -390,7 +452,8 @@ class select_into(ast_node):
            self.ccode = f'{self.parent.out_table.contextname_cpp}->monetdb_append_table(cxt->alt_server, \"{node.lower()}\");'
            
    def produce_sql(self, node):
-        self.sql = f' INTO {node}'
+        self.context.sql = self.context.sql.replace(
+            self.parent.into_stub, f'INTO {node}', 1)
    

 class orderby(ast_node):
@ -409,7 +472,7 @@ class orderby(ast_node):
                o_str += ' ' + 'DESC'
            o_list.append(o_str)
        self.add(', '.join(o_list))
-            
+

 class scan(ast_node):
    class Position(Enum):
@ -586,6 +649,10 @@ class groupby(ast_node):
                return True
        return False

+    def init(self, _):
+        self.terminate = False
+        super().init(_)
+        
    def produce(self, node):
        if not isinstance(self.parent, projection):
            raise ValueError('groupby can only be used in projection')
@ -593,8 +660,10 @@ class groupby(ast_node):
        node = enlist(node)
        o_list = []
        self.refs = set()
+        self.gb_cols = set()
+        # dedicated_glist -> cols populated for special group by
        self.dedicated_glist : List[Tuple[expr, Set[ColRef]]] = []
-        self.use_sp_gb = False
+        self.use_sp_gb = self.parent.force_use_spgb
        for g in node:
            self.datasource.rec = set()
            g_expr = expr(self, g['value'])
@ -610,7 +679,24 @@ class groupby(ast_node):
            if 'sort' in g and f'{g["sort"]}'.lower() == 'desc':
                g_str = g_str + ' ' + 'DESC'
            o_list.append(g_str)
-            
+            if g_expr.is_ColExpr:
+                self.gb_cols.add(g_expr.raw_col)
+            else:
+                self.gb_cols.add(g_expr.sql)
+                
+        for projs in self.parent.proj_map.values():
+            if self.use_sp_gb:
+                break
+            if (projs[2].is_compound and 
+                not ((projs[2].is_ColExpr and projs[2].raw_col in self.gb_cols) or
+                projs[2].sql in self.gb_cols)
+                ) and (not self.parent.force_use_spgb):
+                    self.use_sp_gb = True
+                    break
+                
+        if self.use_sp_gb and not self.parent.force_use_spgb:
+            self.terminate = True
+            return
        if not self.use_sp_gb:
            self.dedicated_gb = None
            self.add(', '.join(o_list))
@ -916,38 +1002,64 @@ class insert(ast_node):
    name = 'insert'
    first_order = name
    def init(self, node):
-        values = node['query']
-        complex_query_kw = ['from', 'where', 'groupby', 'having', 'orderby', 'limit']
-        if any([kw in values for kw in complex_query_kw]):
-            values['into'] = node['insert']
-            proj_cls = (select_distinct 
-            if 'select_distinct' in values 
-            else projection)
-            proj_cls(None, values, self.context)
-            self.produce = lambda*_:None
-            self.spawn = lambda*_:None
-            self.consume = lambda*_:None
+        if 'query' in node:
+            values = node['query']
+            complex_query_kw = ['from', 'where', 'groupby', 'having', 'orderby', 'limit']
+            if any([kw in values for kw in complex_query_kw]):
+                values['into'] = node['insert']
+                proj_cls = (select_distinct 
+                if 'select_distinct' in values 
+                else projection)
+                proj_cls(None, values, self.context)
+                self.produce = lambda*_:None
+                self.spawn = lambda*_:None
+                self.consume = lambda*_:None
        else:
            super().init(node)
            
    def produce(self, node):
-        values = node['query']['select']
+        keys = []
+        if 'query' in node:
+            if 'select' in node['query']:
+                values = enlist(node['query']['select'])
+                if 'columns' in node:
+                    keys = node['columns']
+                values = [v['value'] for v in values]
+
+            elif 'union_all' in node['query']:
+                values = [[v['select']['value']] for v in node['query']['union_all']]
+                if 'columns' in node:
+                    keys = node['columns']
+        else:
+            values = enlist(node['values'])
+            _vals = []
+            for v in values:
+                if isinstance(v, dict):
+                    keys = v.keys()
+                    v = list(v.values())
+                v = [f"'{vv}'" if type(vv) is str else vv for vv in v]
+                _vals.append(v)
+            values = _vals
+            
+        keys = f'({", ".join(keys)})' if keys else ''
        tbl = node['insert']
-        self.sql = f'INSERT INTO {tbl} VALUES('
+        self.sql = f'INSERT INTO {tbl}{keys} VALUES'
        # if len(values) != table.n_cols:
        #     raise ValueError("Column Mismatch")
-
+        values = [values] if isinstance(values, list) and not isinstance(values[0], list) else values
        list_values = []
-        for i, s in enumerate(enlist(values)):
-            if 'value' in s:
-                list_values.append(f"{s['value']}")
-            else:
-                # subquery, dispatch to select astnode
-                pass
-        self.sql += ', '.join(list_values) + ')'
+        for l in values:
+            inner_list_values = []
+            for s in enlist(l):
+                if type(s) is dict and 'value' in s:
+                    s = s['value']
+                inner_list_values.append(f"{get_innermost(s)}")
+            list_values.append(f"({', '.join(inner_list_values)})")
+            
+        self.sql += ', '.join(list_values) 
        

-class delete_table(ast_node):
+class delete_from(ast_node):
    name = 'delete'
    first_order = name
    def init(self, node):
@ -959,6 +1071,31 @@ class delete_table(ast_node):
        if 'where' in node:
            self.sql += filter(self, node['where']).sql

+class union_all(ast_node):
+    name = 'union_all'
+    first_order = name
+    sql_name = 'UNION ALL'
+    def produce(self, node):
+        queries = node[self.name]
+        generated_queries : List[Optional[projection]] = [None] * len(queries)
+        is_standard = True
+        for i, q in enumerate(queries):
+            if 'select' in q:
+                generated_queries[i] = projection(self, q)
+                is_standard &= not generated_queries[i].has_postproc
+        if is_standard:
+            self.sql = f' {self.sql_name} '.join([q.sql for q in generated_queries])
+        else:
+            raise NotImplementedError(f"{self.sql_name} only support standard sql for now")
+    def consume(self, node):
+        super().consume(node)
+        self.context.direct_output()
+
+class except_clause(union_all):
+    name = 'except'
+    first_order = name
+    sql_name = 'EXCEPT'
+    
 class load(ast_node):
    name="load"
    first_order = name
@ -967,6 +1104,9 @@ class load(ast_node):
        if node['load']['file_type'] == 'module':
            self.produce = self.produce_module
            self.module = True
+        elif 'complex' in node['load']:
+            self.produce = self.produce_cpp
+            self.consume = lambda *_: None
        elif self.context.dialect == 'MonetDB':
            self.produce = self.produce_monetdb
        else: 
@ -998,7 +1138,7 @@ class load(ast_node):
                self.context.queries.append(f'F{fname}')
                ret_type = VoidT
                if 'ret_type' in f:
-                    ret_type = Types.decode(f['ret_type'])
+                    ret_type = Types.decode(f['ret_type'], vector_type='vector_type')
                nargs = 0
                arglist = ''
                if 'vars' in f:
@ -1008,7 +1148,7 @@ class load(ast_node):
                    nargs = len(arglist)
                    arglist = ', '.join(arglist)
                # create c++ stub 
-                cpp_stub = f'{ret_type.cname} (*{fname})({arglist}) = nullptr;'
+                cpp_stub = f'{"vectortype_cstorage" if isinstance(ret_type, VectorT) else ret_type.cname} (*{fname})({arglist}) = nullptr;'
                self.context.module_stubs += cpp_stub + '\n'
                self.context.module_map[fname] = cpp_stub
                #registration for parser
@ -1035,7 +1175,56 @@ class load(ast_node):
        self.sql = f'{s1} \'{p}\' {s2} '
        if 'term' in node:
            self.sql += f' {s3} \'{node["term"]["literal"]}\''
-                    
+            
+    def produce_cpp(self, node):
+        self.context.has_dll = True
+        self.context.headers.add('"csv.h"')
+        node = node['load']
+        self.postproc_fname = 'ld_' + base62uuid(5)
+        self.context.postproc_begin(self.postproc_fname)
+        
+        table:TableInfo = self.context.tables_byname[node['table']]
+        self.sql = F"SELECT {', '.join([c.name for c in table.columns])} FROM {table.table_name};"
+        self.emit(self.sql+';\n')
+        self.context.sql_end()
+        length_name = 'len_' + base62uuid(6)
+        self.context.emitc(f'auto {length_name} = server->cnt;')
+        
+        out_typenames = [t.type.cname for t in table.columns]
+        outtable_col_nameslist = ', '.join([f'"{c.name}"' for c in table.columns])
+        
+        self.outtable_col_names = 'names_' + base62uuid(4)
+        self.context.emitc(f'const char* {self.outtable_col_names}[] = {{{outtable_col_nameslist}}};')
+        
+        self.out_table = 'tbl_' + base62uuid(4)
+        self.context.emitc(f'auto {self.out_table} = new TableInfo<{",".join(out_typenames)}>("{table.table_name}", {self.outtable_col_names});')
+        for i, c in enumerate(table.columns):
+            c.cxt_name = 'c_' + base62uuid(6) 
+            self.context.emitc(f'decltype(auto) {c.cxt_name} = {self.out_table}->get_col<{i}>();')
+            self.context.emitc(f'{c.cxt_name}.initfrom({length_name}, server->getCol({i}), "{table.columns[i].name}");')
+        csv_reader_name = 'csv_reader_' + base62uuid(6)
+        col_types = [c.type.cname for c in table.columns]
+        col_tmp_names = ['tmp_'+base62uuid(8) for _ in range(len(table.columns))]
+        #col_names = ','.join([f'"{c.name}"' for c in table.columns])
+        term_field = ',' if 'term' not in node else node['term']['literal']
+        term_ele = ';' if 'ele' not in node else node['ele']['literal']
+        self.context.emitc(f'AQCSVReader<{len(col_types)}, \'{term_field.strip()[0]}\', \'{term_ele.strip()[0]}\'> {csv_reader_name}("{node["file"]["literal"]}");')
+        # self.context.emitc(f'{csv_reader_name}.read_header(io::ignore_extra_column, {col_names});')
+        self.context.emitc(f'{csv_reader_name}.next_line();')
+
+        for t, n in zip(col_types, col_tmp_names):
+            self.context.emitc(f'{t} {n};')
+        self.context.emitc(f'while({csv_reader_name}.read_row({",".join(col_tmp_names)})) {{ \n')
+        for i, c in enumerate(table.columns):
+            # self.context.emitc(f'print({col_tmp_names[i]});')
+            self.context.emitc(f'{c.cxt_name}.emplace_back({col_tmp_names[i]});')
+            
+        self.context.emitc('}')
+        # self.context.emitc(f'print(*{self.out_table});')
+        self.context.emitc(f'{self.out_table}->monetdb_append_table(cxt->alt_server, "{table.table_name}");')
+        
+        self.context.postproc_end(self.postproc_fname)
+
 class outfile(ast_node):
    name="_outfile"
    def __init__(self, parent, node, context = None, *, sql = None):
@ -1062,6 +1251,13 @@ class outfile(ast_node):
        filename = node['loc']['literal'] if 'loc' in node else node['literal']
        import os
        p =  os.path.abspath('.').replace('\\', '/') + '/' + filename
+        print('Warning: file {p} exists and will be overwritten')
+        if os.path.exists(p):
+            try:
+                os.remove(p)
+            except OSError:
+                print(f'Error: file {p} exists and cannot be removed')
+                
        self.sql = f'COPY {self.parent.sql} INTO \'{p}\''
        d = ','
        e = '\\n'
@ -1137,7 +1333,7 @@ class udf(ast_node):
                
        
    def produce(self, node):
-        from engine.utils import get_legal_name, check_legal_name
+        from engine.utils import check_legal_name, get_legal_name
        node = node[self.name]
        # register udf
        self.agg = 'Agg' in node
@ -1232,7 +1428,7 @@ class udf(ast_node):
                    
                    
    def consume(self, node):
-        from engine.utils import get_legal_name, check_legal_name
+        from engine.utils import check_legal_name, get_legal_name
        node = node[self.name]
                    
        if 'params' in node:
@ -1339,7 +1535,25 @@ class udf(ast_node):
            return udf.ReturnPattern.elemental_return
        else:
            return udf.ReturnPattern.bulk_return
-            
+
+class passthru_sql(ast_node):
+    name = 'sql'
+    first_order = name
+    import re
+    # escapestr = r'''(?:((?:[^;"']|"[^"]*"|'[^']*')+)|(?:--[^\r\n]*[\r|\n])+)'''
+    # escape_comment = fr'''(?:{escapestr}|{escapestr}*-{escapestr}*)'''
+    seprator = re.compile(r'''((?:[^;"']|"[^"]*"|'[^']*')+)''')
+    def __init__(self, _, node, context:Context):
+        sqls = passthru_sql.seprator.split(node['sql'])
+        for sql in sqls:
+            sq = sql.strip(' \t\n\r;')
+            if sq:
+                context.queries.append('Q' + sql.strip('\r\n\t ;') + ';')
+                lq = sq.lower()
+                if lq.startswith('select'):
+                    context.queries.append('O')
+
+
 class user_module_function(OperatorBase):
    def __init__(self, name, nargs, ret_type, context : Context):
        super().__init__(name, nargs, lambda *_: ret_type, call=fn_behavior)
@ -1355,4 +1569,5 @@ def include(objs):
            
            
 import sys
+
 include(sys.modules[__name__])
--- a/reconstruct/expr.py
+++ b/reconstruct/expr.py
@ -1,7 +1,8 @@
 from typing import Optional, Set
+
+from engine.types import *
 from reconstruct.ast import ast_node
 from reconstruct.storage import ColRef, Context
-from engine.types import *

 # TODO: Decouple expr and upgrade architecture
 # C_CODE : get ccode/sql code?
@ -31,6 +32,7 @@ class expr(ast_node):
    
    def __init__(self, parent, node, *, c_code = None, supress_undefined = False):
        from reconstruct.ast import projection, udf
+
        # gen2 expr have multi-passes
        # first pass parse json into expr tree
        # generate target code in later passes upon need
@ -78,7 +80,7 @@ class expr(ast_node):
        ast_node.__init__(self, parent, node, None)

    def init(self, _):
-        from reconstruct.ast import projection, _tmp_join_union
+        from reconstruct.ast import _tmp_join_union, projection
        parent = self.parent
        self.is_compound = parent.is_compound if type(parent) is expr else False
        if type(parent) in [projection, expr, _tmp_join_union]:
@ -88,11 +90,13 @@ class expr(ast_node):
        self.udf_map = parent.context.udf_map
        self.func_maps = {**builtin_func, **self.udf_map, **user_module_func}
        self.operators = {**builtin_operators, **self.udf_map, **user_module_func}
-        self.ext_aggfuncs = ['sum', 'avg', 'count', 'min', 'max', 'last', 'first', 'prev', 'next']
+        self.ext_aggfuncs = ['sum', 'avg', 'count', 'min', 'max', 
+                             'last', 'first', 'prev', 'next', 'var', 
+                             'stddev']
        
    def produce(self, node):
        from engine.utils import enlist
-        from reconstruct.ast import udf
+        from reconstruct.ast import udf, projection
        
        if type(node) is dict:
            if 'literal' in node:
@ -166,8 +170,17 @@ class expr(ast_node):
                            
                        special_func = [*self.context.udf_map.keys(), *self.context.module_map.keys(), 
                                        "maxs", "mins", "avgs", "sums", "deltas", "last", "first", 
-                                        "ratios", "pack", "truncate"]
-                        if self.context.special_gb:
+                                        "stddevs", "vars", "ratios", "pack", "truncate"]
+                        
+                        if (
+                                self.context.special_gb 
+                                    or 
+                                (
+                                    type(self.root.parent) is projection 
+                                        and
+                                    self.root.parent.force_use_spgb
+                                )
+                           ):
                            special_func = [*special_func, *self.ext_aggfuncs]
                            
                        if key in special_func and not self.is_special:
@ -333,7 +346,8 @@ class expr(ast_node):
                    self.type = ByteT
            elif type(node) is float:
                self.type = DoubleT
-    
+                self.sql = f'{{"CAST({node} AS DOUBLE)" if not c_code else "{node}f"}}'
+                
    def finalize(self, override = False):
        from reconstruct.ast import udf
        if self.codebuf is None or override:
--- a/reconstruct/storage.py
+++ b/reconstruct/storage.py
@ -1,12 +1,14 @@
+from typing import Dict, List, Set
+
 from engine.types import *
 from engine.utils import CaseInsensitiveDict, base62uuid, enlist
-from typing import List, Dict, Set
+

 class ColRef:
    def __init__(self, _ty, cobj, table:'TableInfo', name, id, compound = False, _ty_args = None):
        self.type : Types = AnyT
        if type(_ty) is str:
-            self.type = builtin_types[_ty.lower()]
+            self.type = Types.decode(_ty)
            if _ty_args:
                self.type = self.type(enlist(_ty_args))
        elif type(_ty) is Types:
@ -17,6 +19,7 @@ class ColRef:
        self.alias = set()
        self.id = id # position in table
        self.compound = compound # compound field (list as a field) 
+        self.cxt_name = ''
        # e.g. order by, group by, filter by expressions
        
        self.__arr__ = (_ty, cobj, table, name, id)
@ -42,6 +45,14 @@ class ColRef:
            alias = table_name
        return f'{alias}.{self.get_name()}'
    
+    def rename(self, name):
+        self.alias.discard(self.name)
+        self.table.columns_byname.pop(self.name, None)
+        self.name = name
+        self.table.columns_byname[name] = self
+        
+        return self
+    
    def __getitem__(self, key):
        if type(key) is str:
            return getattr(self, key)
@ -94,6 +105,17 @@ class TableInfo:
            return
        self.cxt.tables_byname[alias] = self
        self.alias.add(alias)
+    
+    def rename(self, name):
+        if name in self.cxt.tables_byname.keys():
+            print(f"Error: table name {name} already exists")
+            return
+        
+        self.cxt.tables_byname.pop(self.table_name, None)
+        self.alias.discard(self.table_name)
+        self.table_name = name
+        self.cxt.tables_byname[name] = self
+        self.alias.add(name)
        
    def parse_col_names(self, colExpr) -> ColRef:
        parsedColExpr = colExpr.split('.')
@ -134,6 +156,7 @@ class Context:
        self.queries = []
        self.module_init_loc = 0
        self.special_gb = False
+        self.has_dll = False
         
    def __init__(self):
        self.tables_byname = dict()
@ -147,7 +170,6 @@ class Context:
        self.udf_agg_map = dict()
        self.use_columnstore = False
        self.print = print
-        self.has_dll = False
        self.dialect = 'MonetDB'
        self.is_msvc = False
        self.have_hge = False
@ -223,6 +245,14 @@ class Context:
        self.queries.append('P' + proc_name)    
        self.finalize_query()
        
+    def abandon_query(self):
+        self.sql = ''
+        self.ccode = ''
+        self.finalize_query()
+    
+    def direct_output(self):
+        self.queries.append('O')
+    
    def abandon_postproc(self):
        self.ccode = ''
        self.finalize_query()
--- a/sdk/Evaluation.cpp
+++ b/sdk/Evaluation.cpp
@ -5,14 +5,13 @@

 struct minEval{
        double value;
-        double values;
+        int* values;

 	double eval;
        long left; // how many on its left
        double* record;
        long max;
        long** count;
-        long* sorted; // sorted d
 };

 minEval giniSparse(double** data, long* result, long* d, long size, long col, long classes, long* totalT){
--- a/sdk/Makefile
+++ b/sdk/Makefile
@ -1,5 +1,11 @@
+OPT_FLASG = 
+ifneq ($(DEBUG), 1)
+	OPT_FLAGS = -Ofast -march=native -flto -DNDEBUG 
+else 
+	OPT_FLAGS = -g3 -D_DEBUG -fsanitize=leak -fsanitize=address
+endif
 example:
 	$(CXX) -shared -fPIC example.cpp aquery_mem.cpp -fno-semantic-interposition -Ofast -march=native -flto --std=c++1z -o ../test.so
 irf:
-	$(CXX) -shared -fPIC RF.cpp irf.cpp incrementalDecisionTree.cpp aquery_mem.cpp Evaluation.cpp -fno-semantic-interposition -Ofast -march=native -flto --std=c++1z -o ../libirf.so
+	$(CXX) -shared -fPIC RF.cpp irf.cpp incrementalDecisionTree.cpp aquery_mem.cpp Evaluation.cpp -fno-semantic-interposition $(OPT_FLAGS) --std=c++1z -o ../libirf.so
 all: example
--- a/sdk/irf.cpp
+++ b/sdk/irf.cpp
@ -4,9 +4,6 @@
 #include "../server/table.h"

 DecisionTree* dt = nullptr;
-long pt = 0;
-double** data = nullptr;
-long* result = nullptr;

 __AQEXPORT__(bool) newtree(int height, long f, ColRef<int> sparse, double forget, long maxf, long noclasses, Evaluation e, long r, long rb){
 	if(sparse.size!=f)return 0;
@ -19,14 +16,13 @@ __AQEXPORT__(bool) newtree(int height, long f, ColRef<int> sparse, double forget
 	return 1;
 }

-__AQEXPORT__(bool) additem(ColRef<double>X, long y, long size){
-	long j = 0;
-	if(size>0){
-		free(data);
-		free(result);
-		pt = 0;
-		data=(double**)malloc(size*sizeof(double*));
-		result=(long*)malloc(size*sizeof(long));
+__AQEXPORT__(bool) fit(ColRef<ColRef<double>> X, ColRef<int> y){
+	if(X.size != y.size)return 0;
+	double** data = (double**)malloc(X.size*sizeof(double*));
+	long* result = (long*)malloc(y.size*sizeof(long));
+	for(long i=0; i<X.size; i++){
+		data[i] = X.container[i].container;
+		result[i] = y.container[i];
 	}
 	data[pt] = (double*)malloc(X.size*sizeof(double));
 	for(j=0; j<X.size; j++){
@ -36,19 +32,32 @@ __AQEXPORT__(bool) additem(ColRef<double>X, long y, long size){
 	pt ++;
 	return 1;
 }
-__AQEXPORT__(bool) fit(){
-	if(pt<=0)return 0;
-	dt->fit(data, result, pt);
-	return 1;
+__AQEXPORT__(bool) fit(vector_type<vector_type<double>> v, vector_type<long> res){
+	double** data = (double**)malloc(v.size*sizeof(double*));
+	for(int i = 0; i < v.size; ++i)
+		data[i] = v.container[i].container;
+	dt->fit(data, res.container, v.size);
+	return true;
 }

-__AQEXPORT__(ColRef_storage) predict(){
-	int* result = (int*)malloc(pt*sizeof(int));
-	for(long i=0; i<pt; i++){
-		result[i]=dt->Test(data[i], dt->DTree);
-	}
+__AQEXPORT__(vectortype_cstorage) predict(vector_type<vector_type<double>> v){
+	int* result = (int*)malloc(v.size*sizeof(int));
 	
-	return ColRef_storage(new ColRef_storage(result, pt, 0, "prediction", 0), 1, 0, "prediction", 0);
+	for(long i=0; i<v.size; i++){
+		result[i]=dt->Test(v.container[i].container, dt->DTree);
+		//printf("%d ", result[i]);
+	}
+	auto container = (vector_type<int>*)malloc(sizeof(vector_type<int>));
+	container->size = v.size;
+	container->capacity = 0;
+	container->container = result;
+	// container->out(10);
+	// ColRef<vector_type<int>>* col = (ColRef<vector_type<int>>*)malloc(sizeof(ColRef<vector_type<int>>));
+	auto ret = vectortype_cstorage{.container = container, .size = 1, .capacity = 0};
+	// col->initfrom(ret, "sibal");
+	// print(*col);
+	return ret;
+	//return true;
 }


--- a/server/Makefile
+++ b/server/Makefile
@ -1,6 +1,6 @@
 debug:
-	g++ -g3 -O0 server/server.cpp server/io.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
+	g++ -g3 -O0 server/server.cpp server/libaquery.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
 	
 test:
-	g++ --std=c++1z -g3 -O0 server.cpp io.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
+	g++ --std=c++1z -g3 -O0 server.cpp libaquery.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
 	
--- a/server/aggregations.h
+++ b/server/aggregations.h
@ -202,6 +202,102 @@ decayed_t<VT, types::GetFPType<types::GetLongType<T>>> avgw(uint32_t w, const VT
 	return ret;
 }

+template<class T, template<typename ...> class VT, bool sd = false>
+decayed_t<VT, types::GetFPType<types::GetLongType<T>>> varw(uint32_t w, const VT<T>& arr) {
+	using FPType = types::GetFPType<types::GetLongType<T>>;
+	const uint32_t& len = arr.size;
+	decayed_t<VT, FPType> ret(len);
+	uint32_t i = 0;
+	types::GetLongType<T> s{};
+	w = w > len ? len : w;
+	FPType EnX {},  MnX{};
+	if (len) {
+		s = arr[0];
+		MnX = 0;
+		EnX = arr[0];
+		ret[i++] = 0;
+	}
+	for (; i < len; ++i){
+		s += arr[i];
+		FPType _EnX = s / (FPType)(i + 1);
+		MnX += (arr[i] - EnX) * (arr[i] - _EnX);
+		EnX = _EnX;
+		ret[i] = MnX / (FPType)(i + 1);
+		if constexpr(sd) ret[i-1] = sqrt(ret[i-1]);
+	}
+	const float rw = 1.f / (float)w;
+	s *= rw;	
+	for (; i < len; ++i){
+		const auto dw = arr[i] - arr[i - w - 1];
+		const auto sw = arr[i] + arr[i - w - 1];
+		const auto dex = dw * rw;
+		ret[i] = ret[i-1] - dex*(s + s + dex - sw);
+		if constexpr(sd) ret[i-1] = sqrt(ret[i-1]);
+		s += dex;
+	}
+	if constexpr(sd) 
+		if(i)
+			ret[i-1] = sqrt(ret[i-1]);
+	
+	return ret;
+}
+
+template<class T, template<typename ...> class VT>
+types::GetFPType<types::GetLongType<decays<T>>> var(const VT<T>& arr) {
+	typedef types::GetFPType<types::GetLongType<decays<T>>> FPType;
+	const uint32_t& len = arr.size;
+	uint32_t i = 0;
+	types::GetLongType<T> s{0};
+	types::GetLongType<T> ssq{0};
+	if (len) {
+		s = arr[0];
+		ssq = arr[0] * arr[0];
+	}
+	for (; i < len; ++i){
+		s += arr[i];
+		ssq += arr[i] * arr[i];
+	}
+	return (ssq - s * s / (FPType)(len + 1)) / (FPType)(len + 1);
+}
+
+template<class T, template<typename ...> class VT, bool sd = false>
+decayed_t<VT, types::GetFPType<types::GetLongType<T>>> vars(const VT<T>& arr) {
+	typedef types::GetFPType<types::GetLongType<T>> FPType;
+	const uint32_t& len = arr.size;
+	decayed_t<VT, FPType> ret(len);
+	uint32_t i = 0;
+	types::GetLongType<T> s{};
+	FPType MnX{};
+	FPType EnX {};
+	if (len) {
+		s = arr[0];
+		MnX = 0;
+		EnX = arr[0];
+		ret[i++] = 0;
+	}
+	for (; i < len; ++i){
+		s += arr[i];
+		FPType _EnX = s / (FPType)(i + 1);
+		MnX += (arr[i] - EnX) * (arr[i] - _EnX);
+		printf("%d %ld ", arr[i], MnX);
+		EnX = _EnX;
+		ret[i] = MnX / (FPType)(i + 1);
+		if constexpr(sd) ret[i] = sqrt(ret[i]);
+	}
+	return ret;
+}
+template<class T, template<typename ...> class VT>
+types::GetFPType<types::GetLongType<decays<T>>> stddev(const VT<T>& arr) {
+	return sqrt(var(arr));
+}
+template<class T, template<typename ...> class VT>
+decayed_t<VT, types::GetFPType<types::GetLongType<T>>> stddevs(const VT<T>& arr) {
+	return vars<T, VT, true>(arr);
+}
+template<class T, template<typename ...> class VT>
+decayed_t<VT, types::GetFPType<types::GetLongType<T>>> stddevw(uint32_t w, const VT<T>& arr) {
+	return varw<T, VT, true>(w, arr);
+}
 // use getSignedType
 template<class T, template<typename ...> class VT>
 decayed_t<VT, T> deltas(const VT<T>& arr) {
@ -251,26 +347,33 @@ T first(const VT<T>& arr) {
 }


+
 #define __DEFAULT_AGGREGATE_FUNCTION__(NAME, RET) \
-template <class T> constexpr inline T NAME(const T& v) { return RET; }
+template <class T> constexpr T NAME(const T& v) { return RET; }

 // non-aggreation count. E.g. SELECT COUNT(col) from table; 
-template <class T> constexpr inline T count(const T& v) { return 1; }
-template <class T> constexpr inline T max(const T& v) { return v; }
-template <class T> constexpr inline T min(const T& v) { return v; }
-template <class T> constexpr inline T avg(const T& v) { return v; }
-template <class T> constexpr inline T sum(const T& v) { return v; }
-template <class T> constexpr inline T maxw(uint32_t, const T& v) { return v; }
-template <class T> constexpr inline T minw(uint32_t, const T& v) { return v; }
-template <class T> constexpr inline T avgw(uint32_t, const T& v) { return v; }
-template <class T> constexpr inline T sumw(uint32_t, const T& v) { return v; }
-template <class T> constexpr inline T ratiow(uint32_t, const T& v) { return 1; }
-template <class T> constexpr inline T maxs(const T& v) { return v; }
-template <class T> constexpr inline T mins(const T& v) { return v; }
-template <class T> constexpr inline T avgs(const T& v) { return v; }
-template <class T> constexpr inline T sums(const T& v) { return v; }
-template <class T> constexpr inline T last(const T& v) { return v; }
-template <class T> constexpr inline T prev(const T& v) { return v; }
-template <class T> constexpr inline T aggnext(const T& v) { return v; }
-template <class T> constexpr inline T daltas(const T& v) { return 0; }
-template <class T> constexpr inline T ratios(const T& v) { return 1; }
+template <class T> constexpr T count(const T&) { return 1; }
+template <class T> constexpr T var(const T&) { return 0; }
+template <class T> constexpr T vars(const T&) { return 0; }
+template <class T> constexpr T varw(uint32_t, const T&) { return 0; }
+template <class T> constexpr T stddev(const T&) { return 0; }
+template <class T> constexpr T stddevs(const T&) { return 0; }
+template <class T> constexpr T stddevw(uint32_t, const T&) { return 0; }
+template <class T> constexpr T max(const T& v) { return v; }
+template <class T> constexpr T min(const T& v) { return v; }
+template <class T> constexpr T avg(const T& v) { return v; }
+template <class T> constexpr T sum(const T& v) { return v; }
+template <class T> constexpr T maxw(uint32_t, const T& v) { return v; }
+template <class T> constexpr T minw(uint32_t, const T& v) { return v; }
+template <class T> constexpr T avgw(uint32_t, const T& v) { return v; }
+template <class T> constexpr T sumw(uint32_t, const T& v) { return v; }
+template <class T> constexpr T ratiow(uint32_t, const T&) { return 1; }
+template <class T> constexpr T maxs(const T& v) { return v; }
+template <class T> constexpr T mins(const T& v) { return v; }
+template <class T> constexpr T avgs(const T& v) { return v; }
+template <class T> constexpr T sums(const T& v) { return v; }
+template <class T> constexpr T last(const T& v) { return v; }
+template <class T> constexpr T prev(const T& v) { return v; }
+template <class T> constexpr T aggnext(const T& v) { return v; }
+template <class T> constexpr T daltas(const T&) { return 0; }
+template <class T> constexpr T ratios(const T&) { return 1; }
--- a/server/dragonbox/dragonbox.h
+++ b/server/dragonbox/dragonbox.h
--- a/server/dragonbox/dragonbox_to_chars.h
+++ b/server/dragonbox/dragonbox_to_chars.h
@ -0,0 +1,108 @@
+// Copyright 2020-2022 Junekey Jeon
+//
+// The contents of this file may be used under the terms of
+// the Apache License v2.0 with LLVM Exceptions.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     https://llvm.org/foundation/relicensing/LICENSE.txt)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+
+#ifndef JKJ_HEADER_DRAGONBOX_TO_CHARS
+#define JKJ_HEADER_DRAGONBOX_TO_CHARS
+
+#include "dragonbox.h"
+
+namespace jkj::dragonbox {
+    namespace to_chars_detail {
+        template <class Float, class FloatTraits>
+        extern char* to_chars(typename FloatTraits::carrier_uint significand, int exponent,
+                              char* buffer) noexcept;
+
+        // Avoid needless ABI overhead incurred by tag dispatch.
+        template <class PolicyHolder, class Float, class FloatTraits>
+        char* to_chars_n_impl(float_bits<Float, FloatTraits> br, char* buffer) noexcept {
+            auto const exponent_bits = br.extract_exponent_bits();
+            auto const s = br.remove_exponent_bits(exponent_bits);
+
+            if (br.is_finite(exponent_bits)) {
+                if (s.is_negative()) {
+                    *buffer = '-';
+                    ++buffer;
+                }
+                if (br.is_nonzero()) {
+                    auto result = to_decimal<Float, FloatTraits>(
+                        s, exponent_bits, policy::sign::ignore, policy::trailing_zero::ignore,
+                        typename PolicyHolder::decimal_to_binary_rounding_policy{},
+                        typename PolicyHolder::binary_to_decimal_rounding_policy{},
+                        typename PolicyHolder::cache_policy{});
+                    return to_chars_detail::to_chars<Float, FloatTraits>(result.significand,
+                                                                         result.exponent, buffer);
+                }
+                else {
+                    *buffer = '0';
+                    return buffer + 1;
+                }
+            }
+            else {
+                if (s.has_all_zero_significand_bits()) {
+                    if (s.is_negative()) {
+                        *buffer = '-';
+                        ++buffer;
+                    }
+                    std::memcpy(buffer, "Infinity", 8);
+                    return buffer + 8;
+                }
+                else {
+                    std::memcpy(buffer, "NaN", 3);
+                    return buffer + 3;
+                }
+            }
+        }
+    }
+
+    // Returns the next-to-end position
+    template <class Float, class FloatTraits = default_float_traits<Float>, class... Policies>
+    char* to_chars_n(Float x, char* buffer, Policies... policies) noexcept {
+        using namespace jkj::dragonbox::detail::policy_impl;
+        using policy_holder = decltype(make_policy_holder(
+            base_default_pair_list<base_default_pair<decimal_to_binary_rounding::base,
+                                                     decimal_to_binary_rounding::nearest_to_even>,
+                                   base_default_pair<binary_to_decimal_rounding::base,
+                                                     binary_to_decimal_rounding::to_even>,
+                                   base_default_pair<cache::base, cache::full>>{},
+            policies...));
+
+        return to_chars_detail::to_chars_n_impl<policy_holder>(float_bits<Float, FloatTraits>(x),
+                                                               buffer);
+    }
+
+    // Null-terminate and bypass the return value of fp_to_chars_n
+    template <class Float, class FloatTraits = default_float_traits<Float>, class... Policies>
+    char* to_chars(Float x, char* buffer, Policies... policies) noexcept {
+        auto ptr = to_chars_n<Float, FloatTraits>(x, buffer, policies...);
+        *ptr = '\0';
+        return ptr;
+    }
+
+    // Maximum required buffer size (excluding null-terminator)
+    template <class FloatFormat>
+    inline constexpr std::size_t max_output_string_length =
+        std::is_same_v<FloatFormat, ieee754_binary32>
+            ?
+            // sign(1) + significand(9) + decimal_point(1) + exp_marker(1) + exp_sign(1) + exp(2)
+            (1 + 9 + 1 + 1 + 1 + 2)
+            :
+            // format == ieee754_format::binary64
+            // sign(1) + significand(17) + decimal_point(1) + exp_marker(1) + exp_sign(1) + exp(3)
+            (1 + 17 + 1 + 1 + 1 + 3);
+}
+
+#endif
--- a/server/dragonbox/dragonbox_to_chars.hpp
+++ b/server/dragonbox/dragonbox_to_chars.hpp
@ -0,0 +1,521 @@
+// Copyright 2020-2022 Junekey Jeon
+//
+// The contents of this file may be used under the terms of
+// the Apache License v2.0 with LLVM Exceptions.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     https://llvm.org/foundation/relicensing/LICENSE.txt)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+
+#pragma once 
+
+#include "dragonbox_to_chars.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define JKJ_FORCEINLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+    #define JKJ_FORCEINLINE __forceinline
+#else
+    #define JKJ_FORCEINLINE inline
+#endif
+
+namespace jkj::dragonbox {
+    namespace to_chars_detail {
+        // These "//"'s are to prevent clang-format to ruin this nice alignment.
+        // Thanks to reddit user u/mcmcc:
+        // https://www.reddit.com/r/cpp/comments/so3wx9/dragonbox_110_is_released_a_fast_floattostring/hw8z26r/?context=3
+        static constexpr char radix_100_table[] = {
+            '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', //
+            '0', '5', '0', '6', '0', '7', '0', '8', '0', '9', //
+            '1', '0', '1', '1', '1', '2', '1', '3', '1', '4', //
+            '1', '5', '1', '6', '1', '7', '1', '8', '1', '9', //
+            '2', '0', '2', '1', '2', '2', '2', '3', '2', '4', //
+            '2', '5', '2', '6', '2', '7', '2', '8', '2', '9', //
+            '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', //
+            '3', '5', '3', '6', '3', '7', '3', '8', '3', '9', //
+            '4', '0', '4', '1', '4', '2', '4', '3', '4', '4', //
+            '4', '5', '4', '6', '4', '7', '4', '8', '4', '9', //
+            '5', '0', '5', '1', '5', '2', '5', '3', '5', '4', //
+            '5', '5', '5', '6', '5', '7', '5', '8', '5', '9', //
+            '6', '0', '6', '1', '6', '2', '6', '3', '6', '4', //
+            '6', '5', '6', '6', '6', '7', '6', '8', '6', '9', //
+            '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', //
+            '7', '5', '7', '6', '7', '7', '7', '8', '7', '9', //
+            '8', '0', '8', '1', '8', '2', '8', '3', '8', '4', //
+            '8', '5', '8', '6', '8', '7', '8', '8', '8', '9', //
+            '9', '0', '9', '1', '9', '2', '9', '3', '9', '4', //
+            '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'  //
+        };
+        static constexpr char radix_100_head_table[] = {
+            '0', '.', '1', '.', '2', '.', '3', '.', '4', '.', //
+            '5', '.', '6', '.', '7', '.', '8', '.', '9', '.', //
+            '1', '.', '1', '.', '1', '.', '1', '.', '1', '.', //
+            '1', '.', '1', '.', '1', '.', '1', '.', '1', '.', //
+            '2', '.', '2', '.', '2', '.', '2', '.', '2', '.', //
+            '2', '.', '2', '.', '2', '.', '2', '.', '2', '.', //
+            '3', '.', '3', '.', '3', '.', '3', '.', '3', '.', //
+            '3', '.', '3', '.', '3', '.', '3', '.', '3', '.', //
+            '4', '.', '4', '.', '4', '.', '4', '.', '4', '.', //
+            '4', '.', '4', '.', '4', '.', '4', '.', '4', '.', //
+            '5', '.', '5', '.', '5', '.', '5', '.', '5', '.', //
+            '5', '.', '5', '.', '5', '.', '5', '.', '5', '.', //
+            '6', '.', '6', '.', '6', '.', '6', '.', '6', '.', //
+            '6', '.', '6', '.', '6', '.', '6', '.', '6', '.', //
+            '7', '.', '7', '.', '7', '.', '7', '.', '7', '.', //
+            '7', '.', '7', '.', '7', '.', '7', '.', '7', '.', //
+            '8', '.', '8', '.', '8', '.', '8', '.', '8', '.', //
+            '8', '.', '8', '.', '8', '.', '8', '.', '8', '.', //
+            '9', '.', '9', '.', '9', '.', '9', '.', '9', '.', //
+            '9', '.', '9', '.', '9', '.', '9', '.', '9', '.'  //
+        };
+
+        // These digit generation routines are inspired by James Anhalt's itoa algorithm:
+        // https://github.com/jeaiii/itoa
+        // The main idea is for given n, find y such that floor(10^k * y / 2^32) = n holds,
+        // where k is an appropriate integer depending on the length of n.
+        // For example, if n = 1234567, we set k = 6. In this case, we have
+        // floor(y / 2^32) = 1,
+        // floor(10^2 * ((10^0 * y) mod 2^32) / 2^32) = 23,
+        // floor(10^2 * ((10^2 * y) mod 2^32) / 2^32) = 45, and
+        // floor(10^2 * ((10^4 * y) mod 2^32) / 2^32) = 67.
+        // See https://jk-jeon.github.io/posts/2022/02/jeaiii-algorithm/ for more explanation.
+
+        JKJ_FORCEINLINE static void print_9_digits(std::uint32_t s32, int& exponent,
+                                                   char*& buffer) noexcept {
+            // -- IEEE-754 binary32
+            // Since we do not cut trailing zeros in advance, s32 must be of 6~9 digits
+            // unless the original input was subnormal.
+            // In particular, when it is of 9 digits it shouldn't have any trailing zeros.
+            // -- IEEE-754 binary64
+            // In this case, s32 must be of 7~9 digits unless the input is subnormal,
+            // and it shouldn't have any trailing zeros if it is of 9 digits.
+            if (s32 >= 1'0000'0000) {
+                // 9 digits.
+                // 1441151882 = ceil(2^57 / 1'0000'0000) + 1
+                auto prod = s32 * std::uint64_t(1441151882);
+                prod >>= 25;
+                std::memcpy(buffer, radix_100_head_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 6, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 8, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                exponent += 8;
+                buffer += 10;
+            }
+            else if (s32 >= 100'0000) {
+                // 7 or 8 digits.
+                // 281474978 = ceil(2^48 / 100'0000) + 1
+                auto prod = s32 * std::uint64_t(281474978);
+                prod >>= 16;
+                auto two_digits = std::uint32_t(prod >> 32);
+                // If s32 is of 8 digits, increase the exponent by 7.
+                // Otherwise, increase it by 6.
+                exponent += (6 + unsigned(two_digits >= 10));
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                // Remaining 6 digits are all zero?
+                if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100'0000)) {
+                    // The number of characters actually written is:
+                    //   1, if only the first digit is nonzero, which means that either s32 is of 7
+                    //   digits or it is of 8 digits but the second digit is zero, or
+                    //   3, otherwise.
+                    // Note that buffer[2] is never zero if s32 is of 7 digits, because the input is
+                    // never zero.
+                    buffer += (1 + (unsigned(two_digits >= 10) & unsigned(buffer[2] > '0')) * 2);
+                }
+                else {
+                    // At least one of the remaining 6 digits are nonzero.
+                    // After this adjustment, now the first destination becomes buffer + 2.
+                    buffer += unsigned(two_digits >= 10);
+
+                    // Obtain the next two digits.
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                    // Remaining 4 digits are all zero?
+                    if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 1'0000)) {
+                        buffer += (3 + unsigned(buffer[3] > '0'));
+                    }
+                    else {
+                        // At least one of the remaining 4 digits are nonzero.
+
+                        // Obtain the next two digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        two_digits = std::uint32_t(prod >> 32);
+                        std::memcpy(buffer + 4, radix_100_table + two_digits * 2, 2);
+
+                        // Remaining 2 digits are all zero?
+                        if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                            buffer += (5 + unsigned(buffer[5] > '0'));
+                        }
+                        else {
+                            // Obtain the last two digits.
+                            prod = std::uint32_t(prod) * std::uint64_t(100);
+                            two_digits = std::uint32_t(prod >> 32);
+                            std::memcpy(buffer + 6, radix_100_table + two_digits * 2, 2);
+
+                            buffer += (7 + unsigned(buffer[7] > '0'));
+                        }
+                    }
+                }
+            }
+            else if (s32 >= 1'0000) {
+                // 5 or 6 digits.
+                // 429497 = ceil(2^32 / 1'0000)
+                auto prod = s32 * std::uint64_t(429497);
+                auto two_digits = std::uint32_t(prod >> 32);
+
+                // If s32 is of 6 digits, increase the exponent by 5.
+                // Otherwise, increase it by 4.
+                exponent += (4 + unsigned(two_digits >= 10));
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                // Remaining 4 digits are all zero?
+                if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 1'0000)) {
+                    // The number of characters actually written is 1 or 3, similarly to the case of
+                    // 7 or 8 digits.
+                    buffer += (1 + (unsigned(two_digits >= 10) & unsigned(buffer[2] > '0')) * 2);
+                }
+                else {
+                    // At least one of the remaining 4 digits are nonzero.
+                    // After this adjustment, now the first destination becomes buffer + 2.
+                    buffer += unsigned(two_digits >= 10);
+
+                    // Obtain the next two digits.
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                    // Remaining 2 digits are all zero?
+                    if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                        buffer += (3 + unsigned(buffer[3] > '0'));
+                    }
+                    else {
+                        // Obtain the last two digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        two_digits = std::uint32_t(prod >> 32);
+                        std::memcpy(buffer + 4, radix_100_table + two_digits * 2, 2);
+
+                        buffer += (5 + unsigned(buffer[5] > '0'));
+                    }
+                }
+            }
+            else if (s32 >= 100) {
+                // 3 or 4 digits.
+                // 42949673 = ceil(2^32 / 100)
+                auto prod = s32 * std::uint64_t(42949673);
+                auto two_digits = std::uint32_t(prod >> 32);
+
+                // If s32 is of 4 digits, increase the exponent by 3.
+                // Otherwise, increase it by 2.
+                exponent += (2 + int(two_digits >= 10));
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                // Remaining 2 digits are all zero?
+                if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                    // The number of characters actually written is 1 or 3, similarly to the case of
+                    // 7 or 8 digits.
+                    buffer += (1 + (unsigned(two_digits >= 10) & unsigned(buffer[2] > '0')) * 2);
+                }
+                else {
+                    // At least one of the remaining 2 digits are nonzero.
+                    // After this adjustment, now the first destination becomes buffer + 2.
+                    buffer += unsigned(two_digits >= 10);
+
+                    // Obtain the last two digits.
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                    buffer += (3 + unsigned(buffer[3] > '0'));
+                }
+            }
+            else {
+                // 1 or 2 digits.
+                // If s32 is of 2 digits, increase the exponent by 1.
+                exponent += int(s32 >= 10);
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + s32 * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[s32 * 2 + 1];
+
+                // The number of characters actually written is 1 or 3, similarly to the case of
+                // 7 or 8 digits.
+                buffer += (1 + (unsigned(s32 >= 10) & unsigned(buffer[2] > '0')) * 2);
+            }
+        }
+
+        template <>
+        char* to_chars<float, default_float_traits<float>>(std::uint32_t s32, int exponent,
+                                                           char* buffer) noexcept {
+            // Print significand.
+            print_9_digits(s32, exponent, buffer);
+
+            // Print exponent and return
+            if (exponent < 0) {
+                std::memcpy(buffer, "E-", 2);
+                buffer += 2;
+                exponent = -exponent;
+            }
+            else if (exponent > 0) {
+                buffer[0] = 'E';
+                buffer += 1;
+            }
+            else {
+                return buffer;
+            }
+
+            if (exponent >= 10) {
+                std::memcpy(buffer, &radix_100_table[exponent * 2], 2);
+                buffer += 2;
+            }
+            else {
+                buffer[0] = char('0' + exponent);
+                buffer += 1;
+            }
+
+            return buffer;
+        }
+
+        template <>
+        char* to_chars<double, default_float_traits<double>>(std::uint64_t const significand,
+                                                             int exponent, char* buffer) noexcept {
+            // Print significand by decomposing it into a 9-digit block and a 8-digit block.
+            std::uint32_t first_block, second_block;
+            bool no_second_block;
+
+            if (significand >= 1'0000'0000) {
+                first_block = std::uint32_t(significand / 1'0000'0000);
+                second_block = std::uint32_t(significand) - first_block * 1'0000'0000;
+                exponent += 8;
+                no_second_block = (second_block == 0);
+            }
+            else {
+                first_block = std::uint32_t(significand);
+                no_second_block = true;
+            }
+
+            if (no_second_block) {
+                print_9_digits(first_block, exponent, buffer);
+            }
+            else {
+                // We proceed similarly to print_9_digits(), but since we do not need to remove
+                // trailing zeros, the procedure is a bit simpler.
+                if (first_block >= 1'0000'0000) {
+                    // The input is of 17 digits, thus there should be no trailing zero at all.
+                    // The first block is of 9 digits.
+                    // 1441151882 = ceil(2^57 / 1'0000'0000) + 1
+                    auto prod = first_block * std::uint64_t(1441151882);
+                    prod >>= 25;
+                    std::memcpy(buffer, radix_100_head_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 6, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 8, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                    // The second block is of 8 digits.
+                    // 281474978 = ceil(2^48 / 100'0000) + 1
+                    prod = second_block * std::uint64_t(281474978);
+                    prod >>= 16;
+                    prod += 1;
+                    std::memcpy(buffer + 10, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 12, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 14, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 16, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                    exponent += 8;
+                    buffer += 18;
+                }
+                else {
+                    if (first_block >= 100'0000) {
+                        // 7 or 8 digits.
+                        // 281474978 = ceil(2^48 / 100'0000) + 1
+                        auto prod = first_block * std::uint64_t(281474978);
+                        prod >>= 16;
+                        auto two_digits = std::uint32_t(prod >> 32);
+
+                        std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                        buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                        exponent += (6 + unsigned(two_digits >= 10));
+                        buffer += unsigned(two_digits >= 10);
+
+                        // Print remaining 6 digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 6, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                        buffer += 8;
+                    }
+                    else if (first_block >= 1'0000) {
+                        // 5 or 6 digits.
+                        // 429497 = ceil(2^32 / 1'0000)
+                        auto prod = first_block * std::uint64_t(429497);
+                        auto two_digits = std::uint32_t(prod >> 32);
+
+                        std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                        buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                        exponent += (4 + unsigned(two_digits >= 10));
+                        buffer += unsigned(two_digits >= 10);
+
+                        // Print remaining 4 digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                        buffer += 6;
+                    }
+                    else if (first_block >= 100) {
+                        // 3 or 4 digits.
+                        // 42949673 = ceil(2^32 / 100)
+                        auto prod = first_block * std::uint64_t(42949673);
+                        auto two_digits = std::uint32_t(prod >> 32);
+
+                        std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                        buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                        exponent += (2 + unsigned(two_digits >= 10));
+                        buffer += unsigned(two_digits >= 10);
+
+                        // Print remaining 2 digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                        buffer += 4;
+                    }
+                    else {
+                        // 1 or 2 digits.
+                        std::memcpy(buffer, radix_100_head_table + first_block * 2, 2);
+                        buffer[2] = radix_100_table[first_block * 2 + 1];
+
+                        exponent += unsigned(first_block >= 10);
+                        buffer += (2 + unsigned(first_block >= 10));
+                    }
+
+                    // Next, print the second block.
+                    // The second block is of 8 digits, but we may have trailing zeros.
+                    // 281474978 = ceil(2^48 / 100'0000) + 1
+                    auto prod = second_block * std::uint64_t(281474978);
+                    prod >>= 16;
+                    prod += 1;
+                    auto two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer, radix_100_table + two_digits * 2, 2);
+
+                    // Remaining 6 digits are all zero?
+                    if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100'0000)) {
+                        buffer += (1 + unsigned(buffer[1] > '0'));
+                    }
+                    else {
+                        // Obtain the next two digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        two_digits = std::uint32_t(prod >> 32);
+                        std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                        // Remaining 4 digits are all zero?
+                        if (std::uint32_t(prod) <=
+                            std::uint32_t((std::uint64_t(1) << 32) / 1'0000)) {
+                            buffer += (3 + unsigned(buffer[3] > '0'));
+                        }
+                        else {
+                            // Obtain the next two digits.
+                            prod = std::uint32_t(prod) * std::uint64_t(100);
+                            two_digits = std::uint32_t(prod >> 32);
+                            std::memcpy(buffer + 4, radix_100_table + two_digits * 2, 2);
+
+                            // Remaining 2 digits are all zero?
+                            if (std::uint32_t(prod) <=
+                                std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                                buffer += (5 + unsigned(buffer[5] > '0'));
+                            }
+                            else {
+                                // Obtain the last two digits.
+                                prod = std::uint32_t(prod) * std::uint64_t(100);
+                                two_digits = std::uint32_t(prod >> 32);
+                                std::memcpy(buffer + 6, radix_100_table + two_digits * 2, 2);
+                                buffer += (7 + unsigned(buffer[7] > '0'));
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Print exponent and return
+            if (exponent < 0) {
+                std::memcpy(buffer, "E-", 2);
+                buffer += 2;
+                exponent = -exponent;
+            }
+            else if (exponent > 0) {
+                buffer[0] = 'E';
+                buffer += 1;
+            }
+            else {
+                return buffer;
+            }
+
+            if (exponent >= 100) {
+                // d1 = exponent / 10; d2 = exponent % 10;
+                // 6554 = ceil(2^16 / 10)
+                auto prod = std::uint32_t(exponent) * std::uint32_t(6554);
+                auto d1 = prod >> 16;
+                prod = std::uint16_t(prod) * std::uint32_t(5); // * 10
+                auto d2 = prod >> 15;                          // >> 16
+                std::memcpy(buffer, &radix_100_table[d1 * 2], 2);
+                buffer[2] = char('0' + d2);
+                buffer += 3;
+            }
+            else if (exponent >= 10) {
+                std::memcpy(buffer, &radix_100_table[exponent * 2], 2);
+                buffer += 2;
+            }
+            else {
+                buffer[0] = char('0' + exponent);
+                buffer += 1;
+            }
+
+            return buffer;
+        }
+    }
+}
+
--- a/server/gc.h
+++ b/server/gc.h
@ -0,0 +1,63 @@
+#ifndef __AQ_USE_THREADEDGC__
+#include <atomic>
+class GC {
+private:;
+
+	size_t max_slots, 
+		   interval, forced_clean, 
+		   forceclean_timer = 0;
+	uint64_t max_size;
+	bool running, alive;
+//  ptr, dealloc, ref, sz
+	uint32_t threshould;
+	void *q, *q_back;
+	void* handle;
+	std::atomic<uint32_t> slot_pos;
+	std::atomic<uint32_t> alive_cnt;
+	std::atomic<uint64_t> current_size;
+	volatile bool lock;
+	// maybe use volatile std::thread::id instead
+protected:
+	void acquire_lock();
+	void release_lock();
+	void gc();
+	void daemon();
+	void start_deamon();
+	void terminate_daemon();
+
+public:
+	void reg(void* v, uint32_t sz = 1, 
+			void(*f)(void*) = free
+		);
+
+	GC(
+		uint64_t max_size = 0xfffffff, uint32_t max_slots = 4096, 
+		uint32_t interval = 10000, uint32_t forced_clean = 1000000,
+		uint32_t threshould = 64 //one seconds
+	) : max_size(max_size), max_slots(max_slots), 
+		interval(interval), forced_clean(forced_clean), 
+		threshould(threshould) {
+
+		start_deamon();
+		GC::gc_handle = this;
+	} // 256 MB
+
+	~GC(){
+		terminate_daemon();
+	}
+	static GC* gc_handle;
+    constexpr static void(*_free) (void*) = free;
+};
+
+#else
+class GC {
+public:
+	GC(uint32_t) = default;
+	void reg(
+		void* v, uint32_t = 0, 
+		void(*f)(void*) = free
+	) const { f(v); }
+	static GC* gc;
+    constexpr static void(*_free) (void*) = free;
+}
+#endif
--- a/server/gc.hpp
+++ b/server/gc.hpp
@ -1,53 +0,0 @@
-#pragma once
-#include <vector_type>
-#include <utility>
-#include <thread>
-#include <chrono>
-class GC {
-	template<class T>
-	using vector = vector_type<T>;
-	template<class ...T>
-	using tuple = std::tuple<T...>;
-	size_t current_size, max_size, interval, forced_clean;
-	bool running, alive;
-//  ptr, dealloc, ref, sz
-	vector<tuple<void*, void (*)(void*)>> q;
-	std::thread handle;
-	void gc()
-	{
-		
-	}
-	void reg(void* v, uint32_t ref, uint32_t sz, 
-		void(*f)(void*) = [](void* v) {free (v); }) {
-		current_size += sz;
-		if (current_size > max_size)
-			gc();
-		q.push_back({ v, f });
-	}
-	void daemon() {
-		using namespace std::chrono;
-		while (alive) {
-			if (running) {
-				gc();
-				std::this_thread::sleep_for(microseconds(interval));
-			}
-			else {
-				std::this_thread::sleep_for(10ms);
-			}
-		}
-	}
-	void start_deamon() {
-		handle = std::thread(&daemon);
-		alive = true;
-	}
-	void terminate_daemon() {
-		running = false;
-		alive = false;
-		using namespace std::chrono;
-
-		if (handle.joinable()) {
-			std::this_thread::sleep_for(microseconds(1000 + std::max(static_cast<size_t>(10000), interval)));
-			handle.join();
-		}
-	}
-};
--- a/server/jeaiii_to_text.h
+++ b/server/jeaiii_to_text.h
@ -0,0 +1,121 @@
+#pragma once 
+// Copyright (c) 2022 James Edward Anhalt III - https://github.com/jeaiii/itoa
+using u32 = decltype(0xffffffff);
+using u64 = decltype(0xffffffffffffffff);
+
+static_assert(u32(-1) > 0, "u32 must be unsigned");
+static_assert(u32(0xffffffff) + u32(1) == u32(0), "u32 must be 32 bits");
+static_assert(u64(-1) > 0, "u64 must be unsigned");
+static_assert(u64(0xffffffffffffffff) + u32(1) == u32(0), "u64 must be 64 bits");
+
+constexpr auto digits_00_99 =
+    "00010203040506070809" "10111213141516171819" "20212223242526272829" "30313233343536373839"	"40414243444546474849"
+    "50515253545556575859" "60616263646566676869" "70717273747576777879" "80818283848586878889"	"90919293949596979899";
+
+struct pair { char t, o; };
+
+#define JEAIII_W(I, U) *(pair*)&b[I] = *(pair*)&digits_00_99[(U) * 2]
+#define JEAIII_A(I, N) t = (u64(1) << (32 + N / 5 * N * 53 / 16)) / u32(1e##N) + 1 + N / 6 - N / 8, t *= u, t >>= N / 5 * N * 53 / 16, t += N / 6 * 4, JEAIII_W(I, t >> 32)
+#define JEAIII_S(I) b[I] = char(u64(10) * u32(t) >> 32) + '0'
+#define JEAIII_D(I) t = u64(100) * u32(t), JEAIII_W(I, t >> 32)
+
+#define JEAIII_C0(I) b[I] = char(u) + '0'
+#define JEAIII_C1(I) JEAIII_W(I, u)
+#define JEAIII_C2(I) JEAIII_A(I, 1), JEAIII_S(I + 2)
+#define JEAIII_C3(I) JEAIII_A(I, 2), JEAIII_D(I + 2)
+#define JEAIII_C4(I) JEAIII_A(I, 3), JEAIII_D(I + 2), JEAIII_S(I + 4)
+#define JEAIII_C5(I) JEAIII_A(I, 4), JEAIII_D(I + 2), JEAIII_D(I + 4)
+#define JEAIII_C6(I) JEAIII_A(I, 5), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_S(I + 6)
+#define JEAIII_C7(I) JEAIII_A(I, 6), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_D(I + 6)
+#define JEAIII_C8(I) JEAIII_A(I, 7), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_D(I + 6), JEAIII_S(I + 8)
+#define JEAIII_C9(I) JEAIII_A(I, 8), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_D(I + 6), JEAIII_D(I + 8)
+
+#define JEAIII_L(N, A, B) u < u32(1e##N) ? A : B
+#define JEAIII_L09(F) JEAIII_L(2, JEAIII_L(1, F(0), F(1)), JEAIII_L(6, JEAIII_L(4, JEAIII_L(3, F(2), F(3)), JEAIII_L(5, F(4), F(5))), JEAIII_L(8, JEAIII_L(7, F(6), F(7)), JEAIII_L(9, F(8), F(9)))))
+#define JEAIII_L03(F) JEAIII_L(2, JEAIII_L(1, F(0), F(1)), JEAIII_L(3, F(2), F(3)))
+
+#define JEAIII_K(N) (JEAIII_C##N(0), b + N + 1)
+#define JEAIII_KX(N) (JEAIII_C##N(0), u = x, JEAIII_C7(N + 1), b + N + 9)
+#define JEAIII_KYX(N) (JEAIII_C##N(0), u = y, JEAIII_C7(N + 1), u = x, JEAIII_C7(N + 9), b + N + 17)
+
+template<bool B, class T, class F> struct _cond { using type = F; };
+template<class T, class F> struct _cond<true, T, F> { using type = T; };
+template<bool B, class T, class F> using cond = typename _cond<B, T, F>::type;
+
+template<class T> inline char* to_text_from_integer(char* b, T i)
+{
+    u64 t = u64(i);
+
+    if (i < T(0))
+        t = u64(0) - t, b[0] = '-', ++b;
+
+    u32 u = cond<T(1) != T(2), cond<sizeof(T) != 1, cond<sizeof(T) != sizeof(short), u32, unsigned short>, unsigned char>, bool>(t);
+
+    // if our input type fits in 32bits, or its value does, ctreat as 32bit (the line above ensures the compiler can still know the range limits of the input type)
+    // and optimize out cases for small integer types (if only c++ had a builtin way to get the unsigned type from a signed type)
+    if (sizeof(i) <= sizeof(u) || u == t)
+        return JEAIII_L09(JEAIII_K);
+
+    u32 x = t % 100000000u;
+    u = u32(t /= 100000000u);
+
+    // t / 10^8 (fits in 32 bit), t % 10^8 -> ~17.5 digits
+    if (u == t)
+        return JEAIII_L09(JEAIII_KX);
+
+    // t / 10^16 (1-4 digits), t / 10^8 % 10^8, t % 10^8
+    u32 y = t % 100000000u;
+    u = u32(t / 100000000u);
+    return JEAIII_L03(JEAIII_KYX);
+}
+
+inline char* to_text(char text[], signed char i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned char i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], short i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned short i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], int i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned int i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], long i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned long i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], long long i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned long long i) { return to_text_from_integer(text, i); }
+
+// Copyright (c) 2022 Bill Sun
+
+//#if defined(SIZEOF___INT128) || (defined(SIZEOF___INT128_T) && defined(SIZEOF___UINT128_T))
+constexpr static __uint128_t _10_19 = 10000000000000000000ull, 
+    _10_37 = _10_19*_10_19 / 10;
+
+template<class T>
+char* jeaiii_i128(char* buf, T v){
+    if constexpr (std::is_signed_v<T>) {
+        if (v < 0){
+            *(buf++) = '0';
+            v = -v;
+        }
+    }
+    if (v > _10_37){
+        uint8_t vv = uint8_t(v/_10_37);
+        // vv <<= 1;
+        // if (vv < 20)
+        //     *buf ++ = digits_00_99[vv + 1];
+        // else{
+        //     memcpy(buf, digits_00_99 + vv, 2);
+        //     buf += 2;
+        // }  
+    
+        *(buf++) = vv%10 + '0';
+        vv/=10;
+        if (vv) {
+            *buf = *(buf-1);
+            *(buf++-1) = vv + '0';
+        }
+    }
+
+    if (v > _10_19)
+        buf = to_text(buf, uint64_t((v/_10_19) % _10_19));
+    
+    buf = to_text(buf, uint64_t(v % _10_19));
+    return buf;
+}
+// #endif
--- a/server/libaquery.cpp
+++ b/server/libaquery.cpp
@ -1,20 +1,20 @@
 #include "pch_msc.hpp"

 #include "io.h"
-#include "table.h"
 #include <limits>

 #include <chrono>
 #include <ctime>

 #include "utils.h"
+#include "libaquery.h"
 #include <random>

 char* gbuf = nullptr;

 void setgbuf(char* buf) {
-	static char* b = 0;
-	if (buf == 0)
+	static char* b = nullptr;
+	if (buf == nullptr)
 		gbuf = b;
 	else {
 		gbuf = buf;
@ -63,6 +63,7 @@ T getInt(const char*& buf){
 	}
 	return ret;
 }
+
 template<class T> 
 char* intToString(T val, char* buf){

@ -275,6 +276,44 @@ inline const char* str(const bool& v) {
 	return v ? "true" : "false";
 }

+
+Context::Context() {
+    current.memory_map = new std::unordered_map<void*, deallocator_t>;
+    init_session();
+}
+
+Context::~Context() {
+    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
+    delete memmap;
+}
+
+void Context::init_session(){
+    if (log_level == LOG_INFO){
+        memset(&(this->current.stats), 0, sizeof(Session::Statistic));
+    }
+    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
+    memmap->clear();
+}
+
+void Context::end_session(){
+    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
+    for (auto& mem : *memmap) {
+        mem.second(mem.first);
+    }
+    memmap->clear();
+}
+
+void* Context::get_module_function(const char* fname){
+    auto fmap = static_cast<std::unordered_map<std::string, void*>*>
+        (this->module_function_maps);
+    // printf("%p\n", fmap->find("mydiv")->second);
+    //  for (const auto& [key, value] : *fmap){
+    //      printf("%s %p\n", key.c_str(), value);
+    //  }
+    auto ret = fmap->find(fname);
+    return ret == fmap->end() ? nullptr : ret->second;
+}
+
 // template<typename _Ty>
 // inline void vector_type<_Ty>::out(uint32_t n, const char* sep) const
 // {
@ -288,3 +327,195 @@ inline const char* str(const bool& v) {
 // 	}
 // 	std::cout << ')';
 // }
+
+#include "gc.h"
+#include <utility>
+#include <thread>
+#ifndef __AQ_USE_THREADEDGC__
+
+struct gcmemory_t{
+	void* memory;
+	void (*deallocator)(void*);
+};
+
+using memoryqueue_t = gcmemory_t*;
+void GC::acquire_lock() {
+	// auto this_tid = std::this_thread::get_id();
+	// while(lock != this_tid)
+	// {
+	// 	while(lock != this_tid && lock != std::thread::id()) {
+	// 		std::this_thread::sleep_for(std::chrono::milliseconds(0));
+	// 	}
+	// 	lock = this_tid;
+	// }
+}
+
+void GC::release_lock(){
+	// lock = std::thread::id();
+}
+
+void GC::gc()
+{
+	auto _q = static_cast<memoryqueue_t>(q);
+	auto _q_back = static_cast<memoryqueue_t>(q_back);
+	if (slot_pos == 0)
+		return;
+	auto t = _q;
+	lock = true;
+	while(alive_cnt != 0);
+	q = _q_back;
+	uint32_t _slot = slot_pos;
+	slot_pos = 0;
+	current_size = 0;
+	lock = false;
+	q_back = t;
+
+	for(uint32_t i = 0; i < _slot; ++i){
+		if (_q[i].memory != nullptr && _q[i].deallocator != nullptr)
+			_q[i].deallocator(_q[i].memory);
+	}
+	memset(_q, 0, sizeof(gcmemory_t) * _slot);
+	running = false;
+}
+
+void GC::daemon() {
+	using namespace std::chrono;
+
+	while (alive) {
+		if (running) {
+			if (current_size - max_size > 0 || 
+				forceclean_timer > forced_clean) 
+			{
+				gc();
+				forceclean_timer = 0;
+			}
+			std::this_thread::sleep_for(microseconds(interval));
+			forceclean_timer += interval;
+		}
+		else {
+			std::this_thread::sleep_for(10ms);
+			forceclean_timer += 10000;
+		}
+	}
+}
+
+void GC::start_deamon() {
+	q = new gcmemory_t[max_slots << 1];
+	q_back = new memoryqueue_t[max_slots << 1];
+	lock = false;
+	slot_pos = 0;
+	current_size = 0;
+	alive_cnt = 0;
+	alive = true;
+	handle = new std::thread(&GC::daemon, this);
+}
+
+void GC::terminate_daemon() {
+	running = false;
+	alive = false;
+	decltype(auto) _handle = static_cast<std::thread*>(handle);
+	delete[] static_cast<memoryqueue_t>(q);
+	delete[] static_cast<memoryqueue_t>(q_back);
+	using namespace std::chrono;
+	std::this_thread::sleep_for(microseconds(1000 + std::max(static_cast<size_t>(10000), interval)));
+
+	if (_handle->joinable()) {
+		_handle->join();
+	}
+	delete _handle;
+}
+
+void GC::reg(void* v, uint32_t sz, void(*f)(void*)) { //~ 40ns expected v. free ~ 75ns
+	if (v == nullptr || f == nullptr)
+		return;
+	if (sz < threshould){
+		f(v);
+		return;
+	}
+	auto _q = static_cast<memoryqueue_t>(q);
+	while(lock);
+	++alive_cnt;
+	current_size += sz;
+	auto _slot = (slot_pos += 1);
+	_q[_slot] = {v, f};
+	--alive_cnt;
+	running = true;
+}
+
+#endif
+
+GC* GC::gc_handle = nullptr;
+
+#include "dragonbox/dragonbox_to_chars.hpp" 
+
+
+template<>
+char*
+aq_to_chars<float>(void* value, char* buffer) { 
+    return jkj::dragonbox::to_chars_n(*static_cast<float*>(value), buffer);
+}
+template<>
+char*
+aq_to_chars<double>(void* value, char* buffer) { 
+    return jkj::dragonbox::to_chars_n(*static_cast<double*>(value), buffer);
+}
+
+template<>
+inline char*
+aq_to_chars<bool>(void* value, char* buffer) {
+	if (*static_cast<bool*>(value)){
+		memcpy(buffer, "true", 4);
+		return buffer + 4;
+	}
+	else{
+		memcpy(buffer, "false", 5);
+		return buffer + 5;
+	}
+}
+
+template<>
+char*
+aq_to_chars<char*>(void* value, char* buffer) {
+	const auto src = *static_cast<char**>(value);
+	const auto len = strlen(src);
+	memcpy(buffer, src, len);
+	return buffer + len;
+}
+
+template<>
+char*
+aq_to_chars<types::date_t>(void* value, char* buffer) {
+	const auto& src = *static_cast<types::date_t*>(value);
+	buffer = to_text(buffer, src.year);
+	*buffer++ = '-';
+	buffer = to_text(buffer, src.month);
+	*buffer++ = '-';
+	buffer = to_text(buffer, src.day);
+	return buffer;
+}
+
+template<>
+char*
+aq_to_chars<types::time_t>(void* value, char* buffer) {
+	const auto& src = *static_cast<types::time_t*>(value);
+	buffer = to_text(buffer, src.hours);
+	*buffer++ = ':';
+	buffer = to_text(buffer, src.minutes);
+	*buffer++ = ':';
+	buffer = to_text(buffer, src.seconds);
+	*buffer++ = ':';
+	buffer = to_text(buffer, src.ms);
+	return buffer;
+}
+
+template<>
+char*
+aq_to_chars<types::timestamp_t>(void* value, char* buffer) {
+	auto& src = *static_cast<types::timestamp_t*>(value);
+	buffer = aq_to_chars<types::date_t>(static_cast<void*>(&src.date), buffer);
+	*buffer++ = ' ';
+	buffer = aq_to_chars<types::time_t>(static_cast<void*>(&src.time), buffer);
+	return buffer;
+}
+
+
--- a/server/libaquery.h
+++ b/server/libaquery.h
@ -1,8 +1,37 @@
 #ifndef _AQUERY_H
 #define _AQUERY_H

-#include "table.h"
+#ifdef __INTELLISENSE__
+	#define __AQUERY_ITC_USE_SEMPH__
+	#define THREADING
+	#define __AQ_THREADED_GC__
+#endif
+
 #include <unordered_map>
+#include <chrono>
+class aq_timer {
+private:
+	std::chrono::high_resolution_clock::time_point now;
+public:
+	aq_timer(){
+		now = std::chrono::high_resolution_clock::now();
+	}
+	void reset(){
+		now = std::chrono::high_resolution_clock::now();
+	}
+	long long elapsed(){
+		long long ret = (std::chrono::high_resolution_clock::now() - now).count();
+		reset();
+		return ret;
+	}
+	long long lap() const{
+		long long ret = (std::chrono::high_resolution_clock::now() - now).count();
+		return ret;
+	}
+};
+
+#include "table.h"
+

 enum Log_level {
 	LOG_INFO,
@ -15,9 +44,16 @@ enum Backend_Type {
 	BACKEND_MonetDB,
 	BACKEND_MariaDB
 };
+
+struct QueryStats{
+	long long monet_time;
+	long long postproc_time;
+};
 struct Config{
-    int running, new_query, server_mode,
-	 	backend_type, has_dll, exec_time, n_buffers;
+    int running, new_query, server_mode, 
+	 	backend_type, has_dll, 
+		n_buffers;
+	QueryStats stats;
    int buffer_sizes[];
 };

@ -47,7 +83,10 @@ struct Context{
 #ifdef THREADING
 	void* thread_pool;
 #endif	
-	printf_type print = printf;
+#ifdef __AQ_THREADED_GC__
+	void* gc;
+#endif
+	printf_type print = &printf;
 	Context();
 	virtual ~Context();
 	template <class ...Types>
@ -67,6 +106,8 @@ struct Context{
    std::unordered_map<const char*, uColRef *> cols;
 };

+
+
 #ifdef _WIN32
 #define __DLLEXPORT__  __declspec(dllexport) __stdcall 
 #else 
@ -76,4 +117,40 @@ struct Context{
 #define __AQEXPORT__(_Ty) extern "C" _Ty __DLLEXPORT__ 
 typedef void (*deallocator_t) (void*);

+
+#include <type_traits>
+#include "jeaiii_to_text.h"
+
+template<class T>
+inline std::enable_if_t<std::is_integral_v<T>, char *> 
+aq_to_chars(void* value, char* buffer) { 
+	return to_text(buffer, *static_cast<T*>(value));
+}
+
+template<class T>
+inline std::enable_if_t<!std::is_integral_v<T>, char *> 
+aq_to_chars(void* value, char* buffer) {
+	return buffer;
+}
+
+#ifdef __SIZEOF_INT128__
+template<>
+inline char*
+aq_to_chars<__int128_t>(void* value, char* buffer) {
+    return jeaiii_i128<__int128_t>(buffer, *static_cast<__int128_t*>(value));
+}
+
+template<>
+inline char*
+aq_to_chars<__uint128_t>(void* value, char* buffer) {
+    return jeaiii_i128<__uint128_t>(buffer, *static_cast<__uint128_t*>(value));
+}
+#endif
+
+template<> char* aq_to_chars<float>(void* , char*);
+template<> char* aq_to_chars<double>(void* , char*);
+template<> char* aq_to_chars<char*>(void* , char*);
+template<> char* aq_to_chars<types::date_t>(void* , char*);
+template<> char* aq_to_chars<types::time_t>(void* , char*);
+template<> char* aq_to_chars<types::timestamp_t>(void* , char*);
 #endif
--- a/server/monetdb_conn.cpp
+++ b/server/monetdb_conn.cpp
@ -2,12 +2,14 @@

 #include "libaquery.h"
 #include <cstdio>
+#include <string>
 #include "monetdb_conn.h"
 #include "monetdbe.h"
 #include "table.h"
+
 #undef static_assert

-const char* monetdbe_type_str[] = {
+constexpr const char* monetdbe_type_str[] = {
 	"monetdbe_bool", "monetdbe_int8_t", "monetdbe_int16_t", "monetdbe_int32_t", "monetdbe_int64_t",
 #ifdef HAVE_HGE
 	"monetdbe_int128_t",
@ -20,7 +22,7 @@ const char* monetdbe_type_str[] = {
 	"monetdbe_type_unknown"
 } ;

-const unsigned char monetdbe_type_szs[] = {
+inline constexpr static unsigned char monetdbe_type_szs[] = {
    sizeof(monetdbe_column_bool::null_value), sizeof(monetdbe_column_int8_t::null_value), 
    sizeof(monetdbe_column_int16_t::null_value), sizeof(monetdbe_column_int32_t::null_value), 
    sizeof(monetdbe_column_int64_t::null_value),
@ -36,7 +38,19 @@ const unsigned char monetdbe_type_szs[] = {
    1
 };

+namespace types{
+    constexpr const Type_t monetdbe_type_aqtypes[] = {
+        ABOOL, AINT8, AINT16, AINT32, AINT64, 
+#ifdef HAVE_HGE
+        AINT128,
+#endif
+        AUINT64, AFLOAT, ADOUBLE, ASTR, 
+        // blob?
+        AINT64,
+        ADATE, ATIME, ATIMESTAMP, ERROR

+    };
+}

 Server::Server(Context* cxt){
    if (cxt){
@ -80,7 +94,7 @@ void Server::connect(Context *cxt){
    else{
        if(server)
            free(server);
-        this->server = 0;
+        this->server = nullptr;
        status = false;
        puts(ret == -1 ? "Allocation Error." : "Internal Database Error.");
    }
@ -103,7 +117,7 @@ void Server::exec(const char* q){

 bool Server::haserror(){
    if (last_error){
-        last_error = 0;
+        last_error = nullptr;
        return true;
    }
    else{
@ -111,12 +125,53 @@ bool Server::haserror(){
    }
 }

+
+void Server::print_results(const char* sep, const char* end){
+
+    if (!haserror()){
+        auto _res = static_cast<monetdbe_result*> (res);
+        const auto& ncols = _res->ncols;
+        monetdbe_column** cols = static_cast<monetdbe_column**>(malloc(sizeof(monetdbe_column*) * ncols));
+        std::string* printf_string = new std::string[ncols];
+        const char** col_data = static_cast<const char**> (malloc(sizeof(char*) * ncols));
+        uint8_t* szs = static_cast<uint8_t*>(alloca(ncols));
+        std::string header_string = "";
+        const char* err_msg = nullptr;
+        for(uint32_t i = 0; i < ncols; ++i){
+            err_msg = monetdbe_result_fetch(_res, &cols[i], i);
+            printf_string[i] = 
+                std::string(types::printf_str[types::monetdbe_type_aqtypes[cols[i]->type]]) 
+                + (i < ncols - 1 ? sep : "");
+            puts(printf_string[i].c_str());
+            puts(monetdbe_type_str[cols[i]->type]);
+            col_data[i] = static_cast<char *>(cols[i]->data);
+            szs [i] = monetdbe_type_szs[cols[i]->type];
+            header_string = header_string + cols[i]->name + sep + '|' + sep;
+        }
+        const size_t l_sep = strlen(sep) + 1;
+		if (header_string.size() - l_sep >= 0)
+			header_string.resize(header_string.size() - l_sep);
+        header_string += end + std::string(header_string.size(), '=') + end;
+        fputs(header_string.c_str(), stdout);
+        for(uint64_t i = 0; i < cnt; ++i){
+            for(uint32_t j = 0; j < ncols; ++j){
+                printf(printf_string[j].c_str(), *((void**)col_data[j]));
+                col_data[j] += szs[j];
+            }
+            fputs(end, stdout);
+        }
+        free(cols);
+        delete[] printf_string;
+        free(col_data);
+    }
+}
+
 void Server::close(){
    if(this->server){
        auto server = static_cast<monetdbe_database*>(this->server);
        monetdbe_close(*(server));
        free(server);
-        this->server = 0;
+        this->server = nullptr;
    }
 }

@ -130,7 +185,7 @@ void* Server::getCol(int col_idx){
            auto _ret_col = static_cast<monetdbe_column*>(this->ret_col);
            cnt = _ret_col->count;
             printf("Dbg: Getting col %s, type: %s\n", 
-                 _ret_col->name, monetdbe_type_str[_ret_col->type]);
+                _ret_col->name, monetdbe_type_str[_ret_col->type]);
            return _ret_col->data;
        }
        else{
@ -140,7 +195,7 @@ void* Server::getCol(int col_idx){
    else{
        puts("Error: No result.");
    }
-    return 0;
+    return nullptr;
 }

 Server::~Server(){
@ -149,10 +204,10 @@ Server::~Server(){

 bool Server::havehge() {
 #if defined(_MONETDBE_LIB_) and defined(HAVE_HGE)
-    puts("true");
+    // puts("true");
    return HAVE_HGE;
 #else
-    puts("false");
+    // puts("false");
    return false;
 #endif
 }
--- a/server/monetdb_conn.h
+++ b/server/monetdb_conn.h
@ -22,6 +22,9 @@ struct Server{
    void close();
    bool haserror();
    static bool havehge();
+    void test(const char*);
+    void print_results(const char* sep = " ", const char* end = "\n");
+    friend void print_monetdb_results(Server* srv, const char* sep, const char* end, int limit);
    ~Server();
 };

--- a/server/server.cpp
+++ b/server/server.cpp
@ -1,47 +1,137 @@
 #include "pch_msc.hpp"

-#include "../csv.h"
 #include <iostream>
 #include <string>
 #include <chrono>
+#include <thread>

 #include "libaquery.h"
 #include "monetdb_conn.h"
 #ifdef THREADING
 #include "threading.h"
 #endif
+
 #ifdef _WIN32
 #include "winhelper.h"
 #else 
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <sys/mman.h>
+#include <atomic>
+
+// fast numeric to string conversion
+#include "jeaiii_to_text.h"
+#include "dragonbox/dragonbox_to_chars.h"
+
 struct SharedMemory
 {
+    std::atomic<bool> a;
    int hFileMap;
    void* pData;
-    SharedMemory(const char* fname) {
+    explicit SharedMemory(const char* fname) {
        hFileMap = open(fname, O_RDWR, 0);
        if (hFileMap != -1)
-            pData = mmap(NULL, 8, PROT_READ | PROT_WRITE, MAP_SHARED, hFileMap, 0);
+            pData = mmap(nullptr, 8, PROT_READ | PROT_WRITE, MAP_SHARED, hFileMap, 0);
        else 
-            pData = 0;
+            pData = nullptr;
    }
-    void FreeMemoryMap() {
+    void FreeMemoryMap() const {
+        // automatically unmapped in posix
+    }
+};

+#ifndef __USE_STD_SEMAPHORE__
+#ifdef __APPLE__
+#include <dispatch/dispatch.h>
+class A_Semaphore {
+private:
+	dispatch_semaphore_t native_handle;
+public:
+	A_Semaphore(bool v = false) {
+		native_handle = dispatch_semaphore_create(v);
+	}
+	void acquire() {
+        // puts("acquire");
+		dispatch_semaphore_wait(native_handle, DISPATCH_TIME_FOREVER);
+	}
+	void release() {
+        // puts("release");
+		dispatch_semaphore_signal(native_handle);
+	}
+	~A_Semaphore() {
+	}
+};
+#else
+#include <semaphore.h>
+class A_Semaphore {
+private:
+	sem_t native_handle;
+public:
+	A_Semaphore(bool v = false) {
+		sem_init(&native_handle, v, 1);
+	}
+	void acquire() {
+		sem_wait(&native_handle);
+	}
+	void release() {
+		sem_post(&native_handle);
+	}
+	~A_Semaphore() {
+		sem_destroy(&native_handle);
+	}
+};
+#endif
+#endif
+#endif
+
+#ifdef __USE_STD_SEMAPHORE__
+#define __AQUERY_ITC_USE_SEMPH__
+#include <semaphore>
+class A_Semaphore {
+private:
+    std::binary_semaphore native_handle;
+public:
+    A_Semaphore(bool v = false) {
+        native_handle = std::binary_semaphore(v);
+    }
+    void acquire() {
+        native_handle.acquire();
    }
+    void release() {
+        native_handle.release();
+    }
+    ~A_Semaphore() { }
 };
 #endif

-#include "aggregations.h"
+#ifdef __AQUERY_ITC_USE_SEMPH__
+A_Semaphore prompt{ true }, engine{ false };
+#define PROMPT_ACQUIRE() prompt.acquire()
+#define PROMPT_RELEASE() prompt.release()
+#define ENGINE_ACQUIRE() engine.acquire()
+#define ENGINE_RELEASE() engine.release()
+#else
+#define PROMPT_ACQUIRE() 
+#define PROMPT_RELEASE() std::this_thread::sleep_for(std::chrono::nanoseconds(0))
+#define ENGINE_ACQUIRE() 
+#define ENGINE_RELEASE() 
+#endif
+
 typedef int (*code_snippet)(void*);
 typedef void (*module_init_fn)(Context*);

-int test_main();

 int n_recv = 0;
 char** n_recvd = nullptr;

+__AQEXPORT__(void) wait_engine(){
+    PROMPT_ACQUIRE();
+}
+
+__AQEXPORT__(void) wake_engine(){
+    ENGINE_RELEASE();
+}
+
 extern "C" void __DLLEXPORT__ receive_args(int argc, char**argv){
    n_recv = argc;
    n_recvd = argv;
@ -71,42 +161,99 @@ __AQEXPORT__(bool) have_hge(){
 #endif
 }

-Context::Context() {
-    current.memory_map = new std::unordered_map<void*, deallocator_t>;
-    init_session();
-}
+using prt_fn_t = char* (*)(void*, char*);

-Context::~Context() {
-    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
-    delete memmap;
-}

-void Context::init_session(){
-    if (log_level == LOG_INFO){
-        memset(&(this->current.stats), 0, sizeof(Session::Statistic));
-    }
-    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
-    memmap->clear();
-}
+constexpr prt_fn_t monetdbe_prtfns[] = {
+	aq_to_chars<bool>, aq_to_chars<int8_t>, aq_to_chars<int16_t>, aq_to_chars<int32_t>, 
+	aq_to_chars<int64_t>,
+#if __SIZEOF_INT128__
+	aq_to_chars<__int128_t>, 
+#endif
+	aq_to_chars<size_t>, aq_to_chars<float>, aq_to_chars<double>,
+	aq_to_chars<char*>, aq_to_chars<std::nullptr_t>,
+	aq_to_chars<types::date_t>, aq_to_chars<types::time_t>, aq_to_chars<types::timestamp_t>,
+
+	// should be last:
+	aq_to_chars<std::nullptr_t>
+};
+
+#include "monetdbe.h"
+inline constexpr static unsigned char monetdbe_type_szs[] = {
+    sizeof(monetdbe_column_bool::null_value), sizeof(monetdbe_column_int8_t::null_value), 
+    sizeof(monetdbe_column_int16_t::null_value), sizeof(monetdbe_column_int32_t::null_value), 
+    sizeof(monetdbe_column_int64_t::null_value),
+#ifdef __SIZEOF_INT128__
+    sizeof(monetdbe_column_int128_t::null_value),
+#endif
+    sizeof(monetdbe_column_size_t::null_value), sizeof(monetdbe_column_float::null_value),
+    sizeof(monetdbe_column_double::null_value),
+    sizeof(monetdbe_column_str::null_value), sizeof(monetdbe_column_blob::null_value),
+    sizeof(monetdbe_data_date), sizeof(monetdbe_data_time), sizeof(monetdbe_data_timestamp),
+
+    // should be last:
+    1
+};
+constexpr uint32_t output_buffer_size = 65536;
+void print_monetdb_results(Server* srv, const char* sep = " ", const char* end = "\n", 
+    uint32_t limit = std::numeric_limits<uint32_t>::max()) {
+    if (!srv->haserror() && srv->cnt && limit){
+        char buffer[output_buffer_size];
+        auto _res = static_cast<monetdbe_result*> (srv->res);
+        const auto& ncols = _res->ncols;
+        monetdbe_column** cols = static_cast<monetdbe_column**>(malloc(sizeof(monetdbe_column*) * ncols));
+        prt_fn_t *prtfns = (prt_fn_t*) alloca(sizeof(prt_fn_t) * ncols);
+        char** col_data = static_cast<char**> (alloca(sizeof(char*) * ncols));
+        uint8_t* szs = static_cast<uint8_t*>(alloca(ncols));
+        std::string header_string = "";
+        const char* err_msg = nullptr;
+        const size_t l_sep = strlen(sep);
+        const size_t l_end = strlen(end);
+        char* _buffer = buffer;
+
+        for(uint32_t i = 0; i < ncols; ++i){
+            err_msg = monetdbe_result_fetch(_res, &cols[i], i);
+            if(err_msg) { goto cleanup; }
+            col_data[i] = static_cast<char *>(cols[i]->data);
+            prtfns[i] = monetdbe_prtfns[cols[i]->type];
+            szs [i] = monetdbe_type_szs[cols[i]->type];
+            header_string = header_string + cols[i]->name + sep + '|' + sep;
+        }

-void Context::end_session(){
-    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
-    for (auto& mem : *memmap) {
-        mem.second(mem.first);
+        if(l_sep > 512 || l_end > 512) {
+            puts("Error: separator or end string too long");
+            goto cleanup;
+        }
+		if (header_string.size() - l_sep - 1>= 0)
+			header_string.resize(header_string.size() - l_sep - 1);
+        header_string += end + std::string(header_string.size(), '=') + end;
+        fputs(header_string.c_str(), stdout);
+        for(uint64_t i = 0; i < srv->cnt; ++i){
+            for(uint32_t j = 0; j < ncols; ++j){
+                //copy the field to buf
+                _buffer = prtfns[j](col_data[j], _buffer);
+                if (j != ncols - 1){
+                    memcpy(_buffer, sep, l_sep);
+                    _buffer += l_sep;
+                }
+                col_data[j] += szs[j];
+            }
+            memcpy(_buffer, end, l_end);
+            _buffer += l_end;
+            if(output_buffer_size - (_buffer - buffer) <= 1024){
+                fwrite(buffer, 1, _buffer - buffer, stdout);
+                _buffer = buffer;
+            }
+        }
+        memcpy(_buffer, end, l_end);
+        _buffer += l_end;
+        if (_buffer != buffer)
+            fwrite(buffer, 1, _buffer - buffer, stdout);
+cleanup:        
+        free(cols);
    }
-    memmap->clear();
 }

-void* Context::get_module_function(const char* fname){
-    auto fmap = static_cast<std::unordered_map<std::string, void*>*>
-        (this->module_function_maps);
-    // printf("%p\n", fmap->find("mydiv")->second);
-    //  for (const auto& [key, value] : *fmap){
-    //      printf("%s %p\n", key.c_str(), value);
-    //  }
-    auto ret = fmap->find(fname);
-    return ret == fmap->end() ? nullptr : ret->second;
-}

 void initialize_module(const char* module_name, void* module_handle, Context* cxt){
    auto _init_module = reinterpret_cast<module_init_fn>(dlsym(module_handle, "init_session"));
@ -119,15 +266,16 @@ void initialize_module(const char* module_name, void* module_handle, Context* cx
 }

 int dll_main(int argc, char** argv, Context* cxt){
+    aq_timer timer;
    Config *cfg = reinterpret_cast<Config *>(argv[0]);
    std::unordered_map<std::string, void*> user_module_map;
-    if (cxt->module_function_maps == 0)
+    if (cxt->module_function_maps == nullptr)
        cxt->module_function_maps = new std::unordered_map<std::string, void*>();
    auto module_fn_map = 
        static_cast<std::unordered_map<std::string, void*>*>(cxt->module_function_maps);
    
    auto buf_szs = cfg->buffer_sizes;
-    void** buffers = (void**)malloc(sizeof(void*) * cfg->n_buffers);
+    void** buffers = (void**) malloc (sizeof(void*) * cfg->n_buffers);
    for (int i = 0; i < cfg->n_buffers; i++) 
        buffers[i] = static_cast<void *>(argv[i + 1]);

@ -135,19 +283,28 @@ int dll_main(int argc, char** argv, Context* cxt){
    cxt->cfg = cfg;
    cxt->n_buffers = cfg->n_buffers;
    cxt->sz_bufs = buf_szs;
-    cxt->alt_server = NULL;
-
+    if (cfg->backend_type == BACKEND_MonetDB && cxt->alt_server == nullptr)
+    {
+        auto alt_server = new Server(cxt);
+        alt_server->exec("SELECT '**** WELCOME TO AQUERY++! ****';");
+        puts(*(const char**)(alt_server->getCol(0)));
+        cxt->alt_server = alt_server;
+    }
    while(cfg->running){
+        ENGINE_ACQUIRE();
        if (cfg->new_query) {
-            void *handle = 0;
-            void *user_module_handle = 0;
+            cfg->stats.postproc_time = 0;
+            cfg->stats.monet_time = 0;
+
+            void *handle = nullptr;
+            void *user_module_handle = nullptr;
            if (cfg->backend_type == BACKEND_MonetDB){
-                if (cxt->alt_server == 0)
+                if (cxt->alt_server == nullptr)
                    cxt->alt_server = new Server(cxt);
                Server* server = reinterpret_cast<Server*>(cxt->alt_server);
                if(n_recv > 0){
                    if (cfg->backend_type == BACKEND_AQuery || cfg->has_dll) {
-                        handle = dlopen("./dll.so", RTLD_LAZY);
+                        handle = dlopen("./dll.so", RTLD_NOW);
                    }
                    for (const auto& module : user_module_map){
                        initialize_module(module.first.c_str(), module.second, cxt);
@ -159,14 +316,18 @@ int dll_main(int argc, char** argv, Context* cxt){
                        switch(n_recvd[i][0]){
                        case 'Q': // SQL query for monetdbe
                            {
+                                timer.reset();
                                server->exec(n_recvd[i] + 1);
-                                printf("Exec Q%d: %s", i, n_recvd[i]);
+                                cfg->stats.monet_time += timer.elapsed();
+                                // printf("Exec Q%d: %s", i, n_recvd[i]);
                            }
                            break;
                        case 'P': // Postprocessing procedure 
                            if(handle && !server->haserror()) {
                                code_snippet c = reinterpret_cast<code_snippet>(dlsym(handle, n_recvd[i]+1));
+                                timer.reset();
                                c(cxt);
+                                cfg->stats.postproc_time += timer.elapsed();
                            }
                            break;
                        case 'M': // Load Module
@ -193,12 +354,21 @@ int dll_main(int argc, char** argv, Context* cxt){
                                //printf("F::: %p\n", module_fn_map->find("mydiv") != module_fn_map->end() ? module_fn_map->find("mydiv")->second : nullptr);
                            }
                            break;
+                        case 'O':
+                            {
+                                if(!server->haserror()){
+                                    timer.reset();
+                                    print_monetdb_results(server);        
+                                    cfg->stats.postproc_time += timer.elapsed();
+                                }
+                            }
+                            break;
                        case 'U': // Unload Module
                            {
                                auto mname = n_recvd[i] + 1;
                                auto it = user_module_map.find(mname);
                                if (user_module_handle == it->second)
-                                    user_module_handle = 0;
+                                    user_module_handle = nullptr;
                                dlclose(it->second);
                                user_module_map.erase(it);
                            }
@ -207,8 +377,9 @@ int dll_main(int argc, char** argv, Context* cxt){
                    }
                    if(handle) {
                        dlclose(handle);
-                        handle = 0;
+                        handle = nullptr;
                    }
+                    printf("%lld, %lld", cfg->stats.monet_time, cfg->stats.postproc_time);
                    cxt->end_session();
                    n_recv = 0;
                }
@ -217,7 +388,7 @@ int dll_main(int argc, char** argv, Context* cxt){
                }   
                else{
                    server->last_error = nullptr;
-                    continue;
+                    //goto finalize;
                } 
            }
            
@ -230,9 +401,11 @@ int dll_main(int argc, char** argv, Context* cxt){
            if (handle) dlclose(handle);
            cfg->new_query = 0;
        }
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        //puts(cfg->running? "true": "false");
+//finalize:
+        PROMPT_RELEASE();
    }
-
+    
    return 0;
 }

@ -263,20 +436,21 @@ extern "C" int __DLLEXPORT__ main(int argc, char** argv) {
 #ifdef __AQ_BUILD_LAUNCHER__
   return launcher(argc, argv);
 #endif
-   puts("running");
+   // puts("running");
   Context* cxt = new Context();
-   cxt->log("%d %s\n", argc, argv[1]);
+   // cxt->log("%d %s\n", argc, argv[1]);

 #ifdef THREADING
    auto tp = new ThreadPool();
    cxt->thread_pool = tp;
 #endif
    
+#ifdef __AQ_THREADED_GC__
+    cxt->gc_thread = new std::thread(gc_thread, cxt);
+#endif    
   const char* shmname;
   if (argc < 0)
        return dll_main(argc, argv, cxt);
-   else if (argc <= 1)
-        return test_main();
   else
       shmname = argv[1];
   SharedMemory shm = SharedMemory(shmname);
@ -310,56 +484,3 @@ extern "C" int __DLLEXPORT__ main(int argc, char** argv) {
   return 0;
 }

-#include "utils.h"
-#include "table_ext_monetdb.hpp"
-int test_main()
-{
-    Context* cxt = new Context();
-    if (cxt->alt_server == 0)
-        cxt->alt_server = new Server(cxt);
-    Server* server = reinterpret_cast<Server*>(cxt->alt_server);
-
-    const char* qs[]= {
-        "QCREATE TABLE trade(stocksymbol INT, time INT, quantity INT, price INT);",
-        "QCOPY OFFSET 2 INTO trade FROM  'w:/gg/AQuery++/data/trade_numerical.csv'  ON SERVER    USING DELIMITERS  ',';",
-        "QSELECT stocksymbol, (SUM((quantity * price)) / SUM(quantity)) AS weighted_average  FROM trade GROUP BY stocksymbol  ;",
-        "Pdll_5lYrMY",
-        "QSELECT stocksymbol, price  FROM trade ORDER BY time  ;",
-        "Pdll_4Sg6Ri",
-        "QSELECT stocksymbol, quantity, price  FROM trade ORDER BY time  ;",
-        "Pdll_5h4kL2",
-        "QSELECT stocksymbol, price  FROM trade ORDER BY time  ;",
-        "Pdll_7tEWCO",
-        "QSELECT query_c.weighted_moving_averages, query_c.stocksymbol  FROM query_c;",
-        "Pdll_7FCPnF"
-    };
-    n_recv = sizeof(qs)/(sizeof (char*));
-	n_recvd = const_cast<char**>(qs);
-            void* handle = 0;
-                    handle = dlopen("./dll.so", RTLD_LAZY);
-                    cxt->init_session();
-                    for (int i = 0; i < n_recv; ++i)
-                    {
-                        //printf("%s, %d\n", n_recvd[i], n_recvd[i][0] == 'Q');
-                        switch (n_recvd[i][0]) {
-                        case 'Q': // SQL query for monetdbe
-                        {
-                            server->exec(n_recvd[i] + 1);
-                            printf("Exec Q%d: %s\n", i, n_recvd[i]);
-                        }
-                        break;
-                        case 'P': // Postprocessing procedure 
-                            if (handle && !server->haserror()) {
-                                code_snippet c = reinterpret_cast<code_snippet>(dlsym(handle, n_recvd[i] + 1));
-                                c(cxt);
-                            }
-                            break;
-                        }
-                    }
-                    n_recv = 0;
-
-    //static_assert(std::is_same_v<decltype(fill_integer_array<5, 1>()), std::integer_sequence<bool, 1,1,1,1,1>>, "");
-    
-    return 0;
-}
-
--- a/server/table.h
+++ b/server/table.h
@ -9,6 +9,7 @@
 #include <string>
 #include <algorithm>
 #include <cstdarg>
+#include <vector>
 #include "io.h"
 #include "hasher.h"

@ -74,7 +75,16 @@ public:
 		this->container = (_Ty*)container;
 		this->name = name;
 	}
-	template<template <typename ...> class VT, typename T>
+	template<template <typename> class VT, typename T>
+	void initfrom(VT<T>&& v, const char* name = "") {
+		ty = types::Types<_Ty>::getType();
+		this->size = v.size;
+		this->capacity = v.capacity;
+		this->container = (_Ty*)(v.container);
+		this->name = name;
+		v.capacity = 0;
+	}
+	template<template <typename> class VT, typename T>
 	void initfrom(const VT<T>& v, const char* name = "") {
 		ty = types::Types<_Ty>::getType();
 		this->size = v.size;
@ -82,6 +92,21 @@ public:
 		this->container = (_Ty*)(v.container);
 		this->name = name;
 	}
+	void initfrom(vectortype_cstorage v, const char* name = "") {
+		ty = types::Types<_Ty>::getType();
+		this->size = v.size;
+		this->capacity = v.capacity;
+		this->container = (_Ty*)v.container;
+		this->name = name;
+	}
+	template<typename T>
+	void initfrom(const T& v, const char* name = "") {
+		ty = types::Types<_Ty>::getType();
+		this->size = 0;
+		this->capacity = 0;
+		this->emplace_back(v);
+		this->name = name;
+	}
 	template <class T>
 	ColRef<_Ty>& operator =(ColRef<T>&& vt) {
 		this->container = (_Ty*)vt.container;
@ -115,8 +140,16 @@ public:
 	ColView<_Ty> operator [](const vector_type<uint32_t>& idxs) const {
 		return ColView<_Ty>(*this, idxs);
 	}
-
-	void out(uint32_t n = 4, const char* sep = " ") const {
+	vector_type<_Ty> operator [](const std::vector<bool>& idxs) const {
+		vector_type<_Ty> ret (this->size);
+		uint32_t i = 0;
+		for(const auto& f : idxs){
+			if(f) ret.emplace_back(this->operator[](i));
+			++i;
+		}
+		return ret;
+	}
+	void out(uint32_t n = 1000, const char* sep = " ") const {
 		const char* more = "";
 		if (n < this->size)
 			more = " ... ";
@ -180,7 +213,7 @@ template<>
 class ColRef<void> : public ColRef<int> {};

 template<typename _Ty>
-class ColView {
+class ColView : public vector_base<_Ty> {
 public:
 	typedef ColRef<_Ty> Decayed_t;
 	const uint32_t size;
@ -219,7 +252,7 @@ public:
 	Iterator_t end() const {
 		return Iterator_t(idxs.end(), orig);
 	}
-	void out(uint32_t n = 4, const char* sep = " ") const {
+	void out(uint32_t n = 1000, const char* sep = " ") const {
 		n = n > size ? size : n;
 		std::cout << '(';
 		for (uint32_t i = 0; i < n; ++i)
@ -414,19 +447,27 @@ struct TableInfo {
 	}
 	template <int ...cols>
 	void print2(const char* __restrict sep = ",", const char* __restrict end = "\n",
-		const vector_type<uint32_t>* __restrict view = nullptr, FILE* __restrict fp = nullptr) const {
+		const vector_type<uint32_t>* __restrict view = nullptr, 
+		FILE* __restrict fp = nullptr, uint32_t limit = std::numeric_limits<uint32_t>::max()
+		) const {

 		std::string printf_string =
 			generate_printf_string<typename std::tuple_element<cols, tuple_type>::type ...>(sep, end);
+		// puts(printf_string.c_str());
 		std::string header_string = std::string();
 		constexpr static int a_cols[] = { cols... };
-		for (int i = 0; i < sizeof...(cols); ++i)
-			header_string += std::string(this->colrefs[a_cols[i]].name) + sep;
-		const size_t l_sep = strlen(sep);
-		if (header_string.size() - l_sep >= 0)
-			header_string.resize(header_string.size() - l_sep);
-
-		const auto& prt_loop = [&fp, &view, &printf_string, *this](const auto& f) {
+		if (fp == nullptr){
+			header_string = get_header_string(sep, end);
+			header_string.resize(header_string.size() - strlen(end));
+		}
+		else {
+			for (int i = 0; i < sizeof...(cols); ++i)
+				header_string += std::string(this->colrefs[a_cols[i]].name) + sep;
+			const size_t l_sep = strlen(sep);
+			if (header_string.size() - l_sep >= 0)
+				header_string.resize(header_string.size() - l_sep);
+		}
+		const auto& prt_loop = [&fp, &view, &printf_string, *this, &limit](const auto& f) {
 #ifdef __AQ__HAS__INT128__			
 			constexpr auto num_hge = count_type<__int128_t, __uint128_t>((tuple_type*)(0));
 #else
@ -442,16 +483,21 @@ struct TableInfo {
 				+ 1 // padding for msvc not allowing empty arrays
 			];
 			setgbuf(cbuf);
-			if (view)
-				for (uint32_t i = 0; i < view->size; ++i) {
+			
+			if (view){
+				uint32_t outsz = limit > view->size ? view->size : limit;
+				for (uint32_t i = 0; i < outsz; ++i) {
 					print2_impl<cols...>(f, (*view)[i], printf_string.c_str());
 					setgbuf();
 				}
-			else
-				for (uint32_t i = 0; i < colrefs[0].size; ++i) {
+			}
+			else{
+				uint32_t outsz = limit > colrefs[0].size ? colrefs[0].size : limit;
+				for (uint32_t i = 0; i < outsz; ++i) {
 					print2_impl<cols...>(f, i, printf_string.c_str());
 					setgbuf();
 				}
+			}
 		};

 		if (fp)
@ -466,15 +512,17 @@ struct TableInfo {
 	}
 	template <int ...vals> struct applier {
 		inline constexpr static void apply(const TableInfo<Types...>& t, const char* __restrict sep = ",", const char* __restrict end = "\n",
-			const vector_type<uint32_t>* __restrict view = nullptr, FILE* __restrict fp = nullptr)
+			const vector_type<uint32_t>* __restrict view = nullptr, FILE* __restrict fp = nullptr, uint32_t limit = std::numeric_limits<uint32_t>::max()
+			) 
 		{
-			t.template print2<vals ...>(sep, end, view, fp);
+			t.template print2<vals ...>(sep, end, view, fp, limit);
 		}
 	};

 	inline void printall(const char* __restrict sep = ",", const char* __restrict end = "\n",
-		const vector_type<uint32_t>* __restrict view = nullptr, FILE* __restrict fp = nullptr) {
-		applyIntegerSequence<sizeof...(Types), applier>::apply(*this, sep, end, view, fp);
+		const vector_type<uint32_t>* __restrict view = nullptr, FILE* __restrict fp = nullptr, 
+		uint32_t limit = std::numeric_limits<uint32_t>::max() ) const {
+		applyIntegerSequence<sizeof...(Types), applier>::apply(*this, sep, end, view, fp, limit);
 	}

 	TableInfo<Types...>* rename(const char* name) {
@ -643,7 +691,9 @@ template <class ...Types>
 template <size_t j>
 inline typename std::enable_if<j == sizeof...(Types) - 1, void>::type
 TableInfo<Types ...>::print_impl(const uint32_t& i, const char* __restrict sep) const {
-	std::cout << (get<j>(*this))[i];
+	decltype(auto) t = (get<j>(*this))[i];
+//	print(t);
+	std::cout << t;
 }

 template<class ...Types>
@ -658,6 +708,7 @@ inline typename std::enable_if < j < sizeof...(Types) - 1, void>::type
 template<class ...Types>
 inline void TableInfo<Types...>::print(const char* __restrict sep, const char* __restrict end) const {

+	//printall(sep, end);
 	std::string header_string = get_header_string(sep, end);
 	std::cout << header_string.c_str();

@ -669,51 +720,56 @@ inline void TableInfo<Types...>::print(const char* __restrict sep, const char* _
 		std::cout << end;
 	}
 }
+
+// use std::is_base_of here and all vt classes should derive from vector_base
 template <class T1,
 			template<typename> class VT,
 			class TRet>
-using test_vt_support = typename std::enable_if_t<std::is_same_v<VT<T1>, ColRef<T1>> || 
-				std::is_same_v<VT<T1>, ColView<T1>> || 
-				std::is_same_v<VT<T1>, vector_type<T1>>, TRet>;
+using test_vt_support = typename std::enable_if_t<
+					std::is_base_of_v<vector_base<T1>, VT<T1>>, 
+					TRet>;

-template <class T1, class T2,
-			template<typename> class VT>
-using get_autoext_type = test_vt_support<T1, VT, 
-		decayed_t<VT, typename types::Coercion<T1, T2>::type>>;

-template <class T1, class T2,
-			template<typename> class VT>
-using get_long_type = test_vt_support<T1, VT, 
-		decayed_t<VT, types::GetLongType<typename types::Coercion<T1, T2>::type>>>;
+template <class T1, class T2, template<typename> class VT, 
+			test_vt_support<T1, VT, void>* = nullptr>
+using get_autoext_type = 
+		decayed_t<VT, typename types::Coercion<T1, T2>::type>;

-template <class T1, class T2,
-			template<typename> class VT>
-using get_fp_type = test_vt_support<T1, VT, 
-		decayed_t<VT, types::GetFPType<typename types::Coercion<T1, T2>::type>>>;
+template <class T1, class T2, template<typename> class VT, 
+		test_vt_support<T1, VT, void>* = nullptr>
+using get_long_type = 
+		decayed_t<VT, types::GetLongType<typename types::Coercion<T1, T2>::type>>;
+
+template <class T1, class T2, template<typename> class VT,
+		test_vt_support<T1, VT, void>* = nullptr>
+using get_fp_type = 
+		decayed_t<VT, types::GetFPType<typename types::Coercion<T1, T2>::type>>;

 template <class T1, 
 			template<typename> class VT, template<typename> class VT2,
 			class TRet>
-using test_vt_support2 = typename std::enable_if_t<(std::is_same_v<VT<T1>, ColRef<T1>> || 
-				std::is_same_v<VT<T1>, ColView<T1>> || 
-				std::is_same_v<VT<T1>, vector_type<T1>>) &&
-				(std::is_same_v<VT2<T1>, ColRef<T1>> || 
-				std::is_same_v<VT2<T1>, ColView<T1>> || 
-				std::is_same_v<VT2<T1>, vector_type<T1>>), TRet >;
+using test_vt_support2 = typename std::enable_if_t<
+				std::is_base_of_v<vector_base<T1>, VT<T1>> &&
+				std::is_base_of_v<vector_base<T1>, VT2<T1>>, 
+				TRet >;
+
 template <class T1, class T2,
-			template<typename> class VT, template<typename> class VT2>
-using get_autoext_type2 = test_vt_support2<T1, VT, VT2,
-		decayed_t<VT, typename types::Coercion<T1, T2>::type>>;
+			template<typename> class VT, template<typename> class VT2, 
+			test_vt_support2<T1, VT, VT2, void>* = nullptr >
+using get_autoext_type2 = 
+		decayed_t<VT, typename types::Coercion<T1, T2>::type>;

 template <class T1, class T2,
-			template<typename> class VT, template<typename> class VT2>
-using get_long_type2 = test_vt_support2<T1, VT, VT2,
-		decayed_t<VT, types::GetLongType<typename types::Coercion<T1, T2>::type>>>;
+			template<typename> class VT, template<typename> class VT2, 
+			test_vt_support2<T1, VT, VT2, void>* = nullptr >
+using get_long_type2 = 
+		decayed_t<VT, types::GetLongType<typename types::Coercion<T1, T2>::type>>;

 template <class T1, class T2,
-			template<typename> class VT, template<typename> class VT2>
-using get_fp_type2 = test_vt_support2<T1, VT, VT2,
-		decayed_t<VT, types::GetFPType<typename types::Coercion<T1, T2>::type>>>;
+			template<typename> class VT, template<typename> class VT2, 
+			test_vt_support2<T1, VT, VT2, void>* = nullptr >
+using get_fp_type2 = 
+		decayed_t<VT, types::GetFPType<typename types::Coercion<T1, T2>::type>>;

 template <class T1, class T2, template<typename> class VT, template<typename> class VT2>
 get_autoext_type2<T1, T2, VT, VT2>
@ -835,7 +891,6 @@ VT<bool> operator >(const T2& lhs, const VT<T1>& rhs) {
 }


-
 template <class ...Types>
 void print(const TableInfo<Types...>& v, const char* delimiter = " ", const char* endline = "\n") {
 	v.print(delimiter, endline);
--- a/server/table_ext_monetdb.hpp
+++ b/server/table_ext_monetdb.hpp
@ -45,16 +45,16 @@ void TableInfo<Ts ...>::monetdb_append_table(void* srv, const char* alt_name) {
 	puts("getcols...");
 	uint32_t cnt = 0;
 	const auto get_col = [&monetdbe_cols, &i, *this, &gc_vecs, &cnt](auto v) {
-		printf("%d %d\n", i, (ColRef<void>*)v - colrefs);
+		// printf("%d %d\n", i, (ColRef<void>*)v - colrefs);
 		monetdbe_cols[i++] = (monetdbe_column*)v->monetdb_get_col(gc_vecs, cnt);
 	};
 	(get_col((ColRef<Ts>*)(colrefs + i)), ...);
 	puts("getcols done");
-	for(int i = 0; i < sizeof...(Ts); ++i)
-	{
-		printf("no:%d name: %s count:%d data: %p type:%d \n", 
-		i, monetdbe_cols[i]->name, monetdbe_cols[i]->count, monetdbe_cols[i]->data, monetdbe_cols[i]->type);
-	}
+	// for(int i = 0; i < sizeof...(Ts); ++i)
+	// {
+	// 	printf("no:%d name: %s count:%d data: %p type:%d \n", 
+	// 	i, monetdbe_cols[i]->name, monetdbe_cols[i]->count, monetdbe_cols[i]->data, monetdbe_cols[i]->type);
+	// }
 	std::string create_table_str = "CREATE TABLE IF NOT EXISTS ";
 	create_table_str += alt_name;
 	create_table_str += " (";
--- a/server/types.h
+++ b/server/types.h
@ -29,27 +29,37 @@ inline constexpr size_t aq_szof<void> = 0;
 template <class T1, class T2>
 struct aqis_same_impl {
 	constexpr static bool value = 
+		
 		std::conditional_t<
-			std::is_signed_v<T1> == std::is_signed_v<T2>,
+			std::is_same_v<T1, bool> || std::is_same_v<T2, bool>, 
+			Cond(
+				(std::is_same_v<T1, bool> && std::is_same_v<T2, bool>), 
+				std::true_type, 
+				std::false_type
+			),
 			Cond(
-				std::is_floating_point_v<T1> == std::is_floating_point_v<T2>,
+				std::is_signed_v<T1> == std::is_signed_v<T2>,
 				Cond(
-					aq_szof<T1> == aq_szof<T2>, // deal with sizeof(void)
-					std::true_type,
+					std::is_floating_point_v<T1> == std::is_floating_point_v<T2>,
+					Cond(
+						aq_szof<T1> == aq_szof<T2>, // deal with sizeof(void)
+						std::true_type,
+						std::false_type
+					),
 					std::false_type
 				),
 				std::false_type
-			),
-			std::false_type
+			)
 		>::value;
 };
-
+// make sure size_t/ptr_t and the corresponding integer types are the same
 template <class T1, class T2, class ...Ts>
 constexpr bool aqis_same = aqis_same_impl<T1, T2>::value &&
 aqis_same<T2, Ts...>;

 template <class T1, class T2>
 constexpr bool aqis_same<T1, T2> = aqis_same_impl<T1, T2>::value;
+
 namespace types {
 	enum Type_t {
 		AINT32, AFLOAT, ASTR, ADOUBLE, ALDOUBLE, AINT64, AINT128, AINT16, ADATE, ATIME, AINT8,
--- a/server/utils.h
+++ b/server/utils.h
@ -1,14 +1,18 @@
 #pragma once
+
 #include <ctime>
+#include <type_traits>
+#include <string>
+
 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
 constexpr static bool cpp_17 = true;
 #else
 constexpr static bool cpp_17 = false;
 #endif
+
 template <class T>
 inline const char* str(const T& v) {
 	return "";
 }

-#include<string>
 extern std::string base62uuid(int l = 6);
--- a/server/vector_type.hpp
+++ b/server/vector_type.hpp
@ -17,13 +17,16 @@
 #include "types.h"

 #pragma pack(push, 1)
+template<class T>
+struct vector_base {};
+
 struct vectortype_cstorage{
 	void* container;
 	unsigned int size, capacity;
 };

 template <typename _Ty>
-class vector_type {
+class vector_type : public vector_base<_Ty>{
 public:
 	typedef vector_type<_Ty> Decayed_t;
 	void inline _copy(const vector_type<_Ty>& vt) {
@ -71,9 +74,15 @@ public:
 	constexpr explicit vector_type(const vector_type<_Ty>& vt) noexcept : capacity(0) {
 		_copy(vt);
 	}
+	constexpr vector_type(vector_type<_Ty>& vt) noexcept : capacity(0) {
+		_move(std::move(vt));
+	}
 	constexpr vector_type(vector_type<_Ty>&& vt) noexcept : capacity(0) {
 		_move(std::move(vt));
 	}
+	vector_type(vectortype_cstorage vt) noexcept : capacity(vt.capacity), size(vt.size), container((_Ty*)vt.container) {
+		out(10);
+	};
 	// size >= capacity ==> readonly vector
 	constexpr vector_type(const uint32_t size, void* data) : 
 		size(size), capacity(0), container(static_cast<_Ty*>(data)) {}
@ -159,6 +168,10 @@ public:
 		grow();
 		container[size++] = _val;
 	}
+	void emplace_back(_Ty& _val) {
+		grow();
+		container[size++] = std::move(_val);
+	}
 	void emplace_back(_Ty&& _val) {
 		grow();
 		container[size++] = std::move(_val);
@ -255,7 +268,7 @@ public:
 		}
 		size = this->size + dist;
 	}
-	inline void out(uint32_t n = 4, const char* sep = " ") const
+	inline void out(uint32_t n = 4000, const char* sep = " ") const
 	{
 		const char* more = "";
 		if (n < this->size)
--- a/server/winhelper.cpp
+++ b/server/winhelper.cpp
@ -41,4 +41,20 @@ void SharedMemory::FreeMemoryMap()
        if (this->hFileMap)
            CloseHandle(this->hFileMap);
 }
+
+#ifndef __USE_STD_SEMAPHORE__
+A_Semaphore::A_Semaphore(bool v = false) {
+    native_handle = CreateSemaphore(NULL, v, 1, NULL);
+}
+void A_Semaphore::acquire() {
+    WaitForSingleObject(native_handle, INFINITE);
+}
+void A_Semaphore::release() {
+    ReleaseSemaphore(native_handle, 1, NULL);
+}
+A_Semaphore::~A_Semaphore() {
+    CloseHandle(native_handle);
+}
+#endif
+
 #endif
--- a/server/winhelper.h
+++ b/server/winhelper.h
@ -14,5 +14,17 @@ struct SharedMemory
    SharedMemory(const char*);
    void FreeMemoryMap();
 };
+
+#ifndef __USE_STD_SEMAPHORE__
+class A_Semaphore {
+private:
+	void* native_handle;
+public:
+	A_Semaphore();
+	void acquire();
+	void release();
+	~A_Semaphore();
+};
 #endif
+
 #endif
--- a/tests/complex_data.a
+++ b/tests/complex_data.a
@ -0,0 +1,3 @@
+create table f (a float, b vecfloat, c int)
+load complex data infile 'data/test_complex.csv' into table f fields terminated by ',' element terminated by ';'
+select * from f
--- a/tests/datagen_jose/Time.cpp
+++ b/tests/datagen_jose/Time.cpp
@ -18,7 +18,7 @@
 //
 ///////////////////////////////////////////////////////////////////////////////
 #include <stdio.h>
-#include "Time.H"
+#include "Time.hpp"

 Time::Time(char *startTime_)
 {
--- a/tests/dt.a
+++ b/tests/dt.a
@ -1,21 +1,21 @@
 LOAD MODULE FROM "./libirf.so"
-FUNCTIONS (
-    newtree(height:int, f:int64, sparse:vecint, forget:double, maxf:int64, noclasses:int64, e:int, r:int64, rb:int64) -> bool,
-    additem(X:vecdouble, y:int64, size:int64) -> bool,
-    fit() -> bool,
-    predict() -> vecint
-);
-create table tb(x int);
-create table tb2(x double, y double, z double);
-insert into tb values (0);
-insert into tb values (0);
-insert into tb values (0);
-select newtree(5, 3, tb.x, 0, 3, 2, 0, 100, 1) from tb;
-insert into tb2 values (1, 0, 1);
-insert into tb2 values (0, 1, 1);
-insert into tb2 values (1, 1, 1);
-select additem(tb2.x, 1, 3) from tb2;
-select additem(tb2.y, 0, -1) from tb2;
-select additem(tb2.z, 1, -1) from tb2;
-select fit();
-select predict();
+ FUNCTIONS (
+     newtree(height:int, f:int64, sparse:vecint, forget:double, maxf:int64, noclasses:int64, e:int, r:int64, rb:int64) -> bool,
+     additem(X:vecdouble, y:int64, size:int64) -> bool,
+     fit() -> bool,
+     predict() -> vecint
+ );
+ create table tb(x int);
+ create table tb2(x double, y double, z double);
+ insert into tb values (0);
+ insert into tb values (0);
+ insert into tb values (0);
+ select newtree(5, 3, tb.x, 0, 3, 2, 0, 100, 1) from tb;
+ insert into tb2 values (1, 0, 1);
+ insert into tb2 values (0, 1, 1);
+ insert into tb2 values (1, 1, 1);
+ select additem(tb2.x, 1, 3) from tb2;
+ select additem(tb2.y, 0, -1) from tb2;
+ select additem(tb2.z, 1, -1) from tb2;
+ select fit();
+ select predict();
--- a/tests/dt2.a
+++ b/tests/dt2.a
@ -0,0 +1,22 @@
+LOAD MODULE FROM "./libirf.so"
+ FUNCTIONS (
+     newtree(height:int, f:int64, sparse:vecint, forget:double, maxf:int64, noclasses:int64, e:int, r:int64, rb:int64) -> bool,
+     fit(X:vecvecdouble, y:vecint64) -> bool,
+     predict(X:vecvecdouble) -> vecint
+ );
+
+ create table source(x1 double, x2 double, x3 double, x4 double, x5 int64);
+ load data infile "data/benchmark" into table source fields terminated by ",";
+
+ create table sparse(x int);
+ insert into sparse values (1);
+ insert into sparse values (1);
+ insert into sparse values (1);
+ insert into sparse values (1);
+
+ select newtree(6, 4, sparse.x, 0, 4, 2, 0, 400, 2147483647) from sparse
+
+ select fit(pack(x1, x2, x3, x4), x5) from source
+
+-- select pack(x1, x2, x3, x4) from source
+  select predict(pack(x1, x2, x3, x4)) from source
--- a/tests/funcs.a
+++ b/tests/funcs.a
@ -19,7 +19,7 @@ LOAD DATA INFILE "data/test.csv"
 INTO TABLE test1
 FIELDS TERMINATED BY ","

-SELECT pairCorr(c, b) * d, sum(a), b
+SELECT pairCorr(c, b) * d, a, sum(b)
 FROM test1
-group by c,b,d
+group by a
 order by b ASC
--- a/tests/jose_gh.a
+++ b/tests/jose_gh.a
@ -0,0 +1,31 @@
+CREATE TABLE t(indiv INT, grp STRING, val INT)
+INSERT INTO t VALUES(1, 'A', 1)
+INSERT INTO t VALUES(1, 'A', 2)
+INSERT INTO t VALUES(1, 'A', 3)
+INSERT INTO t VALUES(1, 'A', 4)
+INSERT INTO t VALUES(2, 'A', 2)
+INSERT INTO t VALUES(2, 'A', 2)
+INSERT INTO t VALUES(2, 'A', 4)
+INSERT INTO t VALUES(2, 'A', 8)
+INSERT INTO t VALUES(3, 'B', 10)
+INSERT INTO t VALUES(3, 'B', 20)
+INSERT INTO t VALUES(3, 'B', 30)
+INSERT INTO t VALUES(3, 'B', 40)
+INSERT INTO t VALUES(4, 'B', 20)
+INSERT INTO t VALUES(4, 'B', 20)
+INSERT INTO t VALUES(4, 'B', 40)
+INSERT INTO t VALUES(4, 'B', 80)
+
+
+SELECT * FROM t 
+
+FUNCTION myCov(x, y) {
+  center_x := x - avg(x);
+  center_y := y - avg(y);
+  num := sum(center_x * center_y);
+  denom := sqrt(sum(center_x * center_x)) * sqrt(sum(center_y * center_y));
+  num / denom
+  }
+
+
+select myCov(1,2);
--- a/tests/q1.sql
+++ b/tests/q1.sql
@ -7,4 +7,4 @@ FIELDS TERMINATED BY ","
 SELECT sum(c), b, d
 FROM testq1
 group by a,b,d
-order by d DESC, b ASC
+order by d DESC, b ASC;
--- a/tests/q4.a
+++ b/tests/q4.a
@ -17,4 +17,23 @@ LOAD DATA INFILE "data/ticks.csv" INTO TABLE TICKS FIELDS TERMINATED BY ","
 SELECT max(endofdayprice/prev(endofdayprice)) as Max_Ratio
 FROM ticks
 ASSUMING ASC date
-WHERE ID = "3001"
+WHERE ID = "3001"
+
+CREATE TABLE ticks2(ID VARCHAR(20), max REAL, min REAL)
+INSERT INTO ticks2 SELECT ID AS ID, max(ratios(endofdayprice)) AS max, min(ratios(endofdayprice)) AS min from ticks  group by ID;
+
+SELECT ID, max, min
+FROM ticks2;
+
+CREATE TABLE my_table (c1 INT, c2 INT, c3 STRING)
+INSERT INTO my_table VALUES(10, 20, "example")
+select * from my_table;
+INSERT INTO my_table SELECT * FROM my_table
+select * from my_table;
+SELECT c1, c2  as twice_c2 FROM my_table;
+
+CREATE TABLE my_table_derived
+AS
+  SELECT c1, c2  as twice_c2 FROM my_table;
+SELECT * FROM my_table_derived;
+
--- a/tests/sqlblock.a
+++ b/tests/sqlblock.a
@ -0,0 +1,9 @@
+CREATE TABLE my_table (c1 INT, c2 INT, c3 STRING)
+INSERT INTO my_table VALUES(10, 20, "example"), (20, 30, "example2")
+<sql>
+INSERT INTO my_table VALUES(14, 24, 'example3');
+CREATE INDEX idx1 ON my_table(c1);
+SELECT * FROM my_table WHERE c1 < 15;
+
+</sql>
+SELECT * FROM my_table WHERE c1 > 15