WIP: rework exec-engine structure, GC, fast print

3 years ago · fa1f9822bc
parent be7fb9f523
commit fa1f9822bc
18 changed files with 3730 additions and 125 deletions
--- a/14
+++ b/14
@ -8,15 +8,15 @@ ifeq ($(AQ_DEBUG), 1)
 	LINKFLAGS = 
 else
 	OPTFLAGS = -O3 -DNDEBUG -fno-stack-protector 
-	LINKFLAGS = -flto
+	LINKFLAGS = -flto -s
 endif
 SHAREDFLAGS = -shared  
 FPIC = -fPIC
 COMPILER = $(shell $(CXX) --version | grep -q clang && echo clang|| echo gcc) 
 LIBTOOL = ar rcs
 USELIB_FLAG = -Wl,--whole-archive,libaquery.a -Wl,-no-whole-archive
-LIBAQ_SRC = server/server.cpp server/monetdb_conn.cpp server/io.cpp 
-LIBAQ_OBJ = server.o monetdb_conn.o io.o 
+LIBAQ_SRC = server/monetdb_conn.cpp server/libaquery.cpp 
+LIBAQ_OBJ = monetdb_conn.o libaquery.o 
 SEMANTIC_INTERPOSITION = -fno-semantic-interposition
 RANLIB = ranlib

@ -118,19 +118,21 @@ info:
 pch:
 	$(CXX) -x c++-header server/pch.hpp $(FPIC) $(CXXFLAGS)
 libaquery.a:
-	$(CXX) -c $(FPIC) $(PCHFLAGS) $(LIBAQ_SRC) $(MonetDB_LIB) $(OS_SUPPORT) $(CXXFLAGS) &&\
+	$(CXX) -c $(FPIC) $(PCHFLAGS) $(LIBAQ_SRC) $(OS_SUPPORT) $(CXXFLAGS) &&\
 	$(LIBTOOL) libaquery.a $(LIBAQ_OBJ) &&\
 	$(RANLIB) libaquery.a

+warmup:
+	$(CXX) $(SHAREDFLAGS) msc-plugin/dummy.cpp libaquery.a -o dll.so
 server.bin:
 	$(CXX) $(LIBAQ_SRC) $(BINARYFLAGS) $(OS_SUPPORT) -o server.bin
 launcher:
 	$(CXX) -D__AQ_BUILD_LAUNCHER__ $(LIBAQ_SRC) $(OS_SUPPORT) $(BINARYFLAGS) -o aq
 server.so:
 #	$(CXX) -z muldefs server/server.cpp server/monetdb_conn.cpp -fPIC -shared $(OS_SUPPORT) monetdb/msvc/monetdbe.dll --std=c++1z -O3 -march=native -o server.so -I./monetdb/msvc 
-	$(CXX) $(SHAREDFLAGS) $(PCHFLAGS) $(LIBAQ_SRC) $(OS_SUPPORT) -o server.so 
+	$(CXX) $(SHAREDFLAGS) $(PCHFLAGS) $(LIBAQ_SRC) server/server.cpp server/dragonbox/dragonbox_to_chars.cpp $(OS_SUPPORT) -o server.so 
 server_uselib:
-	$(CXX) $(SHAREDFLAGS) $(USELIB_FLAG),libaquery.a -o server.so
+	$(CXX) $(SHAREDFLAGS) server/server.cpp libaquery.a server/dragonbox/dragonbox_to_chars.cpp -o server.so

 snippet:
 	$(CXX) $(SHAREDFLAGS) $(PCHFLAGS) out.cpp $(LIBAQ_SRC) -o dll.so
--- a/README.md
+++ b/README.md
@ -231,7 +231,7 @@ DROP TABLE my_table IF EXISTS
 - `next(col), prev(col)`: moving column back and forth by 1, e.g. `next(col)[i] = col[i+1]`.
 - `first(col), last(col)`: first and last value of a column, i.e. `first(col)= col[0]`, `last(col) = col[n-1]`.
 - `sqrt(x), trunc(x), and other builtin math functions`: value-wise math operations. `sqrt(x)[i] = sqrt(x[i])`
- `pack(cols, ...)`: pack multiple columns into a single column. 
+- `pack(cols, ...)`: pack multiple columns with exact same type into a single column. 

 # Architecture 
 ![Architecture](./docs/arch-hybrid.svg)
@ -287,3 +287,24 @@ DROP TABLE my_table IF EXISTS
 - [ ] Bug: Order By after Group By
 - [ ] Functionality: Having clause, With clause
 - [ ] Decouple expr.py
+
+# Credit:
+- [mo-sql-parsing](https://github.com/klahnakoski/mo-sql-parsing) <br>
+  Author: Kyle Lahnakoski <br>
+  License (Mozilla Public License 2.0): https://github.com/klahnakoski/mo-sql-parsing/blob/dev/LICENSE 
+
+- [Fast C++ CSV pParser](https://github.com/ben-strasser/fast-cpp-csv-parser) <br>
+  Author: Ben Strasser <br>
+  License (BSD 3-Clause License): https://github.com/ben-strasser/fast-cpp-csv-parser/blob/master/LICENSE
+
+- [Dragonbox](https://github.com/jk-jeon/dragonbox)<br>
+  Author: Junekey Jeon
+  License (Boost, Apache2-LLVM): <br>https://github.com/jk-jeon/dragonbox/blob/master/LICENSE-Boost <br>
+  https://github.com/jk-jeon/dragonbox/blob/master/LICENSE-Apache2-LLVM
+
+- [itoa](https://github.com/jeaiii/itoa) <br>
+  Author: James Edward Anhalt III <br>
+  License (MIT): https://github.com/jeaiii/itoa/blob/main/LICENSE
+
+- [MobetDB] (https://www.monetdb.org) <br>
+  License (Mozilla Public License): https://github.com/MonetDB/MonetDB/blob/master/license.txt
--- a/aquery_config.py
+++ b/aquery_config.py
@ -11,6 +11,7 @@ cygroot = 'c:/msys64/usr/bin'
 msbuildroot = ''
 os_platform = 'unknown'
 build_driver = 'Auto'
+compilation_output = True

 def init_config():
    global __config_initialized__, os_platform, msbuildroot, build_driver
--- a/build.py
+++ b/build.py
@ -73,7 +73,7 @@ class checksums:
 class build_manager:
    sourcefiles = [
                   'build.py', 'Makefile', 
-                   'server/server.cpp', 'server/io.cpp',  
+                   'server/server.cpp', 'server/libaquery.cpp',  
                   'server/monetdb_conn.cpp', 'server/threading.cpp', 
                   'server/winhelper.cpp' 
                   ]
@ -94,6 +94,9 @@ class build_manager:
            return False
        def build(self, stdout = sys.stdout, stderr = sys.stderr):
            ret = True
+            if not aquery_config.compilation_output:
+                stdout = nullstream
+                stderr = nullstream
            for c in self.build_cmd:
                if c:
                    try: # only last success matters
@ -102,6 +105,8 @@ class build_manager:
                        ret = False
                        pass
            return ret
+        def warmup(self):
+            return True
                
    class MakefileDriver(DriverBase):
        def __init__(self, mgr : 'build_manager') -> None:
@ -113,7 +118,7 @@ class build_manager:
                mgr.cxx = os.environ['CXX']
            if 'AQ_DEBUG' not in os.environ:
                os.environ['AQ_DEBUG'] = '0' if mgr.OptimizationLv else '1'
-                
+
        def libaquery_a(self):
            self.build_cmd = [['rm', 'libaquery.a'],['make', 'libaquery.a']]
            return self.build()
@ -168,6 +173,10 @@ class build_manager:
            self.build_cmd = [[aquery_config.msbuildroot, loc, self.opt, self.platform]]
            return self.build()

+        def warmup(self):
+            self.build_cmd = [['make', 'warmup']]
+            return self.build()
+            
    #class PythonDriver(DriverBase):
    #    def __init__(self, mgr : 'build_manager') -> None:
    #        super().__init__(mgr)           
@ -223,6 +232,9 @@ class build_manager:
            current.calc(self.cxx, libaquery_a)
            with open('.cached', 'wb') as cache_sig:
                cache_sig.write(pickle.dumps(current))
+            self.driver.warmup()
+            
+            
        else:
            if aquery_config.os_platform == 'mac':
                os.system('./arch-check.sh')
--- a/msc-plugin/libaquery.vcxproj
+++ b/msc-plugin/libaquery.vcxproj
@ -238,7 +238,7 @@
  <ItemGroup>
    <ClCompile Include="..\server\server.cpp" />
    <ClCompile Include="..\server\winhelper.cpp" />
-    <ClCompile Include="..\server\io.cpp" />
+    <ClCompile Include="..\server\libaquery.cpp" />
    <ClCompile Include="..\server\monetdb_conn.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/reconstruct/storage.py
+++ b/reconstruct/storage.py
@ -156,6 +156,7 @@ class Context:
        self.queries = []
        self.module_init_loc = 0
        self.special_gb = False
+        self.has_dll = False
         
    def __init__(self):
        self.tables_byname = dict()
@ -169,7 +170,6 @@ class Context:
        self.udf_agg_map = dict()
        self.use_columnstore = False
        self.print = print
-        self.has_dll = False
        self.dialect = 'MonetDB'
        self.is_msvc = False
        self.have_hge = False
--- a/server/Makefile
+++ b/server/Makefile
@ -1,6 +1,6 @@
 debug:
-	g++ -g3 -O0 server/server.cpp server/io.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
+	g++ -g3 -O0 server/server.cpp server/libaquery.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
 	
 test:
-	g++ --std=c++1z -g3 -O0 server.cpp io.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
+	g++ --std=c++1z -g3 -O0 server.cpp libaquery.cpp  -o a.out -Wall -Wextra -Wpedantic -lpthread
 	
--- a/server/dragonbox/dragonbox.h
+++ b/server/dragonbox/dragonbox.h
--- a/server/dragonbox/dragonbox_to_chars.cpp
+++ b/server/dragonbox/dragonbox_to_chars.cpp
@ -0,0 +1,519 @@
+// Copyright 2020-2022 Junekey Jeon
+//
+// The contents of this file may be used under the terms of
+// the Apache License v2.0 with LLVM Exceptions.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     https://llvm.org/foundation/relicensing/LICENSE.txt)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+
+
+#include "dragonbox_to_chars.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define JKJ_FORCEINLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+    #define JKJ_FORCEINLINE __forceinline
+#else
+    #define JKJ_FORCEINLINE inline
+#endif
+
+namespace jkj::dragonbox {
+    namespace to_chars_detail {
+        // These "//"'s are to prevent clang-format to ruin this nice alignment.
+        // Thanks to reddit user u/mcmcc:
+        // https://www.reddit.com/r/cpp/comments/so3wx9/dragonbox_110_is_released_a_fast_floattostring/hw8z26r/?context=3
+        static constexpr char radix_100_table[] = {
+            '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', //
+            '0', '5', '0', '6', '0', '7', '0', '8', '0', '9', //
+            '1', '0', '1', '1', '1', '2', '1', '3', '1', '4', //
+            '1', '5', '1', '6', '1', '7', '1', '8', '1', '9', //
+            '2', '0', '2', '1', '2', '2', '2', '3', '2', '4', //
+            '2', '5', '2', '6', '2', '7', '2', '8', '2', '9', //
+            '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', //
+            '3', '5', '3', '6', '3', '7', '3', '8', '3', '9', //
+            '4', '0', '4', '1', '4', '2', '4', '3', '4', '4', //
+            '4', '5', '4', '6', '4', '7', '4', '8', '4', '9', //
+            '5', '0', '5', '1', '5', '2', '5', '3', '5', '4', //
+            '5', '5', '5', '6', '5', '7', '5', '8', '5', '9', //
+            '6', '0', '6', '1', '6', '2', '6', '3', '6', '4', //
+            '6', '5', '6', '6', '6', '7', '6', '8', '6', '9', //
+            '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', //
+            '7', '5', '7', '6', '7', '7', '7', '8', '7', '9', //
+            '8', '0', '8', '1', '8', '2', '8', '3', '8', '4', //
+            '8', '5', '8', '6', '8', '7', '8', '8', '8', '9', //
+            '9', '0', '9', '1', '9', '2', '9', '3', '9', '4', //
+            '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'  //
+        };
+        static constexpr char radix_100_head_table[] = {
+            '0', '.', '1', '.', '2', '.', '3', '.', '4', '.', //
+            '5', '.', '6', '.', '7', '.', '8', '.', '9', '.', //
+            '1', '.', '1', '.', '1', '.', '1', '.', '1', '.', //
+            '1', '.', '1', '.', '1', '.', '1', '.', '1', '.', //
+            '2', '.', '2', '.', '2', '.', '2', '.', '2', '.', //
+            '2', '.', '2', '.', '2', '.', '2', '.', '2', '.', //
+            '3', '.', '3', '.', '3', '.', '3', '.', '3', '.', //
+            '3', '.', '3', '.', '3', '.', '3', '.', '3', '.', //
+            '4', '.', '4', '.', '4', '.', '4', '.', '4', '.', //
+            '4', '.', '4', '.', '4', '.', '4', '.', '4', '.', //
+            '5', '.', '5', '.', '5', '.', '5', '.', '5', '.', //
+            '5', '.', '5', '.', '5', '.', '5', '.', '5', '.', //
+            '6', '.', '6', '.', '6', '.', '6', '.', '6', '.', //
+            '6', '.', '6', '.', '6', '.', '6', '.', '6', '.', //
+            '7', '.', '7', '.', '7', '.', '7', '.', '7', '.', //
+            '7', '.', '7', '.', '7', '.', '7', '.', '7', '.', //
+            '8', '.', '8', '.', '8', '.', '8', '.', '8', '.', //
+            '8', '.', '8', '.', '8', '.', '8', '.', '8', '.', //
+            '9', '.', '9', '.', '9', '.', '9', '.', '9', '.', //
+            '9', '.', '9', '.', '9', '.', '9', '.', '9', '.'  //
+        };
+
+        // These digit generation routines are inspired by James Anhalt's itoa algorithm:
+        // https://github.com/jeaiii/itoa
+        // The main idea is for given n, find y such that floor(10^k * y / 2^32) = n holds,
+        // where k is an appropriate integer depending on the length of n.
+        // For example, if n = 1234567, we set k = 6. In this case, we have
+        // floor(y / 2^32) = 1,
+        // floor(10^2 * ((10^0 * y) mod 2^32) / 2^32) = 23,
+        // floor(10^2 * ((10^2 * y) mod 2^32) / 2^32) = 45, and
+        // floor(10^2 * ((10^4 * y) mod 2^32) / 2^32) = 67.
+        // See https://jk-jeon.github.io/posts/2022/02/jeaiii-algorithm/ for more explanation.
+
+        JKJ_FORCEINLINE static void print_9_digits(std::uint32_t s32, int& exponent,
+                                                   char*& buffer) noexcept {
+            // -- IEEE-754 binary32
+            // Since we do not cut trailing zeros in advance, s32 must be of 6~9 digits
+            // unless the original input was subnormal.
+            // In particular, when it is of 9 digits it shouldn't have any trailing zeros.
+            // -- IEEE-754 binary64
+            // In this case, s32 must be of 7~9 digits unless the input is subnormal,
+            // and it shouldn't have any trailing zeros if it is of 9 digits.
+            if (s32 >= 1'0000'0000) {
+                // 9 digits.
+                // 1441151882 = ceil(2^57 / 1'0000'0000) + 1
+                auto prod = s32 * std::uint64_t(1441151882);
+                prod >>= 25;
+                std::memcpy(buffer, radix_100_head_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 6, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                prod = std::uint32_t(prod) * std::uint64_t(100);
+                std::memcpy(buffer + 8, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                exponent += 8;
+                buffer += 10;
+            }
+            else if (s32 >= 100'0000) {
+                // 7 or 8 digits.
+                // 281474978 = ceil(2^48 / 100'0000) + 1
+                auto prod = s32 * std::uint64_t(281474978);
+                prod >>= 16;
+                auto two_digits = std::uint32_t(prod >> 32);
+                // If s32 is of 8 digits, increase the exponent by 7.
+                // Otherwise, increase it by 6.
+                exponent += (6 + unsigned(two_digits >= 10));
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                // Remaining 6 digits are all zero?
+                if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100'0000)) {
+                    // The number of characters actually written is:
+                    //   1, if only the first digit is nonzero, which means that either s32 is of 7
+                    //   digits or it is of 8 digits but the second digit is zero, or
+                    //   3, otherwise.
+                    // Note that buffer[2] is never zero if s32 is of 7 digits, because the input is
+                    // never zero.
+                    buffer += (1 + (unsigned(two_digits >= 10) & unsigned(buffer[2] > '0')) * 2);
+                }
+                else {
+                    // At least one of the remaining 6 digits are nonzero.
+                    // After this adjustment, now the first destination becomes buffer + 2.
+                    buffer += unsigned(two_digits >= 10);
+
+                    // Obtain the next two digits.
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                    // Remaining 4 digits are all zero?
+                    if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 1'0000)) {
+                        buffer += (3 + unsigned(buffer[3] > '0'));
+                    }
+                    else {
+                        // At least one of the remaining 4 digits are nonzero.
+
+                        // Obtain the next two digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        two_digits = std::uint32_t(prod >> 32);
+                        std::memcpy(buffer + 4, radix_100_table + two_digits * 2, 2);
+
+                        // Remaining 2 digits are all zero?
+                        if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                            buffer += (5 + unsigned(buffer[5] > '0'));
+                        }
+                        else {
+                            // Obtain the last two digits.
+                            prod = std::uint32_t(prod) * std::uint64_t(100);
+                            two_digits = std::uint32_t(prod >> 32);
+                            std::memcpy(buffer + 6, radix_100_table + two_digits * 2, 2);
+
+                            buffer += (7 + unsigned(buffer[7] > '0'));
+                        }
+                    }
+                }
+            }
+            else if (s32 >= 1'0000) {
+                // 5 or 6 digits.
+                // 429497 = ceil(2^32 / 1'0000)
+                auto prod = s32 * std::uint64_t(429497);
+                auto two_digits = std::uint32_t(prod >> 32);
+
+                // If s32 is of 6 digits, increase the exponent by 5.
+                // Otherwise, increase it by 4.
+                exponent += (4 + unsigned(two_digits >= 10));
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                // Remaining 4 digits are all zero?
+                if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 1'0000)) {
+                    // The number of characters actually written is 1 or 3, similarly to the case of
+                    // 7 or 8 digits.
+                    buffer += (1 + (unsigned(two_digits >= 10) & unsigned(buffer[2] > '0')) * 2);
+                }
+                else {
+                    // At least one of the remaining 4 digits are nonzero.
+                    // After this adjustment, now the first destination becomes buffer + 2.
+                    buffer += unsigned(two_digits >= 10);
+
+                    // Obtain the next two digits.
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                    // Remaining 2 digits are all zero?
+                    if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                        buffer += (3 + unsigned(buffer[3] > '0'));
+                    }
+                    else {
+                        // Obtain the last two digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        two_digits = std::uint32_t(prod >> 32);
+                        std::memcpy(buffer + 4, radix_100_table + two_digits * 2, 2);
+
+                        buffer += (5 + unsigned(buffer[5] > '0'));
+                    }
+                }
+            }
+            else if (s32 >= 100) {
+                // 3 or 4 digits.
+                // 42949673 = ceil(2^32 / 100)
+                auto prod = s32 * std::uint64_t(42949673);
+                auto two_digits = std::uint32_t(prod >> 32);
+
+                // If s32 is of 4 digits, increase the exponent by 3.
+                // Otherwise, increase it by 2.
+                exponent += (2 + int(two_digits >= 10));
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                // Remaining 2 digits are all zero?
+                if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                    // The number of characters actually written is 1 or 3, similarly to the case of
+                    // 7 or 8 digits.
+                    buffer += (1 + (unsigned(two_digits >= 10) & unsigned(buffer[2] > '0')) * 2);
+                }
+                else {
+                    // At least one of the remaining 2 digits are nonzero.
+                    // After this adjustment, now the first destination becomes buffer + 2.
+                    buffer += unsigned(two_digits >= 10);
+
+                    // Obtain the last two digits.
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                    buffer += (3 + unsigned(buffer[3] > '0'));
+                }
+            }
+            else {
+                // 1 or 2 digits.
+                // If s32 is of 2 digits, increase the exponent by 1.
+                exponent += int(s32 >= 10);
+
+                // Write the first digit and the decimal point.
+                std::memcpy(buffer, radix_100_head_table + s32 * 2, 2);
+                // This third character may be overwritten later but we don't care.
+                buffer[2] = radix_100_table[s32 * 2 + 1];
+
+                // The number of characters actually written is 1 or 3, similarly to the case of
+                // 7 or 8 digits.
+                buffer += (1 + (unsigned(s32 >= 10) & unsigned(buffer[2] > '0')) * 2);
+            }
+        }
+
+        template <>
+        char* to_chars<float, default_float_traits<float>>(std::uint32_t s32, int exponent,
+                                                           char* buffer) noexcept {
+            // Print significand.
+            print_9_digits(s32, exponent, buffer);
+
+            // Print exponent and return
+            if (exponent < 0) {
+                std::memcpy(buffer, "E-", 2);
+                buffer += 2;
+                exponent = -exponent;
+            }
+            else if (exponent > 0) {
+                buffer[0] = 'E';
+                buffer += 1;
+            }
+            else {
+                return buffer;
+            }
+
+            if (exponent >= 10) {
+                std::memcpy(buffer, &radix_100_table[exponent * 2], 2);
+                buffer += 2;
+            }
+            else {
+                buffer[0] = char('0' + exponent);
+                buffer += 1;
+            }
+
+            return buffer;
+        }
+
+        template <>
+        char* to_chars<double, default_float_traits<double>>(std::uint64_t const significand,
+                                                             int exponent, char* buffer) noexcept {
+            // Print significand by decomposing it into a 9-digit block and a 8-digit block.
+            std::uint32_t first_block, second_block;
+            bool no_second_block;
+
+            if (significand >= 1'0000'0000) {
+                first_block = std::uint32_t(significand / 1'0000'0000);
+                second_block = std::uint32_t(significand) - first_block * 1'0000'0000;
+                exponent += 8;
+                no_second_block = (second_block == 0);
+            }
+            else {
+                first_block = std::uint32_t(significand);
+                no_second_block = true;
+            }
+
+            if (no_second_block) {
+                print_9_digits(first_block, exponent, buffer);
+            }
+            else {
+                // We proceed similarly to print_9_digits(), but since we do not need to remove
+                // trailing zeros, the procedure is a bit simpler.
+                if (first_block >= 1'0000'0000) {
+                    // The input is of 17 digits, thus there should be no trailing zero at all.
+                    // The first block is of 9 digits.
+                    // 1441151882 = ceil(2^57 / 1'0000'0000) + 1
+                    auto prod = first_block * std::uint64_t(1441151882);
+                    prod >>= 25;
+                    std::memcpy(buffer, radix_100_head_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 6, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 8, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                    // The second block is of 8 digits.
+                    // 281474978 = ceil(2^48 / 100'0000) + 1
+                    prod = second_block * std::uint64_t(281474978);
+                    prod >>= 16;
+                    prod += 1;
+                    std::memcpy(buffer + 10, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 12, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 14, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                    prod = std::uint32_t(prod) * std::uint64_t(100);
+                    std::memcpy(buffer + 16, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                    exponent += 8;
+                    buffer += 18;
+                }
+                else {
+                    if (first_block >= 100'0000) {
+                        // 7 or 8 digits.
+                        // 281474978 = ceil(2^48 / 100'0000) + 1
+                        auto prod = first_block * std::uint64_t(281474978);
+                        prod >>= 16;
+                        auto two_digits = std::uint32_t(prod >> 32);
+
+                        std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                        buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                        exponent += (6 + unsigned(two_digits >= 10));
+                        buffer += unsigned(two_digits >= 10);
+
+                        // Print remaining 6 digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 6, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                        buffer += 8;
+                    }
+                    else if (first_block >= 1'0000) {
+                        // 5 or 6 digits.
+                        // 429497 = ceil(2^32 / 1'0000)
+                        auto prod = first_block * std::uint64_t(429497);
+                        auto two_digits = std::uint32_t(prod >> 32);
+
+                        std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                        buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                        exponent += (4 + unsigned(two_digits >= 10));
+                        buffer += unsigned(two_digits >= 10);
+
+                        // Print remaining 4 digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 4, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                        buffer += 6;
+                    }
+                    else if (first_block >= 100) {
+                        // 3 or 4 digits.
+                        // 42949673 = ceil(2^32 / 100)
+                        auto prod = first_block * std::uint64_t(42949673);
+                        auto two_digits = std::uint32_t(prod >> 32);
+
+                        std::memcpy(buffer, radix_100_head_table + two_digits * 2, 2);
+                        buffer[2] = radix_100_table[two_digits * 2 + 1];
+
+                        exponent += (2 + unsigned(two_digits >= 10));
+                        buffer += unsigned(two_digits >= 10);
+
+                        // Print remaining 2 digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        std::memcpy(buffer + 2, radix_100_table + std::uint32_t(prod >> 32) * 2, 2);
+
+                        buffer += 4;
+                    }
+                    else {
+                        // 1 or 2 digits.
+                        std::memcpy(buffer, radix_100_head_table + first_block * 2, 2);
+                        buffer[2] = radix_100_table[first_block * 2 + 1];
+
+                        exponent += unsigned(first_block >= 10);
+                        buffer += (2 + unsigned(first_block >= 10));
+                    }
+
+                    // Next, print the second block.
+                    // The second block is of 8 digits, but we may have trailing zeros.
+                    // 281474978 = ceil(2^48 / 100'0000) + 1
+                    auto prod = second_block * std::uint64_t(281474978);
+                    prod >>= 16;
+                    prod += 1;
+                    auto two_digits = std::uint32_t(prod >> 32);
+                    std::memcpy(buffer, radix_100_table + two_digits * 2, 2);
+
+                    // Remaining 6 digits are all zero?
+                    if (std::uint32_t(prod) <= std::uint32_t((std::uint64_t(1) << 32) / 100'0000)) {
+                        buffer += (1 + unsigned(buffer[1] > '0'));
+                    }
+                    else {
+                        // Obtain the next two digits.
+                        prod = std::uint32_t(prod) * std::uint64_t(100);
+                        two_digits = std::uint32_t(prod >> 32);
+                        std::memcpy(buffer + 2, radix_100_table + two_digits * 2, 2);
+
+                        // Remaining 4 digits are all zero?
+                        if (std::uint32_t(prod) <=
+                            std::uint32_t((std::uint64_t(1) << 32) / 1'0000)) {
+                            buffer += (3 + unsigned(buffer[3] > '0'));
+                        }
+                        else {
+                            // Obtain the next two digits.
+                            prod = std::uint32_t(prod) * std::uint64_t(100);
+                            two_digits = std::uint32_t(prod >> 32);
+                            std::memcpy(buffer + 4, radix_100_table + two_digits * 2, 2);
+
+                            // Remaining 2 digits are all zero?
+                            if (std::uint32_t(prod) <=
+                                std::uint32_t((std::uint64_t(1) << 32) / 100)) {
+                                buffer += (5 + unsigned(buffer[5] > '0'));
+                            }
+                            else {
+                                // Obtain the last two digits.
+                                prod = std::uint32_t(prod) * std::uint64_t(100);
+                                two_digits = std::uint32_t(prod >> 32);
+                                std::memcpy(buffer + 6, radix_100_table + two_digits * 2, 2);
+                                buffer += (7 + unsigned(buffer[7] > '0'));
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Print exponent and return
+            if (exponent < 0) {
+                std::memcpy(buffer, "E-", 2);
+                buffer += 2;
+                exponent = -exponent;
+            }
+            else if (exponent > 0) {
+                buffer[0] = 'E';
+                buffer += 1;
+            }
+            else {
+                return buffer;
+            }
+
+            if (exponent >= 100) {
+                // d1 = exponent / 10; d2 = exponent % 10;
+                // 6554 = ceil(2^16 / 10)
+                auto prod = std::uint32_t(exponent) * std::uint32_t(6554);
+                auto d1 = prod >> 16;
+                prod = std::uint16_t(prod) * std::uint32_t(5); // * 10
+                auto d2 = prod >> 15;                          // >> 16
+                std::memcpy(buffer, &radix_100_table[d1 * 2], 2);
+                buffer[2] = char('0' + d2);
+                buffer += 3;
+            }
+            else if (exponent >= 10) {
+                std::memcpy(buffer, &radix_100_table[exponent * 2], 2);
+                buffer += 2;
+            }
+            else {
+                buffer[0] = char('0' + exponent);
+                buffer += 1;
+            }
+
+            return buffer;
+        }
+    }
+}
--- a/server/dragonbox/dragonbox_to_chars.h
+++ b/server/dragonbox/dragonbox_to_chars.h
@ -0,0 +1,108 @@
+// Copyright 2020-2022 Junekey Jeon
+//
+// The contents of this file may be used under the terms of
+// the Apache License v2.0 with LLVM Exceptions.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     https://llvm.org/foundation/relicensing/LICENSE.txt)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+
+#ifndef JKJ_HEADER_DRAGONBOX_TO_CHARS
+#define JKJ_HEADER_DRAGONBOX_TO_CHARS
+
+#include "dragonbox.h"
+
+namespace jkj::dragonbox {
+    namespace to_chars_detail {
+        template <class Float, class FloatTraits>
+        extern char* to_chars(typename FloatTraits::carrier_uint significand, int exponent,
+                              char* buffer) noexcept;
+
+        // Avoid needless ABI overhead incurred by tag dispatch.
+        template <class PolicyHolder, class Float, class FloatTraits>
+        char* to_chars_n_impl(float_bits<Float, FloatTraits> br, char* buffer) noexcept {
+            auto const exponent_bits = br.extract_exponent_bits();
+            auto const s = br.remove_exponent_bits(exponent_bits);
+
+            if (br.is_finite(exponent_bits)) {
+                if (s.is_negative()) {
+                    *buffer = '-';
+                    ++buffer;
+                }
+                if (br.is_nonzero()) {
+                    auto result = to_decimal<Float, FloatTraits>(
+                        s, exponent_bits, policy::sign::ignore, policy::trailing_zero::ignore,
+                        typename PolicyHolder::decimal_to_binary_rounding_policy{},
+                        typename PolicyHolder::binary_to_decimal_rounding_policy{},
+                        typename PolicyHolder::cache_policy{});
+                    return to_chars_detail::to_chars<Float, FloatTraits>(result.significand,
+                                                                         result.exponent, buffer);
+                }
+                else {
+                    std::memcpy(buffer, "0E0", 3);
+                    return buffer + 3;
+                }
+            }
+            else {
+                if (s.has_all_zero_significand_bits()) {
+                    if (s.is_negative()) {
+                        *buffer = '-';
+                        ++buffer;
+                    }
+                    std::memcpy(buffer, "Infinity", 8);
+                    return buffer + 8;
+                }
+                else {
+                    std::memcpy(buffer, "NaN", 3);
+                    return buffer + 3;
+                }
+            }
+        }
+    }
+
+    // Returns the next-to-end position
+    template <class Float, class FloatTraits = default_float_traits<Float>, class... Policies>
+    char* to_chars_n(Float x, char* buffer, Policies... policies) noexcept {
+        using namespace jkj::dragonbox::detail::policy_impl;
+        using policy_holder = decltype(make_policy_holder(
+            base_default_pair_list<base_default_pair<decimal_to_binary_rounding::base,
+                                                     decimal_to_binary_rounding::nearest_to_even>,
+                                   base_default_pair<binary_to_decimal_rounding::base,
+                                                     binary_to_decimal_rounding::to_even>,
+                                   base_default_pair<cache::base, cache::full>>{},
+            policies...));
+
+        return to_chars_detail::to_chars_n_impl<policy_holder>(float_bits<Float, FloatTraits>(x),
+                                                               buffer);
+    }
+
+    // Null-terminate and bypass the return value of fp_to_chars_n
+    template <class Float, class FloatTraits = default_float_traits<Float>, class... Policies>
+    char* to_chars(Float x, char* buffer, Policies... policies) noexcept {
+        auto ptr = to_chars_n<Float, FloatTraits>(x, buffer, policies...);
+        *ptr = '\0';
+        return ptr;
+    }
+
+    // Maximum required buffer size (excluding null-terminator)
+    template <class FloatFormat>
+    inline constexpr std::size_t max_output_string_length =
+        std::is_same_v<FloatFormat, ieee754_binary32>
+            ?
+            // sign(1) + significand(9) + decimal_point(1) + exp_marker(1) + exp_sign(1) + exp(2)
+            (1 + 9 + 1 + 1 + 1 + 2)
+            :
+            // format == ieee754_format::binary64
+            // sign(1) + significand(17) + decimal_point(1) + exp_marker(1) + exp_sign(1) + exp(3)
+            (1 + 17 + 1 + 1 + 1 + 3);
+}
+
+#endif
--- a/server/gc.h
+++ b/server/gc.h
@ -0,0 +1,59 @@
+#ifndef __AQ_USE_THREADEDGC__
+#include <atomic>
+class GC {
+private:;
+
+	size_t max_size, max_slots, 
+		   interval, forced_clean, 
+		   forceclean_timer = 0;
+	bool running, alive;
+//  ptr, dealloc, ref, sz
+	void *q, *q_back;
+	void* handle;
+	std::atomic<uint32_t> slot_pos;
+	std::atomic<uint32_t> alive_cnt;
+	std::atomic<uint64_t> current_size;
+	volatile bool lock;
+	// maybe use volatile std::thread::id instead
+protected:
+	void acquire_lock();
+	void release_lock();
+	void gc();
+	void daemon();
+	void start_deamon();
+	void terminate_daemon();
+
+public:
+	void reg(void* v, uint32_t sz = 1, 
+			void(*f)(void*) = free
+		);
+
+	GC(
+		uint32_t max_size = 0xfffffff, uint32_t max_slots = 4096, 
+		uint32_t interval = 10000, uint32_t forced_clean = 1000000 //one seconds
+	) : max_size(max_size), max_slots(max_slots), 
+		interval(interval), forced_clean(forced_clean){
+
+		start_deamon();
+		GC::gc = this;
+	} // 256 MB
+
+	~GC(){
+		terminate_daemon();
+	}
+	static GC* gc;
+    constexpr static void(*_free) (void*) = free;
+};
+
+#else
+class GC {
+public:
+	GC(uint32_t) = default;
+	void reg(
+		void* v, uint32_t = 0, 
+		void(*f)(void*) = free
+	) const { f(v); }
+	static GC* gc;
+    constexpr static void(*_free) (void*) = free;
+}
+#endif
--- a/server/jeaiii_to_text.h
+++ b/server/jeaiii_to_text.h
@ -0,0 +1,116 @@
+
+// Copyright (c) 2022 James Edward Anhalt III - https://github.com/jeaiii/itoa
+using u32 = decltype(0xffffffff);
+using u64 = decltype(0xffffffffffffffff);
+
+static_assert(u32(-1) > 0, "u32 must be unsigned");
+static_assert(u32(0xffffffff) + u32(1) == u32(0), "u32 must be 32 bits");
+static_assert(u64(-1) > 0, "u64 must be unsigned");
+static_assert(u64(0xffffffffffffffff) + u32(1) == u32(0), "u64 must be 64 bits");
+
+constexpr auto digits_00_99 =
+    "00010203040506070809" "10111213141516171819" "20212223242526272829" "30313233343536373839"	"40414243444546474849"
+    "50515253545556575859" "60616263646566676869" "70717273747576777879" "80818283848586878889"	"90919293949596979899";
+
+struct pair { char t, o; };
+
+#define JEAIII_W(I, U) *(pair*)&b[I] = *(pair*)&digits_00_99[(U) * 2]
+#define JEAIII_A(I, N) t = (u64(1) << (32 + N / 5 * N * 53 / 16)) / u32(1e##N) + 1 + N / 6 - N / 8, t *= u, t >>= N / 5 * N * 53 / 16, t += N / 6 * 4, JEAIII_W(I, t >> 32)
+#define JEAIII_S(I) b[I] = char(u64(10) * u32(t) >> 32) + '0'
+#define JEAIII_D(I) t = u64(100) * u32(t), JEAIII_W(I, t >> 32)
+
+#define JEAIII_C0(I) b[I] = char(u) + '0'
+#define JEAIII_C1(I) JEAIII_W(I, u)
+#define JEAIII_C2(I) JEAIII_A(I, 1), JEAIII_S(I + 2)
+#define JEAIII_C3(I) JEAIII_A(I, 2), JEAIII_D(I + 2)
+#define JEAIII_C4(I) JEAIII_A(I, 3), JEAIII_D(I + 2), JEAIII_S(I + 4)
+#define JEAIII_C5(I) JEAIII_A(I, 4), JEAIII_D(I + 2), JEAIII_D(I + 4)
+#define JEAIII_C6(I) JEAIII_A(I, 5), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_S(I + 6)
+#define JEAIII_C7(I) JEAIII_A(I, 6), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_D(I + 6)
+#define JEAIII_C8(I) JEAIII_A(I, 7), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_D(I + 6), JEAIII_S(I + 8)
+#define JEAIII_C9(I) JEAIII_A(I, 8), JEAIII_D(I + 2), JEAIII_D(I + 4), JEAIII_D(I + 6), JEAIII_D(I + 8)
+
+#define JEAIII_L(N, A, B) u < u32(1e##N) ? A : B
+#define JEAIII_L09(F) JEAIII_L(2, JEAIII_L(1, F(0), F(1)), JEAIII_L(6, JEAIII_L(4, JEAIII_L(3, F(2), F(3)), JEAIII_L(5, F(4), F(5))), JEAIII_L(8, JEAIII_L(7, F(6), F(7)), JEAIII_L(9, F(8), F(9)))))
+#define JEAIII_L03(F) JEAIII_L(2, JEAIII_L(1, F(0), F(1)), JEAIII_L(3, F(2), F(3)))
+
+#define JEAIII_K(N) (JEAIII_C##N(0), b + N + 1)
+#define JEAIII_KX(N) (JEAIII_C##N(0), u = x, JEAIII_C7(N + 1), b + N + 9)
+#define JEAIII_KYX(N) (JEAIII_C##N(0), u = y, JEAIII_C7(N + 1), u = x, JEAIII_C7(N + 9), b + N + 17)
+
+template<bool B, class T, class F> struct _cond { using type = F; };
+template<class T, class F> struct _cond<true, T, F> { using type = T; };
+template<bool B, class T, class F> using cond = typename _cond<B, T, F>::type;
+
+template<class T> inline char* to_text_from_integer(char* b, T i)
+{
+    u64 t = u64(i);
+
+    if (i < T(0))
+        t = u64(0) - t, b[0] = '-', ++b;
+
+    u32 u = cond<T(1) != T(2), cond<sizeof(T) != 1, cond<sizeof(T) != sizeof(short), u32, unsigned short>, unsigned char>, bool>(t);
+
+    // if our input type fits in 32bits, or its value does, ctreat as 32bit (the line above ensures the compiler can still know the range limits of the input type)
+    // and optimize out cases for small integer types (if only c++ had a builtin way to get the unsigned type from a signed type)
+    if (sizeof(i) <= sizeof(u) || u == t)
+        return JEAIII_L09(JEAIII_K);
+
+    u32 x = t % 100000000u;
+    u = u32(t /= 100000000u);
+
+    // t / 10^8 (fits in 32 bit), t % 10^8 -> ~17.5 digits
+    if (u == t)
+        return JEAIII_L09(JEAIII_KX);
+
+    // t / 10^16 (1-4 digits), t / 10^8 % 10^8, t % 10^8
+    u32 y = t % 100000000u;
+    u = u32(t / 100000000u);
+    return JEAIII_L03(JEAIII_KYX);
+}
+
+inline char* to_text(char text[], signed char i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned char i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], short i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned short i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], int i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned int i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], long i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned long i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], long long i) { return to_text_from_integer(text, i); }
+inline char* to_text(char text[], unsigned long long i) { return to_text_from_integer(text, i); }
+
+// Copyright (c) 2022 Bill Sun
+constexpr static __uint128_t _10_19 = 10000000000000000000ull, 
+    _10_37 = _10_19*_10_19 / 10;
+
+template<class T>
+char* jeaiii_i128(char* buf, T v){
+    if (v < 0){
+        *(buf++) = '0';
+        v = -v;
+    }
+    if (v > _10_37){
+        uint8_t vv = uint8_t(v/_10_37);
+        // vv <<= 1;
+        // if (vv < 20)
+        //     *buf ++ = digits_00_99[vv + 1];
+        // else{
+        //     *buf++ = digits_00_99[vv ];
+        //     *buf++ = digits_00_99[vv + 1];
+        // }  
+    
+        *(buf++) = vv%10 + '0';
+        vv/=10;
+        if (vv) {
+            *buf = *(buf-1);
+            *(buf++-1) = vv + '0';
+        }
+    }
+
+    if (v > _10_19)
+        buf = to_text(buf, uint64_t((v/_10_19) % _10_19));
+    
+    buf = to_text(buf, uint64_t(v % _10_19));
+    return buf;
+}
--- a/server/libaquery.cpp
+++ b/server/libaquery.cpp
@ -1,20 +1,20 @@
 #include "pch_msc.hpp"

 #include "io.h"
-#include "table.h"
 #include <limits>

 #include <chrono>
 #include <ctime>

 #include "utils.h"
+#include "libaquery.h"
 #include <random>

 char* gbuf = nullptr;

 void setgbuf(char* buf) {
-	static char* b = 0;
-	if (buf == 0)
+	static char* b = nullptr;
+	if (buf == nullptr)
 		gbuf = b;
 	else {
 		gbuf = buf;
@ -63,6 +63,7 @@ T getInt(const char*& buf){
 	}
 	return ret;
 }
+
 template<class T> 
 char* intToString(T val, char* buf){

@ -275,6 +276,43 @@ inline const char* str(const bool& v) {
 	return v ? "true" : "false";
 }

+
+Context::Context() {
+    current.memory_map = new std::unordered_map<void*, deallocator_t>;
+    init_session();
+}
+
+Context::~Context() {
+    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
+    delete memmap;
+}
+
+void Context::init_session(){
+    if (log_level == LOG_INFO){
+        memset(&(this->current.stats), 0, sizeof(Session::Statistic));
+    }
+    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
+    memmap->clear();
+}
+
+void Context::end_session(){
+    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
+    for (auto& mem : *memmap) {
+        mem.second(mem.first);
+    }
+    memmap->clear();
+}
+
+void* Context::get_module_function(const char* fname){
+    auto fmap = static_cast<std::unordered_map<std::string, void*>*>
+        (this->module_function_maps);
+    // printf("%p\n", fmap->find("mydiv")->second);
+    //  for (const auto& [key, value] : *fmap){
+    //      printf("%s %p\n", key.c_str(), value);
+    //  }
+    auto ret = fmap->find(fname);
+    return ret == fmap->end() ? nullptr : ret->second;
+}
 // template<typename _Ty>
 // inline void vector_type<_Ty>::out(uint32_t n, const char* sep) const
 // {
@ -288,3 +326,123 @@ inline const char* str(const bool& v) {
 // 	}
 // 	std::cout << ')';
 // }
+
+#include "gc.h"
+#include <vector_type>
+#include <utility>
+#include <thread>
+#include <chrono>
+#ifndef __AQ_USE_THREADEDGC__
+
+struct gcmemory_t{
+	void* memory;
+	void (*deallocator)(void*);
+};
+
+using memoryqueue_t = gcmemory_t*;
+void GC::acquire_lock() {
+	auto this_tid = std::this_thread::get_id();
+	while(lock != this_tid)
+	{
+		while(lock != this_tid && lock != std::thread::id()) {
+			std::this_thread::sleep_for(std::chrono::milliseconds(0));
+		}
+		lock = this_tid;
+	}
+}
+
+void GC::release_lock(){
+	lock = std::thread::id();
+}
+
+void GC::gc()
+{
+	auto& _q = static_cast<memoryqueue_t*>(q);
+	auto& _q_back = static_cast<memoryqueue_t*>(q_back);
+	if (_q->size == 0)
+		return;
+	auto t = _q;
+	lock = true;
+	while(alive_cnt > 0);
+	_q = q_back;
+	uint32_t _slot = slot_pos;
+	slot_pos = 0;
+	current_size = 0;
+	lock = false;
+	_q_back = t;
+
+	for(uint32_t i = 0; i < _slot; ++i){
+		if (_q_back[i]->memory != nullptr && _q_back[i]->deallocator != nullptr)
+			_q_back[i]->deallocator(_q_back[i]->memory);
+	}
+	memset(_q_back, 0, sizeof(gcmemory_t) * _slot);
+	running = false;
+}
+
+void GC::daemon() {
+	using namespace std::chrono;
+
+	while (alive) {
+		if (running) {
+			if (current_size > max_size || 
+				forceclean_timer > forced_clean) 
+			{
+				gc();
+				forceclean_timer = 0;
+			}
+			std::this_thread::sleep_for(microseconds(interval));
+			forceclean_timer += interval;
+		}
+		else {
+			std::this_thread::sleep_for(10ms);
+			forceclean_timer += 10000;
+		}
+	}
+}
+
+void GC::start_deamon() {
+	q = new gcmemory_t[max_slots << 1];
+	q_back = new memoryqueue_t[max_slots << 1];
+	lock = false;
+	slot_pos = 0;
+	current_size = 0;
+	alive_cnt = 0;
+	alive = true;
+	handle = new std::thread(&GC::daemon, this);
+}
+
+void GC::terminate_daemon() {
+	running = false;
+	alive = false;
+	decltype(auto) _handle = static_cast<std::thread*>(handle);
+	delete[] static_cast<memoryqueue_t>(q);
+	delete[] static_cast<memoryqueue_t>(q_back);
+	using namespace std::chrono;
+	std::this_thread::sleep_for(microseconds(1000 + std::max(static_cast<size_t>(10000), interval)));
+
+	if (_handle->joinable()) {
+		_handle->join();
+	}
+	delete _handle;
+}
+
+void GC::reg(void* v, uint32_t sz, void(*f)(void*)) { //~ 40ns expected v. free ~ 75ns
+	if (v == nullptr || f == nullptr)
+		return;
+	if (sz < threshould){
+		f(v);
+		return;
+	}
+	auto _q = static_cast<memoryqueue_t>q;
+	while(lock);
+	++alive_cnt;
+	current_size += sz;
+	auto _slot = (slot_pos += 1);
+	q[_slot] = {v, f};
+	--alive_cnt;
+	running = true;
+}
+
+#endif
+
+static GC* GC::gc = nullptr;
--- a/server/libaquery.h
+++ b/server/libaquery.h
@ -1,6 +1,12 @@
 #ifndef _AQUERY_H
 #define _AQUERY_H

+#ifdef __INTELLISENSE__
+	#define __AQUERY_ITC_USE_SEMPH__
+	#define THREADING
+	#define __AQ_THREADED_GC__
+#endif
+
 #include "table.h"
 #include <unordered_map>
 #include <chrono>
@ -55,7 +61,10 @@ struct Context{
 #ifdef THREADING
 	void* thread_pool;
 #endif	
-	printf_type print = printf;
+#ifdef __AQ_THREADED_GC__
+	void* gc;
+#endif
+	printf_type print = &printf;
 	Context();
 	virtual ~Context();
 	template <class ...Types>
--- a/server/monetdb_conn.cpp
+++ b/server/monetdb_conn.cpp
@ -6,6 +6,7 @@
 #include "monetdb_conn.h"
 #include "monetdbe.h"
 #include "table.h"
+
 #undef static_assert

 const char* monetdbe_type_str[] = {
@ -121,6 +122,8 @@ bool Server::haserror(){
        return false;
    }
 }
+
+
 void Server::print_results(const char* sep, const char* end){

    if (!haserror()){
@ -138,6 +141,7 @@ void Server::print_results(const char* sep, const char* end){
                std::string(types::printf_str[types::monetdbe_type_aqtypes[cols[i]->type]]) 
                + (i < ncols - 1 ? sep : "");
            puts(printf_string[i].c_str());
+            puts(monetdbe_type_str[cols[i]->type]);
            col_data[i] = static_cast<char *>(cols[i]->data);
            szs [i] = monetdbe_type_szs[cols[i]->type];
            header_string = header_string + cols[i]->name + sep + '|' + sep;
@ -179,7 +183,7 @@ void* Server::getCol(int col_idx){
            auto _ret_col = static_cast<monetdbe_column*>(this->ret_col);
            cnt = _ret_col->count;
             printf("Dbg: Getting col %s, type: %s\n", 
-                 _ret_col->name, monetdbe_type_str[_ret_col->type]);
+                _ret_col->name, monetdbe_type_str[_ret_col->type]);
            return _ret_col->data;
        }
        else{
@ -198,10 +202,10 @@ Server::~Server(){

 bool Server::havehge() {
 #if defined(_MONETDBE_LIB_) and defined(HAVE_HGE)
-    puts("true");
+    // puts("true");
    return HAVE_HGE;
 #else
-    puts("false");
+    // puts("false");
    return false;
 #endif
 }
--- a/server/monetdb_conn.h
+++ b/server/monetdb_conn.h
@ -24,6 +24,7 @@ struct Server{
    static bool havehge();
    void test(const char*);
    void print_results(const char* sep = " ", const char* end = "\n");
+    friend void print_monetdb_results(Server* srv, const char* sep, const char* end, int limit);
    ~Server();
 };

--- a/server/server.cpp
+++ b/server/server.cpp
@ -1,6 +1,5 @@
 #include "pch_msc.hpp"

-#include "../csv.h"
 #include <iostream>
 #include <string>
 #include <chrono>
@ -10,28 +9,35 @@
 #ifdef THREADING
 #include "threading.h"
 #endif
+
 #ifdef _WIN32
 #include "winhelper.h"
 #else 
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <sys/mman.h>
+
+// fast numeric to string conversion
+#include "jeaiii_to_text.h"
+#include "dragonbox/dragonbox_to_chars.h"
+
 struct SharedMemory
 {
    std::atomic<bool> a;
    int hFileMap;
    void* pData;
-    SharedMemory(const char* fname) {
+    explicit SharedMemory(const char* fname) {
        hFileMap = open(fname, O_RDWR, 0);
        if (hFileMap != -1)
-            pData = mmap(NULL, 8, PROT_READ | PROT_WRITE, MAP_SHARED, hFileMap, 0);
+            pData = mmap(nullptr, 8, PROT_READ | PROT_WRITE, MAP_SHARED, hFileMap, 0);
        else 
-            pData = 0;
+            pData = nullptr;
    }
-    void FreeMemoryMap() {
-
+    void FreeMemoryMap() const {
+        // automatically unmapped in posix
    }
 };
+
 #ifndef __USE_STD_SEMAPHORE__
 #ifdef __APPLE__
 #include <dispatch/dispatch.h>
@ -74,9 +80,10 @@ public:
 };
 #endif
 #endif
-
 #endif
+
 #ifdef __USE_STD_SEMAPHORE__
+#define __AQUERY_ITC_USE_SEMPH__
 #include <semaphore>
 class A_Semaphore {
 private:
@ -94,6 +101,7 @@ public:
    ~A_Semaphore() { }
 };
 #endif
+
 #ifdef __AQUERY_ITC_USE_SEMPH__
 A_Semaphore prompt{ true }, engine{ false };
 #define PROMPT_ACQUIRE() prompt.acquire()
@ -107,11 +115,9 @@ A_Semaphore prompt{ true }, engine{ false };
 #define ENGINE_RELEASE() 
 #endif

-#include "aggregations.h"
 typedef int (*code_snippet)(void*);
 typedef void (*module_init_fn)(Context*);

-int test_main();

 int n_recv = 0;
 char** n_recvd = nullptr;
@ -119,6 +125,7 @@ char** n_recvd = nullptr;
 __AQEXPORT__(void) wait_engine(){
    PROMPT_ACQUIRE();
 }
+
 __AQEXPORT__(void) wake_engine(){
    ENGINE_RELEASE();
 }
@ -152,42 +159,6 @@ __AQEXPORT__(bool) have_hge(){
 #endif
 }

-Context::Context() {
-    current.memory_map = new std::unordered_map<void*, deallocator_t>;
-    init_session();
-}
-
-Context::~Context() {
-    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
-    delete memmap;
-}
-
-void Context::init_session(){
-    if (log_level == LOG_INFO){
-        memset(&(this->current.stats), 0, sizeof(Session::Statistic));
-    }
-    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
-    memmap->clear();
-}
-
-void Context::end_session(){
-    auto memmap = (std::unordered_map<void*, deallocator_t>*) this->current.memory_map;
-    for (auto& mem : *memmap) {
-        mem.second(mem.first);
-    }
-    memmap->clear();
-}
-
-void* Context::get_module_function(const char* fname){
-    auto fmap = static_cast<std::unordered_map<std::string, void*>*>
-        (this->module_function_maps);
-    // printf("%p\n", fmap->find("mydiv")->second);
-    //  for (const auto& [key, value] : *fmap){
-    //      printf("%s %p\n", key.c_str(), value);
-    //  }
-    auto ret = fmap->find(fname);
-    return ret == fmap->end() ? nullptr : ret->second;
-}

 void initialize_module(const char* module_name, void* module_handle, Context* cxt){
    auto _init_module = reinterpret_cast<module_init_fn>(dlsym(module_handle, "init_session"));
@ -253,7 +224,7 @@ int dll_main(int argc, char** argv, Context* cxt){
                                timer.reset();
                                server->exec(n_recvd[i] + 1);
                                cfg->stats.monet_time += timer.elapsed();
-                                printf("Exec Q%d: %s", i, n_recvd[i]);
+                                // printf("Exec Q%d: %s", i, n_recvd[i]);
                            }
                            break;
                        case 'P': // Postprocessing procedure 
@ -313,7 +284,7 @@ int dll_main(int argc, char** argv, Context* cxt){
                        dlclose(handle);
                        handle = nullptr;
                    }
-                    printf("%ld, %ld", cfg->stats.monet_time, cfg->stats.postproc_time);
+                    printf("%lld, %lld", cfg->stats.monet_time, cfg->stats.postproc_time);
                    cxt->end_session();
                    n_recv = 0;
                }
@ -370,20 +341,21 @@ extern "C" int __DLLEXPORT__ main(int argc, char** argv) {
 #ifdef __AQ_BUILD_LAUNCHER__
   return launcher(argc, argv);
 #endif
-   puts("running");
+   // puts("running");
   Context* cxt = new Context();
-   cxt->log("%d %s\n", argc, argv[1]);
+   // cxt->log("%d %s\n", argc, argv[1]);

 #ifdef THREADING
    auto tp = new ThreadPool();
    cxt->thread_pool = tp;
 #endif
    
+#ifdef __AQ_THREADED_GC__
+    cxt->gc_thread = new std::thread(gc_thread, cxt);
+#endif    
   const char* shmname;
   if (argc < 0)
        return dll_main(argc, argv, cxt);
-   else if (argc <= 1)
-        return test_main();
   else
       shmname = argv[1];
   SharedMemory shm = SharedMemory(shmname);
@ -417,56 +389,3 @@ extern "C" int __DLLEXPORT__ main(int argc, char** argv) {
   return 0;
 }

-#include "utils.h"
-#include "table_ext_monetdb.hpp"
-int test_main()
-{
-    Context* cxt = new Context();
-    if (cxt->alt_server == 0)
-        cxt->alt_server = new Server(cxt);
-    Server* server = reinterpret_cast<Server*>(cxt->alt_server);
-
-    const char* qs[]= {
-        "QCREATE TABLE trade(stocksymbol INT, time INT, quantity INT, price INT);",
-        "QCOPY OFFSET 2 INTO trade FROM  'w:/gg/AQuery++/data/trade_numerical.csv'  ON SERVER    USING DELIMITERS  ',';",
-        "QSELECT stocksymbol, (SUM((quantity * price)) / SUM(quantity)) AS weighted_average  FROM trade GROUP BY stocksymbol  ;",
-        "Pdll_5lYrMY",
-        "QSELECT stocksymbol, price  FROM trade ORDER BY time  ;",
-        "Pdll_4Sg6Ri",
-        "QSELECT stocksymbol, quantity, price  FROM trade ORDER BY time  ;",
-        "Pdll_5h4kL2",
-        "QSELECT stocksymbol, price  FROM trade ORDER BY time  ;",
-        "Pdll_7tEWCO",
-        "QSELECT query_c.weighted_moving_averages, query_c.stocksymbol  FROM query_c;",
-        "Pdll_7FCPnF"
-    };
-    n_recv = sizeof(qs)/(sizeof (char*));
-	n_recvd = const_cast<char**>(qs);
-            void* handle = 0;
-                    handle = dlopen("./dll.so", RTLD_LAZY);
-                    cxt->init_session();
-                    for (int i = 0; i < n_recv; ++i)
-                    {
-                        //printf("%s, %d\n", n_recvd[i], n_recvd[i][0] == 'Q');
-                        switch (n_recvd[i][0]) {
-                        case 'Q': // SQL query for monetdbe
-                        {
-                            server->exec(n_recvd[i] + 1);
-                            printf("Exec Q%d: %s\n", i, n_recvd[i]);
-                        }
-                        break;
-                        case 'P': // Postprocessing procedure 
-                            if (handle && !server->haserror()) {
-                                code_snippet c = reinterpret_cast<code_snippet>(dlsym(handle, n_recvd[i] + 1));
-                                c(cxt);
-                            }
-                            break;
-                        }
-                    }
-                    n_recv = 0;
-
-    //static_assert(std::is_same_v<decltype(fill_integer_array<5, 1>()), std::integer_sequence<bool, 1,1,1,1,1>>, "");
-    
-    return 0;
-}
-
--- a/server/utils.h
+++ b/server/utils.h
@ -1,5 +1,9 @@
 #pragma once
+
 #include <ctime>
+#include <type_traits>
+#include<string>
+
 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
 constexpr static bool cpp_17 = true;
 #else
@ -10,5 +14,19 @@ inline const char* str(const T& v) {
 	return "";
 }

-#include<string>
+template <class T>
+constexpr char* aq_itoa(T t, char* buf){
+	if constexpr (std::is_signed<T>::value){
+		if (t < 0){
+			*buf++ = '-';
+			t = -t;
+		}
+	}
+	while(t > 0){
+		*buf++ = t%10 + '0';
+		t /= 10;
+	}
+	return buf;
+}
+
 extern std::string base62uuid(int l = 6);