From d6e3e4878ee72347e53254b528d4583624c6fdac Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Mon, 28 Nov 2022 04:32:08 +0800
Subject: [PATCH] Optimized hashtable performance; Stored procedures

---
 Makefile                 |    2 +-
 README.md                |    4 +
 prompt.py                |   17 +-
 reconstruct/ast.py       |   19 +-
 server/aggregations.h    |    8 +-
 server/hasher.h          |   39 +-
 server/libaquery.h       |    7 +-
 server/server.cpp        |  179 ++++-
 server/table.h           |   17 +-
 server/unordered_dense.h | 1516 ++++++++++++++++++++++++++++++++++++++
 server/vector_type.hpp   |   32 +-
 11 files changed, 1778 insertions(+), 62 deletions(-)
 create mode 100644 server/unordered_dense.h
diff --git a/Makefile b/Makefile
index 21b55bd..c438529 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ OS_SUPPORT =
 MonetDB_LIB = 
 MonetDB_INC = 
 Defines = 
-CXXFLAGS = --std=c++1z
+CXXFLAGS = --std=c++2a
 ifeq ($(AQ_DEBUG), 1)
 	OPTFLAGS = -g3 -fsanitize=address -fsanitize=leak
 	LINKFLAGS = 
diff --git a/README.md b/README.md
index 14abd61..7d72430 100644
--- a/README.md
+++ b/README.md
@@ -343,3 +343,7 @@ SELECT * FROM my_table WHERE c1 > 10
 
 - [MonetDB](https://www.monetdb.org) <br>
   License (Mozilla Public License): https://github.com/MonetDB/MonetDB/blob/master/license.txt
+
+- [ankerl::unordered_dense](https://github.com/martinus/unordered_dense)<br>
+  Author: Martin Ankerl <br>
+  License (MIT): http://opensource.org/licenses/MIT <br>
diff --git a/prompt.py b/prompt.py
index 3afb22c..4e72ceb 100644
--- a/prompt.py
+++ b/prompt.py
@@ -613,16 +613,21 @@ def prompt(running = lambda:True, next = lambda:input('> '), state : Optional[Pr
             elif q.startswith('procedure'):
                 qs = re.split(r'[ \t\r\n]', q)
                 procedure_help = '''Usage: procedure <procedure_name> [record|stop|run|remove|save|load]'''
-                send_to_server = lambda str: state.send(1, ctypes.c_char_p(bytes(str, 'utf-8')))
+                def send_to_server(payload : str): 
+                    state.payload = (ctypes.c_char_p*1)(ctypes.c_char_p(bytes(payload, 'utf-8')))
+                    state.cfg.has_dll = 0
+                    state.send(1, state.payload)
+                    state.set_ready()
                 if len(qs) > 2:
                     if qs[2].lower() =='record':
-                        if state.current_procedure != qs[1]:             
+                        if state.current_procedure is not None and state.current_procedure != qs[1]:             
                             print(f'Cannot record 2 procedures at the same time. Stop recording {state.current_procedure} first.')
-                        elif not state.current_procedure:
+                        elif state.current_procedure is None:
                             state.current_procedure = qs[1]
-                            send_to_server(f'R\0{qs[1]}', 'utf-8')
+                            send_to_server(f'R\0{qs[1]}')
                     elif qs[2].lower() == 'stop':
                         send_to_server(f'RT\0{qs[1]}')
+                        state.current_procedure = None
                     else:
                         if state.current_procedure:
                             print(f'Procedure manipulation commands are disallowed during procedure recording.')
@@ -635,6 +640,10 @@ def prompt(running = lambda:True, next = lambda:input('> '), state : Optional[Pr
                             send_to_server(f'RS\0{qs[1]}')
                         elif qs[2].lower() == 'load':
                             send_to_server(f'RL\0{qs[1]}')
+                     
+                elif len(qs) > 1:
+                    if qs[1].lower() == 'display':
+                        send_to_server(f'Rd\0')
                 else:
                     print(procedure_help)
                 continue
diff --git a/reconstruct/ast.py b/reconstruct/ast.py
index 07efd77..870df5b 100644
--- a/reconstruct/ast.py
+++ b/reconstruct/ast.py
@@ -383,12 +383,13 @@ class projection(ast_node):
         if self.group_node is not None and self.group_node.use_sp_gb:
             gb_vartable : Dict[str, Union[str, int]] = deepcopy(self.pyname2cname)
             gb_cexprs : List[str] = []
-            
+            gb_colnames : List[str] = []
             for key, val in proj_map.items():
                 col_name = 'col_' + base62uuid(6)
                 self.context.emitc(f'decltype(auto) {col_name} = {self.out_table.contextname_cpp}->get_col<{key}>();')
                 gb_cexprs.append((col_name, val[2]))
-            self.group_node.finalize(gb_cexprs, gb_vartable)
+                gb_colnames.append(col_name)
+            self.group_node.finalize(gb_cexprs, gb_vartable, gb_colnames)
         else:
             for i, (key, val) in enumerate(proj_map.items()):
                 if type(val[1]) is int:
@@ -536,7 +537,7 @@ class groupby_c(ast_node):
     
     def produce(self, node : List[Tuple[expr, Set[ColRef]]]):
         self.context.headers.add('"./server/hasher.h"')
-        self.context.headers.add('unordered_map')
+        # self.context.headers.add('unordered_map')
         self.group = 'g' + base62uuid(7)
         self.group_type = 'record_type' + base62uuid(7)
         self.datasource = self.proj.datasource
@@ -565,8 +566,9 @@ class groupby_c(ast_node):
             [f'{c}[{scanner_itname}]' for c in g_contents_list]
         )
         self.context.emitc(f'typedef record<{",".join(g_contents_decltype)}> {self.group_type};')
-        self.context.emitc(f'unordered_map<{self.group_type}, vector_type<uint32_t>, '
+        self.context.emitc(f'ankerl::unordered_dense::map<{self.group_type}, vector_type<uint32_t>, '
             f'transTypes<{self.group_type}, hasher>> {self.group};')
+        self.context.emitc(f'{self.group}.reserve({first_col}.size);')
         self.n_grps = len(self.glist)
         self.scanner = scan(self, first_col + '.size', it_name=scanner_itname)
         self.scanner.add(f'{self.group}[forward_as_tuple({g_contents})].emplace_back({self.scanner.it_var});')
@@ -581,7 +583,10 @@ class groupby_c(ast_node):
     #     gscanner.add(f'{self.datasource.cxt_name}->order_by<{assumption.result()}>(&{val_var});')
     #     gscanner.finalize()
         
-    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]]):
+    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]], col_names : List[str]):
+        for c in col_names:
+            self.context.emitc(f'{c}.reserve({self.group}.size());')
+        
         gscanner = scan(self, self.group, loop_style = 'for_each')
         key_var = 'key_'+base62uuid(7)
         val_var = 'val_'+base62uuid(7)
@@ -713,10 +718,10 @@ class groupby(ast_node):
                 #     self.parent.var_table.
                 self.parent.col_ext.update(l[1])    
                 
-    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]]):
+    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]], col_names : List[str]):
         if self.use_sp_gb:
             self.dedicated_gb = groupby_c(self.parent, self.dedicated_glist)
-            self.dedicated_gb.finalize(cexprs, var_table)
+            self.dedicated_gb.finalize(cexprs, var_table, col_names)
 
 
 class join(ast_node):
diff --git a/server/aggregations.h b/server/aggregations.h
index cb4bcbe..0f1d8f8 100644
--- a/server/aggregations.h
+++ b/server/aggregations.h
@@ -31,7 +31,7 @@ double avg(const VT<T>& v) {
 
 template<class T, template<typename ...> class VT>
 VT<double> sqrt(const VT<T>& v) {
-	VT<double> ret{ v.size };
+	VT<double> ret(v.size);
 	for (uint32_t i = 0; i < v.size; ++i) {
 		ret[i] = sqrt(v[i]);
 	}
@@ -52,7 +52,7 @@ VT<T> truncate(const VT<T>& v, const uint32_t precision) {
 		return v.subvec_memcpy();
 	auto multiplier = pow(10, precision);
 	auto max_truncate = std::numeric_limits<T>::max()/multiplier;
-	VT<T> ret{ v.size };
+	VT<T> ret(v.size);
 	for (uint32_t i = 0; i < v.size; ++i) { // round or trunc??
 		ret[i] = v[i] < max_truncate ? round(v[i] * multiplier)/multiplier : v[i];
 	}
@@ -102,7 +102,7 @@ decayed_t<VT, T> maxs(const VT<T>& arr) {
 template<class T, template<typename ...> class VT>
 decayed_t<VT, T> minw(uint32_t w, const VT<T>& arr) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, T> ret{ len };
+	decayed_t<VT, T> ret(len);
 	std::deque<std::pair<T, uint32_t>> cache;
 	for (int i = 0; i < len; ++i) {
 		if (!cache.empty() && cache.front().second == i - w) cache.pop_front();
@@ -194,7 +194,7 @@ decayed_t<VT, types::GetFPType<types::GetLongType<T>>> avgw(uint32_t w, const VT
 	uint32_t i = 0;
 	types::GetLongType<T> s{};
 	w = w > len ? len : w;
-	if (len)	s = ret[i++] = arr[0];
+	if (len) s = ret[i++] = arr[0];
 	for (; i < w; ++i)
 		ret[i] = (s += arr[i]) / (FPType)(i + 1);
 	for (; i < len; ++i)
diff --git a/server/hasher.h b/server/hasher.h
index 70a97e8..30330dd 100644
--- a/server/hasher.h
+++ b/server/hasher.h
@@ -3,7 +3,10 @@
 #include <type_traits>
 #include <tuple>
 #include <functional>
+#include <string_view>
 #include "types.h"
+// #include "robin_hood.h"
+#include "unordered_dense.h"
 // only works for 64 bit systems
 constexpr size_t _FNV_offset_basis = 14695981039346656037ULL;
 constexpr size_t _FNV_prime = 1099511628211ULL;
@@ -14,7 +17,7 @@ inline size_t append_bytes(const unsigned char* _First) noexcept {
 		_Val ^= static_cast<size_t>(*_First);
 		_Val *= _FNV_prime;
 	}
-
+	
 	return _Val;
 }
 
@@ -65,37 +68,44 @@ struct hasher {
 #else
 		#define _current_type current_type
 #endif
-		return std::hash<_current_type>()(std::get<i>(record)) ^ hashi<i + 1>(record);
+		return ankerl::unordered_dense::hash<_current_type>()(std::get<i>(record)) ^ hashi<i + 1>(record);
 	}
 	size_t operator()(const std::tuple<Types...>& record) const {
 		return hashi(record);
 	}
 };
+template <class T>
+struct hasher<T>{
+	size_t operator()(const std::tuple<T>& record) const {
+		return ankerl::unordered_dense::hash<T>()(std::get<0>(record));
+	}
+};
 
-
-namespace std{
-
+namespace ankerl::unordered_dense{
 	template<>
 	struct hash<astring_view> {
 		size_t operator()(const astring_view& _Keyval) const noexcept {
-			return append_bytes(_Keyval.str);
+			
+			return ankerl::unordered_dense::hash<std::string_view>()(_Keyval.rstr);
+			//return append_bytes(_Keyval.str);
+			
 		}
 	};
 
 	template<>
 	struct hash<types::date_t> {
 		size_t operator() (const types::date_t& _Keyval) const noexcept {
-			return std::hash<unsigned int>()(*(unsigned int*)(&_Keyval));
+			return ankerl::unordered_dense::hash<unsigned int>()(*(unsigned int*)(&_Keyval));
 		}
 	};
 
 	template<>
 	struct hash<types::time_t> {
 		size_t operator() (const types::time_t& _Keyval) const noexcept {
-			return std::hash<unsigned int>()(_Keyval.ms) ^ 
-			std::hash<unsigned char>()(_Keyval.seconds) ^
-			std::hash<unsigned char>()(_Keyval.minutes) ^
-			std::hash<unsigned char>()(_Keyval.hours)
+			return ankerl::unordered_dense::hash<unsigned int>()(_Keyval.ms) ^ 
+			ankerl::unordered_dense::hash<unsigned char>()(_Keyval.seconds) ^
+			ankerl::unordered_dense::hash<unsigned char>()(_Keyval.minutes) ^
+			ankerl::unordered_dense::hash<unsigned char>()(_Keyval.hours)
 			;
 		}
 	};
@@ -103,8 +113,8 @@ namespace std{
 	template<>
 	struct hash<types::timestamp_t>{
 		size_t operator() (const types::timestamp_t& _Keyval) const noexcept {
-			return std::hash<types::date_t>()(_Keyval.date) ^ 
-				std::hash<types::time_t>()(_Keyval.time);
+			return ankerl::unordered_dense::hash<types::date_t>()(_Keyval.date) ^ 
+				ankerl::unordered_dense::hash<types::time_t>()(_Keyval.time);
 		}
 	};
 #ifdef __SIZEOF_INT128__
@@ -112,12 +122,11 @@ namespace std{
 	template<>
 	struct hash<int128_struct>{
 		size_t operator() (const int128_struct& _Keyval) const noexcept {
-			return std::hash<uint64_t>()(_Keyval.__struct.low) ^ std::hash<uint64_t>()(_Keyval.__struct.high);
+			return ankerl::unordered_dense::hash<uint64_t>()(_Keyval.__struct.low) ^ ankerl::unordered_dense::hash<uint64_t>()(_Keyval.__struct.high);
 		}
 	};
 #endif
 	template <class ...Types>
 	struct hash<std::tuple<Types...>> : public hasher<Types...>{ };
-
 }
 
diff --git a/server/libaquery.h b/server/libaquery.h
index 6bfc98c..981cd57 100644
--- a/server/libaquery.h
+++ b/server/libaquery.h
@@ -109,10 +109,9 @@ struct Context{
 	void init_session();
 	void end_session();
 	void* get_module_function(const char*);
-	std::unordered_map<const char*, void*> tables;
-    std::unordered_map<const char*, uColRef *> cols;
-    std::unordered_map<const char*, void*> loaded_modules;
-    std::unordered_map<const char*, StoredProcedure> stored_proc;
+	std::unordered_map<std::string, void*> tables;
+    std::unordered_map<std::string, uColRef *> cols;
+    std::unordered_map<std::string, StoredProcedure> stored_proc;
 };
 
 
diff --git a/server/server.cpp b/server/server.cpp
index dd55597..12f0aed 100644
--- a/server/server.cpp
+++ b/server/server.cpp
@@ -163,6 +163,20 @@ __AQEXPORT__(bool) have_hge(){
 
 using prt_fn_t = char* (*)(void*, char*);
 
+// This function contains heap allocations, free after use
+template<class String_T>
+char* to_lpstr(const String_T& str){
+    auto ret = static_cast<char*>(malloc(str.size() + 1));
+    memcpy(ret, str.c_str(), str.size());
+    ret[str.size()] = '\0';
+    return ret;
+}
+char* copy_lpstr(const char* str){
+    auto len = strlen(str);
+    auto ret = static_cast<char*>(malloc(len + 1));
+    memcpy(ret, str, len + 1);
+    return ret;
+}
 
 constexpr prt_fn_t monetdbe_prtfns[] = {
 	aq_to_chars<bool>, aq_to_chars<int8_t>, aq_to_chars<int16_t>, aq_to_chars<int32_t>, 
@@ -270,7 +284,18 @@ int dll_main(int argc, char** argv, Context* cxt){
     aq_timer timer;
     Config *cfg = reinterpret_cast<Config *>(argv[0]);
     std::unordered_map<std::string, void*> user_module_map;
+    std::string pwd = std::filesystem::current_path().c_str();
+    auto sep = std::filesystem::path::preferred_separator;
+    pwd += sep;
+    std::string procedure_root = pwd + "procedures" + sep;
     std::string procedure_name = "";
+    StoredProcedure current_procedure;
+    vector_type<char *> recorded_queries;
+    vector_type<void *> recorded_libraries;
+    bool procedure_recording = false, 
+         procedure_replaying = false;
+    uint32_t procedure_module_cursor = 0;
+
     if (cxt->module_function_maps == nullptr)
         cxt->module_function_maps = new std::unordered_map<std::string, void*>();
     auto module_fn_map = 
@@ -291,12 +316,12 @@ int dll_main(int argc, char** argv, Context* cxt){
         puts(*(const char**)(alt_server->getCol(0)));
         cxt->alt_server = alt_server;
     }
-    bool rec = false;
     while(cfg->running){
         ENGINE_ACQUIRE();
         if (cfg->new_query) {
             cfg->stats.postproc_time = 0;
             cfg->stats.monet_time = 0;
+start:
 
             void *handle = nullptr;
             void *user_module_handle = nullptr;
@@ -306,7 +331,28 @@ int dll_main(int argc, char** argv, Context* cxt){
                 Server* server = reinterpret_cast<Server*>(cxt->alt_server);
                 if(n_recv > 0){
                     if (cfg->backend_type == BACKEND_AQuery || cfg->has_dll) {
-                        handle = dlopen("./dll.so", RTLD_NOW);
+                        const char* proc_name = "./dll.so";
+                        std::string dll_path;
+                        if (procedure_recording) {
+                            dll_path = procedure_root + 
+                                procedure_name + std::to_string(recorded_libraries.size) + ".so";
+                            
+                            try{
+                                if (std::filesystem::exists(dll_path))
+                                    std::filesystem::remove(dll_path);
+                                std::filesystem::copy_file(proc_name, dll_path);
+                            } catch(std::filesystem::filesystem_error& e){
+                                puts(e.what());
+                                dll_path = proc_name;
+                            }
+                            proc_name = dll_path.c_str();
+                            if(recorded_libraries.size)
+                                recorded_queries.emplace_back(copy_lpstr("N"));
+                        }
+                        handle = dlopen(proc_name, RTLD_NOW);
+                        if (procedure_recording) {
+                            recorded_libraries.emplace_back(handle);
+                        }
                     }
                     for (const auto& module : user_module_map){
                         initialize_module(module.first.c_str(), module.second, cxt);
@@ -314,18 +360,24 @@ int dll_main(int argc, char** argv, Context* cxt){
                     cxt->init_session();
                     for(int i = 0; i < n_recv; ++i)
                     {
-                        //printf("%s, %d\n", n_recvd[i], n_recvd[i][0] == 'Q');
+                        printf("%s, %d\n", n_recvd[i], n_recvd[i][0] == 'Q');
                         switch(n_recvd[i][0]){
                         case 'Q': // SQL query for monetdbe
                             {
+                                if(procedure_recording){
+                                    recorded_queries.emplace_back(copy_lpstr(n_recvd[i]));
+                                }
                                 timer.reset();
                                 server->exec(n_recvd[i] + 1);
                                 cfg->stats.monet_time += timer.elapsed();
-                                // printf("Exec Q%d: %s", i, n_recvd[i]);
+                                printf("Exec Q%d: %s", i, n_recvd[i]);
                             }
                             break;
                         case 'P': // Postprocessing procedure 
                             if(handle && !server->haserror()) {
+                                if (procedure_recording) {
+                                    recorded_queries.emplace_back(copy_lpstr(n_recvd[i]));
+                                }
                                 code_snippet c = reinterpret_cast<code_snippet>(dlsym(handle, n_recvd[i]+1));
                                 timer.reset();
                                 c(cxt);
@@ -359,6 +411,12 @@ int dll_main(int argc, char** argv, Context* cxt){
                         case 'O':
                             {
                                 if(!server->haserror()){
+                                    if (procedure_recording){
+                                        char* buf = (char*) malloc (sizeof(char) * 6);
+                                        memcpy(buf, n_recvd[i], 5);
+                                        buf[5] = '\0';
+                                        recorded_queries.emplace_back(buf);
+                                    }
                                     uint32_t limit;
                                     memcpy(&limit, n_recvd[i] + 1, sizeof(uint32_t));
                                     if (limit == 0)
@@ -379,36 +437,115 @@ int dll_main(int argc, char** argv, Context* cxt){
                                 user_module_map.erase(it);
                             }
                             break;
+                        case 'N':
+                            {
+                                if(procedure_module_cursor < current_procedure.postproc_modules)
+                                    handle = current_procedure.__rt_loaded_modules[procedure_module_cursor++];
+                            }
+                            break;
                         case 'R': //recorded procedure
                             {
-                                auto proc_name = n_recvd[i] + 1;
+                                auto proc_name = n_recvd[i] + 2;
                                 proc_name = *proc_name?proc_name : proc_name + 1;
-                                const auto& load_modules = [](StoredProcedure &p){
+                                puts(proc_name);
+                                const auto& load_modules = [&](StoredProcedure &p) {
                                     if (!p.__rt_loaded_modules){
                                         p.__rt_loaded_modules = static_cast<void**>(
                                             malloc(sizeof(void*) * p.postproc_modules));
                                         for(uint32_t j = 0; j < p.postproc_modules; ++j){
-                                            p.__rt_loaded_modules[j] = dlopen(p.name, RTLD_NOW);
+                                            auto pj = dlopen(p.name, RTLD_NOW);
+                                            if (pj == nullptr){
+                                                printf("Error: failed to load module %s\n", p.name);
+                                                return true;
+                                            }
+                                            p.__rt_loaded_modules[j] = pj;
                                         }
                                     }
+                                    return false;
+                                };
+                                const auto& save_proc_tofile = [&](const StoredProcedure& p)  {
+                                    auto config_name = procedure_root + procedure_name + ".aqp";
+                                    auto fp = fopen(config_name.c_str(), "wb");
+                                    if (fp == nullptr){
+                                        printf("Error: failed to open file %s\n", config_name.c_str());
+                                        return true;
+                                    }
+                                    fwrite(&p.cnt, sizeof(p.cnt), 1, fp);
+                                    fwrite(&p.postproc_modules, sizeof(p.postproc_modules), 1, fp);
+                                    for(uint32_t j = 0; j < p.cnt; ++j){
+                                        auto current_query = p.queries[j];
+                                        auto len_query = strlen(current_query);
+                                        fwrite(current_query, len_query + 1, 1, fp);
+                                    }
+                                    fclose(fp);
+                                    return false;
+                                };
+                                const auto& load_proc_fromfile = [&](StoredProcedure& p)  {
+                                    auto config_name = procedure_root + p.name + ".aqp";
+                                    auto fp = fopen(config_name.c_str(), "rb");
+                                    if(fp == nullptr){
+                                        puts("ERROR: Procedure not found on disk.");
+                                        return false;
+                                    }
+                                    fread(&p.cnt, sizeof(p.cnt), 1, fp);
+                                    fread(&p.postproc_modules, sizeof(p.postproc_modules), 1, fp);
+                                    auto offset_now = ftell(fp);
+                                    fseek(fp, 0, SEEK_END);
+                                    auto queries_size = ftell(fp) - offset_now;
+                                    fseek(fp, offset_now, SEEK_SET);
+
+                                    p.queries = static_cast<char**>(malloc(sizeof(char*) * p.cnt));
+                                    p.queries[0] = static_cast<char*>(malloc(sizeof(char) * queries_size));
+                                    fread(&p.queries[0], queries_size, 1, fp);
+
+                                    for(uint32_t j = 1; j < p.cnt; ++j){
+                                        p.queries[j] = p.queries[j-1];
+                                        while(*p.queries[j] != '\0')
+                                            ++p.queries[j];
+                                    }
+                                    fclose(fp);
+                                    return load_modules(p);
                                 };
                                 switch(n_recvd[i][1]){
                                     case '\0':
+                                        current_procedure.name = copy_lpstr(proc_name);
+                                        current_procedure.cnt = 0;
+                                        current_procedure.queries = nullptr;
+                                        current_procedure.postproc_modules = 0;
+                                        current_procedure.__rt_loaded_modules = nullptr;
+                                        procedure_recording = true;
                                         procedure_name = proc_name;
                                     break;
                                     case 'T':
+                                        current_procedure.queries = recorded_queries.container;
+                                        current_procedure.cnt = recorded_queries.size;
+                                        current_procedure.name = copy_lpstr(proc_name);
+                                        current_procedure.postproc_modules = recorded_libraries.size;
+                                        current_procedure.__rt_loaded_modules = recorded_libraries.container;
+                                        recorded_queries.size = recorded_queries.capacity = 0;
+                                        recorded_queries.container = nullptr;
+                                        recorded_libraries.size = recorded_libraries.capacity = 0;
+                                        recorded_libraries.container = nullptr;
+                                        procedure_recording = false;
+                                        save_proc_tofile(current_procedure);
+                                        cxt->stored_proc.insert_or_assign(procedure_name, current_procedure);
                                         procedure_name = "";
                                     break;
                                     case 'E': // execute procedure
                                     {
-                                        auto _proc = cxt->stored_proc.find(procedure_name.c_str());
-                                        if (_proc == cxt->stored_proc.end())
-                                            printf("Procedure %s not found.\n", procedure_name.c_str());
+                                        auto _proc = cxt->stored_proc.find(proc_name);
+                                        if (_proc == cxt->stored_proc.end()){
+                                            printf("Procedure %s not found. Trying load from disk.\n", proc_name);
+                                            if (load_proc_fromfile(current_procedure)){
+                                                cxt->stored_proc.insert_or_assign(proc_name, current_procedure);
+                                            }
+                                        }
                                         else{
-                                            StoredProcedure &p = _proc->second;
-                                            n_recv = p.cnt;
-                                            n_recvd = p.queries;
-                                            load_modules(p);
+                                            current_procedure = _proc->second;
+                                            n_recv = current_procedure.cnt;
+                                            n_recvd = current_procedure.queries;
+                                            load_modules(current_procedure);
+                                            goto start; // yes, I know, refactor later!!
                                         }
                                     }
                                     break;
@@ -418,12 +555,22 @@ int dll_main(int argc, char** argv, Context* cxt){
                                     break;
                                     case 'L': //load procedure
                                     break;
+                                    case 'd': // display all procedures
+                                    for(const auto& p : cxt->stored_proc){
+                                        printf("Procedure: %s, %d queries, %d modules:\n", p.first.c_str(), 
+                                            p.second.cnt, p.second.postproc_modules);
+                                        for(uint32_t j = 0; j < p.second.cnt; ++j){
+                                            printf("\tQuery %d: %s\n", j, p.second.queries[j]);
+                                        }
+                                        puts("");
+                                    }
+                                    break;
                                 }
                             }
                             break;
                         }
                     }
-                    if(handle) {
+                    if(handle && procedure_replaying) {
                         dlclose(handle);
                         handle = nullptr;
                     }
@@ -486,7 +633,7 @@ extern "C" int __DLLEXPORT__ main(int argc, char** argv) {
 #endif
    // puts("running");
    Context* cxt = new Context();
-   cxt->aquery_root_path = std::filesystem::current_path().c_str();
+   cxt->aquery_root_path = to_lpstr(std::filesystem::current_path().string());
    // cxt->log("%d %s\n", argc, argv[1]);
 
 #ifdef THREADING
diff --git a/server/table.h b/server/table.h
index a32705c..8b038f0 100644
--- a/server/table.h
+++ b/server/table.h
@@ -145,9 +145,19 @@ public:
 	ColRef<_Ty>& operator =(ColRef<_Ty>&& vt) {
 		vector_type<_Ty>::operator=(std::move(vt));
 		return *this;
+	
 	}
-	ColView<_Ty> operator [](const vector_type<uint32_t>& idxs) const {
-		return ColView<_Ty>(*this, idxs);
+	// ColView<_Ty> operator [](vector_type<uint32_t>& idxs) const {
+	// 	return ColView<_Ty>(*this, std::move(idxs));
+	// }
+	// ColView<_Ty> operator [](const vector_type<uint32_t>& idxs) const {
+	// 	return ColView<_Ty>(*this, idxs);
+	// }
+	vector_type<_Ty> operator[](vector_type<uint32_t>& idxs) const {
+		vector_type<_Ty> ret(idxs.size);
+		for (uint32_t i = 0; i < idxs.size; ++i)
+			ret.container[i] = this->container[idxs[i]];
+		return ret;
 	}
 	vector_type<_Ty> operator [](const std::vector<bool>& idxs) const {
 		vector_type<_Ty> ret (this->size);
@@ -226,7 +236,7 @@ class ColView : public vector_base<_Ty> {
 public:
 	typedef ColRef<_Ty> Decayed_t;
 	const uint32_t size;
-	const ColRef<_Ty> orig;
+	const ColRef<_Ty>& orig;
 	vector_type<uint32_t> idxs;
 	ColView(const ColRef<_Ty>& orig, vector_type<uint32_t>&& idxs) : orig(orig), size(idxs.size), idxs(std::move(idxs)) {}
 	ColView(const ColRef<_Ty>& orig, const vector_type<uint32_t>& idxs) : orig(orig), idxs(idxs), size(idxs.size) {}
@@ -274,6 +284,7 @@ public:
 			ret[i] = orig[idxs[i]];
 		return ret;
 	}
+
 	ColView<_Ty> subvec(uint32_t start, uint32_t end) const {
 		uint32_t len = end - start;
 		return ColView<_Ty>(orig, idxs.subvec(start, end));
diff --git a/server/unordered_dense.h b/server/unordered_dense.h
new file mode 100644
index 0000000..737d12b
--- /dev/null
+++ b/server/unordered_dense.h
@@ -0,0 +1,1516 @@
+///////////////////////// ankerl::unordered_dense::{map, set} /////////////////////////
+
+// A fast & densely stored hashmap and hashset based on robin-hood backward shift deletion.
+// Version 2.0.1
+// https://github.com/martinus/unordered_dense
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2022 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ANKERL_UNORDERED_DENSE_H
+#define ANKERL_UNORDERED_DENSE_H
+
+// see https://semver.org/spec/v2.0.0.html
+#define ANKERL_UNORDERED_DENSE_VERSION_MAJOR 2 // NOLINT(cppcoreguidelines-macro-usage) incompatible API changes
+#define ANKERL_UNORDERED_DENSE_VERSION_MINOR 0 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible functionality
+#define ANKERL_UNORDERED_DENSE_VERSION_PATCH 1 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible bug fixes
+
+// API versioning with inline namespace, see https://www.foonathan.net/2018/11/inline-namespaces/
+#define ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch) v##major##_##minor##_##patch
+#define ANKERL_UNORDERED_DENSE_VERSION_CONCAT(major, minor, patch) ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch)
+#define ANKERL_UNORDERED_DENSE_NAMESPACE   \
+    ANKERL_UNORDERED_DENSE_VERSION_CONCAT( \
+        ANKERL_UNORDERED_DENSE_VERSION_MAJOR, ANKERL_UNORDERED_DENSE_VERSION_MINOR, ANKERL_UNORDERED_DENSE_VERSION_PATCH)
+
+#if defined(_MSVC_LANG)
+#    define ANKERL_UNORDERED_DENSE_CPP_VERSION _MSVC_LANG
+#else
+#    define ANKERL_UNORDERED_DENSE_CPP_VERSION __cplusplus
+#endif
+
+#if defined(__GNUC__)
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#    define ANKERL_UNORDERED_DENSE_PACK(decl) decl __attribute__((__packed__))
+#elif defined(_MSC_VER)
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#    define ANKERL_UNORDERED_DENSE_PACK(decl) __pragma(pack(push, 1)) decl __pragma(pack(pop))
+#endif
+
+#if ANKERL_UNORDERED_DENSE_CPP_VERSION < 201703L
+#    error ankerl::unordered_dense requires C++17 or higher
+#else
+#    include <array>            // for array
+#    include <cstdint>          // for uint64_t, uint32_t, uint8_t, UINT64_C
+#    include <cstring>          // for size_t, memcpy, memset
+#    include <functional>       // for equal_to, hash
+#    include <initializer_list> // for initializer_list
+#    include <iterator>         // for pair, distance
+#    include <limits>           // for numeric_limits
+#    include <memory>           // for allocator, allocator_traits, shared_ptr
+#    include <stdexcept>        // for out_of_range
+#    include <string>           // for basic_string
+#    include <string_view>      // for basic_string_view, hash
+#    include <tuple>            // for forward_as_tuple
+#    include <type_traits>      // for enable_if_t, declval, conditional_t, ena...
+#    include <utility>          // for forward, exchange, pair, as_const, piece...
+#    include <vector>           // for vector
+
+#    define ANKERL_UNORDERED_DENSE_PMR 0 // NOLINT(cppcoreguidelines-macro-usage)
+#    if defined(__has_include)
+#        if __has_include(<memory_resource>)
+#            undef ANKERL_UNORDERED_DENSE_PMR
+#            define ANKERL_UNORDERED_DENSE_PMR 1 // NOLINT(cppcoreguidelines-macro-usage)
+#            include <memory_resource>           // for polymorphic_allocator
+#        endif
+#    endif
+
+#    if defined(_MSC_VER) && defined(_M_X64)
+#        include <intrin.h>
+#        pragma intrinsic(_umul128)
+#    endif
+
+#    if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+#        define ANKERL_UNORDERED_DENSE_LIKELY(x) __builtin_expect(x, 1)   // NOLINT(cppcoreguidelines-macro-usage)
+#        define ANKERL_UNORDERED_DENSE_UNLIKELY(x) __builtin_expect(x, 0) // NOLINT(cppcoreguidelines-macro-usage)
+#    else
+#        define ANKERL_UNORDERED_DENSE_LIKELY(x) (x)   // NOLINT(cppcoreguidelines-macro-usage)
+#        define ANKERL_UNORDERED_DENSE_UNLIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage)
+#    endif
+
+namespace ankerl::unordered_dense {
+inline namespace ANKERL_UNORDERED_DENSE_NAMESPACE {
+
+// hash ///////////////////////////////////////////////////////////////////////
+
+// This is a stripped-down implementation of wyhash: https://github.com/wangyi-fudan/wyhash
+// No big-endian support (because different values on different machines don't matter),
+// hardcodes seed and the secret, reformattes the code, and clang-tidy fixes.
+namespace detail::wyhash {
+
+static inline void mum(uint64_t* a, uint64_t* b) {
+#    if defined(__SIZEOF_INT128__)
+    __uint128_t r = *a;
+    r *= *b;
+    *a = static_cast<uint64_t>(r);
+    *b = static_cast<uint64_t>(r >> 64U);
+#    elif defined(_MSC_VER) && defined(_M_X64)
+    *a = _umul128(*a, *b, b);
+#    else
+    uint64_t ha = *a >> 32U;
+    uint64_t hb = *b >> 32U;
+    uint64_t la = static_cast<uint32_t>(*a);
+    uint64_t lb = static_cast<uint32_t>(*b);
+    uint64_t hi{};
+    uint64_t lo{};
+    uint64_t rh = ha * hb;
+    uint64_t rm0 = ha * lb;
+    uint64_t rm1 = hb * la;
+    uint64_t rl = la * lb;
+    uint64_t t = rl + (rm0 << 32U);
+    auto c = static_cast<uint64_t>(t < rl);
+    lo = t + (rm1 << 32U);
+    c += static_cast<uint64_t>(lo < t);
+    hi = rh + (rm0 >> 32U) + (rm1 >> 32U) + c;
+    *a = lo;
+    *b = hi;
+#    endif
+}
+
+// multiply and xor mix function, aka MUM
+[[nodiscard]] static inline auto mix(uint64_t a, uint64_t b) -> uint64_t {
+    mum(&a, &b);
+    return a ^ b;
+}
+
+// read functions. WARNING: we don't care about endianness, so results are different on big endian!
+[[nodiscard]] static inline auto r8(const uint8_t* p) -> uint64_t {
+    uint64_t v{};
+    std::memcpy(&v, p, 8U);
+    return v;
+}
+
+[[nodiscard]] static inline auto r4(const uint8_t* p) -> uint64_t {
+    uint32_t v{};
+    std::memcpy(&v, p, 4);
+    return v;
+}
+
+// reads 1, 2, or 3 bytes
+[[nodiscard]] static inline auto r3(const uint8_t* p, size_t k) -> uint64_t {
+    return (static_cast<uint64_t>(p[0]) << 16U) | (static_cast<uint64_t>(p[k >> 1U]) << 8U) | p[k - 1];
+}
+
+[[maybe_unused]] [[nodiscard]] static inline auto hash(void const* key, size_t len) -> uint64_t {
+    static constexpr auto secret = std::array{UINT64_C(0xa0761d6478bd642f),
+                                              UINT64_C(0xe7037ed1a0b428db),
+                                              UINT64_C(0x8ebc6af09c88c6e3),
+                                              UINT64_C(0x589965cc75374cc3)};
+
+    auto const* p = static_cast<uint8_t const*>(key);
+    uint64_t seed = secret[0];
+    uint64_t a{};
+    uint64_t b{};
+    if (ANKERL_UNORDERED_DENSE_LIKELY(len <= 16)) {
+        if (ANKERL_UNORDERED_DENSE_LIKELY(len >= 4)) {
+            a = (r4(p) << 32U) | r4(p + ((len >> 3U) << 2U));
+            b = (r4(p + len - 4) << 32U) | r4(p + len - 4 - ((len >> 3U) << 2U));
+        } else if (ANKERL_UNORDERED_DENSE_LIKELY(len > 0)) {
+            a = r3(p, len);
+            b = 0;
+        } else {
+            a = 0;
+            b = 0;
+        }
+    } else {
+        size_t i = len;
+        if (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 48)) {
+            uint64_t see1 = seed;
+            uint64_t see2 = seed;
+            do {
+                seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
+                see1 = mix(r8(p + 16) ^ secret[2], r8(p + 24) ^ see1);
+                see2 = mix(r8(p + 32) ^ secret[3], r8(p + 40) ^ see2);
+                p += 48;
+                i -= 48;
+            } while (ANKERL_UNORDERED_DENSE_LIKELY(i > 48));
+            seed ^= see1 ^ see2;
+        }
+        while (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 16)) {
+            seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
+            i -= 16;
+            p += 16;
+        }
+        a = r8(p + i - 16);
+        b = r8(p + i - 8);
+    }
+
+    return mix(secret[1] ^ len, mix(a ^ secret[1], b ^ seed));
+}
+
+[[nodiscard]] static inline auto hash(uint64_t x) -> uint64_t {
+    return detail::wyhash::mix(x, UINT64_C(0x9E3779B97F4A7C15));
+}
+
+} // namespace detail::wyhash
+
+template <typename T, typename Enable = void>
+struct hash {
+    auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))
+        -> uint64_t {
+        return std::hash<T>{}(obj);
+    }
+};
+
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+    using is_avalanching = void;
+    auto operator()(std::basic_string<CharT> const& str) const noexcept -> uint64_t {
+        return detail::wyhash::hash(str.data(), sizeof(CharT) * str.size());
+    }
+};
+
+template <typename CharT>
+struct hash<std::basic_string_view<CharT>> {
+    using is_avalanching = void;
+    auto operator()(std::basic_string_view<CharT> const& sv) const noexcept -> uint64_t {
+        return detail::wyhash::hash(sv.data(), sizeof(CharT) * sv.size());
+    }
+};
+
+template <class T>
+struct hash<T*> {
+    using is_avalanching = void;
+    auto operator()(T* ptr) const noexcept -> uint64_t {
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr));
+    }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+    using is_avalanching = void;
+    auto operator()(std::unique_ptr<T> const& ptr) const noexcept -> uint64_t {
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr.get()));
+    }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+    using is_avalanching = void;
+    auto operator()(std::shared_ptr<T> const& ptr) const noexcept -> uint64_t {
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr.get()));
+    }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+    using is_avalanching = void;
+    auto operator()(Enum e) const noexcept -> uint64_t {
+        using underlying = typename std::underlying_type_t<Enum>;
+        return detail::wyhash::hash(static_cast<underlying>(e));
+    }
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#    define ANKERL_UNORDERED_DENSE_HASH_STATICCAST(T)                    \
+        template <>                                                      \
+        struct hash<T> {                                                 \
+            using is_avalanching = void;                                 \
+            auto operator()(T const& obj) const noexcept -> uint64_t {   \
+                return detail::wyhash::hash(static_cast<uint64_t>(obj)); \
+            }                                                            \
+        }
+
+#    if defined(__GNUC__) && !defined(__clang__)
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wuseless-cast"
+#    endif
+// see https://en.cppreference.com/w/cpp/utility/hash
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(bool);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(signed char);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned char);
+#    if ANKERL_UNORDERED_DENSE_CPP_VERSION >= 202002L
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char8_t);
+#    endif
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char16_t);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char32_t);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(wchar_t);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(short);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned short);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(int);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned int);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long long);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long long);
+
+#    if defined(__GNUC__) && !defined(__clang__)
+#        pragma GCC diagnostic pop
+#    endif
+
+// bucket_type //////////////////////////////////////////////////////////
+
+namespace bucket_type {
+
+struct standard {
+    static constexpr uint32_t dist_inc = 1U << 8U;             // skip 1 byte fingerprint
+    static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint
+
+    uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash
+    uint32_t m_value_idx;            // index into the m_values vector.
+};
+
+ANKERL_UNORDERED_DENSE_PACK(struct big {
+    static constexpr uint32_t dist_inc = 1U << 8U;             // skip 1 byte fingerprint
+    static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint
+
+    uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash
+    size_t m_value_idx;              // index into the m_values vector.
+});
+
+} // namespace bucket_type
+
+namespace detail {
+
+struct nonesuch {};
+
+template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
+struct detector {
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> {
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detail::detector<detail::nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+template <typename T>
+using detect_avalanching = typename T::is_avalanching;
+
+template <typename T>
+using detect_is_transparent = typename T::is_transparent;
+
+template <typename T>
+using detect_iterator = typename T::iterator;
+
+template <typename T>
+using detect_reserve = decltype(std::declval<T&>().reserve(size_t{}));
+
+// enable_if helpers
+
+template <typename Mapped>
+constexpr bool is_map_v = !std::is_void_v<Mapped>;
+
+template <typename Hash, typename KeyEqual>
+constexpr bool is_transparent_v = is_detected_v<detect_is_transparent, Hash>&& is_detected_v<detect_is_transparent, KeyEqual>;
+
+template <typename From, typename To1, typename To2>
+constexpr bool is_neither_convertible_v = !std::is_convertible_v<From, To1> && !std::is_convertible_v<From, To2>;
+
+template <typename T>
+constexpr bool has_reserve = is_detected_v<detect_reserve, T>;
+
+// This is it, the table. Doubles as map and set, and uses `void` for T when its used as a set.
+template <class Key,
+          class T, // when void, treat it as a set.
+          class Hash,
+          class KeyEqual,
+          class AllocatorOrContainer,
+          class Bucket>
+class table {
+public:
+    using value_container_type = std::conditional_t<
+        is_detected_v<detect_iterator, AllocatorOrContainer>,
+        AllocatorOrContainer,
+        typename std::vector<typename std::conditional_t<std::is_void_v<T>, Key, std::pair<Key, T>>, AllocatorOrContainer>>;
+
+private:
+    using bucket_alloc =
+        typename std::allocator_traits<typename value_container_type::allocator_type>::template rebind_alloc<Bucket>;
+    using bucket_alloc_traits = std::allocator_traits<bucket_alloc>;
+
+    static constexpr uint8_t initial_shifts = 64 - 3; // 2^(64-m_shift) number of buckets
+    static constexpr float default_max_load_factor = 0.8F;
+
+public:
+    using key_type = Key;
+    using mapped_type = T;
+    using value_type = typename value_container_type::value_type;
+    using size_type = typename value_container_type::size_type;
+    using difference_type = typename value_container_type::difference_type;
+    using hasher = Hash;
+    using key_equal = KeyEqual;
+    using allocator_type = typename value_container_type::allocator_type;
+    using reference = typename value_container_type::reference;
+    using const_reference = typename value_container_type::const_reference;
+    using pointer = typename value_container_type::pointer;
+    using const_pointer = typename value_container_type::const_pointer;
+    using iterator = typename value_container_type::iterator;
+    using const_iterator = typename value_container_type::const_iterator;
+    using bucket_type = Bucket;
+
+private:
+    using value_idx_type = decltype(Bucket::m_value_idx);
+    using dist_and_fingerprint_type = decltype(Bucket::m_dist_and_fingerprint);
+
+    static_assert(std::is_trivially_destructible_v<Bucket>, "assert there's no need to call destructor / std::destroy");
+    static_assert(std::is_trivially_copyable_v<Bucket>, "assert we can just memset / memcpy");
+
+    value_container_type m_values{}; // Contains all the key-value pairs in one densely stored container. No holes.
+    typename std::allocator_traits<bucket_alloc>::pointer m_buckets{};
+    size_t m_num_buckets = 0;
+    size_t m_max_bucket_capacity = 0;
+    float m_max_load_factor = default_max_load_factor;
+    Hash m_hash{};
+    KeyEqual m_equal{};
+    uint8_t m_shifts = initial_shifts;
+
+    [[nodiscard]] auto next(value_idx_type bucket_idx) const -> value_idx_type {
+        return ANKERL_UNORDERED_DENSE_UNLIKELY(bucket_idx + 1U == m_num_buckets)
+                   ? 0
+                   : static_cast<value_idx_type>(bucket_idx + 1U);
+    }
+
+    // Helper to access bucket through pointer types
+    [[nodiscard]] static constexpr auto at(typename std::allocator_traits<bucket_alloc>::pointer bucket_ptr, size_t offset)
+        -> Bucket& {
+        return *(bucket_ptr + static_cast<typename std::allocator_traits<bucket_alloc>::difference_type>(offset));
+    }
+
+    // use the dist_inc and dist_dec functions so that uint16_t types work without warning
+    [[nodiscard]] static constexpr auto dist_inc(dist_and_fingerprint_type x) -> dist_and_fingerprint_type {
+        return static_cast<dist_and_fingerprint_type>(x + Bucket::dist_inc);
+    }
+
+    [[nodiscard]] static constexpr auto dist_dec(dist_and_fingerprint_type x) -> dist_and_fingerprint_type {
+        return static_cast<dist_and_fingerprint_type>(x - Bucket::dist_inc);
+    }
+
+    // The goal of mixed_hash is to always produce a high quality 64bit hash.
+    template <typename K>
+    [[nodiscard]] constexpr auto mixed_hash(K const& key) const -> uint64_t {
+        if constexpr (is_detected_v<detect_avalanching, Hash>) {
+            // we know that the hash is good because is_avalanching.
+            if constexpr (sizeof(decltype(m_hash(key))) < sizeof(uint64_t)) {
+                // 32bit hash and is_avalanching => multiply with a constant to avalanche bits upwards
+                return m_hash(key) * UINT64_C(0x9ddfea08eb382d69);
+            } else {
+                // 64bit and is_avalanching => only use the hash itself.
+                return m_hash(key);
+            }
+        } else {
+            // not is_avalanching => apply wyhash
+            return wyhash::hash(m_hash(key));
+        }
+    }
+
+    [[nodiscard]] constexpr auto dist_and_fingerprint_from_hash(uint64_t hash) const -> dist_and_fingerprint_type {
+        return Bucket::dist_inc | (static_cast<dist_and_fingerprint_type>(hash) & Bucket::fingerprint_mask);
+    }
+
+    [[nodiscard]] constexpr auto bucket_idx_from_hash(uint64_t hash) const -> value_idx_type {
+        return static_cast<value_idx_type>(hash >> m_shifts);
+    }
+
+    [[nodiscard]] static constexpr auto get_key(value_type const& vt) -> key_type const& {
+        if constexpr (std::is_void_v<T>) {
+            return vt;
+        } else {
+            return vt.first;
+        }
+    }
+
+    template <typename K>
+    [[nodiscard]] auto next_while_less(K const& key) const -> Bucket {
+        auto hash = mixed_hash(key);
+        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
+        auto bucket_idx = bucket_idx_from_hash(hash);
+
+        while (dist_and_fingerprint < at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
+            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+            bucket_idx = next(bucket_idx);
+        }
+        return {dist_and_fingerprint, bucket_idx};
+    }
+
+    void place_and_shift_up(Bucket bucket, value_idx_type place) {
+        while (0 != at(m_buckets, place).m_dist_and_fingerprint) {
+            bucket = std::exchange(at(m_buckets, place), bucket);
+            bucket.m_dist_and_fingerprint = dist_inc(bucket.m_dist_and_fingerprint);
+            place = next(place);
+        }
+        at(m_buckets, place) = bucket;
+    }
+
+    [[nodiscard]] static constexpr auto calc_num_buckets(uint8_t shifts) -> size_t {
+        return std::min(max_bucket_count(), size_t{1} << (64U - shifts));
+    }
+
+    [[nodiscard]] constexpr auto calc_shifts_for_size(size_t s) const -> uint8_t {
+        auto shifts = initial_shifts;
+        while (shifts > 0 && static_cast<size_t>(static_cast<float>(calc_num_buckets(shifts)) * max_load_factor()) < s) {
+            --shifts;
+        }
+        return shifts;
+    }
+
+    // assumes m_values has data, m_buckets=m_buckets_end=nullptr, m_shifts is INITIAL_SHIFTS
+    void copy_buckets(table const& other) {
+        if (!empty()) {
+            m_shifts = other.m_shifts;
+            allocate_buckets_from_shift();
+            std::memcpy(m_buckets, other.m_buckets, sizeof(Bucket) * bucket_count());
+        }
+    }
+
+    /**
+     * True when no element can be added any more without increasing the size
+     */
+    [[nodiscard]] auto is_full() const -> bool {
+        return size() >= m_max_bucket_capacity;
+    }
+
+    void deallocate_buckets() {
+        auto ba = bucket_alloc(m_values.get_allocator());
+        if (nullptr != m_buckets) {
+            bucket_alloc_traits::deallocate(ba, m_buckets, bucket_count());
+        }
+        m_buckets = nullptr;
+        m_num_buckets = 0;
+        m_max_bucket_capacity = 0;
+    }
+
+    void allocate_buckets_from_shift() {
+        auto ba = bucket_alloc(m_values.get_allocator());
+        m_num_buckets = calc_num_buckets(m_shifts);
+        m_buckets = bucket_alloc_traits::allocate(ba, m_num_buckets);
+        if (m_num_buckets == max_bucket_count()) {
+            // reached the maximum, make sure we can use each bucket
+            m_max_bucket_capacity = max_bucket_count();
+        } else {
+            m_max_bucket_capacity = static_cast<value_idx_type>(static_cast<float>(m_num_buckets) * max_load_factor());
+        }
+    }
+
+    void clear_buckets() {
+        if (m_buckets != nullptr) {
+            std::memset(&*m_buckets, 0, sizeof(Bucket) * bucket_count());
+        }
+    }
+
+    void clear_and_fill_buckets_from_values() {
+        clear_buckets();
+        for (value_idx_type value_idx = 0, end_idx = static_cast<value_idx_type>(m_values.size()); value_idx < end_idx;
+             ++value_idx) {
+            auto const& key = get_key(m_values[value_idx]);
+            auto [dist_and_fingerprint, bucket] = next_while_less(key);
+
+            // we know for certain that key has not yet been inserted, so no need to check it.
+            place_and_shift_up({dist_and_fingerprint, value_idx}, bucket);
+        }
+    }
+
+    void increase_size() {
+        if (ANKERL_UNORDERED_DENSE_UNLIKELY(m_max_bucket_capacity == max_bucket_count())) {
+            throw std::overflow_error("ankerl::unordered_dense: reached max bucket size, cannot increase size");
+        }
+        --m_shifts;
+        deallocate_buckets();
+        allocate_buckets_from_shift();
+        clear_and_fill_buckets_from_values();
+    }
+
+    void do_erase(value_idx_type bucket_idx) {
+        auto const value_idx_to_remove = at(m_buckets, bucket_idx).m_value_idx;
+
+        // shift down until either empty or an element with correct spot is found
+        auto next_bucket_idx = next(bucket_idx);
+        while (at(m_buckets, next_bucket_idx).m_dist_and_fingerprint >= Bucket::dist_inc * 2) {
+            at(m_buckets, bucket_idx) = {dist_dec(at(m_buckets, next_bucket_idx).m_dist_and_fingerprint),
+                                         at(m_buckets, next_bucket_idx).m_value_idx};
+            bucket_idx = std::exchange(next_bucket_idx, next(next_bucket_idx));
+        }
+        at(m_buckets, bucket_idx) = {};
+
+        // update m_values
+        if (value_idx_to_remove != m_values.size() - 1) {
+            // no luck, we'll have to replace the value with the last one and update the index accordingly
+            auto& val = m_values[value_idx_to_remove];
+            val = std::move(m_values.back());
+
+            // update the values_idx of the moved entry. No need to play the info game, just look until we find the values_idx
+            auto mh = mixed_hash(get_key(val));
+            bucket_idx = bucket_idx_from_hash(mh);
+
+            auto const values_idx_back = static_cast<value_idx_type>(m_values.size() - 1);
+            while (values_idx_back != at(m_buckets, bucket_idx).m_value_idx) {
+                bucket_idx = next(bucket_idx);
+            }
+            at(m_buckets, bucket_idx).m_value_idx = value_idx_to_remove;
+        }
+        m_values.pop_back();
+    }
+
+    template <typename K>
+    auto do_erase_key(K&& key) -> size_t {
+        if (empty()) {
+            return 0;
+        }
+
+        auto [dist_and_fingerprint, bucket_idx] = next_while_less(key);
+
+        while (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
+               !m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {
+            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+            bucket_idx = next(bucket_idx);
+        }
+
+        if (dist_and_fingerprint != at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
+            return 0;
+        }
+        do_erase(bucket_idx);
+        return 1;
+    }
+
+    template <class K, class M>
+    auto do_insert_or_assign(K&& key, M&& mapped) -> std::pair<iterator, bool> {
+        auto it_isinserted = try_emplace(std::forward<K>(key), std::forward<M>(mapped));
+        if (!it_isinserted.second) {
+            it_isinserted.first->second = std::forward<M>(mapped);
+        }
+        return it_isinserted;
+    }
+
+    template <typename K, typename... Args>
+    auto do_place_element(dist_and_fingerprint_type dist_and_fingerprint, value_idx_type bucket_idx, K&& key, Args&&... args)
+        -> std::pair<iterator, bool> {
+
+        // emplace the new value. If that throws an exception, no harm done; index is still in a valid state
+        m_values.emplace_back(std::piecewise_construct,
+                              std::forward_as_tuple(std::forward<K>(key)),
+                              std::forward_as_tuple(std::forward<Args>(args)...));
+
+        // place element and shift up until we find an empty spot
+        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);
+        place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
+        return {begin() + static_cast<difference_type>(value_idx), true};
+    }
+
+    template <typename K, typename... Args>
+    auto do_try_emplace(K&& key, Args&&... args) -> std::pair<iterator, bool> {
+        if (ANKERL_UNORDERED_DENSE_UNLIKELY(is_full())) {
+            increase_size();
+        }
+
+        auto hash = mixed_hash(key);
+        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
+        auto bucket_idx = bucket_idx_from_hash(hash);
+
+        while (true) {
+            auto* bucket = &at(m_buckets, bucket_idx);
+            if (dist_and_fingerprint == bucket->m_dist_and_fingerprint) {
+                if (m_equal(key, m_values[bucket->m_value_idx].first)) {
+                    return {begin() + static_cast<difference_type>(bucket->m_value_idx), false};
+                }
+            } else if (dist_and_fingerprint > bucket->m_dist_and_fingerprint) {
+                return do_place_element(dist_and_fingerprint, bucket_idx, std::forward<K>(key), std::forward<Args>(args)...);
+            }
+            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+            bucket_idx = next(bucket_idx);
+        }
+    }
+
+    template <typename K>
+    auto do_find(K const& key) -> iterator {
+        if (ANKERL_UNORDERED_DENSE_UNLIKELY(empty())) {
+            return end();
+        }
+
+        auto mh = mixed_hash(key);
+        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(mh);
+        auto bucket_idx = bucket_idx_from_hash(mh);
+        auto* bucket = &at(m_buckets, bucket_idx);
+
+        // unrolled loop. *Always* check a few directly, then enter the loop. This is faster.
+        if (dist_and_fingerprint == bucket->m_dist_and_fingerprint && m_equal(key, get_key(m_values[bucket->m_value_idx]))) {
+            return begin() + static_cast<difference_type>(bucket->m_value_idx);
+        }
+        dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+        bucket_idx = next(bucket_idx);
+        bucket = &at(m_buckets, bucket_idx);
+
+        if (dist_and_fingerprint == bucket->m_dist_and_fingerprint && m_equal(key, get_key(m_values[bucket->m_value_idx]))) {
+            return begin() + static_cast<difference_type>(bucket->m_value_idx);
+        }
+        dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+        bucket_idx = next(bucket_idx);
+        bucket = &at(m_buckets, bucket_idx);
+
+        while (true) {
+            if (dist_and_fingerprint == bucket->m_dist_and_fingerprint) {
+                if (m_equal(key, get_key(m_values[bucket->m_value_idx]))) {
+                    return begin() + static_cast<difference_type>(bucket->m_value_idx);
+                }
+            } else if (dist_and_fingerprint > bucket->m_dist_and_fingerprint) {
+                return end();
+            }
+            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+            bucket_idx = next(bucket_idx);
+            bucket = &at(m_buckets, bucket_idx);
+        }
+    }
+
+    template <typename K>
+    auto do_find(K const& key) const -> const_iterator {
+        return const_cast<table*>(this)->do_find(key); // NOLINT(cppcoreguidelines-pro-type-const-cast)
+    }
+
+    template <typename K, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto do_at(K const& key) -> Q& {
+        if (auto it = find(key); end() != it) {
+            return it->second;
+        }
+        throw std::out_of_range("ankerl::unordered_dense::map::at(): key not found");
+    }
+
+    template <typename K, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto do_at(K const& key) const -> Q const& {
+        return const_cast<table*>(this)->at(key); // NOLINT(cppcoreguidelines-pro-type-const-cast)
+    }
+
+public:
+    table()
+        : table(0) {}
+
+    explicit table(size_t bucket_count,
+                   Hash const& hash = Hash(),
+                   KeyEqual const& equal = KeyEqual(),
+                   allocator_type const& alloc_or_container = allocator_type())
+        : m_values(alloc_or_container)
+        , m_hash(hash)
+        , m_equal(equal) {
+        if (0 != bucket_count) {
+            reserve(bucket_count);
+        }
+    }
+
+    table(size_t bucket_count, allocator_type const& alloc)
+        : table(bucket_count, Hash(), KeyEqual(), alloc) {}
+
+    table(size_t bucket_count, Hash const& hash, allocator_type const& alloc)
+        : table(bucket_count, hash, KeyEqual(), alloc) {}
+
+    explicit table(allocator_type const& alloc)
+        : table(0, Hash(), KeyEqual(), alloc) {}
+
+    template <class InputIt>
+    table(InputIt first,
+          InputIt last,
+          size_type bucket_count = 0,
+          Hash const& hash = Hash(),
+          KeyEqual const& equal = KeyEqual(),
+          allocator_type const& alloc = allocator_type())
+        : table(bucket_count, hash, equal, alloc) {
+        insert(first, last);
+    }
+
+    template <class InputIt>
+    table(InputIt first, InputIt last, size_type bucket_count, allocator_type const& alloc)
+        : table(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
+
+    template <class InputIt>
+    table(InputIt first, InputIt last, size_type bucket_count, Hash const& hash, allocator_type const& alloc)
+        : table(first, last, bucket_count, hash, KeyEqual(), alloc) {}
+
+    table(table const& other)
+        : table(other, other.m_values.get_allocator()) {}
+
+    table(table const& other, allocator_type const& alloc)
+        : m_values(other.m_values, alloc)
+        , m_max_load_factor(other.m_max_load_factor)
+        , m_hash(other.m_hash)
+        , m_equal(other.m_equal) {
+        copy_buckets(other);
+    }
+
+    table(table&& other) noexcept
+        : table(std::move(other), other.m_values.get_allocator()) {}
+
+    table(table&& other, allocator_type const& alloc) noexcept
+        : m_values(std::move(other.m_values), alloc)
+        , m_buckets(std::exchange(other.m_buckets, nullptr))
+        , m_num_buckets(std::exchange(other.m_num_buckets, 0))
+        , m_max_bucket_capacity(std::exchange(other.m_max_bucket_capacity, 0))
+        , m_max_load_factor(std::exchange(other.m_max_load_factor, default_max_load_factor))
+        , m_hash(std::exchange(other.m_hash, {}))
+        , m_equal(std::exchange(other.m_equal, {}))
+        , m_shifts(std::exchange(other.m_shifts, initial_shifts)) {
+        other.m_values.clear();
+    }
+
+    table(std::initializer_list<value_type> ilist,
+          size_t bucket_count = 0,
+          Hash const& hash = Hash(),
+          KeyEqual const& equal = KeyEqual(),
+          allocator_type const& alloc = allocator_type())
+        : table(bucket_count, hash, equal, alloc) {
+        insert(ilist);
+    }
+
+    table(std::initializer_list<value_type> ilist, size_type bucket_count, allocator_type const& alloc)
+        : table(ilist, bucket_count, Hash(), KeyEqual(), alloc) {}
+
+    table(std::initializer_list<value_type> init, size_type bucket_count, Hash const& hash, allocator_type const& alloc)
+        : table(init, bucket_count, hash, KeyEqual(), alloc) {}
+
+    ~table() {
+        auto ba = bucket_alloc(m_values.get_allocator());
+        bucket_alloc_traits::deallocate(ba, m_buckets, bucket_count());
+    }
+
+    auto operator=(table const& other) -> table& {
+        if (&other != this) {
+            deallocate_buckets(); // deallocate before m_values is set (might have another allocator)
+            m_values = other.m_values;
+            m_max_load_factor = other.m_max_load_factor;
+            m_hash = other.m_hash;
+            m_equal = other.m_equal;
+            m_shifts = initial_shifts;
+            copy_buckets(other);
+        }
+        return *this;
+    }
+
+    auto operator=(table&& other) noexcept(
+        noexcept(std::is_nothrow_move_assignable_v<value_container_type>&& std::is_nothrow_move_assignable_v<Hash>&&
+                     std::is_nothrow_move_assignable_v<KeyEqual>)) -> table& {
+        if (&other != this) {
+            deallocate_buckets(); // deallocate before m_values is set (might have another allocator)
+            m_values = std::move(other.m_values);
+            m_buckets = std::exchange(other.m_buckets, nullptr);
+            m_num_buckets = std::exchange(other.m_num_buckets, 0);
+            m_max_bucket_capacity = std::exchange(other.m_max_bucket_capacity, 0);
+            m_max_load_factor = std::exchange(other.m_max_load_factor, default_max_load_factor);
+            m_hash = std::exchange(other.m_hash, {});
+            m_equal = std::exchange(other.m_equal, {});
+            m_shifts = std::exchange(other.m_shifts, initial_shifts);
+            other.m_values.clear();
+        }
+        return *this;
+    }
+
+    auto operator=(std::initializer_list<value_type> ilist) -> table& {
+        clear();
+        insert(ilist);
+        return *this;
+    }
+
+    auto get_allocator() const noexcept -> allocator_type {
+        return m_values.get_allocator();
+    }
+
+    // iterators //////////////////////////////////////////////////////////////
+
+    auto begin() noexcept -> iterator {
+        return m_values.begin();
+    }
+
+    auto begin() const noexcept -> const_iterator {
+        return m_values.begin();
+    }
+
+    auto cbegin() const noexcept -> const_iterator {
+        return m_values.cbegin();
+    }
+
+    auto end() noexcept -> iterator {
+        return m_values.end();
+    }
+
+    auto cend() const noexcept -> const_iterator {
+        return m_values.cend();
+    }
+
+    auto end() const noexcept -> const_iterator {
+        return m_values.end();
+    }
+
+    // capacity ///////////////////////////////////////////////////////////////
+
+    [[nodiscard]] auto empty() const noexcept -> bool {
+        return m_values.empty();
+    }
+
+    [[nodiscard]] auto size() const noexcept -> size_t {
+        return m_values.size();
+    }
+
+    [[nodiscard]] static constexpr auto max_size() noexcept -> size_t {
+        if constexpr (std::numeric_limits<value_idx_type>::max() == std::numeric_limits<size_t>::max()) {
+            return size_t{1} << (sizeof(value_idx_type) * 8 - 1);
+        } else {
+            return size_t{1} << (sizeof(value_idx_type) * 8);
+        }
+    }
+
+    // modifiers //////////////////////////////////////////////////////////////
+
+    void clear() {
+        m_values.clear();
+        clear_buckets();
+    }
+
+    auto insert(value_type const& value) -> std::pair<iterator, bool> {
+        return emplace(value);
+    }
+
+    auto insert(value_type&& value) -> std::pair<iterator, bool> {
+        return emplace(std::move(value));
+    }
+
+    template <class P, std::enable_if_t<std::is_constructible_v<value_type, P&&>, bool> = true>
+    auto insert(P&& value) -> std::pair<iterator, bool> {
+        return emplace(std::forward<P>(value));
+    }
+
+    auto insert(const_iterator /*hint*/, value_type const& value) -> iterator {
+        return insert(value).first;
+    }
+
+    auto insert(const_iterator /*hint*/, value_type&& value) -> iterator {
+        return insert(std::move(value)).first;
+    }
+
+    template <class P, std::enable_if_t<std::is_constructible_v<value_type, P&&>, bool> = true>
+    auto insert(const_iterator /*hint*/, P&& value) -> iterator {
+        return insert(std::forward<P>(value)).first;
+    }
+
+    template <class InputIt>
+    void insert(InputIt first, InputIt last) {
+        while (first != last) {
+            insert(*first);
+            ++first;
+        }
+    }
+
+    void insert(std::initializer_list<value_type> ilist) {
+        insert(ilist.begin(), ilist.end());
+    }
+
+    // nonstandard API: *this is emptied.
+    // Also see "A Standard flat_map" https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p0429r9.pdf
+    auto extract() && -> value_container_type {
+        return std::move(m_values);
+    }
+
+    // nonstandard API:
+    // Discards the internally held container and replaces it with the one passed. Erases non-unique elements.
+    auto replace(value_container_type&& container) {
+        if (container.size() > max_size()) {
+            throw std::out_of_range("ankerl::unordered_dense::map::replace(): too many elements");
+        }
+
+        auto shifts = calc_shifts_for_size(container.size());
+        if (0 == m_num_buckets || shifts < m_shifts || container.get_allocator() != m_values.get_allocator()) {
+            m_shifts = shifts;
+            deallocate_buckets();
+            allocate_buckets_from_shift();
+        }
+        clear_buckets();
+
+        m_values = std::move(container);
+
+        // can't use clear_and_fill_buckets_from_values() because container elements might not be unique
+        auto value_idx = value_idx_type{};
+
+        // loop until we reach the end of the container. duplicated entries will be replaced with back().
+        while (value_idx != static_cast<value_idx_type>(m_values.size())) {
+            auto const& key = get_key(m_values[value_idx]);
+
+            auto hash = mixed_hash(key);
+            auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
+            auto bucket_idx = bucket_idx_from_hash(hash);
+
+            bool key_found = false;
+            while (true) {
+                auto const& bucket = at(m_buckets, bucket_idx);
+                if (dist_and_fingerprint > bucket.m_dist_and_fingerprint) {
+                    break;
+                }
+                if (dist_and_fingerprint == bucket.m_dist_and_fingerprint &&
+                    m_equal(key, m_values[bucket.m_value_idx].first)) {
+                    key_found = true;
+                    break;
+                }
+                dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+                bucket_idx = next(bucket_idx);
+            }
+
+            if (key_found) {
+                if (value_idx != static_cast<value_idx_type>(m_values.size() - 1)) {
+                    m_values[value_idx] = std::move(m_values.back());
+                }
+                m_values.pop_back();
+            } else {
+                place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
+                ++value_idx;
+            }
+        }
+    }
+
+    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto insert_or_assign(Key const& key, M&& mapped) -> std::pair<iterator, bool> {
+        return do_insert_or_assign(key, std::forward<M>(mapped));
+    }
+
+    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto insert_or_assign(Key&& key, M&& mapped) -> std::pair<iterator, bool> {
+        return do_insert_or_assign(std::move(key), std::forward<M>(mapped));
+    }
+
+    template <typename K,
+              typename M,
+              typename Q = T,
+              typename H = Hash,
+              typename KE = KeyEqual,
+              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
+    auto insert_or_assign(K&& key, M&& mapped) -> std::pair<iterator, bool> {
+        return do_insert_or_assign(std::forward<K>(key), std::forward<M>(mapped));
+    }
+
+    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto insert_or_assign(const_iterator /*hint*/, Key const& key, M&& mapped) -> iterator {
+        return do_insert_or_assign(key, std::forward<M>(mapped)).first;
+    }
+
+    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto insert_or_assign(const_iterator /*hint*/, Key&& key, M&& mapped) -> iterator {
+        return do_insert_or_assign(std::move(key), std::forward<M>(mapped)).first;
+    }
+
+    template <typename K,
+              typename M,
+              typename Q = T,
+              typename H = Hash,
+              typename KE = KeyEqual,
+              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
+    auto insert_or_assign(const_iterator /*hint*/, K&& key, M&& mapped) -> iterator {
+        return do_insert_or_assign(std::forward<K>(key), std::forward<M>(mapped)).first;
+    }
+
+    // Single arguments for unordered_set can be used without having to construct the value_type
+    template <class K,
+              typename Q = T,
+              typename H = Hash,
+              typename KE = KeyEqual,
+              std::enable_if_t<!is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
+    auto emplace(K&& key) -> std::pair<iterator, bool> {
+        if (is_full()) {
+            increase_size();
+        }
+
+        auto hash = mixed_hash(key);
+        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
+        auto bucket_idx = bucket_idx_from_hash(hash);
+
+        while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
+            if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
+                m_equal(key, m_values[at(m_buckets, bucket_idx).m_value_idx])) {
+                // found it, return without ever actually creating anything
+                return {begin() + static_cast<difference_type>(at(m_buckets, bucket_idx).m_value_idx), false};
+            }
+            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+            bucket_idx = next(bucket_idx);
+        }
+
+        // value is new, insert element first, so when exception happens we are in a valid state
+        m_values.emplace_back(std::forward<K>(key));
+        // now place the bucket and shift up until we find an empty spot
+        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);
+        place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
+        return {begin() + static_cast<difference_type>(value_idx), true};
+    }
+
+    template <class... Args>
+    auto emplace(Args&&... args) -> std::pair<iterator, bool> {
+        if (is_full()) {
+            increase_size();
+        }
+
+        // we have to instantiate the value_type to be able to access the key.
+        // 1. emplace_back the object so it is constructed. 2. If the key is already there, pop it later in the loop.
+        auto& key = get_key(m_values.emplace_back(std::forward<Args>(args)...));
+        auto hash = mixed_hash(key);
+        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
+        auto bucket_idx = bucket_idx_from_hash(hash);
+
+        while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
+            if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
+                m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {
+                m_values.pop_back(); // value was already there, so get rid of it
+                return {begin() + static_cast<difference_type>(at(m_buckets, bucket_idx).m_value_idx), false};
+            }
+            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+            bucket_idx = next(bucket_idx);
+        }
+
+        // value is new, place the bucket and shift up until we find an empty spot
+        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);
+        place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
+
+        return {begin() + static_cast<difference_type>(value_idx), true};
+    }
+
+    template <class... Args>
+    auto emplace_hint(const_iterator /*hint*/, Args&&... args) -> iterator {
+        return emplace(std::forward<Args>(args)...).first;
+    }
+
+    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto try_emplace(Key const& key, Args&&... args) -> std::pair<iterator, bool> {
+        return do_try_emplace(key, std::forward<Args>(args)...);
+    }
+
+    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto try_emplace(Key&& key, Args&&... args) -> std::pair<iterator, bool> {
+        return do_try_emplace(std::move(key), std::forward<Args>(args)...);
+    }
+
+    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto try_emplace(const_iterator /*hint*/, Key const& key, Args&&... args) -> iterator {
+        return do_try_emplace(key, std::forward<Args>(args)...).first;
+    }
+
+    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto try_emplace(const_iterator /*hint*/, Key&& key, Args&&... args) -> iterator {
+        return do_try_emplace(std::move(key), std::forward<Args>(args)...).first;
+    }
+
+    template <
+        typename K,
+        typename... Args,
+        typename Q = T,
+        typename H = Hash,
+        typename KE = KeyEqual,
+        std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE> && is_neither_convertible_v<K&&, iterator, const_iterator>,
+                         bool> = true>
+    auto try_emplace(K&& key, Args&&... args) -> std::pair<iterator, bool> {
+        return do_try_emplace(std::forward<K>(key), std::forward<Args>(args)...);
+    }
+
+    template <
+        typename K,
+        typename... Args,
+        typename Q = T,
+        typename H = Hash,
+        typename KE = KeyEqual,
+        std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE> && is_neither_convertible_v<K&&, iterator, const_iterator>,
+                         bool> = true>
+    auto try_emplace(const_iterator /*hint*/, K&& key, Args&&... args) -> iterator {
+        return do_try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
+    }
+
+    auto erase(iterator it) -> iterator {
+        auto hash = mixed_hash(get_key(*it));
+        auto bucket_idx = bucket_idx_from_hash(hash);
+
+        auto const value_idx_to_remove = static_cast<value_idx_type>(it - cbegin());
+        while (at(m_buckets, bucket_idx).m_value_idx != value_idx_to_remove) {
+            bucket_idx = next(bucket_idx);
+        }
+
+        do_erase(bucket_idx);
+        return begin() + static_cast<difference_type>(value_idx_to_remove);
+    }
+
+    auto erase(const_iterator it) -> iterator {
+        return erase(begin() + (it - cbegin()));
+    }
+
+    auto erase(const_iterator first, const_iterator last) -> iterator {
+        auto const idx_first = first - cbegin();
+        auto const idx_last = last - cbegin();
+        auto const first_to_last = std::distance(first, last);
+        auto const last_to_end = std::distance(last, cend());
+
+        // remove elements from left to right which moves elements from the end back
+        auto const mid = idx_first + std::min(first_to_last, last_to_end);
+        auto idx = idx_first;
+        while (idx != mid) {
+            erase(begin() + idx);
+            ++idx;
+        }
+
+        // all elements from the right are moved, now remove the last element until all done
+        idx = idx_last;
+        while (idx != mid) {
+            --idx;
+            erase(begin() + idx);
+        }
+
+        return begin() + idx_first;
+    }
+
+    auto erase(Key const& key) -> size_t {
+        return do_erase_key(key);
+    }
+
+    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
+    auto erase(K&& key) -> size_t {
+        return do_erase_key(std::forward<K>(key));
+    }
+
+    void swap(table& other) noexcept(noexcept(std::is_nothrow_swappable_v<value_container_type>&&
+                                                  std::is_nothrow_swappable_v<Hash>&& std::is_nothrow_swappable_v<KeyEqual>)) {
+        using std::swap;
+        swap(other, *this);
+    }
+
+    // lookup /////////////////////////////////////////////////////////////////
+
+    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto at(key_type const& key) -> Q& {
+        return do_at(key);
+    }
+
+    template <typename K,
+              typename Q = T,
+              typename H = Hash,
+              typename KE = KeyEqual,
+              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
+    auto at(K const& key) -> Q& {
+        return do_at(key);
+    }
+
+    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto at(key_type const& key) const -> Q const& {
+        return do_at(key);
+    }
+
+    template <typename K,
+              typename Q = T,
+              typename H = Hash,
+              typename KE = KeyEqual,
+              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
+    auto at(K const& key) const -> Q const& {
+        return do_at(key);
+    }
+
+    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto operator[](Key const& key) -> Q& {
+        return try_emplace(key).first->second;
+    }
+
+    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
+    auto operator[](Key&& key) -> Q& {
+        return try_emplace(std::move(key)).first->second;
+    }
+
+    template <typename K,
+              typename Q = T,
+              typename H = Hash,
+              typename KE = KeyEqual,
+              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
+    auto operator[](K&& key) -> Q& {
+        return try_emplace(std::forward<K>(key)).first->second;
+    }
+
+    auto count(Key const& key) const -> size_t {
+        return find(key) == end() ? 0 : 1;
+    }
+
+    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
+    auto count(K const& key) const -> size_t {
+        return find(key) == end() ? 0 : 1;
+    }
+
+    auto find(Key const& key) -> iterator {
+        return do_find(key);
+    }
+
+    auto find(Key const& key) const -> const_iterator {
+        return do_find(key);
+    }
+
+    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
+    auto find(K const& key) -> iterator {
+        return do_find(key);
+    }
+
+    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
+    auto find(K const& key) const -> const_iterator {
+        return do_find(key);
+    }
+
+    auto contains(Key const& key) const -> bool {
+        return find(key) != end();
+    }
+
+    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
+    auto contains(K const& key) const -> bool {
+        return find(key) != end();
+    }
+
+    auto equal_range(Key const& key) -> std::pair<iterator, iterator> {
+        auto it = do_find(key);
+        return {it, it == end() ? end() : it + 1};
+    }
+
+    auto equal_range(const Key& key) const -> std::pair<const_iterator, const_iterator> {
+        auto it = do_find(key);
+        return {it, it == end() ? end() : it + 1};
+    }
+
+    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
+    auto equal_range(K const& key) -> std::pair<iterator, iterator> {
+        auto it = do_find(key);
+        return {it, it == end() ? end() : it + 1};
+    }
+
+    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
+    auto equal_range(K const& key) const -> std::pair<const_iterator, const_iterator> {
+        auto it = do_find(key);
+        return {it, it == end() ? end() : it + 1};
+    }
+
+    // bucket interface ///////////////////////////////////////////////////////
+
+    auto bucket_count() const noexcept -> size_t { // NOLINT(modernize-use-nodiscard)
+        return m_num_buckets;
+    }
+
+    static constexpr auto max_bucket_count() noexcept -> size_t { // NOLINT(modernize-use-nodiscard)
+        return max_size();
+    }
+
+    // hash policy ////////////////////////////////////////////////////////////
+
+    [[nodiscard]] auto load_factor() const -> float {
+        return bucket_count() ? static_cast<float>(size()) / static_cast<float>(bucket_count()) : 0.0F;
+    }
+
+    [[nodiscard]] auto max_load_factor() const -> float {
+        return m_max_load_factor;
+    }
+
+    void max_load_factor(float ml) {
+        m_max_load_factor = ml;
+        if (m_num_buckets != max_bucket_count()) {
+            m_max_bucket_capacity = static_cast<value_idx_type>(static_cast<float>(bucket_count()) * max_load_factor());
+        }
+    }
+
+    void rehash(size_t count) {
+        count = std::min(count, max_size());
+        auto shifts = calc_shifts_for_size(std::max(count, size()));
+        if (shifts != m_shifts) {
+            m_shifts = shifts;
+            deallocate_buckets();
+            m_values.shrink_to_fit();
+            allocate_buckets_from_shift();
+            clear_and_fill_buckets_from_values();
+        }
+    }
+
+    void reserve(size_t capa) {
+        capa = std::min(capa, max_size());
+        if constexpr (has_reserve<value_container_type>) {
+            // std::deque doesn't have reserve(). Make sure we only call when available
+            m_values.reserve(capa);
+        }
+        auto shifts = calc_shifts_for_size(std::max(capa, size()));
+        if (0 == m_num_buckets || shifts < m_shifts) {
+            m_shifts = shifts;
+            deallocate_buckets();
+            allocate_buckets_from_shift();
+            clear_and_fill_buckets_from_values();
+        }
+    }
+
+    // observers //////////////////////////////////////////////////////////////
+
+    auto hash_function() const -> hasher {
+        return m_hash;
+    }
+
+    auto key_eq() const -> key_equal {
+        return m_equal;
+    }
+
+    // nonstandard API: expose the underlying values container
+    [[nodiscard]] auto values() const noexcept -> value_container_type const& {
+        return m_values;
+    }
+
+    // non-member functions ///////////////////////////////////////////////////
+
+    friend auto operator==(table const& a, table const& b) -> bool {
+        if (&a == &b) {
+            return true;
+        }
+        if (a.size() != b.size()) {
+            return false;
+        }
+        for (auto const& b_entry : b) {
+            auto it = a.find(get_key(b_entry));
+            if constexpr (std::is_void_v<T>) {
+                // set: only check that the key is here
+                if (a.end() == it) {
+                    return false;
+                }
+            } else {
+                // map: check that key is here, then also check that value is the same
+                if (a.end() == it || !(b_entry.second == it->second)) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    friend auto operator!=(table const& a, table const& b) -> bool {
+        return !(a == b);
+    }
+};
+
+} // namespace detail
+
+template <class Key,
+          class T,
+          class Hash = hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class AllocatorOrContainer = std::allocator<std::pair<Key, T>>,
+          class Bucket = bucket_type::standard>
+using map = detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket>;
+
+template <class Key,
+          class Hash = hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class AllocatorOrContainer = std::allocator<Key>,
+          class Bucket = bucket_type::standard>
+using set = detail::table<Key, void, Hash, KeyEqual, AllocatorOrContainer, Bucket>;
+
+#    if ANKERL_UNORDERED_DENSE_PMR
+
+namespace pmr {
+
+template <class Key,
+          class T,
+          class Hash = hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class Bucket = bucket_type::standard>
+using map = detail::table<Key, T, Hash, KeyEqual, std::pmr::polymorphic_allocator<std::pair<Key, T>>, Bucket>;
+
+template <class Key, class Hash = hash<Key>, class KeyEqual = std::equal_to<Key>, class Bucket = bucket_type::standard>
+using set = detail::table<Key, void, Hash, KeyEqual, std::pmr::polymorphic_allocator<Key>, Bucket>;
+
+} // namespace pmr
+
+#    endif
+
+// deduction guides ///////////////////////////////////////////////////////////
+
+// deduction guides for alias templates are only possible since C++20
+// see https://en.cppreference.com/w/cpp/language/class_template_argument_deduction
+
+} // namespace ANKERL_UNORDERED_DENSE_NAMESPACE
+} // namespace ankerl::unordered_dense
+
+// std extensions /////////////////////////////////////////////////////////////
+
+namespace std { // NOLINT(cert-dcl58-cpp)
+
+template <class Key, class T, class Hash, class KeyEqual, class AllocatorOrContainer, class Bucket, class Pred>
+auto erase_if(ankerl::unordered_dense::detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket>& map, Pred pred)
+    -> size_t {
+    using map_t = ankerl::unordered_dense::detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket>;
+
+    // going back to front because erase() invalidates the end iterator
+    auto const old_size = map.size();
+    auto idx = old_size;
+    while (idx) {
+        --idx;
+        auto it = map.begin() + static_cast<typename map_t::difference_type>(idx);
+        if (pred(*it)) {
+            map.erase(it);
+        }
+    }
+
+    return map.size() - old_size;
+}
+
+} // namespace std
+
+#endif
+#endif
diff --git a/server/vector_type.hpp b/server/vector_type.hpp
index f817051..829044d 100644
--- a/server/vector_type.hpp
+++ b/server/vector_type.hpp
@@ -36,7 +36,7 @@ public:
 		this->size = vt.size;
 		this->capacity = vt.capacity;
 		if (capacity) {
-			// puts("copy");
+			//puts("copy");
 			this->container = (_Ty*)malloc(size * sizeof(_Ty));
 			memcpy(container, vt.container, sizeof(_Ty) * size);
 		}
@@ -152,18 +152,34 @@ public:
 		else
 			return distinct_copy();
 	}
-	inline void grow() {
-		if (size >= capacity) { // geometric growth
-			uint32_t new_capacity = size + 1 + (size >> 1);
-			_Ty* n_container = (_Ty*)malloc(new_capacity * sizeof(_Ty));
-			memcpy(n_container, container, sizeof(_Ty) * size);
+	// TODO: think of situations where this is a temp!! (copy on write!!!)
+	template <bool _grow = true>
+	inline void grow(uint32_t sz = 0) {
+		if constexpr (_grow)
+			sz = this->size;
+		if (sz >= capacity) { // geometric growth
+			uint32_t new_capacity;
+			if constexpr (_grow)
+				new_capacity = size + 1 + (size >> 1);
+			else	
+				new_capacity = sz;
+
+			_Ty* n_container = (_Ty*)realloc(container, new_capacity * sizeof(_Ty));
+			// memcpy(n_container, container, sizeof(_Ty) * size);
 			memset(n_container + size, 0, sizeof(_Ty) * (new_capacity - size));
-			if (capacity)
-				free(container);
+			// if (capacity)
+			// 	free(container);
 			container = n_container;
 			capacity = new_capacity;
 		}
 	}
+	inline void resize(const uint32_t sz){
+		size = sz;
+		grow<false>(sz);
+	}
+	inline void reserve(const uint32_t sz){
+		grow<false>(sz);
+	}
 	void emplace_back(const _Ty& _val) {
 		grow();
 		container[size++] = _val;