hashtable optimize, aggresive SIMD via openmp

master
bill 1 year ago
parent f9205dc2a6
commit 0815222e96

@ -93,7 +93,7 @@ ifeq ($(AQ_DEBUG), 1)
OPTFLAGS = -g3 #-static-libsan -fsanitize=address
# LINKFLAGS =
else
OPTFLAGS += -Ofast -DNDEBUG -fno-stack-protector
OPTFLAGS += -Ofast -DNDEBUG -fno-stack-protector -fopenmp
LINKFLAGS += -flto -s
endif

@ -0,0 +1,53 @@
import os
payload = ('''\
(C) Bill Sun 2022 - 2023
All rights reserved. (or some other license stuff)
''' ).strip().split('\n')
comment_factory = lambda mark, enclosure = '': (f'''\
{enclosure}{mark}
{mark} {f'{chr(10)}{mark} '.join(payload)}
{mark}{enclosure}\n
''' ).encode()
py_payload = comment_factory('#')
c_payload = comment_factory('*', '/')
curr = ['.']
while curr:
next = []
for dir in curr:
items = os.listdir(dir)
for file in items:
fullpath = f'{dir}{os.sep}{file}'
if os.path.isdir(fullpath):
next.append(fullpath)
else:
def write_to_file(payload: str):
with open(fullpath, 'rb+') as f:
content = f.read()
if not content.startswith(payload):
f.seek(0)
f.write(payload + content)
print('processed', fullpath)
else:
print('not processed', fullpath)
if (
file.lower() == 'makefile' or
file.lower() == 'dockerfile' or
'.' in file and
file[file.rfind('.') + 1:].lower()
in
['py', 'sh']
):
write_to_file(py_payload)
elif (
'.' in file and
file[file.rfind('.') + 1:].lower()
in
['cc', 'c', 'cpp', 'cxx', 'hpp', 'h']
):
write_to_file(c_payload)
curr = next

@ -9,3 +9,7 @@
## 2. ColRef supports multiple objects
- A.a = B.b then in projection A.a B.b will refer to same projection
- Colref::ProjEq(ColRef v) => this == v or v in this.proj_eqs
## 3. External Optimizing Tools
- mold/sold instead of builtin linker will boost linker speed
- bolt that have binary optimizer

@ -566,6 +566,7 @@ class scan(ast_node):
self.parent.context.scans.append(self)
def produce(self, node):
self.start += '#pragma openmp simd\n'
if self.loop_style == scan.LoopStyle.foreach:
self.colref = node
self.start += f'for ({self.const}auto& {self.it_var} : {node}) {{\n'
@ -595,7 +596,7 @@ class scan(ast_node):
self.start +
self.front +
b +
'}'
'\n}'
) for b in self.body])
+
self.end
@ -606,7 +607,7 @@ class scan(ast_node):
self.start +
self.front +
'\n'.join(self.body) +
'}' +
'\n}' +
self.end
)
self.context.remove_scan(self, scan_assembly)
@ -657,11 +658,13 @@ class groupby_c(ast_node):
self.context.emitc(f'AQHashTable<{self.group_type}, '
f'transTypes<{self.group_type}, hasher>> {self.group} {{{self.total_sz}}};')
self.n_grps = len(self.glist)
self.scanner = scan(self, self.total_sz, it_name=scanner_itname)
self.scanner.add(f'{self.group}.hashtable_push(forward_as_tuple({g_contents}), {self.scanner.it_var});')
# self.scanner = scan(self, self.total_sz, it_name=scanner_itname)
# self.scanner.add(f'{self.group}.hashtable_push(forward_as_tuple({g_contents}), {self.scanner.it_var});')
self.context.emitc(f'{self.group}.hashtable_push_all({g_contents}, {self.total_sz});')
def consume(self, _):
self.scanner.finalize()
# self.scanner.finalize()
self.context.emitc('printf("ht_construct: %lld\\n", (chrono::high_resolution_clock::now() - timer).count()); timer = chrono::high_resolution_clock::now();')
self.context.emitc(f'auto {self.vecs} = {self.group}.ht_postproc({self.total_sz});')
self.context.emitc('printf("ht_postproc: %lld\\n", (chrono::high_resolution_clock::now() - timer).count()); timer = chrono::high_resolution_clock::now();')

@ -1,13 +1,13 @@
#ifndef _AQUERY_H
#define _AQUERY_H
enum Log_level {
enum Log_level : int {
LOG_INFO,
LOG_ERROR,
LOG_SILENT
};
enum Backend_Type {
enum Backend_Type : int {
BACKEND_AQuery,
BACKEND_MonetDB,
BACKEND_MariaDB

@ -14,7 +14,7 @@ struct AQQueryResult {
#ifndef __AQBACKEND_TYPE__
#define __AQBACKEND_TYPE__ 1
enum Backend_Type {
enum Backend_Type : int {
BACKEND_AQuery,
BACKEND_MonetDB,
BACKEND_MariaDB,

@ -1,3 +1,7 @@
/*
* (C) Bill Sun 2022 - 2023
*/
#pragma once
#include <type_traits>
@ -5,6 +9,7 @@
#include <functional>
#include <string_view>
#include "types.h"
#include "vector_type.hpp"
// #include "robin_hood.h"
#include "unordered_dense.h"
@ -138,68 +143,145 @@ namespace ankerl::unordered_dense{
struct hash<std::tuple<Types...>> : public hasher<Types...>{ };
}
template <class Key, class Hash>
class AQHashTable : public ankerl::unordered_dense::set<Key, Hash> {
public:
uint32_t* reversemap, *mapbase, *ht_base;
AQHashTable() = default;
explicit AQHashTable(uint32_t sz)
: ankerl::unordered_dense::set<Key, Hash>{} {
this->reserve(sz);
this->m_values.reserve(sz);
reversemap = static_cast<uint32_t *>(malloc(sizeof(uint32_t) * sz * 2));
mapbase = reversemap + sz;
ht_base = static_cast<uint32_t *>(calloc(sz, sizeof(uint32_t)));
}
void init(uint32_t sz) {
ankerl::unordered_dense::set<Key, Hash>::reserve(sz);
reversemap = static_cast<uint32_t *>(malloc(sizeof(uint32_t) * sz * 2));
mapbase = reversemap + sz;
ht_base = static_cast<uint32_t *>(calloc(sz, sizeof(uint32_t)));
}
template<typename... Keys_t>
inline void hashtable_push_all(Keys_t& ... keys, uint32_t len) {
for(uint32_t i = 0; i < len; ++i)
reversemap[i] = ankerl::unordered_dense::set<Key, Hash>::hashtable_push(keys[i]...);
for(uint32_t i = 0; i < len; ++i)
++ht_base[reversemap[i]];
}
inline void hashtable_push(Key&& k, uint32_t i){
reversemap[i] = ankerl::unordered_dense::set<Key, Hash>::hashtable_push(k);
++ht_base[reversemap[i]]; // do this seperately?
}
auto ht_postproc(uint32_t sz) {
auto& arr_values = this->values();
const auto& len = this->size();
auto vecs = static_cast<vector_type<uint32_t>*>(malloc(sizeof(vector_type<uint32_t>) * len));
vecs[0].init_from(ht_base[0], mapbase);
for (uint32_t i = 1; i < len; ++i) {
vecs[i].init_from(ht_base[i], mapbase + ht_base[i - 1]);
ht_base[i] += ht_base[i - 1];
}
for (uint32_t i = 0; i < sz; ++i) {
auto id = reversemap[i];
mapbase[--ht_base[id]] = i;
}
return vecs;
}
};
template <
typename ValueType = bool,
typename ValueType = uint32_t,
int PerfectHashingThreshold = 12
>
struct PerfectHashTable {
// static int m_PerfectHashingThreshold = 12;
using key_t = std::conditional_t<PerfectHashingThreshold <= 8, uint8_t,
std::conditional_t<PerfectHashingThreshold <= 16, uint16_t,
std::conditional_t<PerfectHashingThreshold <= 32, uint32_t,
uint64_t
std::conditional_t<PerfectHashingThreshold <= 16, uint16_t,
std::conditional_t<PerfectHashingThreshold <= 32, uint32_t,
uint64_t
>>>;
int n_cols, n_rows = 0;
// char bits[32];
ValueType table[1 << PerfectHashingThreshold];
// PerfectHashTable(int n_cols, char* bits) {
// this->n_cols = n_cols;
// memcpy(this->bits, bits, 32);
// }
// template<typename ... Types, template <typename> class VT>
// PerfectHashTable(VT<Types> ... args) {
// }
constexpr static uint32_t tbl_sz = 1 << PerfectHashingThreshold;
template <typename ... Types, template <typename> class VT>
// std::enable_if_t<std::is_same_v<ValueType, bool>, void>
void
static vector_type<uint32_t>*
construct(VT<Types>&... args) { // construct a hash set
((this->n_cols = args.size), ...);
AQTmr();
int n_cols, n_rows = 0;
((n_cols = args.size), ...);
static_assert(
(sizeof...(Types) < PerfectHashingThreshold) &&
//(sizeof(Types) + ...) < PerfectHashingThreshold &&
(std::is_integral_v<Types> && ...),
"Types must be integral and less than 12 wide in total."
);
// this should be an attrib of VT.
key_t* // this better be automatically determined by Threshould
hash_values = static_cast<key_t*>(
calloc(this->n_cols, sizeof(key_t))
);
//new short[this->n_cols] {0}; // use calloc/delete
key_t*
hash_values = static_cast<key_t*>(
calloc(n_cols, sizeof(key_t))
);
auto get_hash = [&hash_values](auto& arg, int idx) {
uint32_t i = 0;
if(idx > 0)
for (auto& a : arg) {
if (idx > 0) {
#pragma omp simd
for (uint32_t i = 0; i < arg.size; ++i) {
hash_values[i] =
(hash_values[i] << arg.stats.bits) +
(a - arg.stats.minima);
++i;
(arg.container[i] - arg.stats.minima);
}
else
for (auto& a : arg) {
hash_values[i] = a - arg.stats.minima;
++i;
}
else {
#pragma omp simd
for (uint32_t i = 0; i < arg.size; ++i) {
hash_values[i] = arg.container[i] - arg.stats.minima;
}
}
};
};
int idx = 0;
(get_hash(args, idx++), ...);
for (uint32_t i = 0; i < this->n_cols; ++i) {
this->table[hash_values[i]] = true;
// problem: random memory access
uint32_t cnt[tbl_sz];
uint32_t n_grps = 0;
memset(cnt, 0, tbl_sz * sizeof(tbl_sz));
#pragma omp simd
for (uint32_t i = 0; i < n_cols; ++i) {
++cnt[hash_values[i]];
}
ValueType grp_ids[tbl_sz];
#pragma omp simd
for (ValueType i = 0; i < tbl_sz; ++i) {
if (cnt[i] != 0) {
cnt[n_grps] = cnt[i];
grp_ids[i] = n_grps++;
}
}
uint32_t* idxs = static_cast<uint32_t*>(
malloc(n_cols * sizeof(uint32_t))
);
uint32_t** idxs_ptr = static_cast<uint32_t**>(
malloc(n_grps * sizeof(uint32_t*))
);
idxs_ptr[0] = idxs;
#ifdef _MSCVER
#pragma omp simd
#endif
for (int i = 1; i < n_grps; ++i) {
idxs_ptr[i] = idxs_ptr[i - 1] + cnt[i - 1];
}
#pragma omp simd
for (int i = 0; i < n_cols; ++i) {
*(idxs_ptr[grp_ids[hash_values[i]]]++) = i;
}
vector_type<uint32_t>* idxs_vec = static_cast<vector_type<uint32_t>*>(
malloc(n_grps * sizeof(vector_type<uint32_t>))
);
#pragma omp simd
for (int i = 0; i < n_grps; ++i) {
idxs_vec[i].container = idxs_ptr[i];
idxs_vec[i].size = cnt[i];
}
// delete[] hash_values;
free(hash_values); // dispatch to gc
free(hash_values);
return idxs_vec;
}
};

@ -57,7 +57,7 @@ char* intToString(T val, char* buf){
}
enum Log_level {
enum Log_level : int {
LOG_INFO,
LOG_ERROR,
LOG_SILENT
@ -65,7 +65,7 @@ enum Log_level {
#ifndef __AQBACKEND_TYPE__
#define __AQBACKEND_TYPE__ 1
enum Backend_Type {
enum Backend_Type : int {
BACKEND_AQuery,
BACKEND_MonetDB,
BACKEND_MariaDB,

@ -1,3 +1,7 @@
/*
* (C) Bill Sun 2022 - 2023
*/
#include "pch_msc.hpp"
// Non-standard Extensions for MonetDBe, may break concurrency control!

@ -80,7 +80,7 @@ extern "C" void __DLLEXPORT__ receive_args(int argc, char**argv){
n_recvd = argv;
}
enum BinaryInfo_t {
enum BinaryInfo_t : int { // For ABI consistency between compiler
MSVC, MSYS, GCC, CLANG, AppleClang
};

@ -35,7 +35,7 @@ struct ColRef_cstorage {
void* container;
unsigned int size, capacity;
const char* name;
int ty; // what if enum is not int?
int ty;
};
template <template <class...> class VT, class T,

@ -32,7 +32,7 @@ constexpr static bool is_vector_type = is_vector_impl<T>::value;
template <class T>
constexpr size_t aq_szof = sizeof(T);
template <>
inline constexpr size_t aq_szof<void> = 0;
constexpr size_t aq_szof<void> = 0;
template <class T1, class T2>
struct aqis_same_impl {
constexpr static bool value =
@ -68,7 +68,7 @@ template <class T1, class T2>
constexpr bool aqis_same<T1, T2> = aqis_same_impl<T1, T2>::value;
namespace types {
enum Type_t {
enum Type_t : int {
__AQUERY_TYPES__
};
static constexpr const char* printf_str[] = { "%d", "%f", "%s", "%lf", "%Lf", "%ld", "%s", "%hi", "%s", "%s", "%hhd",

@ -1,5 +1,5 @@
/*
* Bill Sun 2022
* (C) Bill Sun 2022 - 2023
*/
@ -13,7 +13,11 @@
#include <initializer_list>
#include <unordered_set>
#include <iostream>
#include "hasher.h"
template <typename _Ty>
class vector_type;
// #include "hasher.h"
#include "types.h"
#include "gc.h"
#pragma pack(push, 1)
@ -484,56 +488,6 @@ public:
};
#pragma pack(pop)
template <class Key, class Hash>
class AQHashTable : public ankerl::unordered_dense::set<Key, Hash> {
public:
uint32_t* reversemap, *mapbase, *ht_base;
AQHashTable() = default;
explicit AQHashTable(uint32_t sz)
: ankerl::unordered_dense::set<Key, Hash>{} {
this->reserve(sz);
this->m_values.reserve(sz);
reversemap = static_cast<uint32_t *>(malloc(sizeof(uint32_t) * sz * 2));
mapbase = reversemap + sz;
ht_base = static_cast<uint32_t *>(calloc(sz, sizeof(uint32_t)));
}
void init(uint32_t sz) {
ankerl::unordered_dense::set<Key, Hash>::reserve(sz);
reversemap = static_cast<uint32_t *>(malloc(sizeof(uint32_t) * sz * 2));
mapbase = reversemap + sz;
ht_base = static_cast<uint32_t *>(calloc(sz, sizeof(uint32_t)));
}
template<typename... Keys_t>
inline void hashtable_push_all(Keys_t& ... keys, uint32_t len) {
for(uint32_t i = 0; i < len; ++i)
reversemap[i] = ankerl::unordered_dense::set<Key, Hash>::hashtable_push(keys[i]...);
for(uint32_t i = 0; i < len; ++i)
++ht_base[reversemap[i]];
}
inline void hashtable_push(Key&& k, uint32_t i){
reversemap[i] = ankerl::unordered_dense::set<Key, Hash>::hashtable_push(k);
++ht_base[reversemap[i]]; // do this seperately?
}
auto ht_postproc(uint32_t sz) {
auto& arr_values = this->values();
const auto& len = this->size();
auto vecs = static_cast<vector_type<uint32_t>*>(malloc(sizeof(vector_type<uint32_t>) * len));
vecs[0].init_from(ht_base[0], mapbase);
for (uint32_t i = 1; i < len; ++i) {
vecs[i].init_from(ht_base[i], mapbase + ht_base[i - 1]);
ht_base[i] += ht_base[i - 1];
}
for (uint32_t i = 0; i < sz; ++i) {
auto id = reversemap[i];
mapbase[--ht_base[id]] = i;
}
return vecs;
}
};
template<>
vector_type<std::string_view>::vector_type(const uint32_t size, void* data);

Loading…
Cancel
Save