From 6adb7900cbe98a14b98c81375ab1ec74dc2d4fe7 Mon Sep 17 00:00:00 2001
From: taozizhuo <tzzveryhandsome@gmail.com>
Date: Mon, 5 Dec 2022 14:28:29 -0800
Subject: [PATCH 1/7] add benchmark queries

---
 benchmark/quries/Aquery/load_data.a | 6 ++++++
 benchmark/quries/Aquery/q0.a        | 5 +++++
 benchmark/quries/Aquery/q1.a        | 7 +++++++
 benchmark/quries/Aquery/q10.a       | 4 ++++
 benchmark/quries/Aquery/q2.a        | 4 ++++
 benchmark/quries/Aquery/q3.a        | 7 +++++++
 benchmark/quries/Aquery/q4.a        | 5 +++++
 benchmark/quries/Aquery/q7.a        | 5 +++++
 benchmark/quries/Aquery/q8.a        | 6 ++++++
 benchmark/quries/Aquery/q9.a        | 6 ++++++
 benchmark/quries/Clickhouse/q0      | 3 +++
 benchmark/quries/Clickhouse/q1      | 4 ++++
 benchmark/quries/Clickhouse/q10     | 8 ++++++++
 benchmark/quries/Clickhouse/q2      | 2 ++
 benchmark/quries/Clickhouse/q3      | 4 ++++
 benchmark/quries/Clickhouse/q4      | 2 ++
 benchmark/quries/Clickhouse/q7      | 5 +++++
 benchmark/quries/Clickhouse/q8      | 3 +++
 benchmark/quries/Clickhouse/q9      | 3 +++
 benchmark/quries/Timescaledb/q0     | 3 +++
 benchmark/quries/Timescaledb/q1     | 4 ++++
 benchmark/quries/Timescaledb/q10    | 7 +++++++
 benchmark/quries/Timescaledb/q2     | 2 ++
 benchmark/quries/Timescaledb/q3     | 4 ++++
 benchmark/quries/Timescaledb/q4     | 2 ++
 benchmark/quries/Timescaledb/q7     | 5 +++++
 benchmark/quries/Timescaledb/q8     | 3 +++
 benchmark/quries/Timescaledb/q9     | 3 +++
 28 files changed, 122 insertions(+)
 create mode 100644 benchmark/quries/Aquery/load_data.a
 create mode 100644 benchmark/quries/Aquery/q0.a
 create mode 100644 benchmark/quries/Aquery/q1.a
 create mode 100644 benchmark/quries/Aquery/q10.a
 create mode 100644 benchmark/quries/Aquery/q2.a
 create mode 100644 benchmark/quries/Aquery/q3.a
 create mode 100644 benchmark/quries/Aquery/q4.a
 create mode 100644 benchmark/quries/Aquery/q7.a
 create mode 100644 benchmark/quries/Aquery/q8.a
 create mode 100644 benchmark/quries/Aquery/q9.a
 create mode 100644 benchmark/quries/Clickhouse/q0
 create mode 100644 benchmark/quries/Clickhouse/q1
 create mode 100644 benchmark/quries/Clickhouse/q10
 create mode 100644 benchmark/quries/Clickhouse/q2
 create mode 100644 benchmark/quries/Clickhouse/q3
 create mode 100644 benchmark/quries/Clickhouse/q4
 create mode 100644 benchmark/quries/Clickhouse/q7
 create mode 100644 benchmark/quries/Clickhouse/q8
 create mode 100644 benchmark/quries/Clickhouse/q9
 create mode 100644 benchmark/quries/Timescaledb/q0
 create mode 100644 benchmark/quries/Timescaledb/q1
 create mode 100644 benchmark/quries/Timescaledb/q10
 create mode 100644 benchmark/quries/Timescaledb/q2
 create mode 100644 benchmark/quries/Timescaledb/q3
 create mode 100644 benchmark/quries/Timescaledb/q4
 create mode 100644 benchmark/quries/Timescaledb/q7
 create mode 100644 benchmark/quries/Timescaledb/q8
 create mode 100644 benchmark/quries/Timescaledb/q9
diff --git a/benchmark/quries/Aquery/load_data.a b/benchmark/quries/Aquery/load_data.a
new file mode 100644
index 0000000..54bc36f
--- /dev/null
+++ b/benchmark/quries/Aquery/load_data.a
@@ -0,0 +1,6 @@
+CREATE TABLE trade01m(stocksymbol STRING, time INT, quantity INT, price INT)
+load data infile "../tables/trade01m.csv" into table trade01m fields terminated by ','
+CREATE TABLE trade1m(stocksymbol STRING, time INT, quantity INT, price INT)
+load data infile "../tables/trade1m.csv" into table trade1m fields terminated by ','
+CREATE TABLE trade10m(stocksymbol STRING, time INT, quantity INT, price INT)
+load data infile "../tables/trade10m.csv" into table trade10m fields terminated by ','
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q0.a b/benchmark/quries/Aquery/q0.a
new file mode 100644
index 0000000..a18deec
--- /dev/null
+++ b/benchmark/quries/Aquery/q0.a
@@ -0,0 +1,5 @@
+-- select rows
+<sql>
+CREATE TABLE res0 AS
+SELECT * FROM trade10m
+</sql>
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q1.a b/benchmark/quries/Aquery/q1.a
new file mode 100644
index 0000000..f3077a9
--- /dev/null
+++ b/benchmark/quries/Aquery/q1.a
@@ -0,0 +1,7 @@
+-- groupby_multi_different_functions
+<sql>
+CREATE TABLE res1 AS
+SELECT avg(quantity) AS avg_quan, min(price) AS min_p
+FROM trade1m
+GROUP BY stocksymbol, time
+</sql>
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q10.a b/benchmark/quries/Aquery/q10.a
new file mode 100644
index 0000000..8c891ba
--- /dev/null
+++ b/benchmark/quries/Aquery/q10.a
@@ -0,0 +1,4 @@
+SELECT stocksymbol, MAX(stddevs(3, price))
+FROM trade1m
+ASSUMING ASC time
+GROUP BY stocksymbol
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q2.a b/benchmark/quries/Aquery/q2.a
new file mode 100644
index 0000000..28e6368
--- /dev/null
+++ b/benchmark/quries/Aquery/q2.a
@@ -0,0 +1,4 @@
+-- count values
+<sql>
+SELECT COUNT(*) FROM trade10m
+</sql>
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q3.a b/benchmark/quries/Aquery/q3.a
new file mode 100644
index 0000000..c6f7a5b
--- /dev/null
+++ b/benchmark/quries/Aquery/q3.a
@@ -0,0 +1,7 @@
+-- group by multiple keys
+<sql>
+create table res3 AS
+SELECT sum(quantity) as sum_quantity
+FROM trade01m
+GROUP BY stocksymbol, price
+</sql>
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q4.a b/benchmark/quries/Aquery/q4.a
new file mode 100644
index 0000000..bab175f
--- /dev/null
+++ b/benchmark/quries/Aquery/q4.a
@@ -0,0 +1,5 @@
+-- append tables
+<sql>
+CREATE TABLE res4 AS
+SELECT * FROM trade10m UNION ALL SELECT * FROM trade10m
+</sql>
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q7.a b/benchmark/quries/Aquery/q7.a
new file mode 100644
index 0000000..7e384c8
--- /dev/null
+++ b/benchmark/quries/Aquery/q7.a
@@ -0,0 +1,5 @@
+CREATE table res7 AS
+SELECT stocksymbol, avgs(5, price)
+FROM trade10m
+ASSUMING ASC time
+GROUP BY stocksymbol
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q8.a b/benchmark/quries/Aquery/q8.a
new file mode 100644
index 0000000..6642520
--- /dev/null
+++ b/benchmark/quries/Aquery/q8.a
@@ -0,0 +1,6 @@
+<sql>
+CREATE TABLE res8 AS
+SELECT stocksymbol, quantity, price
+FROM trade10m
+WHERE time >= 5288 and time <= 7000
+</sql>
\ No newline at end of file
diff --git a/benchmark/quries/Aquery/q9.a b/benchmark/quries/Aquery/q9.a
new file mode 100644
index 0000000..7348b8e
--- /dev/null
+++ b/benchmark/quries/Aquery/q9.a
@@ -0,0 +1,6 @@
+<sql>
+CREATE TABLE res9 AS
+SELECT stocksymbol, MAX(price) - MIN(price)
+FROM trade10m
+GROUP BY stocksymbol
+</sql>
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q0 b/benchmark/quries/Clickhouse/q0
new file mode 100644
index 0000000..e06e534
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q0
@@ -0,0 +1,3 @@
+-- q0 select rows
+CREATE TABLE res0 (a String, b Int32, c Int32, d Int32) ENGINE = MergeTree() ORDER BY b AS
+SELECT * FROM benchmark.trade10m
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q1 b/benchmark/quries/Clickhouse/q1
new file mode 100644
index 0000000..21ef83b
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q1
@@ -0,0 +1,4 @@
+-- groupby_multi_different_functions
+SELECT avg(quantity), min(price)
+FROM benchmark.trade10m
+GROUP BY stocksymbol, time
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q10 b/benchmark/quries/Clickhouse/q10
new file mode 100644
index 0000000..c251cb6
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q10
@@ -0,0 +1,8 @@
+-- max rolling std
+select
+    stocksymbol,
+    max(stddevPop(price)) over
+        (partition by stocksymbol rows between 2 preceding AND CURRENT row) as maxRollingStd
+from 
+(SELECT * FROM benchmark.trade01m ORDER BY time)
+GROUP BY stocksymbol
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q2 b/benchmark/quries/Clickhouse/q2
new file mode 100644
index 0000000..1267934
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q2
@@ -0,0 +1,2 @@
+-- count values
+SELECT COUNT(*) FROM benchmark.trade10m
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q3 b/benchmark/quries/Clickhouse/q3
new file mode 100644
index 0000000..79ea85e
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q3
@@ -0,0 +1,4 @@
+-- group by multiple keys
+SELECT sum(quantity)
+FROM benchmark.trade10m
+GROUP BY stocksymbol, price
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q4 b/benchmark/quries/Clickhouse/q4
new file mode 100644
index 0000000..016f3fc
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q4
@@ -0,0 +1,2 @@
+-- append two tables
+SELECT * FROM benchmark.trade10m UNION ALL SELECT * FROM benchmark.trade10m
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q7 b/benchmark/quries/Clickhouse/q7
new file mode 100644
index 0000000..ed57058
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q7
@@ -0,0 +1,5 @@
+-- moving_avg
+SELECT stocksymbol, groupArrayMovingAvg(5)(price) AS moving_avg_price
+FROM 
+(SELECT * FROM benchmark.trade01m ORDER BY time)
+GROUP BY stocksymbol
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q8 b/benchmark/quries/Clickhouse/q8
new file mode 100644
index 0000000..550abbd
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q8
@@ -0,0 +1,3 @@
+SELECT stocksymbol, quantity, price
+FROM benchmark.trade10m
+WHERE time >= 5288 and time <= 7000
\ No newline at end of file
diff --git a/benchmark/quries/Clickhouse/q9 b/benchmark/quries/Clickhouse/q9
new file mode 100644
index 0000000..48312c9
--- /dev/null
+++ b/benchmark/quries/Clickhouse/q9
@@ -0,0 +1,3 @@
+SELECT stocksymbol, MAX(price) - MIN(price)
+FROM benchmark.trade1m
+GROUP BY stocksymbol
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q0 b/benchmark/quries/Timescaledb/q0
new file mode 100644
index 0000000..b6dec8f
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q0
@@ -0,0 +1,3 @@
+-- select rows
+CREATE TABLE res0 AS 
+SELECT * FROM trade10m;
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q1 b/benchmark/quries/Timescaledb/q1
new file mode 100644
index 0000000..0ac4c46
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q1
@@ -0,0 +1,4 @@
+-- groupby_multi_different_functions
+SELECT avg(quantity), min(price)
+FROM trade10m
+GROUP BY stocksymbol, time;
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q10 b/benchmark/quries/Timescaledb/q10
new file mode 100644
index 0000000..6d4b326
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q10
@@ -0,0 +1,7 @@
+select
+    stocksymbol,
+    max(stddev(price)) over
+        (partition by stocksymbol rows between 2 preceding AND CURRENT row) as maxRollingStd
+from 
+(SELECT * FROM trade01m ORDER BY time) as t
+GROUP BY stocksymbol;
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q2 b/benchmark/quries/Timescaledb/q2
new file mode 100644
index 0000000..b1f00f6
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q2
@@ -0,0 +1,2 @@
+-- count values
+SELECT COUNT(*) FROM trade10m;
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q3 b/benchmark/quries/Timescaledb/q3
new file mode 100644
index 0000000..0176182
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q3
@@ -0,0 +1,4 @@
+-- group by multiple keys
+SELECT sum(quantity)
+FROM trade10m
+GROUP BY stocksymbol, price;
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q4 b/benchmark/quries/Timescaledb/q4
new file mode 100644
index 0000000..a3e7f14
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q4
@@ -0,0 +1,2 @@
+-- append tables
+SELECT * FROM trade10m UNION ALL SELECT * FROM trade10m;
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q7 b/benchmark/quries/Timescaledb/q7
new file mode 100644
index 0000000..c0aa976
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q7
@@ -0,0 +1,5 @@
+select
+    stocksymbol,
+    coalesce(avg(price) over
+        (partition by stocksymbol order by time rows between 4 preceding AND CURRENT row), price) as rollingAvg
+from trade10m;
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q8 b/benchmark/quries/Timescaledb/q8
new file mode 100644
index 0000000..db6be13
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q8
@@ -0,0 +1,3 @@
+SELECT stocksymbol, quantity, price
+FROM trade01m
+WHERE time >= 5288 and time <= 7000
\ No newline at end of file
diff --git a/benchmark/quries/Timescaledb/q9 b/benchmark/quries/Timescaledb/q9
new file mode 100644
index 0000000..e8c0b92
--- /dev/null
+++ b/benchmark/quries/Timescaledb/q9
@@ -0,0 +1,3 @@
+SELECT stocksymbol, MAX(price) - MIN(price)
+FROM trade01m
+GROUP BY stocksymbol;
\ No newline at end of file

From 4942dc1f5044bdd79b72d554f1b04cb0c6587762 Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Tue, 6 Dec 2022 11:34:39 +0800
Subject: [PATCH 2/7] fixes on stored proc

---
 Makefile          |  4 ++--
 aquery_config.py  |  2 +-
 build.py          |  4 ++--
 server/server.cpp | 26 ++++++++++++++++++--------
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index c438529..4240bf6 100644
--- a/Makefile
+++ b/Makefile
@@ -4,10 +4,10 @@ MonetDB_INC =
 Defines = 
 CXXFLAGS = --std=c++2a
 ifeq ($(AQ_DEBUG), 1)
-	OPTFLAGS = -g3 -fsanitize=address -fsanitize=leak
+	OPTFLAGS = -g3 #-fsanitize=address 
 	LINKFLAGS = 
 else
-	OPTFLAGS = -O3 -DNDEBUG -fno-stack-protector 
+	OPTFLAGS = -Ofast -DNDEBUG -fno-stack-protector 
 	LINKFLAGS = -flto -s
 endif
 SHAREDFLAGS = -shared  
diff --git a/aquery_config.py b/aquery_config.py
index 094bc47..df2511a 100644
--- a/aquery_config.py
+++ b/aquery_config.py
@@ -2,7 +2,7 @@
 
 ## GLOBAL CONFIGURATION FLAGS
 
-version_string = '0.5.4a'
+version_string = '0.6.0a'
 add_path_to_ldpath = True
 rebuild_backend = False
 run_backend = True
diff --git a/build.py b/build.py
index ec59122..e8c5255 100644
--- a/build.py
+++ b/build.py
@@ -117,7 +117,7 @@ class build_manager:
             else:
                 mgr.cxx = os.environ['CXX']
             if 'AQ_DEBUG' not in os.environ:
-                os.environ['AQ_DEBUG'] = '0' if mgr.OptimizationLv else '1'
+                os.environ['AQ_DEBUG'] = ('0' if mgr.OptimizationLv != '0' else '1')
 
         def libaquery_a(self):
             self.build_cmd = [['rm', 'libaquery.a'],['make', 'libaquery']]
@@ -184,7 +184,7 @@ class build_manager:
     def __init__(self) -> None:
         self.method = 'make'
         self.cxx = ''
-        self.OptimizationLv = '0' # [O0, O1, O2, O3, Ofast]
+        self.OptimizationLv = '4' # [O0, O1, O2, O3, Ofast]
         self.Platform = 'amd64'
         self.PCH = os.environ['PCH'] if 'PCH' in os.environ else 1
         self.StaticLib = 1
diff --git a/server/server.cpp b/server/server.cpp
index 64f6544..f2e8c77 100644
--- a/server/server.cpp
+++ b/server/server.cpp
@@ -503,6 +503,7 @@ start:
                                 };
                                 const auto& load_proc_fromfile = [&](StoredProcedure& p)  {
                                     auto config_name = procedure_root + p.name + ".aqp";
+                                    puts(p.name);
                                     auto fp = fopen(config_name.c_str(), "rb");
                                     if(fp == nullptr){
                                         puts("ERROR: Procedure not found on disk.");
@@ -517,12 +518,14 @@ start:
 
                                     p.queries = static_cast<char**>(malloc(sizeof(char*) * p.cnt));
                                     p.queries[0] = static_cast<char*>(malloc(sizeof(char) * queries_size));
-                                    fread(&p.queries[0], queries_size, 1, fp);
+                                    fread(p.queries[0], 1, queries_size, fp);
 
                                     for(uint32_t j = 1; j < p.cnt; ++j){
                                         p.queries[j] = p.queries[j-1];
-                                        while(*p.queries[j] != '\0')
+                                        while(*(p.queries[j]) != '\0')
                                             ++p.queries[j];
+                                        ++p.queries[j];
+                                        puts(p.queries[j-1]);
                                     }
                                     fclose(fp);
                                     return load_modules(p);
@@ -553,18 +556,22 @@ start:
                                         auto _proc = cxt->stored_proc.find(proc_name);
                                         if (_proc == cxt->stored_proc.end()){
                                             printf("Procedure %s not found. Trying load from disk.\n", proc_name);
-                                            if (load_proc_fromfile(current_procedure)){
+                                            current_procedure.name = copy_lpstr(proc_name);
+                                            if (!load_proc_fromfile(current_procedure)){
                                                 cxt->stored_proc.insert_or_assign(proc_name, current_procedure);
                                             }
+                                            else {
+                                                continue;
+                                            }
                                         }
                                         else{
                                             current_procedure = _proc->second;
-                                            n_recv = current_procedure.cnt;
-                                            n_recvd = current_procedure.queries;
-                                            load_modules(current_procedure);
-                                            procedure_replaying = true;
-                                            goto start; // yes, I know, refactor later!!
                                         }
+                                        n_recv = current_procedure.cnt;
+                                        n_recvd = current_procedure.queries;
+                                        load_modules(current_procedure);
+                                        procedure_replaying = true;
+                                        goto start; // yes, I know, refactor later!!
                                     }
                                     break;
                                     case 'D': // delete procedure
@@ -572,6 +579,9 @@ start:
                                     case 'S': //save procedure
                                     break;
                                     case 'L': //load procedure
+                                    if (!load_proc_fromfile(current_procedure)) {
+                                        cxt->stored_proc.insert_or_assign(proc_name, current_procedure);
+                                    }
                                     break;
                                     case 'd': // display all procedures
                                     for(const auto& p : cxt->stored_proc){

From eebf507c6a9390ebaa74aefc8bb5332c67346a7a Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Fri, 23 Dec 2022 15:04:39 +0800
Subject: [PATCH 3/7] WIP: group by optimizations

---
 README.md                | 65 +++++++++++++++++-----------------------
 server/aggregations.h    | 15 ++++++++++
 server/hasher.h          |  4 +++
 server/monetdb_conn.cpp  |  7 ++++-
 server/server.cpp        | 27 ++++++++++++-----
 server/table.h           |  1 +
 server/unordered_dense.h | 12 ++++++++
 7 files changed, 86 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index ef96a71..3624a73 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,20 @@
-
 # AQuery++ Database
-
-### Please try the latest code in dev branch if you encounter any problem. Use `git checkout dev` to switch branches.
-
 ## Introduction
 
 AQuery++ Database is a cross-platform, In-Memory Column-Store Database that incorporates compiled query execution. (**Note**: If you encounter any problems, feel free to contact me via ys3540@nyu.edu)
+# Architecture 
+![Architecture](./docs/arch-hybrid.svg)
 
-## Docker (Recommended): 
-   - See installation instructions from [docker.com](https://www.docker.com). Run **docker desktop** to start docker engine.
-   - In AQuery root directory, type `make docker` to build the docker image from scratch. 
-   - For Arm-based Mac users, you would have to build and run the **x86_64** docker image because MonetDB doesn't offer official binaries for arm64 Linux. (Run `docker buildx build --platform=linux/amd64 -t aquery .` instead of `make docker`)
-   - Finally run the image in **interactive** mode (`docker run --name aquery -it aquery`) 
-   - When you need to access the container again run `docker start -ai aquery` 
-   - If there is a need to access the system shell within AQuery, type `dbg` to activate python interpreter and type `os.system('sh')` to launch a shell.
-   - Docker image is available on [Docker Hub](https://hub.docker.com/repository/docker/sunyinqi0508/aquery) but building image yourself is highly recommended (see [#2](../../issues/2)) 
-## CIMS Computer Lab (Only for NYU affiliates who have access)
-  1. Clone this git repo in CIMS.
-  2. Download the [patch](https://drive.google.com/file/d/1YkykhM6u0acZ-btQb4EUn4jAEXPT81cN/view?usp=sharing) 
-  3. Decompress the patch to any directory and execute script inside by typing (`source ./cims.sh`). Please use the source command or `. ./cims.sh` (dot space) to execute the script because it contains configurations for environment variables. Also note that this script can only work with bash and compatible shells (e.g. dash, zsh. but not csh)
-  4. Execute `python3 ./prompt.py`
+## AQuery Compiler
+- The query is first processed by the AQuery Compiler which is composed of a frontend that parses the query into AST and a backend that generates target code that delivers the query.
+- Front end of AQuery++ Compiler is built on top of [mo-sql-parsing](https://github.com/klahnakoski/mo-sql-parsing) with modifications to handle AQuery dialect and extension.
+- Backend of AQuery++ Compiler generates target code dependent on the Execution Engine. It can either be the C++ code for AQuery Execution Engine or sql and C++ post-processor for Hybrid Engine or k9 for the k9 Engine.
+## Execution Engines
+- AQuery++ supports different execution engines thanks to the decoupled compiler structure.
+- Hybrid Execution Engine: decouples the query into two parts. The sql-compliant part is executed by an Embedded version of Monetdb and everything else is executed by a post-process module which is generated by AQuery++ Compiler in C++ and then compiled and executed.
+- AQuery Library: A set of header based libraries that provide column arithmetic and operations inspired by array programming languages like kdb. This library is used by C++ post-processor code which can significantly reduce the complexity of generated code, reducing compile time while maintaining the best performance. The set of libraries can also be used by UDFs as well as User modules which makes it easier for users to write simple, efficient yet powerful extensions. 
 
-## Singularity Container
- 1. build container `singularity build aquery.sif aquery.def`
- 2. execute container `singularity exec aquery.sif sh`
- 3. run AQuery `python3 ./prompt.py`
-# Native Installation:
+# Installation:
 ## Requirements
 1. Recent version of Linux, Windows or MacOS, with recent C++ compiler that has C++17 (1z) support. (however c++20 is recommended if available for heterogeneous lookup on unordered containers)
      - GCC: 9.0 or above (g++ 7.x, 8.x fail to handle fold-expressions due to a compiler bug)
@@ -38,10 +27,6 @@ AQuery++ Database is a cross-platform, In-Memory Column-Store Database that inco
    - On MacOS, Monetdb can be easily installed in homebrew `brew install monetdb`.
 
 3. Python 3.6 or above and install required packages in requirements.txt by `python3 -m pip install -r requirements.txt` 
-
-## Installation
-AQuery is tested on mainstream operating systems such as Windows, macOS and Linux
-
 ### Windows
 There're multiple options to run AQuery on Windows. But for better consistency I recommend using a simulated Linux environment such as **Windows Subsystem for Linux** (1 or 2), **Docker** or **Linux Virtual Machines**. You can also use the native toolchain from Microsoft Visual Studio or gcc from Winlabs/Cygwin/MinGW.
 
@@ -97,7 +82,24 @@ There're multiple options to run AQuery on Windows. But for better consistency I
    
    In this case, upgrade anaconda or your compiler or use the python from your OS or package manager instead. Or (**NOT recommended**) copy/link the library from your system (e.g. /usr/lib/x86_64-linux-gnu/libstdc++.so.6) to anaconda's library directory (e.g. ~/Anaconda3/lib/).
 
+## Docker (Recommended): 
+   - See installation instructions from [docker.com](https://www.docker.com). Run **docker desktop** to start docker engine.
+   - In AQuery root directory, type `make docker` to build the docker image from scratch. 
+   - For Arm-based Mac users, you would have to build and run the **x86_64** docker image because MonetDB doesn't offer official binaries for arm64 Linux. (Run `docker buildx build --platform=linux/amd64 -t aquery .` instead of `make docker`)
+   - Finally run the image in **interactive** mode (`docker run --name aquery -it aquery`) 
+   - When you need to access the container again run `docker start -ai aquery` 
+   - If there is a need to access the system shell within AQuery, type `dbg` to activate python interpreter and type `os.system('sh')` to launch a shell.
+   - Docker image is available on [Docker Hub](https://hub.docker.com/repository/docker/sunyinqi0508/aquery) but building image yourself is highly recommended (see [#2](../../issues/2)) 
+## CIMS Computer Lab (Only for NYU affiliates who have access)
+  1. Clone this git repo in CIMS.
+  2. Download the [patch](https://drive.google.com/file/d/1YkykhM6u0acZ-btQb4EUn4jAEXPT81cN/view?usp=sharing) 
+  3. Decompress the patch to any directory and execute script inside by typing (`source ./cims.sh`). Please use the source command or `. ./cims.sh` (dot space) to execute the script because it contains configurations for environment variables. Also note that this script can only work with bash and compatible shells (e.g. dash, zsh. but not csh)
+  4. Execute `python3 ./prompt.py`
 
+## Singularity Container
+ 1. build container `singularity build aquery.sif aquery.def`
+ 2. execute container `singularity exec aquery.sif sh`
+ 3. run AQuery `python3 ./prompt.py`
 # Usage
 `python3 prompt.py` will launch the interactive command prompt. The server binary will be automatically rebuilt and started.
 ### Commands:
@@ -268,17 +270,6 @@ SELECT * FROM my_table WHERE c1 > 10
 - `sqrt(x), trunc(x), and other builtin math functions`: value-wise math operations. `sqrt(x)[i] = sqrt(x[i])`
 - `pack(cols, ...)`: pack multiple columns with exact same type into a single column. 
 
-# Architecture 
-![Architecture](./docs/arch-hybrid.svg)
-
-## AQuery Compiler
-- The query is first processed by the AQuery Compiler which is composed of a frontend that parses the query into AST and a backend that generates target code that delivers the query.
-- Front end of AQuery++ Compiler is built on top of [mo-sql-parsing](https://github.com/klahnakoski/mo-sql-parsing) with modifications to handle AQuery dialect and extension.
-- Backend of AQuery++ Compiler generates target code dependent on the Execution Engine. It can either be the C++ code for AQuery Execution Engine or sql and C++ post-processor for Hybrid Engine or k9 for the k9 Engine.
-## Execution Engines
-- AQuery++ supports different execution engines thanks to the decoupled compiler structure.
-- Hybrid Execution Engine: decouples the query into two parts. The sql-compliant part is executed by an Embedded version of Monetdb and everything else is executed by a post-process module which is generated by AQuery++ Compiler in C++ and then compiled and executed.
-- AQuery Library: A set of header based libraries that provide column arithmetic and operations inspired by array programming languages like kdb. This library is used by C++ post-processor code which can significantly reduce the complexity of generated code, reducing compile time while maintaining the best performance. The set of libraries can also be used by UDFs as well as User modules which makes it easier for users to write simple but powerful extensions. 
 
 # Roadmap
 - [x] SQL Parser -> AQuery Parser (Front End)
diff --git a/server/aggregations.h b/server/aggregations.h
index 0f1d8f8..bb8ca0e 100644
--- a/server/aggregations.h
+++ b/server/aggregations.h
@@ -186,6 +186,21 @@ decayed_t<VT, types::GetLongType<T>> sumw(uint32_t w, const VT<T>& arr) {
 	return ret;
 }
 
+template<class T, template<typename ...> class VT>
+void avgw(uint32_t w, const VT<T>& arr, 
+	decayed_t<VT, types::GetFPType<types::GetLongType<T>>>& ret) {
+	typedef types::GetFPType<types::GetLongType<T>> FPType;
+	const uint32_t& len = arr.size;
+	uint32_t i = 0;
+	types::GetLongType<T> s{};
+	w = w > len ? len : w;
+	if (len) s = ret[i++] = arr[0];
+	for (; i < w; ++i)
+		ret[i] = (s += arr[i]) / (FPType)(i + 1);
+	for (; i < len; ++i)
+		ret[i] = ret[i - 1] + (arr[i] - arr[i - w]) / (FPType)w;
+}
+
 template<class T, template<typename ...> class VT>
 decayed_t<VT, types::GetFPType<types::GetLongType<T>>> avgw(uint32_t w, const VT<T>& arr) {
 	typedef types::GetFPType<types::GetLongType<T>> FPType;
diff --git a/server/hasher.h b/server/hasher.h
index 22a98e2..0675f96 100644
--- a/server/hasher.h
+++ b/server/hasher.h
@@ -132,3 +132,7 @@ namespace ankerl::unordered_dense{
 	struct hash<std::tuple<Types...>> : public hasher<Types...>{ };
 }
 
+struct aq_hashtable_value_t{
+	uint32_t id;
+	uint32_t cnt;
+};
\ No newline at end of file
diff --git a/server/monetdb_conn.cpp b/server/monetdb_conn.cpp
index a7827ae..c577c8b 100644
--- a/server/monetdb_conn.cpp
+++ b/server/monetdb_conn.cpp
@@ -6,6 +6,8 @@
 #include "monetdb_conn.h"
 #include "monetdbe.h"
 #include "table.h"
+#include <thread>
+
 #undef ERROR
 #undef static_assert
 
@@ -86,7 +88,10 @@ void Server::connect(Context *cxt){
     }
 
     server = (monetdbe_database*)malloc(sizeof(monetdbe_database));
-    auto ret = monetdbe_open(server, nullptr, nullptr);
+    monetdbe_options ops;
+    AQ_ZeroMemory(ops);
+    ops.nr_threads = std::thread::hardware_concurrency();
+    auto ret = monetdbe_open(server, nullptr, &ops);
     if (ret == 0){
         status = true;
         this->server = server;
diff --git a/server/server.cpp b/server/server.cpp
index f2e8c77..0176e5b 100644
--- a/server/server.cpp
+++ b/server/server.cpp
@@ -191,6 +191,21 @@ constexpr prt_fn_t monetdbe_prtfns[] = {
 	aq_to_chars<std::nullptr_t>
 };
 
+#ifndef __AQ_USE_THREADEDGC__
+void aq_init_gc(void *handle, Context* cxt)
+{
+    typedef void (*aq_gc_init_t) (Context*);
+    if (handle && cxt){
+        auto sym = dlsym(handle, "__AQ_Init_GC__");
+        if(sym){
+            ((aq_gc_init_t)sym)(cxt);
+        }
+    }
+}
+#else //__AQ_USE_THREADEDGC__
+#define aq_init_gc(h, c) 
+#endif //__AQ_USE_THREADEDGC__
+
 #include "monetdbe.h"
 #undef max
 #undef min
@@ -363,12 +378,7 @@ start:
                             recorded_queries.emplace_back(copy_lpstr("N"));
                         }
                         handle = dlopen(proc_name, RTLD_NOW);
-#ifndef __AQ_USE_THREADEDGC__
-                        {
-                            typedef void (*aq_gc_init_t) (Context*);
-                            ((aq_gc_init_t)dlsym(handle, "__AQ_Init_GC__"))(cxt);
-                        }
-#endif
+                        aq_init_gc(handle, cxt);
                         if (procedure_recording) {
                             recorded_libraries.emplace_back(handle);
                         }
@@ -474,11 +484,13 @@ start:
                                         p.__rt_loaded_modules = static_cast<void**>(
                                             malloc(sizeof(void*) * p.postproc_modules));
                                         for(uint32_t j = 0; j < p.postproc_modules; ++j){
-                                            auto pj = dlopen(p.name, RTLD_NOW);
+                                            auto pj = dlopen((procedure_root + p.name + std::to_string(j) + ".so").c_str(), RTLD_NOW);
                                             if (pj == nullptr){
                                                 printf("Error: failed to load module %s\n", p.name);
                                                 return true;
                                             }
+                                            aq_init_gc(pj, cxt);
+
                                             p.__rt_loaded_modules[j] = pj;
                                         }
                                     }
@@ -528,6 +540,7 @@ start:
                                         puts(p.queries[j-1]);
                                     }
                                     fclose(fp);
+                                    p.__rt_loaded_modules = 0;
                                     return load_modules(p);
                                 };
                                 switch(n_recvd[i][1]){
diff --git a/server/table.h b/server/table.h
index 3a33136..9de4487 100644
--- a/server/table.h
+++ b/server/table.h
@@ -289,6 +289,7 @@ public:
 		uint32_t len = end - start;
 		return ColView<_Ty>(orig, idxs.subvec(start, end));
 	}
+
 	ColRef<_Ty> subvec_deep(uint32_t start, uint32_t end) const {
 		uint32_t len = end - start;
 		ColRef<_Ty> subvec(len);
diff --git a/server/unordered_dense.h b/server/unordered_dense.h
index 737d12b..d81a134 100644
--- a/server/unordered_dense.h
+++ b/server/unordered_dense.h
@@ -1059,6 +1059,18 @@ public:
         return do_insert_or_assign(std::move(key), std::forward<M>(mapped)).first;
     }
 
+    template <class K, class M>
+    auto hashtable_push(K&& key, M& mapped) {
+        ++ mapped.id;
+        ++ mapped.cnt;
+        auto it_isinserted = try_emplace(std::forward<K>(key), std::forward<M>(mapped));
+        if (!it_isinserted.second) {
+            --mapped.cnt;
+            return it_isinserted.first->second.id;
+        }
+        return mapped.id;
+    }
+
     template <typename K,
               typename M,
               typename Q = T,

From aee803adce0b42c33480e360737ce0baf46c4420 Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Mon, 9 Jan 2023 22:59:41 +0800
Subject: [PATCH 4/7] group by optimization

---
 engine/types.py          |   6 +--
 proctool.py              |   2 +-
 reconstruct/ast.py       | 105 +++++++++++++++++++++++++++------------
 server/hasher.h          |   4 --
 server/server.cpp        |   1 +
 server/unordered_dense.h |  64 ++++++++++++------------
 server/vector_type.hpp   |  57 +++++++++++++++++++++
 7 files changed, 168 insertions(+), 71 deletions(-)

diff --git a/engine/types.py b/engine/types.py
index 5baf47f..31c5b37 100644
--- a/engine/types.py
+++ b/engine/types.py
@@ -107,9 +107,9 @@ ULongT = Types(8, name = 'uint64', sqlname = 'UINT64', fp_type=DoubleT)
 UIntT = Types(7, name = 'uint32', sqlname = 'UINT32', long_type=ULongT, fp_type=FloatT)
 UShortT = Types(6, name = 'uint16', sqlname = 'UINT16', long_type=ULongT, fp_type=FloatT)
 UByteT = Types(5, name = 'uint8', sqlname = 'UINT8', long_type=ULongT, fp_type=FloatT)
-StrT = Types(200, name = 'str', cname = 'const char*', sqlname='TEXT', ctype_name = 'types::ASTR')
-TextT = Types(200, name = 'text', cname = 'const char*', sqlname='TEXT', ctype_name = 'types::ASTR')
-VarcharT = Types(200, name = 'varchar', cname = 'const char*', sqlname='VARCHAR', ctype_name = 'types::ASTR')
+StrT = Types(200, name = 'str', cname = 'string_view', sqlname='TEXT', ctype_name = 'types::ASTR')
+TextT = Types(200, name = 'text', cname = 'string_view', sqlname='TEXT', ctype_name = 'types::ASTR')
+VarcharT = Types(200, name = 'varchar', cname = 'string_view', sqlname='VARCHAR', ctype_name = 'types::ASTR')
 VoidT = Types(200, name = 'void', cname = 'void', sqlname='Null', ctype_name = 'types::None')
 
 class VectorT(Types):
diff --git a/proctool.py b/proctool.py
index 81035bf..1ff726c 100644
--- a/proctool.py
+++ b/proctool.py
@@ -2,7 +2,7 @@ import struct
 import readline
 from typing import List
 
-name : str = input()
+name : str = input('Filename (in path ./procedures/<filename>.aqp):')
 
 def write():
     s : str = input()
diff --git a/reconstruct/ast.py b/reconstruct/ast.py
index e9348ac..37c5e52 100644
--- a/reconstruct/ast.py
+++ b/reconstruct/ast.py
@@ -339,8 +339,8 @@ class projection(ast_node):
                 return ', '.join([self.pyname2cname[n.name] for n in lst_names])
             else:
                 return self.pyname2cname[proj_name]
-        
-        for key, val in proj_map.items():
+        gb_tovec = [False] * len(proj_map)
+        for i, (key, val) in enumerate(proj_map.items()):
             if type(val[1]) is str:
                 x = True
                 y = get_proj_name
@@ -357,22 +357,27 @@ class projection(ast_node):
                 out_typenames[key] = decltypestring
             else:
                 out_typenames[key] = val[0].cname
-            if (type(val[2].udf_called) is udf and # should bulkret also be colref?
+            elemental_ret_udf = (
+                    type(val[2].udf_called) is udf and # should bulkret also be colref?
                     val[2].udf_called.return_pattern == udf.ReturnPattern.elemental_return
-                    or 
-                    self.group_node and 
-                    (self.group_node.use_sp_gb and
+            )
+            folding_vector_groups = (
+                self.group_node and 
+                (
+                    self.group_node.use_sp_gb and
                     val[2].cols_mentioned.intersection(
                         self.datasource.all_cols().difference(
                             self.datasource.get_joint_cols(self.group_node.refs)
-                        ))
-                    ) and val[2].is_compound # compound val not in key
-                    # or 
-                    # val[2].is_compound > 1
-                    # (not self.group_node and val[2].is_compound)
-                    ):
-                    out_typenames[key] = f'vector_type<{out_typenames[key]}>'
-                    self.out_table.columns[key].compound = True
+                        )
+                    )
+                ) and 
+                val[2].is_compound # compound val not in key
+            )
+            if (elemental_ret_udf or folding_vector_groups):
+                out_typenames[key] = f'vector_type<{out_typenames[key]}>'
+                self.out_table.columns[key].compound = True
+                if self.group_node is not None and self.group_node.use_sp_gb:
+                    gb_tovec[i] = True
         outtable_col_nameslist = ', '.join([f'"{c.name}"' for c in self.out_table.columns])
         self.outtable_col_names = 'names_' + base62uuid(4)
         self.context.emitc(f'const char* {self.outtable_col_names}[] = {{{outtable_col_nameslist}}};')
@@ -384,12 +389,14 @@ class projection(ast_node):
             gb_vartable : Dict[str, Union[str, int]] = deepcopy(self.pyname2cname)
             gb_cexprs : List[str] = []
             gb_colnames : List[str] = []
+            gb_types : List[Types] = []
             for key, val in proj_map.items():
                 col_name = 'col_' + base62uuid(6)
                 self.context.emitc(f'decltype(auto) {col_name} = {self.out_table.contextname_cpp}->get_col<{key}>();')
                 gb_cexprs.append((col_name, val[2]))
                 gb_colnames.append(col_name)
-            self.group_node.finalize(gb_cexprs, gb_vartable, gb_colnames)
+                gb_types.append(val[0])
+            self.group_node.finalize(gb_cexprs, gb_vartable, gb_colnames, gb_types, gb_tovec)
         else:
             for i, (key, val) in enumerate(proj_map.items()):
                 if type(val[1]) is int:
@@ -533,6 +540,7 @@ class groupby_c(ast_node):
     def init(self, node : List[Tuple[expr, Set[ColRef]]]):
         self.proj : projection = self.parent
         self.glist : List[Tuple[expr, Set[ColRef]]] = node
+        self.vecs : str = 'vecs_' + base62uuid(3)
         return super().init(node)
     
     def produce(self, node : List[Tuple[expr, Set[ColRef]]]):
@@ -561,21 +569,22 @@ class groupby_c(ast_node):
                 e = g_str
             g_contents_list.append(e)
         first_col = g_contents_list[0]
+        self.total_sz = 'len_' + base62uuid(4)
+        self.context.emitc(f'uint32_t {self.total_sz} = {first_col}.size;')
         g_contents_decltype = [f'decays<decltype({c})::value_t>' for c in g_contents_list]
         g_contents = ', '.join(
             [f'{c}[{scanner_itname}]' for c in g_contents_list]
         )
         self.context.emitc(f'typedef record<{",".join(g_contents_decltype)}> {self.group_type};')
-        self.context.emitc(f'ankerl::unordered_dense::map<{self.group_type}, vector_type<uint32_t>, '
-            f'transTypes<{self.group_type}, hasher>> {self.group};')
-        self.context.emitc(f'{self.group}.reserve({first_col}.size);')
+        self.context.emitc(f'AQHashTable<{self.group_type}, '
+            f'transTypes<{self.group_type}, hasher>> {self.group} {{{self.total_sz}}};')
         self.n_grps = len(self.glist)
-        self.scanner = scan(self, first_col + '.size', it_name=scanner_itname)
-        self.scanner.add(f'{self.group}[forward_as_tuple({g_contents})].emplace_back({self.scanner.it_var});')
+        self.scanner = scan(self, self.total_sz, it_name=scanner_itname)
+        self.scanner.add(f'{self.group}.hashtable_push(forward_as_tuple({g_contents}), {self.scanner.it_var});')
 
     def consume(self, _):
         self.scanner.finalize()
-        
+        self.context.emitc(f'auto {self.vecs} = {self.group}.ht_postproc({self.total_sz});')
     # def deal_with_assumptions(self, assumption:assumption, out:TableInfo):
     #     gscanner = scan(self, self.group)
     #     val_var = 'val_'+base62uuid(7)
@@ -583,16 +592,40 @@ class groupby_c(ast_node):
     #     gscanner.add(f'{self.datasource.cxt_name}->order_by<{assumption.result()}>(&{val_var});')
     #     gscanner.finalize()
         
-    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]], col_names : List[str]):
-        for c in col_names:
+    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]], 
+                 col_names : List[str], col_types : List[Types], col_tovec : List[bool]):
+        tovec_columns = set()
+        for i, c in enumerate(col_names):
             self.context.emitc(f'{c}.reserve({self.group}.size());')
-        
-        gscanner = scan(self, self.group, loop_style = 'for_each')
+            if col_tovec[i]: # and type(col_types[i]) is VectorT:
+                typename : Types = col_types[i] # .inner_type
+                self.context.emitc(f'auto buf_{c} = static_cast<{typename.cname} *>(malloc({self.total_sz} * sizeof({typename.cname})));')
+                tovec_columns.add(c)
+        self.arr_len = 'arrlen_' + base62uuid(3)
+        self.arr_values = 'arrvals_' + base62uuid(3)
+        
+        if len(tovec_columns):
+            self.context.emitc(f'auto {self.arr_len} = {self.group}.size();')
+            self.context.emitc(f'auto {self.arr_values} = {self.group}.values();')
+            preproc_scanner = scan(self, self.arr_len)
+            preproc_scanner_it = preproc_scanner.it_var
+            for c in tovec_columns:
+                preproc_scanner.add(f'{c}[{preproc_scanner_it}].init_from'
+                                    f'({self.vecs}[{preproc_scanner_it}].size,'
+                                    f' {"buf_" + c} + {self.group}.ht_base'
+                                    f'[{preproc_scanner_it}]);'
+                )
+            preproc_scanner.finalize()
+            
+        # gscanner = scan(self, self.group, loop_style = 'for_each')
+        gscanner = scan(self, self.arr_len)
         key_var = 'key_'+base62uuid(7)
         val_var = 'val_'+base62uuid(7)
         
-        gscanner.add(f'auto &{key_var} = {gscanner.it_var}.first;', position = 'front')
-        gscanner.add(f'auto &{val_var} = {gscanner.it_var}.second;', position = 'front')
+        # gscanner.add(f'auto &{key_var} = {gscanner.it_var}.first;', position = 'front')
+        # gscanner.add(f'auto &{val_var} = {gscanner.it_var}.second;', position = 'front')
+        gscanner.add(f'auto &{key_var} = {self.arr_values}[{gscanner.it_var}];', position = 'front')
+        gscanner.add(f'auto &{val_var} = {self.vecs}[{gscanner.it_var}];', position = 'front')
         len_var = None
         def define_len_var():
             nonlocal len_var
@@ -627,7 +660,7 @@ class groupby_c(ast_node):
                         materialize_builtin = materialize_builtin, 
                         count=lambda:f'{val_var}.size')
                 
-        for ce in cexprs:
+        for i, ce in enumerate(cexprs):
             ex = ce[1]
             materialize_builtin = {}
             if type(ex.udf_called) is udf:
@@ -640,7 +673,16 @@ class groupby_c(ast_node):
                     materialize_builtin['_builtin_ret'] = f'{ce[0]}.back()'
                     gscanner.add(f'{ex.eval(c_code = True, y=get_var_names, materialize_builtin = materialize_builtin)};\n')
                     continue
-            gscanner.add(f'{ce[0]}.emplace_back({get_var_names_ex(ex)});\n')
+            if col_tovec[i]:
+                if ex.opname == 'avgs':
+                    patch_expr = get_var_names_ex(ex)
+                    patch_expr = patch_expr[:patch_expr.rindex(')')]
+                    patch_expr += ', ' + f'{ce[0]}[{gscanner.it_var}]' + ')'
+                    gscanner.add(f'{patch_expr};\n')
+                else:
+                    gscanner.add(f'{ce[0]}[{gscanner.it_var}] = {get_var_names_ex(ex)};\n')
+            else:
+                gscanner.add(f'{ce[0]}.emplace_back({get_var_names_ex(ex)});\n')
         
         gscanner.finalize()
         
@@ -718,10 +760,11 @@ class groupby(ast_node):
                 #     self.parent.var_table.
                 self.parent.col_ext.update(l[1])    
                 
-    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]], col_names : List[str]):
+    def finalize(self, cexprs : List[Tuple[str, expr]], var_table : Dict[str, Union[str, int]], 
+                 col_names : List[str], col_types : List[Types], col_tovec : List[bool]):
         if self.use_sp_gb:
             self.dedicated_gb = groupby_c(self.parent, self.dedicated_glist)
-            self.dedicated_gb.finalize(cexprs, var_table, col_names)
+            self.dedicated_gb.finalize(cexprs, var_table, col_names, col_types, col_tovec)
 
 
 class join(ast_node):
diff --git a/server/hasher.h b/server/hasher.h
index b632319..22a98e2 100644
--- a/server/hasher.h
+++ b/server/hasher.h
@@ -132,7 +132,3 @@ namespace ankerl::unordered_dense{
 	struct hash<std::tuple<Types...>> : public hasher<Types...>{ };
 }
 
-struct aq_hashtable_value_t {
-	uint32_t id;
-	uint32_t cnt;
-};
\ No newline at end of file
diff --git a/server/server.cpp b/server/server.cpp
index 507e944..3fcbe9b 100644
--- a/server/server.cpp
+++ b/server/server.cpp
@@ -295,6 +295,7 @@ void initialize_module(const char* module_name, void* module_handle, Context* cx
         printf("Warning: module %s have no session support.\n", module_name);
     }
 }
+
 #pragma endregion
 int dll_main(int argc, char** argv, Context* cxt){
     aq_timer timer;
diff --git a/server/unordered_dense.h b/server/unordered_dense.h
index 828615d..03b6fc0 100644
--- a/server/unordered_dense.h
+++ b/server/unordered_dense.h
@@ -1062,7 +1062,7 @@ public:
     // template <class K>
     // bool hashtable_push(K&& key) {
     //     auto it_isinserted = try_emplace(std::forward<K>(key), 1);
-    //     if (!it_isinserted.second) 
+    //     if (!it_isinserted.second)
     //         ++ it_isinserted.first->second;
     //     return it_isinserted.second;
     // }
@@ -1113,8 +1113,8 @@ public:
 template <class K,
               typename Q = T,
               typename H = Hash,
-              typename KE = KeyEqual,
-              std::enable_if_t<!is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
+              typename KE = KeyEqual>//,
+              //std::enable_if_t<!is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
     auto hashtable_push(K&& key) -> unsigned {
         if (is_full()) {
             increase_size();
@@ -1141,35 +1141,35 @@ template <class K,
         place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
         return static_cast<uint32_t>(value_idx);
     }
-    template <class... Args>
-    auto hashtable_push(Args&&... args) -> unsigned {
-        if (is_full()) {
-            increase_size();
-        }
-
-        // we have to instantiate the value_type to be able to access the key.
-        // 1. emplace_back the object so it is constructed. 2. If the key is already there, pop it later in the loop.
-        auto& key = get_key(m_values.emplace_back(std::forward<Args>(args)...));
-        auto hash = mixed_hash(key);
-        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
-        auto bucket_idx = bucket_idx_from_hash(hash);
-
-        while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
-            if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
-                m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {
-                m_values.pop_back(); // value was already there, so get rid of it
-                return static_cast<uint32_t>(at(m_buckets, bucket_idx).m_value_idx);
-            }
-            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
-            bucket_idx = next(bucket_idx);
-        }
-
-        // value is new, place the bucket and shift up until we find an empty spot
-        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);
-        place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
-
-        return static_cast<uint32_t>(value_idx);
-    }
+    // template <class... Args>
+    // auto hashtable_push(Args&&... args) -> unsigned {
+    //     if (is_full()) {
+    //         increase_size();
+    //     }
+
+    //     // we have to instantiate the value_type to be able to access the key.
+    //     // 1. emplace_back the object so it is constructed. 2. If the key is already there, pop it later in the loop.
+    //     auto& key = get_key(m_values.emplace_back(std::forward<Args>(args)...));
+    //     auto hash = mixed_hash(key);
+    //     auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
+    //     auto bucket_idx = bucket_idx_from_hash(hash);
+
+    //     while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
+    //         if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
+    //             m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {
+    //             m_values.pop_back(); // value was already there, so get rid of it
+    //             return static_cast<uint32_t>(at(m_buckets, bucket_idx).m_value_idx);
+    //         }
+    //         dist_and_fingerprint = dist_inc(dist_and_fingerprint);
+    //         bucket_idx = next(bucket_idx);
+    //     }
+
+    //     // value is new, place the bucket and shift up until we find an empty spot
+    //     auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);
+    //     place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
+
+    //     return static_cast<uint32_t>(value_idx);
+    // }
     template <class... Args>
     auto emplace(Args&&... args) -> std::pair<iterator, bool> {
         if (is_full()) {
diff --git a/server/vector_type.hpp b/server/vector_type.hpp
index a5a3b40..dc109b1 100644
--- a/server/vector_type.hpp
+++ b/server/vector_type.hpp
@@ -427,6 +427,19 @@ constexpr vector_type<std::string_view>::vector_type(const uint32_t size, void*
 // 	}
 // }
 
+// template<template <typename> class VT>
+// inline void 
+// prealloc_vector (VT &vt, uint32_t sz) { 
+// 	vt.reserve(sz); 
+// }
+
+// template<class T>
+// inline void 
+// prealloc_vector (vector_type<vector_type<T>> &vt, 
+// 				uint32_t outer_sz, uint32_t inner_sz) { 
+// 	vt.reserve(outer_sz); 
+// 	auto mem = static_cast<T*>(malloc(inner_sz * sizeof(T)));
+// }
 
 template <>
 class vector_type<void> {
@@ -460,4 +473,48 @@ public:
 	vector_type<void> subvec_deep(uint32_t);
 };
 #pragma pack(pop)
+
+template <class Key, class Hash>
+class AQHashTable : public ankerl::unordered_dense::set<Key, Hash> {
+public:
+	uint32_t* reversemap, *mapbase, *ht_base;
+	AQHashTable() = default;
+	explicit AQHashTable(uint32_t sz) 
+		: ankerl::unordered_dense::set<Key, Hash>{} {
+		this->reserve(sz);
+		reversemap = static_cast<uint32_t *>(malloc(sizeof(uint32_t) * sz * 2));
+		mapbase = reversemap + sz;
+		ht_base =  static_cast<uint32_t *>(calloc(sz, sizeof(uint32_t)));
+	}
+
+	void init(uint32_t sz) {
+		ankerl::unordered_dense::set<Key, Hash>::reserve(sz);
+		reversemap = static_cast<uint32_t *>(malloc(sizeof(uint32_t) * sz * 2));
+		mapbase = reversemap + sz;
+		ht_base =  static_cast<uint32_t *>(calloc(sz, sizeof(uint32_t)));
+	}
+
+	inline void hashtable_push(Key&& k, uint32_t i){
+		reversemap[i] = ankerl::unordered_dense::set<Key, Hash>::hashtable_push(std::forward<Key&&>(k));
+		++ht_base[reversemap[i]];
+	}
+
+	auto ht_postproc(uint32_t sz) {
+		auto& arr_values = this->values();
+		const auto& len = this->size();
+
+		auto vecs = static_cast<vector_type<uint32_t>*>(malloc(sizeof(vector_type<uint32_t>) * len));
+		vecs[0].init_from(ht_base[0], mapbase);
+		for (uint32_t i = 1; i < len; ++i) {
+			vecs[i].init_from(ht_base[i], mapbase + ht_base[i - 1]);
+			ht_base[i] += ht_base[i - 1];
+		}
+		for (uint32_t i = 0; i < sz; ++i) {
+			auto id = reversemap[i];
+			mapbase[--ht_base[id]] = i;    
+		}
+		return vecs;
+	}
+};
+
 #endif

From 540672cdc4d31cbca213e7d951c55a2f10836931 Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Mon, 16 Jan 2023 23:02:52 +0800
Subject: [PATCH 5/7] updated code generation for compound-columns, initial
 support for scratchspace

---
 engine/types.py        |  22 +++--
 reconstruct/ast.py     |  12 +--
 reconstruct/expr.py    |  13 +++
 server/aggregations.h  | 216 +++++++++++++++++++++++++++++------------
 server/gc.h            |  51 +++++++++-
 server/libaquery.cpp   |  68 +++++++++++++
 server/table.h         |  47 ++++++++-
 server/vector_type.hpp |  28 +++---
 8 files changed, 359 insertions(+), 98 deletions(-)

diff --git a/engine/types.py b/engine/types.py
index 31c5b37..46b750b 100644
--- a/engine/types.py
+++ b/engine/types.py
@@ -305,7 +305,7 @@ opor = OperatorBase('or', 2, logical, cname = '||', sqlname = ' OR ', call = bin
 opxor = OperatorBase('xor', 2, logical, cname = '^', sqlname = ' XOR ', call = binary_op_behavior)
 opgt = OperatorBase('gt', 2, logical, cname = '>', sqlname = '>', call = binary_op_behavior)
 oplt = OperatorBase('lt', 2, logical, cname = '<', sqlname = '<', call = binary_op_behavior)
-opge = OperatorBase('gte', 2, logical, cname = '>=', sqlname = '>=', call = binary_op_behavior)
+opgte = OperatorBase('gte', 2, logical, cname = '>=', sqlname = '>=', call = binary_op_behavior)
 oplte = OperatorBase('lte', 2, logical, cname = '<=', sqlname = '<=', call = binary_op_behavior)
 opneq = OperatorBase('neq', 2, logical, cname = '!=', sqlname = '!=', call = binary_op_behavior)
 opeq = OperatorBase('eq', 2, logical, cname = '==', sqlname = '=', call = binary_op_behavior)
@@ -355,19 +355,27 @@ fnpow = OperatorBase('pow', 2, lambda *_ : DoubleT, cname = 'pow', sqlname = 'PO
 # type collections
 def _op_make_dict(*items : OperatorBase):
     return { i.name: i for i in items}
+#binary op
 builtin_binary_arith = _op_make_dict(opadd, opdiv, opmul, opsub, opmod)
 builtin_binary_logical = _op_make_dict(opand, opor, opxor, opgt, oplt, 
-                                       opge, oplte, opneq, opeq)
+                                       opgte, oplte, opneq, opeq)
+builtin_binary_ops = {**builtin_binary_arith, **builtin_binary_logical}
+#unary op
 builtin_unary_logical = _op_make_dict(opnot)
 builtin_unary_arith = _op_make_dict(opneg)
 builtin_unary_special = _op_make_dict(spnull, opdistinct)
+# functions 
 builtin_cstdlib = _op_make_dict(fnsqrt, fnlog, fnsin, fncos, fntan, fnpow)
-builtin_func = _op_make_dict(fnmax, fnmin, fnsum, fnavg, fnmaxs, 
-                             fnmins, fndeltas, fnratios, fnlast,
-                             fnfirst, fnsums, fnavgs, fncnt, 
-                             fnpack, fntrunc, fnprev, fnnext, 
-                             fnvar, fnvars, fnstd, fnstds)
+builtin_aggfunc = _op_make_dict(fnmax, fnmin, fnsum, fnavg, 
+                                fnlast, fnfirst, fncnt, fnvar, fnstd)
+builtin_vecfunc = _op_make_dict(fnmaxs, 
+                             fnmins, fndeltas, fnratios, fnsums, fnavgs, 
+                             fnpack, fntrunc, fnprev, fnnext, fnvars, fnstds)
+builtin_vecfunc = {**builtin_vecfunc, **builtin_cstdlib}
+builtin_func = {**builtin_vecfunc, **builtin_aggfunc}
+
 user_module_func = {}
+
 builtin_operators : Dict[str, OperatorBase] = {**builtin_binary_arith, **builtin_binary_logical, 
     **builtin_unary_arith, **builtin_unary_logical, **builtin_unary_special, **builtin_func, **builtin_cstdlib, 
     **user_module_func}
diff --git a/reconstruct/ast.py b/reconstruct/ast.py
index 37c5e52..31b3939 100644
--- a/reconstruct/ast.py
+++ b/reconstruct/ast.py
@@ -604,9 +604,10 @@ class groupby_c(ast_node):
         self.arr_len = 'arrlen_' + base62uuid(3)
         self.arr_values = 'arrvals_' + base62uuid(3)
         
+        self.context.emitc(f'auto {self.arr_len} = {self.group}.size();')
+        self.context.emitc(f'auto {self.arr_values} = {self.group}.values();')
+        
         if len(tovec_columns):
-            self.context.emitc(f'auto {self.arr_len} = {self.group}.size();')
-            self.context.emitc(f'auto {self.arr_values} = {self.group}.values();')
             preproc_scanner = scan(self, self.arr_len)
             preproc_scanner_it = preproc_scanner.it_var
             for c in tovec_columns:
@@ -674,11 +675,8 @@ class groupby_c(ast_node):
                     gscanner.add(f'{ex.eval(c_code = True, y=get_var_names, materialize_builtin = materialize_builtin)};\n')
                     continue
             if col_tovec[i]:
-                if ex.opname == 'avgs':
-                    patch_expr = get_var_names_ex(ex)
-                    patch_expr = patch_expr[:patch_expr.rindex(')')]
-                    patch_expr += ', ' + f'{ce[0]}[{gscanner.it_var}]' + ')'
-                    gscanner.add(f'{patch_expr};\n')
+                if ex.remake_binary(f'{ce[0]}[{gscanner.it_var}]'):
+                    gscanner.add(f'{get_var_names_ex(ex)};\n')
                 else:
                     gscanner.add(f'{ce[0]}[{gscanner.it_var}] = {get_var_names_ex(ex)};\n')
             else:
diff --git a/reconstruct/expr.py b/reconstruct/expr.py
index af1f0cb..135a21e 100644
--- a/reconstruct/expr.py
+++ b/reconstruct/expr.py
@@ -367,6 +367,19 @@ class expr(ast_node):
             self.curr_code += c.codegen(delegate)
         return self.curr_code
     
+    def remake_binary(self, ret_expr):
+        if self.root:
+            self.oldsql = self.sql
+            if (self.opname in builtin_binary_ops):
+                patched_opname = 'aqop_' + self.opname
+                self.sql = (f'{patched_opname}({self.children[0].sql}, '
+                            f'{self.children[1].sql}, {ret_expr})')
+                return True
+            elif self.opname in builtin_vecfunc:
+                self.sql = self.sql[:self.sql.rindex(')')]
+                self.sql += ', ' + ret_expr + ')'
+                return True
+        return False
     def __str__(self):
         return self.sql
     def __repr__(self):
diff --git a/server/aggregations.h b/server/aggregations.h
index ccd5c25..a51c011 100644
--- a/server/aggregations.h
+++ b/server/aggregations.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "types.h"
+#include "gc.h"
 #include <utility>
 #include <limits>
 #include <deque>
@@ -29,14 +30,19 @@ double avg(const VT<T>& v) {
 	return (sum<T>(v) / static_cast<double>(v.size));
 }
 
+template<class T, template<typename ...> class VT, class Ret>
+void sqrt(const VT<T>& v, Ret& ret) {
+	for (uint32_t i = 0; i < v.size; ++i) 
+		ret[i] = sqrt(v[i]);
+}
+
 template<class T, template<typename ...> class VT>
 VT<double> sqrt(const VT<T>& v) {
 	VT<double> ret(v.size);
-	for (uint32_t i = 0; i < v.size; ++i) {
-		ret[i] = sqrt(v[i]);
-	}
+	sqrt(v, ret);
 	return ret;
 }
+
 template <class T>
 T truncate(const T& v, const uint32_t precision) {
 	auto multiplier = pow(10, precision);
@@ -73,109 +79,153 @@ T min(const VT<T>& v) {
 		min_v = min_v < _v ? min_v : _v;
 	return min_v;
 }
-template<class T, template<typename ...> class VT>
-decayed_t<VT, T> mins(const VT<T>& arr) {
+
+// simplify this using a template std::binary_function<T, T, bool> = std::less;
+template<class T, template<typename ...> class VT, class Ret>
+void mins(const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	std::deque<std::pair<T, uint32_t>> cache;
-	decayed_t<VT, T> ret(len);
 	T min = std::numeric_limits<T>::max();
 	for (int i = 0; i < len; ++i) {
 		if (arr[i] < min)
 			min = arr[i];
 		ret[i] = min;
 	}
-	return ret;
 }
+
 template<class T, template<typename ...> class VT>
-decayed_t<VT, T> maxs(const VT<T>& arr) {
+decayed_t<VT, T> mins(const VT<T>& arr) {
+	decayed_t<VT, T> ret(arr.size);
+	mins(arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void maxs(const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, T> ret(len);
 	T max = std::numeric_limits<T>::min();
 	for (int i = 0; i < len; ++i) {
 		if (arr[i] > max)
 			max = arr[i];
 		ret[i] = max;
 	}
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, T> minw(uint32_t w, const VT<T>& arr) {
+decayed_t<VT, T> maxs(const VT<T>& arr) {
+	decayed_t<VT, T> ret(arr.size);
+	maxs(arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void minw(uint32_t w, const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, T> ret(len);
 	std::deque<std::pair<T, uint32_t>> cache;
 	for (int i = 0; i < len; ++i) {
 		if (!cache.empty() && cache.front().second == i - w) cache.pop_front();
+		
 		while (!cache.empty() && cache.back().first > arr[i]) cache.pop_back();
 		cache.push_back({ arr[i], i });
 		ret[i] = cache.front().first;
 	}
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, T> maxw(uint32_t w, const VT<T>& arr) {
+decayed_t<VT, T> minw(uint32_t w, const VT<T>& arr) {
+	decayed_t<VT, T> ret(arr.size);
+	minw(w, arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void maxw(uint32_t w, const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, T> ret(len);
 	std::deque<std::pair<T, uint32_t>> cache;
 	for (int i = 0; i < len; ++i) {
 		if (!cache.empty() && cache.front().second == i - w) cache.pop_front();
-		while (!cache.empty() && cache.back().first > arr[i]) cache.pop_back();
+		while (!cache.empty() && cache.back().first < arr[i]) cache.pop_back();
 		cache.push_back({ arr[i], i });
-		arr[i] = cache.front().first;
+		ret[i] = cache.front().first;
 	}
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetFPType<T>> ratiow(uint32_t w, const VT<T>& arr) {
+inline decayed_t<VT, T> maxw(uint32_t w, const VT<T>& arr) {
+	decayed_t<VT, T> ret(arr.size);
+	maxw(w, arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void ratiow(uint32_t w, const VT<T>& arr, Ret& ret) {
 	typedef std::decay_t<types::GetFPType<T>> FPType;
 	uint32_t len = arr.size;
 	if (arr.size <= w) 
 		len = 1;
 	w = w > len ? len : w;
-	decayed_t<VT, FPType> ret(arr.size);
 	ret[0] = 0;
 	for (uint32_t i = 0; i < w; ++i) 
 		ret[i] = arr[i] / (FPType)arr[0];
 	for (uint32_t i = w; i < arr.size; ++i) 
 		ret[i] = arr[i] / (FPType) arr[i - w];
+}
+
+template<class T, template<typename ...> class VT>
+inline decayed_t<VT, types::GetFPType<T>> ratiow(uint32_t w, const VT<T>& arr) {
+	typedef std::decay_t<types::GetFPType<T>> FPType;
+	decayed_t<VT, FPType> ret(arr.size);
+	ratiow(w, arr, ret);
 	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetFPType<T>> ratios(const VT<T>& arr) { 
+inline decayed_t<VT, types::GetFPType<T>> ratios(const VT<T>& arr) { 
 	return ratiow(1, arr);
 }
 
-template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetLongType<T>> sums(const VT<T>& arr) {
+template<class T, template<typename ...> class VT, class Ret>
+inline void ratios(const VT<T>& arr, Ret& ret) { 
+	return ratiow(1, arr, ret);
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void sums(const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, types::GetLongType<T>> ret(len);
 	uint32_t i = 0;
 	if (len) ret[i++] = arr[0];
 	for (; i < len; ++i)
 		ret[i] = ret[i - 1] + arr[i];
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetFPType<types::GetLongType<T>>> avgs(const VT<T>& arr) {
+inline decayed_t<VT, types::GetLongType<T>> sums(const VT<T>& arr) {
+	decayed_t<VT, types::GetLongType<T>> ret(arr.size);
+	sums(arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void avgs(const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
 	typedef types::GetFPType<types::GetLongType<T>> FPType;
-	decayed_t<VT, FPType> ret(len);
 	uint32_t i = 0;
 	types::GetLongType<T> s;
 	if (len) s = ret[i++] = arr[0];
 	for (; i < len; ++i)
 		ret[i] = (s += arr[i]) / (FPType)(i + 1);
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetLongType<T>> sumw(uint32_t w, const VT<T>& arr) {
+inline decayed_t<VT, types::GetFPType<types::GetLongType<T>>> avgs(const VT<T>& arr) {
+	typedef types::GetFPType<types::GetLongType<T>> FPType;
+	decayed_t<VT, FPType> ret(arr.size);
+	avgs(arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void sumw(uint32_t w, const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, types::GetLongType<T>> ret(len);
 	uint32_t i = 0;
 	w = w > len ? len : w;
 	if (len) ret[i++] = arr[0];
@@ -183,11 +233,17 @@ decayed_t<VT, types::GetLongType<T>> sumw(uint32_t w, const VT<T>& arr) {
 		ret[i] = ret[i - 1] + arr[i];
 	for (; i < len; ++i)
 		ret[i] = ret[i - 1] + arr[i] - arr[i - w];
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-void avgw(uint32_t w, const VT<T>& arr, decayed_t<vector_type, types::GetFPType<types::GetLongType<T>>>& ret) {
+decayed_t<VT, types::GetLongType<T>> sumw(uint32_t w, const VT<T>& arr) {
+	decayed_t<VT, types::GetLongType<T>> ret(arr.size);
+	sumw(w, arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void avgw(uint32_t w, const VT<T>& arr, Ret& ret) {
 	typedef types::GetFPType<types::GetLongType<T>> FPType;
 	const uint32_t& len = arr.size;
 	uint32_t i = 0;
@@ -201,26 +257,19 @@ void avgw(uint32_t w, const VT<T>& arr, decayed_t<vector_type, types::GetFPType<
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetFPType<types::GetLongType<T>>> avgw(uint32_t w, const VT<T>& arr) {
+inline decayed_t<VT, types::GetFPType<types::GetLongType<T>>> avgw(uint32_t w, const VT<T>& arr) {
 	typedef types::GetFPType<types::GetLongType<T>> FPType;
 	const uint32_t& len = arr.size;
 	decayed_t<VT, FPType> ret(len);
-	uint32_t i = 0;
-	types::GetLongType<T> s{};
-	w = w > len ? len : w;
-	if (len) s = ret[i++] = arr[0];
-	for (; i < w; ++i)
-		ret[i] = (s += arr[i]) / (FPType)(i + 1);
-	for (; i < len; ++i)
-		ret[i] = ret[i - 1] + (arr[i] - arr[i - w]) / (FPType)w;
+	avgw(w, arr, ret);
 	return ret;
 }
 
-template<class T, template<typename ...> class VT, bool sd = false>
-decayed_t<VT, types::GetFPType<types::GetLongType<T>>> varw(uint32_t w, const VT<T>& arr) {
+template<class T, template<typename ...> class VT, class Ret, bool sd = false>
+void varw(uint32_t w, const VT<T>& arr, 
+	Ret& ret) {
 	using FPType = types::GetFPType<types::GetLongType<T>>;
 	const uint32_t& len = arr.size;
-	decayed_t<VT, FPType> ret(len);
 	uint32_t i = 0;
 	types::GetLongType<T> s{};
 	w = w > len ? len : w;
@@ -252,7 +301,14 @@ decayed_t<VT, types::GetFPType<types::GetLongType<T>>> varw(uint32_t w, const VT
 	if constexpr(sd) 
 		if(i)
 			ret[i-1] = sqrt(ret[i-1]);
-	
+}
+
+
+template<class T, template<typename ...> class VT, bool sd = false>
+inline decayed_t<VT, types::GetFPType<types::GetLongType<T>>> varw(uint32_t w, const VT<T>& arr) {
+	using FPType = types::GetFPType<types::GetLongType<T>>;
+	decayed_t<VT, FPType> ret(arr.size);
+	varw<T, VT, decayed_t<VT, types::GetFPType<types::GetLongType<T>>>, sd>(w, arr, ret);
 	return ret;
 }
 
@@ -274,11 +330,10 @@ types::GetFPType<types::GetLongType<decays<T>>> var(const VT<T>& arr) {
 	return (ssq - s * s / (FPType)(len + 1)) / (FPType)(len + 1);
 }
 
-template<class T, template<typename ...> class VT, bool sd = false>
-decayed_t<VT, types::GetFPType<types::GetLongType<T>>> vars(const VT<T>& arr) {
+template<class T, template<typename ...> class VT, class Ret, bool sd = false>
+void vars(const VT<T>& arr, Ret& ret) {
 	typedef types::GetFPType<types::GetLongType<T>> FPType;
 	const uint32_t& len = arr.size;
-	decayed_t<VT, FPType> ret(len);
 	uint32_t i = 0;
 	types::GetLongType<T> s{};
 	FPType MnX{};
@@ -298,51 +353,88 @@ decayed_t<VT, types::GetFPType<types::GetLongType<T>>> vars(const VT<T>& arr) {
 		ret[i] = MnX / (FPType)(i + 1);
 		if constexpr(sd) ret[i] = sqrt(ret[i]);
 	}
+}
+
+template<class T, template<typename ...> class VT, bool sd = false>
+inline decayed_t<VT, types::GetFPType<types::GetLongType<T>>> vars(const VT<T>& arr) {
+	typedef types::GetFPType<types::GetLongType<T>> FPType;
+	decayed_t<VT, FPType> ret(arr.size);
+	vars<T, VT, decayed_t<VT, types::GetFPType<types::GetLongType<T>>>, sd>(arr, ret);
 	return ret;
 }
+
 template<class T, template<typename ...> class VT>
-types::GetFPType<types::GetLongType<decays<T>>> stddev(const VT<T>& arr) {
+inline types::GetFPType<types::GetLongType<decays<T>>> stddev(const VT<T>& arr) {
 	return sqrt(var(arr));
 }
+
 template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetFPType<types::GetLongType<T>>> stddevs(const VT<T>& arr) {
+inline decayed_t<VT, types::GetFPType<types::GetLongType<T>>> stddevs(const VT<T>& arr) {
 	return vars<T, VT, true>(arr);
 }
+
 template<class T, template<typename ...> class VT>
-decayed_t<VT, types::GetFPType<types::GetLongType<T>>> stddevw(uint32_t w, const VT<T>& arr) {
+inline decayed_t<VT, types::GetFPType<types::GetLongType<T>>> stddevw(uint32_t w, const VT<T>& arr) {
 	return varw<T, VT, true>(w, arr);
 }
+
+template<class T, template<typename ...> class VT, class Ret>
+inline auto stddevs(const VT<T>& arr, Ret& ret) {
+	return vars<T, VT, Ret, true>(arr, ret);
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+inline auto stddevw(uint32_t w, const VT<T>& arr, Ret& ret) {
+	return varw<T, VT, Ret, true>(w, arr, ret);
+}
+
+
 // use getSignedType
-template<class T, template<typename ...> class VT>
-decayed_t<VT, T> deltas(const VT<T>& arr) {
+template<class T, template<typename ...> class VT, class Ret>
+void deltas(const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, T> ret(len);
 	uint32_t i = 0;
 	if (len) ret[i++] = 0;
 	for (; i < len; ++i)
 		ret[i] = arr[i] - arr[i - 1];
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, T> prev(const VT<T>& arr) {
+inline decayed_t<VT, T> deltas(const VT<T>& arr) {
+	decayed_t<VT, T> ret(arr.size);
+	deltas(arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void prev(const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, T> ret(len);
 	uint32_t i = 0;
 	if (len) ret[i++] = arr[0];
 	for (; i < len; ++i)
 		ret[i] = arr[i - 1];
-	return ret;
 }
 
 template<class T, template<typename ...> class VT>
-decayed_t<VT, T> aggnext(const VT<T>& arr) {
+inline decayed_t<VT, T> prev(const VT<T>& arr) {
+	decayed_t<VT, T> ret(arr.size);
+	prev(arr, ret);
+	return ret;
+}
+
+template<class T, template<typename ...> class VT, class Ret>
+void aggnext(const VT<T>& arr, Ret& ret) {
 	const uint32_t& len = arr.size;
-	decayed_t<VT, T> ret(len);
 	uint32_t i = 1;
 	for (; i < len; ++i)
 		ret[i - 1] = arr[i];
 	if (len > 0) ret[len - 1] = arr[len - 1];
+}
+
+template<class T, template<typename ...> class VT>
+inline decayed_t<VT, T> aggnext(const VT<T>& arr) {
+	decayed_t<VT, T> ret(arr.size);
+	aggnext(arr, ret);
 	return ret;
 }
 
@@ -360,8 +452,6 @@ T first(const VT<T>& arr) {
 	return arr[0];
 }
 
-
-
 #define __DEFAULT_AGGREGATE_FUNCTION__(NAME, RET) \
 template <class T> constexpr T NAME(const T& v) { return RET; }
 
diff --git a/server/gc.h b/server/gc.h
index 1099248..45c827e 100644
--- a/server/gc.h
+++ b/server/gc.h
@@ -1,4 +1,39 @@
 #pragma once
+
+class ScratchSpace {
+public:
+	void* ret;
+	char* scratchspace;
+	size_t ptr;
+	size_t cnt;
+	size_t capacity;
+	size_t initial_capacity;
+	void* temp_memory_fractions;
+
+	//uint8_t status; 
+	// record maximum size
+	constexpr static uint8_t Grow = 0x1;
+	// no worry about overflow
+	constexpr static uint8_t Use = 0x0; 
+
+	void init(size_t initial_capacity);
+
+	// apply for memory
+	void* alloc(uint32_t sz);
+
+	void register_ret(void* ret);
+
+	// reorganize memory space
+	void release();
+
+	// reset status of the scratch space 
+	void reset();
+
+	// reset scratch space to initial capacity.
+	void cleanup();
+};
+
+
 #ifndef __AQ_USE_THREADEDGC__
 #include <atomic>
 class GC {
@@ -18,7 +53,7 @@ private:;
 	std::atomic<uint64_t> current_size;
 	volatile bool lock;
 	using gc_deallocator_t = void (*)(void*);
-
+	ScratchSpace scratch;
 	// maybe use volatile std::thread::id instead
 protected:
 	void acquire_lock();
@@ -29,28 +64,36 @@ protected:
 	void terminate_daemon();
 
 public:
-	void reg(void* v, uint32_t sz = 1, 
+	void reg(void* v, uint32_t sz = 0xffffffff, 
 			void(*f)(void*) = free
 		);
 
+	uint32_t get_threshold() const {
+		return threshould;
+	}
+
 	GC(
 		uint64_t max_size = 0xfffffff, uint32_t max_slots = 4096, 
 		uint32_t interval = 10000, uint32_t forced_clean = 1000000,
-		uint32_t threshould = 64 //one seconds
+		uint32_t threshould = 64, //one seconds
+		uint32_t scratch_sz = 0x1000000 // 16 MB
 	) : max_size(max_size), max_slots(max_slots), 
 		interval(interval), forced_clean(forced_clean), 
 		threshould(threshould) {
 
 		start_deamon();
 		GC::gc_handle = this;
+		this->scratch.init(scratch_sz);
 	} // 256 MB
 
 	~GC(){
 		terminate_daemon();
+		scratch.cleanup();
 	}
+
 	static GC* gc_handle;
 	template <class T>
-	constexpr static inline gc_deallocator_t _delete(T*){
+	static inline gc_deallocator_t _delete(T*) {
 		return [](void* v){
 			delete (T*)v;
 		};
diff --git a/server/libaquery.cpp b/server/libaquery.cpp
index a97d6e8..fc8f5dc 100644
--- a/server/libaquery.cpp
+++ b/server/libaquery.cpp
@@ -452,6 +452,9 @@ void GC::reg(void* v, uint32_t sz, void(*f)(void*)) { //~ 40ns expected v. free
 		f(v);
 		return;
 	}
+	else if (sz == 0xffffffff)
+		sz = this->threshould;
+	
 	auto _q = static_cast<memoryqueue_t>(q);
 	while(lock);
 	++alive_cnt;
@@ -466,6 +469,71 @@ void GC::reg(void* v, uint32_t sz, void(*f)(void*)) { //~ 40ns expected v. free
 
 inline GC* GC::gc_handle = nullptr;
 
+void ScratchSpace::init(size_t initial_capacity) {
+	ret = nullptr;
+	scratchspace = static_cast<char*>(malloc(initial_capacity));
+	ptr = cnt = 0;
+	capacity = initial_capacity;
+	this->initial_capacity = initial_capacity;
+	temp_memory_fractions = new vector_type<void*>();
+}
+
+inline void* ScratchSpace::alloc(uint32_t sz){
+	this->cnt += sz; // major cost
+    ptr = this->cnt;
+	if (this->cnt > capacity) {
+		[[unlikely]] 
+		capacity = this->cnt + (capacity >> 1);
+		auto vec_tmpmem_fractions = static_cast<vector_type<char *>*>(temp_memory_fractions);
+		vec_tmpmem_fractions->emplace_back(scratchspace);
+		scratchspace = static_cast<char*>(malloc(capacity));
+		ptr = 0;
+	}
+	return scratchspace + ptr;
+}
+
+inline void ScratchSpace::register_ret(void* ret){
+	this->ret = ret;
+}
+
+inline void ScratchSpace::release(){
+	ptr = cnt = 0;
+	ret = nullptr;
+	auto vec_tmpmem_fractions = 
+		static_cast<vector_type<void*>*>(temp_memory_fractions);
+	if (vec_tmpmem_fractions->size) {
+    //[[unlikely]]
+		for(auto& mem : *vec_tmpmem_fractions){
+			 free(mem);
+			//GC::gc_handle->reg(mem);
+		}
+		vec_tmpmem_fractions->clear();
+	}
+}
+
+inline void ScratchSpace::reset() {
+	this->release();
+	if (capacity != initial_capacity){
+		capacity = initial_capacity;
+		scratchspace = static_cast<char*>(realloc(scratchspace, capacity));
+	}
+}
+
+void ScratchSpace::cleanup(){
+	auto vec_tmpmem_fractions = 
+		static_cast<vector_type<void*>*>(temp_memory_fractions);
+	if (vec_tmpmem_fractions->size) {
+		for(auto& mem : *vec_tmpmem_fractions){
+			free(mem); 
+			//GC::gc_handle->reg(mem);
+		}
+		vec_tmpmem_fractions->clear();
+	}
+	delete vec_tmpmem_fractions;
+	free(this->scratchspace);
+}
+
+
 #include "dragonbox/dragonbox_to_chars.hpp" 
 
 
diff --git a/server/table.h b/server/table.h
index c2b3400..36f7230 100644
--- a/server/table.h
+++ b/server/table.h
@@ -36,7 +36,8 @@ struct ColRef_cstorage {
 	int ty; // what if enum is not int?
 };
 
-template <template <class...> class VT, class T>
+template <template <class...> class VT, class T, 
+	std::enable_if_t<std::is_base_of_v<vector_base<T>, VT<T>>>* = nullptr>
 std::ostream& operator<<(std::ostream& os, const VT<T>& v)
 {
 	v.out();
@@ -143,7 +144,7 @@ public:
 		vector_type<_Ty>::operator=(vt);
 		return *this;
 	}
-	ColRef<_Ty>& operator =(ColRef<_Ty>&& vt) {
+	ColRef<_Ty>& operator =(ColRef<_Ty>&& vt) noexcept {
 		vector_type<_Ty>::operator=(std::move(vt));
 		return *this;
 	
@@ -475,7 +476,7 @@ struct TableInfo {
 
 		std::string printf_string =
 			generate_printf_string<typename std::tuple_element<cols, tuple_type>::type ...>(sep, end);
-		puts(printf_string.c_str());
+		// puts(printf_string.c_str());
 		std::string header_string = std::string();
 		constexpr static int a_cols[] = { cols... };
 		if (fp == nullptr){
@@ -659,11 +660,11 @@ struct TableView {
 };
 
 template <class T>
-constexpr static inline bool is_vector(const ColRef<T>&) {
+constexpr static bool is_vector(const ColRef<T>&) {
 	return true;
 }
 template <class T>
-constexpr static inline bool is_vector(const vector_type<T>&) {
+constexpr static bool is_vector(const vector_type<T>&) {
 	return true;
 }
 
@@ -913,6 +914,42 @@ VT<bool> operator >(const T2& lhs, const VT<T1>& rhs) {
 	return ret;
 }
 
+#define _AQ_OP_(x) __AQ_OP__##x
+#define __AQ_OP__add +
+#define __AQ_OP__minus -
+#define __AQ_OP__div *
+#define __AQ_OP__mul /
+#define __AQ_OP__and &
+#define __AQ_OP__or |
+#define __AQ_OP__xor ^
+#define __AQ_OP__gt >
+#define __AQ_OP__lt <
+#define __AQ_OP__gte >=
+#define __AQ_OP__lte <=
+#define __AQ_OP__eq ==
+#define __AQ_OP__neq !=
+
+#define __D_AQOP(x) \
+template <class T1, class T2, template<typename> class VT, class Ret>\
+void aqop_##x (const VT<T1>& lhs, const VT<T2>& rhs, Ret& ret){\
+	for (uint32_t i = 0; i < ret.size; ++i)\
+		ret[i] = lhs[i] _AQ_OP_(x) rhs[i];\
+}
+
+__D_AQOP(add)
+__D_AQOP(minus)
+__D_AQOP(div)
+__D_AQOP(mul)
+__D_AQOP(and)
+__D_AQOP(or)
+__D_AQOP(xor)
+__D_AQOP(gt)
+__D_AQOP(lt)
+__D_AQOP(gte)
+__D_AQOP(lte)
+__D_AQOP(eq)
+__D_AQOP(neq)
+
 
 template <class ...Types>
 void print(const TableInfo<Types...>& v, const char* delimiter = " ", const char* endline = "\n") {
diff --git a/server/vector_type.hpp b/server/vector_type.hpp
index dc109b1..e0c68ff 100644
--- a/server/vector_type.hpp
+++ b/server/vector_type.hpp
@@ -187,15 +187,15 @@ public:
 		grow<false>(sz);
 	}
 
-	void emplace_back(const _Ty& _val) {
+	inline void emplace_back(const _Ty& _val) {
 		grow();
 		container[size++] = _val;
 	}
-	void emplace_back(_Ty& _val) {
+	inline void emplace_back(_Ty& _val) {
 		grow();
 		container[size++] = std::move(_val);
 	}
-	void emplace_back(_Ty&& _val) {
+	inline void emplace_back(_Ty&& _val) {
 		grow();
 		container[size++] = std::move(_val);
 	}
@@ -212,10 +212,10 @@ public:
 		return _it;
 	}
 
-	iterator_t begin() const {
+	inline iterator_t begin() const {
 		return container;
 	}
-	iterator_t end() const {
+	inline iterator_t end() const {
 		return container + size;
 	}
 
@@ -229,7 +229,7 @@ public:
 		return container[_i];
 	}
 
-	void shrink_to_fit() {
+	inline void shrink_to_fit() {
 		if (size && capacity != size) {
 			capacity = size;
 			_Ty* _container = (_Ty*)malloc(sizeof(_Ty) * size);
@@ -239,13 +239,17 @@ public:
 		}
 	}
 
-	_Ty& back() {
+	inline void clear() {
+		this->size = 0;
+	}
+
+	inline _Ty& back() {
 		return container[size - 1];
 	}
-	void qpop() {
+	inline void qpop() {
 		size = size ? size - 1 : size;
 	}
-	void pop_resize() {
+	inline void pop_resize() {
 		if (size) {
 			--size;
 			if (capacity > (size << 1))
@@ -258,7 +262,7 @@ public:
 			}
 		}
 	}
-	_Ty pop() {
+	inline _Ty pop() {
 		return container[--size];
 	}
 	void merge(vector_type<_Ty>& _other) {
@@ -368,7 +372,7 @@ public:
 #define Ops(o, x) \
 	template<typename T>\
 	vector_type<typename types::Coercion<_Ty, T>::type> operator o (const vector_type<T>& r) const {\
-		/*[[likely]] if (r.size == size) {*/\
+		/*if (r.size == size) { [[likely]] */\
 			return x(r);\
 		/*}*/\
 	}
@@ -376,7 +380,7 @@ public:
 #define Opseq(o, x) \
 	template<typename T>\
 	vector_type<typename types::Coercion<_Ty, T>::type> operator o##= (const vector_type<T>& r) {\
-		/*[[likely]] if (r.size == size) {*/\
+		/*if (r.size == size) { [[likely]] */\
 			return x##eq(r);\
 		/*}*/\
 	}

From e588e4b0dc2e3b728772d0aed2b23dec62893fef Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Fri, 20 Jan 2023 23:00:14 +0800
Subject: [PATCH 6/7] improved scratch space

---
 engine/utils.py        |  2 +-
 header.cxx             |  1 +
 reconstruct/ast.py     | 10 ++++++----
 server/aggregations.h  |  4 +---
 server/gc.h            |  8 ++++----
 server/libaquery.cpp   |  1 +
 server/vector_type.hpp | 20 +++++++++++++-------
 7 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/engine/utils.py b/engine/utils.py
index 8e65fcd..59b1309 100644
--- a/engine/utils.py
+++ b/engine/utils.py
@@ -157,4 +157,4 @@ def get_innermost(sl):
     elif sl and type(sl) is list:
         return get_innermost(sl[0])
     else:
-        return sl
\ No newline at end of file
+        return sl
diff --git a/header.cxx b/header.cxx
index 612b4a9..73bdce1 100644
--- a/header.cxx
+++ b/header.cxx
@@ -5,6 +5,7 @@
 #include "./server/gc.h"
 __AQEXPORT__(void) __AQ_Init_GC__(Context* cxt) {
     GC::gc_handle = static_cast<GC*>(cxt->gc);
+    GC::scratch_space = nullptr;
 }
 
 #else // __AQ_USE_THREADEDGC__
diff --git a/reconstruct/ast.py b/reconstruct/ast.py
index 31b3939..a0dbe7b 100644
--- a/reconstruct/ast.py
+++ b/reconstruct/ast.py
@@ -4,8 +4,8 @@ from enum import Enum, auto
 from typing import Dict, List, Optional, Set, Tuple, Union
 
 from engine.types import *
-from engine.utils import (base62alp, base62uuid, enlist, get_innermost,
-                          get_legal_name)
+from engine.utils import (base62alp, base62uuid, enlist, 
+                          get_innermost, get_legal_name)
 from reconstruct.storage import ColRef, Context, TableInfo
 
 class ast_node:
@@ -599,7 +599,7 @@ class groupby_c(ast_node):
             self.context.emitc(f'{c}.reserve({self.group}.size());')
             if col_tovec[i]: # and type(col_types[i]) is VectorT:
                 typename : Types = col_types[i] # .inner_type
-                self.context.emitc(f'auto buf_{c} = static_cast<{typename.cname} *>(malloc({self.total_sz} * sizeof({typename.cname})));')
+                self.context.emitc(f'auto buf_{c} = static_cast<{typename.cname} *>(calloc({self.total_sz}, sizeof({typename.cname})));')
                 tovec_columns.add(c)
         self.arr_len = 'arrlen_' + base62uuid(3)
         self.arr_values = 'arrvals_' + base62uuid(3)
@@ -617,7 +617,8 @@ class groupby_c(ast_node):
                                     f'[{preproc_scanner_it}]);'
                 )
             preproc_scanner.finalize()
-            
+        
+        self.context.emitc(f'GC::scratch_space = GC::gc_handle ? &(GC::gc_handle->scratch) : nullptr;')
         # gscanner = scan(self, self.group, loop_style = 'for_each')
         gscanner = scan(self, self.arr_len)
         key_var = 'key_'+base62uuid(7)
@@ -683,6 +684,7 @@ class groupby_c(ast_node):
                 gscanner.add(f'{ce[0]}.emplace_back({get_var_names_ex(ex)});\n')
         
         gscanner.finalize()
+        self.context.emitc(f'GC::scratch_space = nullptr;')
         
         self.datasource.groupinfo = None
 
diff --git a/server/aggregations.h b/server/aggregations.h
index a51c011..5b67e03 100644
--- a/server/aggregations.h
+++ b/server/aggregations.h
@@ -13,7 +13,7 @@ size_t count(const VT<T>& v) {
 }
 
 template <class T>
-constexpr static inline size_t count(const T&) { return 1; }
+constexpr static size_t count(const T&) { return 1; }
 
 // TODO: Specializations for dt/str/none
 template<class T, template<typename ...> class VT>
@@ -441,14 +441,12 @@ inline decayed_t<VT, T> aggnext(const VT<T>& arr) {
 template<class T, template<typename ...> class VT>
 T last(const VT<T>& arr) {
 	if (!arr.size) return 0;
-	const uint32_t& len = arr.size;
 	return arr[arr.size - 1];
 }
 
 template<class T, template<typename ...> class VT>
 T first(const VT<T>& arr) {
 	if (!arr.size) return 0;
-	const uint32_t& len = arr.size;
 	return arr[0];
 }
 
diff --git a/server/gc.h b/server/gc.h
index 45c827e..0ba5f1d 100644
--- a/server/gc.h
+++ b/server/gc.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <atomic>
 
 class ScratchSpace {
 public:
@@ -35,10 +36,8 @@ public:
 
 
 #ifndef __AQ_USE_THREADEDGC__
-#include <atomic>
 class GC {
-private:;
-
+private:
 	size_t max_slots, 
 		   interval, forced_clean, 
 		   forceclean_timer = 0;
@@ -53,7 +52,6 @@ private:;
 	std::atomic<uint64_t> current_size;
 	volatile bool lock;
 	using gc_deallocator_t = void (*)(void*);
-	ScratchSpace scratch;
 	// maybe use volatile std::thread::id instead
 protected:
 	void acquire_lock();
@@ -64,6 +62,7 @@ protected:
 	void terminate_daemon();
 
 public:
+	ScratchSpace scratch;
 	void reg(void* v, uint32_t sz = 0xffffffff, 
 			void(*f)(void*) = free
 		);
@@ -92,6 +91,7 @@ public:
 	}
 
 	static GC* gc_handle;
+	static ScratchSpace *scratch_space;
 	template <class T>
 	static inline gc_deallocator_t _delete(T*) {
 		return [](void* v){
diff --git a/server/libaquery.cpp b/server/libaquery.cpp
index fc8f5dc..b59f658 100644
--- a/server/libaquery.cpp
+++ b/server/libaquery.cpp
@@ -468,6 +468,7 @@ void GC::reg(void* v, uint32_t sz, void(*f)(void*)) { //~ 40ns expected v. free
 #endif
 
 inline GC* GC::gc_handle = nullptr;
+inline ScratchSpace* GC::scratch_space = nullptr;
 
 void ScratchSpace::init(size_t initial_capacity) {
 	ret = nullptr;
diff --git a/server/vector_type.hpp b/server/vector_type.hpp
index e0c68ff..caa0f33 100644
--- a/server/vector_type.hpp
+++ b/server/vector_type.hpp
@@ -49,20 +49,25 @@ public:
 		this->container = vt.container;
 		// puts("move");
 		vt.size = vt.capacity = 0;	
-		vt.container = 0;
+		vt.container = nullptr;
 	}
 public:
 	_Ty* container;
 	uint32_t size, capacity;
 	typedef _Ty* iterator_t;
 	typedef std::conditional_t<is_cstr<_Ty>(), astring_view, _Ty> value_t;
-	vector_type(const uint32_t& size) : size(size), capacity(size) {
+	explicit vector_type(const uint32_t& size) : size(size), capacity(size) {
+		if (GC::scratch_space != nullptr) {
+			[[likely]]
+			container = (_Ty*)GC::scratch_space->alloc(size * sizeof(_Ty));
+		}
 		container = (_Ty*)malloc(size * sizeof(_Ty));
 		// TODO: calloc for objects. 
 	}
-	constexpr vector_type(std::initializer_list<_Ty> _l) {
+	explicit constexpr vector_type(std::initializer_list<_Ty> _l) {
 		size = capacity = _l.size();
-		_Ty* _container = this->container = (_Ty*)malloc(sizeof(_Ty) * _l.size());
+		this->container = (_Ty*)malloc(sizeof(_Ty) * capacity);
+		_Ty* _container = this->container;
 		for (const auto& l : _l) {
 			*(_container++) = l;
 		}
@@ -80,8 +85,9 @@ public:
 	constexpr vector_type(vector_type<_Ty>&& vt) noexcept : capacity(0) {
 		_move(std::move(vt));
 	}
-	vector_type(vectortype_cstorage vt) noexcept : capacity(vt.capacity), size(vt.size), container((_Ty*)vt.container) {
-		out(10);
+	explicit vector_type(vectortype_cstorage vt) noexcept : 
+		capacity(vt.capacity), size(vt.size), container((_Ty*)vt.container) {
+		// out(10);
 	};
 	// size >= capacity ==> readonly vector
 	constexpr vector_type(const uint32_t size, void* data) : 
@@ -499,7 +505,7 @@ public:
 	}
 
 	inline void hashtable_push(Key&& k, uint32_t i){
-		reversemap[i] = ankerl::unordered_dense::set<Key, Hash>::hashtable_push(std::forward<Key&&>(k));
+		reversemap[i] = ankerl::unordered_dense::set<Key, Hash>::hashtable_push(std::move(k));
 		++ht_base[reversemap[i]];
 	}
 

From a91ab1841dbdf1f2e3f05bb3c2a51ec22a068b9d Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Sat, 21 Jan 2023 04:10:47 +0800
Subject: [PATCH 7/7] scratch space

---
 reconstruct/ast.py      |  2 ++
 server/gc.h             |  2 +-
 server/libaquery.cpp    | 10 +++++-----
 server/monetdb_conn.cpp |  7 +++----
 server/table.h          | 12 ++++--------
 5 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/reconstruct/ast.py b/reconstruct/ast.py
index a0dbe7b..a260ccb 100644
--- a/reconstruct/ast.py
+++ b/reconstruct/ast.py
@@ -683,6 +683,8 @@ class groupby_c(ast_node):
             else:
                 gscanner.add(f'{ce[0]}.emplace_back({get_var_names_ex(ex)});\n')
         
+        gscanner.add(f'GC::scratch_space->release();')
+        
         gscanner.finalize()
         self.context.emitc(f'GC::scratch_space = nullptr;')
         
diff --git a/server/gc.h b/server/gc.h
index 0ba5f1d..d8e84f8 100644
--- a/server/gc.h
+++ b/server/gc.h
@@ -82,7 +82,7 @@ public:
 
 		start_deamon();
 		GC::gc_handle = this;
-		this->scratch.init(scratch_sz);
+		this->scratch.init(1);
 	} // 256 MB
 
 	~GC(){
diff --git a/server/libaquery.cpp b/server/libaquery.cpp
index b59f658..60e4b81 100644
--- a/server/libaquery.cpp
+++ b/server/libaquery.cpp
@@ -480,8 +480,8 @@ void ScratchSpace::init(size_t initial_capacity) {
 }
 
 inline void* ScratchSpace::alloc(uint32_t sz){
-	this->cnt += sz; // major cost
     ptr = this->cnt;
+	this->cnt += sz; // major cost
 	if (this->cnt > capacity) {
 		[[unlikely]] 
 		capacity = this->cnt + (capacity >> 1);
@@ -499,14 +499,13 @@ inline void ScratchSpace::register_ret(void* ret){
 
 inline void ScratchSpace::release(){
 	ptr = cnt = 0;
-	ret = nullptr;
 	auto vec_tmpmem_fractions = 
 		static_cast<vector_type<void*>*>(temp_memory_fractions);
 	if (vec_tmpmem_fractions->size) {
-    //[[unlikely]]
+    	[[unlikely]]
 		for(auto& mem : *vec_tmpmem_fractions){
-			 free(mem);
-			//GC::gc_handle->reg(mem);
+			//free(mem);
+			GC::gc_handle->reg(mem);
 		}
 		vec_tmpmem_fractions->clear();
 	}
@@ -514,6 +513,7 @@ inline void ScratchSpace::release(){
 
 inline void ScratchSpace::reset() {
 	this->release();
+	ret = nullptr;
 	if (capacity != initial_capacity){
 		capacity = initial_capacity;
 		scratchspace = static_cast<char*>(realloc(scratchspace, capacity));
diff --git a/server/monetdb_conn.cpp b/server/monetdb_conn.cpp
index c577c8b..533a6f0 100644
--- a/server/monetdb_conn.cpp
+++ b/server/monetdb_conn.cpp
@@ -75,11 +75,11 @@ void Server::connect(Context *cxt){
         printf("Error: Server %p already connected. Restart? (Y/n). \n", server);
         char c[50];
         std::cin.getline(c, 49);
-        for(int i = 0; i < 50; ++i){
+        for(int i = 0; i < 50; ++i) {
             if (!c[i] || c[i] == 'y' || c[i] == 'Y'){
                 monetdbe_close(*server);
                 free(*server);
-                this->server = 0;
+                this->server = nullptr;
                 break;
             }
             else if(c[i]&&!(c[i] == ' ' || c[i] == '\t'))
@@ -153,8 +153,7 @@ void Server::print_results(const char* sep, const char* end){
             szs [i] = monetdbe_type_szs[cols[i]->type];
             header_string = header_string + cols[i]->name + sep + '|' + sep;
         }
-        const size_t l_sep = strlen(sep) + 1;
-		if (header_string.size() - l_sep >= 0)
+		if (const size_t l_sep = strlen(sep) + 1; header_string.size() >= l_sep)
 			header_string.resize(header_string.size() - l_sep);
         header_string += end + std::string(header_string.size(), '=') + end;
         fputs(header_string.c_str(), stdout);
diff --git a/server/table.h b/server/table.h
index 36f7230..6f7dfdf 100644
--- a/server/table.h
+++ b/server/table.h
@@ -332,7 +332,7 @@ template<class ...Types> struct TableInfo;
 template<class ...Types> struct TableView;
 
 template <long long _Index, bool order = true, class... _Types>
-constexpr inline auto& get(const TableInfo<_Types...>& table) noexcept {
+constexpr auto& get(const TableInfo<_Types...>& table) noexcept {
 	if constexpr (order)
 		return *(ColRef<std::tuple_element_t<_Index, std::tuple<_Types...>>> *) & (table.colrefs[_Index]);
 	else
@@ -340,7 +340,7 @@ constexpr inline auto& get(const TableInfo<_Types...>& table) noexcept {
 }
 
 template <long long _Index, class... _Types>
-constexpr inline ColRef<std::tuple_element_t<_Index, std::tuple<_Types...>>>& get(const TableView<_Types...>& table) noexcept {
+constexpr ColRef<std::tuple_element_t<_Index, std::tuple<_Types...>>>& get(const TableView<_Types...>& table) noexcept {
 	return *(ColRef<std::tuple_element_t<_Index, std::tuple<_Types...>>> *) & (table.info.colrefs[_Index]);
 }
 
@@ -351,9 +351,6 @@ struct is_vector_impl<ColView<V>> : std::true_type {};
 template <class V>
 struct is_vector_impl<vector_type<V>> : std::true_type {};
 
-template<class ...Types>
-struct TableView;
-
 template<class ...Types>
 struct TableInfo {
 	const char* name;
@@ -462,8 +459,7 @@ struct TableInfo {
 		std::string header_string = std::string();
 		for (uint32_t i = 0; i < sizeof...(Types); ++i)
 			header_string += std::string(this->colrefs[i].name) + sep + '|' + sep;
-		const size_t l_sep = strlen(sep) + 1;
-		if (header_string.size() - l_sep >= 0)
+		if (const size_t l_sep = strlen(sep) + 1; header_string.size() >= l_sep)
 			header_string.resize(header_string.size() - l_sep);
 		header_string += end + std::string(header_string.size(), '=') + end;
 		return header_string;
@@ -535,7 +531,7 @@ struct TableInfo {
 		}
 	}
 	template <int ...vals> struct applier {
-		inline constexpr static void apply(const TableInfo<Types...>& t, const char* __restrict sep = ",", const char* __restrict end = "\n",
+		constexpr static void apply(const TableInfo<Types...>& t, const char* __restrict sep = ",", const char* __restrict end = "\n",
 			const vector_type<uint32_t>* __restrict view = nullptr, FILE* __restrict fp = nullptr, uint32_t limit = std::numeric_limits<uint32_t>::max()
 			) 
 		{