read complex data from csv

dev
Bill 2 years ago
parent 818ab3b2e5
commit 6478deb7da

@ -147,8 +147,7 @@ See files in ./tests/ for more examples.
## Execution Engines
- AQuery++ supports different execution engines thanks to the decoupled compiler structure.
- Hybrid Execution Engine: decouples the query into two parts. The sql-compliant part is executed by an Embedded version of Monetdb and everything else is executed by a post-process module which is generated by AQuery++ Compiler in C++ and then compiled and executed.
- AQuery Execution Engine: executes queries by compiling the query plan to C++ code. Doesn't support joins and udf functions.
- K9 Execution Engine: (discontinued).
- AQuery Library: A set of header based libraries that provide column arithmetic and operations inspired by array programming languages like kdb. This library is used by C++ post-processor code which can significantly reduce the complexity of generated code, reducing compile time while maintaining the best performance. The set of libraries can also be used by UDFs as well as User modules which makes it easier for users to write simple but powerful extensions.
# Roadmap
- [x] SQL Parser -> AQuery Parser (Front End)
@ -156,14 +155,21 @@ See files in ./tests/ for more examples.
- [x] Schema and Data Model
- [x] Data acquisition/output from/to csv file
- [ ] Execution Engine
- [x] Projections and single-group Aggregations
- [x] Group by Aggregations
- [x] Filters
- [x] Order by
- [x] Assumption
- [x] Flatten
- [x] Join (Hybrid Engine only)
- [ ] Subqueries
- [x] Single Query
- [x] Projections and single-group Aggregations
- [x] Group by Aggregations
- [x] Filters
- [x] Order by
- [x] Assumption
- [x] Flatten
- [x] Join (Hybrid Engine only)
- [ ] Subquery
- [ ] With Clause
- [ ] From subquery
- [ ] Select sunquery
- [ ] Where subquery
- [ ] Subquery in group by
- [ ] Subquery in order by
- [x] Query Optimization
- [x] Selection/Order by push-down
- [x] Join Optimization (Only in Hybrid Engine)

@ -8,6 +8,7 @@
#
from sre_parse import WHITESPACE
from mo_parsing.helpers import restOfLine
from mo_parsing.infix import delimited_list
from mo_parsing.whitespaces import NO_WHITESPACE, Whitespace
@ -655,7 +656,8 @@ def parser(literal_string, ident, sqlserver=False):
) / to_json_call
load_data = (
keyword("data") ("file_type")
Optional(keyword("complex")("complex"))
+ keyword("data") ("file_type")
+ keyword("infile")("loc")
+ literal_string ("file")
+ INTO
@ -667,6 +669,12 @@ def parser(literal_string, ident, sqlserver=False):
+ keyword("by").suppress()
+ literal_string ("term")
)
+ Optional(
keyword("element").suppress()
+ keyword("terminated").suppress()
+ keyword("by").suppress()
+ literal_string ("ele")
)
)
module_func_def = (

61
csv.h

@ -1,4 +1,4 @@
// Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net>
// Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net>, 2022 Bill Sun
// License: BSD-3
//
// All rights reserved.
@ -49,6 +49,7 @@
#include <cerrno>
#include <istream>
#include <limits>
#include "server/vector_type.hpp"
namespace io{
////////////////////////////////////////////////////////////////////////////
@ -974,8 +975,7 @@ namespace io{
return;
}
x = 10*x+y;
}else
throw error::no_digit();
}
++col;
}
}
@ -1005,8 +1005,7 @@ namespace io{
return;
}
x = 10*x-y;
}else
throw error::no_digit();
}
++col;
}
return;
@ -1080,19 +1079,37 @@ namespace io{
}
x *= base;
}
}else{
if(*col != '\0')
throw error::no_digit();
}
if(is_neg)
x = -x;
}
template<class overflow_policy> void parse(char*col, float&x) { parse_float(col, x); }
template<class overflow_policy> void parse(char*col, double&x) { parse_float(col, x); }
template<class overflow_policy> void parse(char*col, long double&x) { parse_float(col, x); }
template<class overflow_policy, class T, char sep2 = ';'>
void parse_vector(char* col, vector_type<T>& x) {
while (*col != '\0') {
char* next_col = col;
while (*next_col != sep2 && *next_col != '\0')
++next_col;
while (*next_col == ' ' || *next_col == '\t' ||
*next_col == sep2 || *next_col == '\r' ||
*next_col == '\n')
++next_col;
char _next_end = *next_col;
*next_col = '\0';
T y;
::io::detail::parse<overflow_policy>(col, y);
x.emplace_back(y);
col = next_col;
*next_col = _next_end;
}
}
template<class overflow_policy, class T>
void parse(char*col, T&x){
// Mute unused variable compiler warning
@ -1108,6 +1125,7 @@ namespace io{
}
template<unsigned column_count,
char sep2 = -2,
class trim_policy = trim_chars<' ', '\t'>,
class quote_policy = no_quote_escape<','>,
class overflow_policy = throw_on_overflow,
@ -1234,7 +1252,23 @@ namespace io{
parse_helper(r+1, cols...);
}
template<class T, class ...ColType>
void parse_helper(std::size_t r, vector_type<T>&t, ColType&...cols){
if(row[r]){
try{
try{
::io::detail::parse_vector<overflow_policy, T, sep2>(row[r], t);
}catch(error::with_column_content&err){
err.set_column_content(row[r]);
throw;
}
}catch(error::with_column_name&err){
err.set_column_name(column_names[r].c_str());
throw;
}
}
parse_helper(r+1, cols...);
}
public:
template<class ...ColType>
bool read_row(ColType& ...cols){
@ -1269,5 +1303,12 @@ namespace io{
}
};
}
template <unsigned column_count, char sep1 = ',', char sep2 = ';'>
using AQCSVReader = io::CSVReader<column_count, sep2,
io::trim_chars<(char)32, (char)9>, io::no_quote_escape<sep1>,
io::ignore_overflow, io::empty_line_comment
>;
#endif

@ -1,8 +1,9 @@
from copy import deepcopy
from engine.utils import base62uuid, defval
from aquery_config import have_hge
from typing import Dict, List
from aquery_config import have_hge
from engine.utils import base62uuid, defval
type_table: Dict[str, "Types"] = {}
class Types:
@ -65,10 +66,10 @@ class Types:
return self.sqlname
@staticmethod
def decode(aquery_type : str, vector_type:str = 'ColRef') -> "Types":
if (aquery_type.startswith('vec')):
def decode(aquery_type : str, vector_type:str = 'vector_type') -> "Types":
if (aquery_type.lower().startswith('vec')):
return VectorT(Types.decode(aquery_type[3:]), vector_type)
return type_table[aquery_type]
return type_table[aquery_type.lower()]
class TypeCollection:
def __init__(self, sz, deftype, fptype = None, utype = None, *, collection = None) -> None:
@ -121,7 +122,7 @@ class VectorT(Types):
return f'{self.vector_type}<{self.inner_type.name}>'
@property
def sqlname(self) -> str:
return 'BIGINT'
return 'HUGEINT' # Store vector_type into 16 bit integers
@property
def cname(self) -> str:
return f'{self.vector_type}<{self.inner_type.cname}>'
@ -142,7 +143,7 @@ fp_types : Dict[str, Types] = _ty_make_dict('t.sqlname.lower()', FloatT, DoubleT
temporal_types : Dict[str, Types] = _ty_make_dict('t.sqlname.lower()', DateT, TimeT, TimeStampT)
builtin_types : Dict[str, Types] = {
'string' : StrT,
**_ty_make_dict('t.sqlname.lower()', AnyT, TextT, VarcharT),
**_ty_make_dict('t.sqlname.lower()', AnyT, TextT, VarcharT, HgeT),
**int_types, **fp_types, **temporal_types}
def get_int128_support():
@ -365,3 +366,5 @@ user_module_func = {}
builtin_operators : Dict[str, OperatorBase] = {**builtin_binary_arith, **builtin_binary_logical,
**builtin_unary_arith, **builtin_unary_logical, **builtin_unary_special, **builtin_func, **builtin_cstdlib,
**user_module_func}
type_table = {**builtin_types, **type_table}

@ -1,6 +1,6 @@
from collections import OrderedDict
from collections.abc import MutableMapping, Mapping
import uuid
from collections import OrderedDict
from collections.abc import Mapping, MutableMapping
lower_alp = 'abcdefghijklmnopqrstuvwxyz'
upper_alp = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -107,6 +107,8 @@ def defval(val, default):
# escape must be readonly
from typing import Mapping, Set
def remove_last(pattern : str, string : str, escape : Set[str] = set()) -> str:
idx = string.rfind(pattern)
if idx == -1:
@ -126,9 +128,11 @@ class _Counter:
return cnt
import re
ws = re.compile(r'\s+')
import os
def add_dll_dir(dll: str):
import sys
if sys.version_info.major >= 3 and sys.version_info.minor >7 and os.name == 'nt':

@ -1,4 +1,5 @@
import aquery_config
help_message = '''\
======================================================
AQUERY COMMANDLINE HELP
@ -82,31 +83,31 @@ if __name__ == '__main__':
import os
from dataclasses import dataclass
import atexit
import ctypes
import enum
import time
import mmap
import os
# import dbconn
import re
import subprocess
import sys
import threading
import time
from dataclasses import dataclass
from typing import Callable, List, Optional
import numpy as np
from mo_parsing import ParseException
import aquery_parser as parser
import engine
import engine.projection
import engine.ddl
import engine.projection
import reconstruct as xengine
import subprocess
import mmap
import sys
from engine.utils import base62uuid
import atexit
import threading
import ctypes
import numpy as np
from engine.utils import ws
from engine.utils import add_dll_dir
from engine.utils import nullstream
from build import build_manager
from engine.utils import add_dll_dir, base62uuid, nullstream, ws
## CLASSES BEGIN
class RunType(enum.Enum):
@ -407,7 +408,7 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
for t in cxt.tables:
lst_cols = []
for c in t.columns:
lst_cols.append(f'{c.name} : {c.type}')
lst_cols.append(f'{c.name} : {c.type.name}')
print(f'{t.table_name} ({", ".join(lst_cols)})')
continue
elif q.startswith('help'):
@ -605,7 +606,8 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
print("\nBye.")
raise
except ValueError as e:
import code, traceback
import code
import traceback
__stdin = os.dup(0)
raise_exception = True
sh = code.InteractiveConsole({**globals(), **locals()})

@ -1,4 +1,5 @@
from reconstruct.ast import Context, ast_node
saved_cxt = None
def initialize(cxt = None, keep = False):

@ -1,12 +1,13 @@
from copy import deepcopy
from dataclasses import dataclass
from enum import Enum, auto
from typing import Set, Tuple, Dict, Union, List, Optional
from typing import Dict, List, Optional, Set, Tuple, Union
from engine.types import *
from engine.utils import enlist, base62uuid, base62alp, get_legal_name
from reconstruct.storage import Context, TableInfo, ColRef
from engine.utils import base62alp, base62uuid, enlist, get_legal_name
from reconstruct.storage import ColRef, Context, TableInfo
class ast_node:
header = []
types = dict()
@ -70,7 +71,11 @@ class projection(ast_node):
elif 'select_distinct' in node:
p = node['select_distinct']
self.distinct = True
if 'with' in node:
self.with_clause = projection(self, node['value'])
else:
self.with_clause = None
self.projections = p if type(p) is list else [p]
if self.parent is None:
self.context.sql_begin()
@ -951,6 +956,9 @@ class load(ast_node):
if node['load']['file_type'] == 'module':
self.produce = self.produce_module
self.module = True
elif 'complex' in node['load']:
self.produce = self.produce_cpp
self.consume = lambda *_: None
elif self.context.dialect == 'MonetDB':
self.produce = self.produce_monetdb
else:
@ -1019,7 +1027,56 @@ class load(ast_node):
self.sql = f'{s1} \'{p}\' {s2} '
if 'term' in node:
self.sql += f' {s3} \'{node["term"]["literal"]}\''
def produce_cpp(self, node):
self.context.has_dll = True
self.context.headers.add('"csv.h"')
node = node['load']
self.postproc_fname = 'ld_' + base62uuid(5)
self.context.postproc_begin(self.postproc_fname)
table:TableInfo = self.context.tables_byname[node['table']]
self.sql = F"SELECT {', '.join([c.name for c in table.columns])} FROM {table.table_name};"
self.emit(self.sql+';\n')
self.context.sql_end()
length_name = 'len_' + base62uuid(6)
self.context.emitc(f'auto {length_name} = server->cnt;')
out_typenames = [t.type.cname for t in table.columns]
outtable_col_nameslist = ', '.join([f'"{c.name}"' for c in table.columns])
self.outtable_col_names = 'names_' + base62uuid(4)
self.context.emitc(f'const char* {self.outtable_col_names}[] = {{{outtable_col_nameslist}}};')
self.out_table = 'tbl_' + base62uuid(4)
self.context.emitc(f'auto {self.out_table} = new TableInfo<{",".join(out_typenames)}>("{table.table_name}", {self.outtable_col_names});')
for i, c in enumerate(table.columns):
c.cxt_name = 'c_' + base62uuid(6)
self.context.emitc(f'decltype(auto) {c.cxt_name} = {self.out_table}->get_col<{i}>();')
self.context.emitc(f'{c.cxt_name}.initfrom({length_name}, server->getCol({i}), "{table.columns[i].name}");')
csv_reader_name = 'csv_reader_' + base62uuid(6)
col_types = [c.type.cname for c in table.columns]
col_tmp_names = ['tmp_'+base62uuid(8) for _ in range(len(table.columns))]
#col_names = ','.join([f'"{c.name}"' for c in table.columns])
term_field = ',' if 'term' not in node else node['term']['literal']
term_ele = ';' if 'ele' not in node else node['ele']['literal']
self.context.emitc(f'AQCSVReader<{len(col_types)}, \'{term_field.strip()[0]}\', \'{term_ele.strip()[0]}\'> {csv_reader_name}("{node["file"]["literal"]}");')
# self.context.emitc(f'{csv_reader_name}.read_header(io::ignore_extra_column, {col_names});')
self.context.emitc(f'{csv_reader_name}.next_line();')
for t, n in zip(col_types, col_tmp_names):
self.context.emitc(f'{t} {n};')
self.context.emitc(f'while({csv_reader_name}.read_row({",".join(col_tmp_names)})) {{ \n')
for i, c in enumerate(table.columns):
self.context.emitc(f'print({col_tmp_names[i]});')
self.context.emitc(f'{c.cxt_name}.emplace_back({col_tmp_names[i]});')
self.context.emitc('}')
self.context.emitc(f'print(*{self.out_table});')
self.context.emitc(f'{self.out_table}->monetdb_append_table(cxt->alt_server, "{table.table_name}");')
self.context.postproc_end(self.postproc_fname)
class outfile(ast_node):
name="_outfile"
def __init__(self, parent, node, context = None, *, sql = None):
@ -1121,7 +1178,7 @@ class udf(ast_node):
def produce(self, node):
from engine.utils import get_legal_name, check_legal_name
from engine.utils import check_legal_name, get_legal_name
node = node[self.name]
# register udf
self.agg = 'Agg' in node
@ -1216,7 +1273,7 @@ class udf(ast_node):
def consume(self, node):
from engine.utils import get_legal_name, check_legal_name
from engine.utils import check_legal_name, get_legal_name
node = node[self.name]
if 'params' in node:
@ -1339,4 +1396,5 @@ def include(objs):
import sys
include(sys.modules[__name__])

@ -1,7 +1,8 @@
from typing import Optional, Set
from engine.types import *
from reconstruct.ast import ast_node
from reconstruct.storage import ColRef, Context
from engine.types import *
# TODO: Decouple expr and upgrade architecture
# C_CODE : get ccode/sql code?
@ -31,6 +32,7 @@ class expr(ast_node):
def __init__(self, parent, node, *, c_code = None, supress_undefined = False):
from reconstruct.ast import projection, udf
# gen2 expr have multi-passes
# first pass parse json into expr tree
# generate target code in later passes upon need
@ -78,7 +80,7 @@ class expr(ast_node):
ast_node.__init__(self, parent, node, None)
def init(self, _):
from reconstruct.ast import projection, _tmp_join_union
from reconstruct.ast import _tmp_join_union, projection
parent = self.parent
self.is_compound = parent.is_compound if type(parent) is expr else False
if type(parent) in [projection, expr, _tmp_join_union]:

@ -1,12 +1,14 @@
from typing import Dict, List, Set
from engine.types import *
from engine.utils import CaseInsensitiveDict, base62uuid, enlist
from typing import List, Dict, Set
class ColRef:
def __init__(self, _ty, cobj, table:'TableInfo', name, id, compound = False, _ty_args = None):
self.type : Types = AnyT
if type(_ty) is str:
self.type = builtin_types[_ty.lower()]
self.type = Types.decode(_ty)
if _ty_args:
self.type = self.type(enlist(_ty_args))
elif type(_ty) is Types:
@ -17,6 +19,7 @@ class ColRef:
self.alias = set()
self.id = id # position in table
self.compound = compound # compound field (list as a field)
self.cxt_name = ''
# e.g. order by, group by, filter by expressions
self.__arr__ = (_ty, cobj, table, name, id)

@ -159,6 +159,10 @@ public:
grow();
container[size++] = _val;
}
void emplace_back(_Ty& _val) {
grow();
container[size++] = std::move(_val);
}
void emplace_back(_Ty&& _val) {
grow();
container[size++] = std::move(_val);

@ -0,0 +1,3 @@
create table f (a float, b vecfloat, c int)
load complex data infile 'data/test_complex.csv' into table f fields terminated by ',' element terminated by ';'
select * from f

@ -17,4 +17,10 @@ LOAD DATA INFILE "data/ticks.csv" INTO TABLE TICKS FIELDS TERMINATED BY ","
SELECT max(endofdayprice/prev(endofdayprice)) as Max_Ratio
FROM ticks
ASSUMING ASC date
WHERE ID = "3001"
WHERE ID = "3001"
CREATE TABLE ticks2(ID VARCHAR(20), max REAL, min REAL)
INSERT INTO ticks2 SELECT ID AS ID, max(ratios(endofdayprice)) AS max, min(ratios(endofdayprice)) AS min from ticks group by ID;
SELECT ID, max, min
FROM ticks2;
Loading…
Cancel
Save