From ff63be720c6cc2c509a200f4a7f58a08eb2ae782 Mon Sep 17 00:00:00 2001 From: Bill Date: Tue, 20 Sep 2022 02:21:49 +0800 Subject: [PATCH] Bug fixes for alias&join. Add test in presentation. --- .gitignore | 3 + aquery_config.py | 2 +- aquery_parser/keywords.py | 5 +- aquery_parser/sql_parser.py | 16 ++- datagen.cpp | 47 ++++++- engine/projection.py | 6 +- reconstruct/ast.py | 54 +++++--- reconstruct/expr.py | 230 +++++++++++++++++++---------------- server/io.cpp | 6 +- server/types.h | 2 +- server/utils.h | 5 +- tests/best_profit.a | 41 +++++++ tests/datagen_jose/README | 26 ++++ tests/datagen_jose/RandGen.H | 61 ++++++++++ tests/datagen_jose/Time.C | 71 +++++++++++ tests/datagen_jose/Time.H | 43 +++++++ tests/datagen_jose/cal.C | 79 ++++++++++++ tests/datagen_jose/cal.H | 26 ++++ tests/datagen_jose/cal.cpp | 76 ++++++++++++ tests/datagen_jose/gen.C | 58 +++++++++ tests/datagen_jose/histgen.C | 198 ++++++++++++++++++++++++++++++ tests/datagen_jose/makefile | 20 +++ tests/datagen_jose/tickgen.C | 197 ++++++++++++++++++++++++++++++ tests/stock.a | 1 + 24 files changed, 1131 insertions(+), 142 deletions(-) create mode 100644 tests/best_profit.a create mode 100644 tests/datagen_jose/README create mode 100644 tests/datagen_jose/RandGen.H create mode 100644 tests/datagen_jose/Time.C create mode 100644 tests/datagen_jose/Time.H create mode 100644 tests/datagen_jose/cal.C create mode 100644 tests/datagen_jose/cal.H create mode 100644 tests/datagen_jose/cal.cpp create mode 100644 tests/datagen_jose/gen.C create mode 100644 tests/datagen_jose/histgen.C create mode 100644 tests/datagen_jose/makefile create mode 100644 tests/datagen_jose/tickgen.C diff --git a/.gitignore b/.gitignore index a2ad2b0..3ef09c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +tests/datagen_jose/histgen +tests/datagen_jose/tickgen +datagen *.dSYM testmain.lib testmain.exp diff --git a/aquery_config.py b/aquery_config.py index 0470d20..3330b6e 100644 --- a/aquery_config.py +++ b/aquery_config.py @@ -2,7 +2,7 @@ ## GLOBAL CONFIGURATION FLAGS -version_string = '0.4.4a' +version_string = '0.4.5a' add_path_to_ldpath = True rebuild_backend = False run_backend = True diff --git a/aquery_parser/keywords.py b/aquery_parser/keywords.py index 479081b..5ae05bf 100644 --- a/aquery_parser/keywords.py +++ b/aquery_parser/keywords.py @@ -44,6 +44,7 @@ LEFT = keyword("left") LIKE = keyword("like") LIMIT = keyword("limit").suppress() MINUS = keyword("minus") +NATURAL = keyword("natural") OFFSET = keyword("offset").suppress() ON = keyword("on").suppress() ORDER = keyword("order").suppress() @@ -145,7 +146,7 @@ VIEW = keyword("view") joins = ( ( - Optional(CROSS | OUTER | INNER | ((FULL | LEFT | RIGHT) + Optional(INNER | OUTER))) + Optional(CROSS | OUTER | INNER | NATURAL | ((FULL | LEFT | RIGHT) + Optional(INNER | OUTER))) + JOIN + Optional(LATERAL) ) @@ -214,6 +215,7 @@ RESERVED = MatchFirst([ LIKE, LIMIT, MINUS, + NATURAL, NOCASE, NOT, NULL, @@ -253,6 +255,7 @@ EQ = Char("=").suppress() join_keywords = { "join", + "natural join", "full join", "cross join", "inner join", diff --git a/aquery_parser/sql_parser.py b/aquery_parser/sql_parser.py index c07aea3..45bbe28 100644 --- a/aquery_parser/sql_parser.py +++ b/aquery_parser/sql_parser.py @@ -323,9 +323,12 @@ def parser(literal_string, ident, sqlserver=False): table_source = Forward() + assumption = Group((ASC|DESC) ("sort") + var_name("value")) + assumptions = Optional(ASSUMING.suppress() + Group(delimited_list(assumption))) + join = ( Group(joins)("op") - + table_source("join") + + (table_source )("join") + Optional((ON + expr("on")) | (USING + expr("using"))) | ( Group(WINDOW)("op") @@ -403,7 +406,12 @@ def parser(literal_string, ident, sqlserver=False): | selection + Optional(INTO + table_source("into")) + Optional( - (FROM + delimited_list(table_source) + ZeroOrMore(join))("from") + ( + FROM + + (delimited_list(table_source) + + ZeroOrMore(join))("table_source") + + Optional(assumptions) ("assumptions") + )("from") + Optional(WHERE + expr("where")) + Optional(GROUP_BY + delimited_list(Group(named_column))("groupby")) + Optional(HAVING + expr("having")) @@ -443,12 +451,8 @@ def parser(literal_string, ident, sqlserver=False): + RB, ) - assumption = Group((ASC|DESC) ("sort") + var_name("value")) - assumptions = (ASSUMING + Group(delimited_list(assumption))("assumptions")) - table_source << Group( ((LB + query + RB) | stack | call_function | var_name)("value") - + Optional(assumptions) + Optional(flag("with ordinality")) + Optional(tablesample) + alias diff --git a/datagen.cpp b/datagen.cpp index a94d3e6..88f5a48 100644 --- a/datagen.cpp +++ b/datagen.cpp @@ -37,7 +37,7 @@ void permutation(int *v, int n) { } } -int main(int argc, char* argv[]) +int gen_trade_data(int argc, char* argv[]) { using std::vector; float frac = .3; @@ -108,3 +108,48 @@ int main(int argc, char* argv[]) fclose(fp); return 0; } +#include "./server/utils.h" +#include "./server/types.h" +#include +types::date_t rand_date(){ + unsigned char d = ui(engine) % 28 + 1; + unsigned char m = ui(engine) % 12 + 1; + short y = ui(engine) % 40 + 1990; + if (ui(engine) % 2) return types::date_t((unsigned char)10, (unsigned char)1, 2003); + return types::date_t{d, m, y}; +} +int gen_stock_data(int argc, char* argv[]){ + using std::string; + using namespace types; + int n_stocks = 5; + int n_data = 1000; + string* IDs = new string[n_stocks + 1]; + string* names = new string[n_stocks + 1]; + for(int i = 0; i < n_stocks; ++i){ + IDs[i] = base62uuid(); + names[i] = base62uuid(); + } + IDs[n_stocks] = "S"; + names[n_stocks] = "x"; + FILE* fp = fopen("./data/stock.csv", "w"); + fprintf(fp, "ID, timestamp, tradeDate, price\n"); + char date_str_buf [types::date_t::string_length()]; + int* timestamps = new int[n_data]; + for(int i = 0; i < n_data; ++i) timestamps[i] = i+1; + permutation(timestamps, n_data); + for(int i = 0; i < n_data; ++i){ + auto date = rand_date().toString(date_str_buf + date_t::string_length()); + fprintf(fp, "%s,%d,%s,%d\n", IDs[ui(engine)%(n_stocks + 1)].c_str(), timestamps[i], date, ui(engine) % 1000); + } + fclose(fp); + fp = fopen("./data/base.csv", "w"); + fprintf(fp, "ID, name\n"); + for(int i = 0; i < n_stocks + 1; ++ i){ + fprintf(fp, "%s,%s\n", IDs[i].c_str(), names[i].c_str()); + } + fclose(fp); +} + +int main(int argc, char* argv[]){ + gen_stock_data(argc, argv); +} diff --git a/engine/projection.py b/engine/projection.py index fa199ed..f813005 100644 --- a/engine/projection.py +++ b/engine/projection.py @@ -29,7 +29,7 @@ class projection(ast_node): def spawn(self, node): self.datasource = None if 'from' in node: - from_clause = node['from'] + from_clause = node['from']['table_source'] if type(from_clause) is list: # from joins join(self, from_clause) @@ -47,8 +47,8 @@ class projection(ast_node): self.datasource = self.context.tables_byname[value] if 'name' in value: self.datasource.add_alias(value['name']) - if 'assumptions' in from_clause: - self.assumptions = enlist(from_clause['assumptions']) + if 'assuming' in node['from']: + self.assumptions = enlist(node['from']['assuming']) elif type(from_clause) is str: self.datasource = self.context.tables_byname[from_clause] diff --git a/reconstruct/ast.py b/reconstruct/ast.py index a549b03..a66da39 100644 --- a/reconstruct/ast.py +++ b/reconstruct/ast.py @@ -73,7 +73,7 @@ class projection(ast_node): self.datasource = join(self, [], self.context) # datasource is Join instead of TableInfo self.assumptions = [] if 'from' in node: - from_clause = node['from'] + from_clause = node['from']['table_source'] self.datasource = join(self, from_clause) if 'assumptions' in from_clause: self.assumptions = enlist(from_clause['assumptions']) @@ -129,12 +129,17 @@ class projection(ast_node): if not proj_expr.is_special: y = lambda x:x name = eval('f\'' + name + '\'') + offset = len(col_exprs) if name not in self.var_table: - self.var_table[name] = len(col_exprs) - proj_map[i] = [this_type, len(col_exprs), proj_expr] + self.var_table[name] = offset + if proj_expr.is_ColExpr and type(proj_expr.raw_col) is ColRef: + for n in (proj_expr.raw_col.table.alias): + self.var_table[f'{n}.'+name] = offset + proj_map[i] = [this_type, offset, proj_expr] col_expr = name + ' AS ' + alias if alias else name if alias: - self.var_table[alias] = len(col_exprs) + self.var_table[alias] = offset + col_exprs.append((col_expr, proj_expr.type)) else: self.context.headers.add('"./server/aggregations.h"') @@ -164,10 +169,12 @@ class projection(ast_node): self.add(', '.join([c[0] for c in col_exprs] + col_ext_names)) _base_offset = len(col_exprs) - for i, col in enumerate(col_ext_names): - if col not in self.var_table: - self.var_table[col] = i + _base_offset - + for i, col in enumerate(self.col_ext): + if col.name not in self.var_table: + offset = i + _base_offset + self.var_table[col.name] = offset + for n in (col.table.alias): + self.var_table[f'{n}.'+col.name] = offset def finialize(astnode:ast_node): if(astnode is not None): @@ -548,24 +555,25 @@ class join(ast_node): self.tables_dir = dict() self.rec = None self.top_level = self.parent and type(self.parent) is projection + self.have_sep = False # self.tmp_name = 'join_' + base62uuid(4) # self.datasource = TableInfo(self.tmp_name, [], self.context) def append(self, tbls, __alias = ''): - alias = lambda t : '(' + t + ') ' + __alias if len(__alias) else t + alias = lambda t : t + ' ' + __alias if len(__alias) else t if type(tbls) is join: - self.joins.append(alias(tbls.__str__())) + self.joins.append((alias(tbls.__str__()), tbls.have_sep)) self.tables += tbls.tables self.tables_dir = {**self.tables_dir, **tbls.tables_dir} elif type(tbls) is TableInfo: - self.joins.append(alias(tbls.table_name)) + self.joins.append((alias(tbls.table_name), False)) self.tables.append(tbls) self.tables_dir[tbls.table_name] = tbls for a in tbls.alias: self.tables_dir[a] = tbls elif type(tbls) is projection: - self.joins.append(alias(tbls.finalize())) + self.joins.append((alias(tbls.finalize()), False)) def produce(self, node): if type(node) is list: @@ -589,13 +597,14 @@ class join(ast_node): tbl.add_alias(node['name']) self.append(tbl, alias) else: - keys = node.keys() + keys = list(node.keys()) if keys[0].lower().endswith('join'): + self.have_sep = True j = join(self, node[keys[0]]) tablename = f' {keys[0]} {j}' - if keys[1].lower() == 'on': + if len(keys) > 1 and keys[1].lower() == 'on': tablename += f' on {expr(self, node[keys[1]])}' - self.joins.append(tablename) + self.joins.append((tablename, self.have_sep)) self.tables += j.tables self.tables_dir = {**self.tables_dir, **j.tables_dir} @@ -622,18 +631,27 @@ class join(ast_node): if datasource is None: raise ValueError(f'Table name/alias not defined{parsedColExpr[0]}') else: - return datasource.parse_col_names(parsedColExpr[1]) + datasource.rec = self.rec + ret = datasource.parse_col_names(parsedColExpr[1]) + datasource.rec = None + return ret + @property def all_cols(self): return set([c for t in self.tables for c in t.columns]) def consume(self, node): - self.sql = ', '.join(self.joins) + self.sql = '' + for j in self.joins: + if not self.sql or j[1]: + self.sql += j[0] + else: + self.sql += ', ' + j[0] if node and self.sql and self.top_level: self.sql = ' FROM ' + self.sql return super().consume(node) def __str__(self): - return ', '.join(self.joins) + return self.sql def __repr__(self): return self.__str__() diff --git a/reconstruct/expr.py b/reconstruct/expr.py index ee9fcac..91ed2f9 100644 --- a/reconstruct/expr.py +++ b/reconstruct/expr.py @@ -87,116 +87,119 @@ class expr(ast_node): from reconstruct.ast import udf if type(node) is dict: - if len(node) > 1: - print(f'Parser Error: {node} has more than 1 dict entry.') + if 'literal' in node: + node = node['literal'] + else: + if len(node) > 1: + print(f'Parser Error: {node} has more than 1 dict entry.') - for key, val in node.items(): - if key in self.operators: - if key in builtin_func: - if self.is_agg_func: - self.root.is_special = True # Nested Aggregation - else: - self.is_agg_func = True - - op = self.operators[key] - count_distinct = False - if key == 'count' and type(val) is dict and 'distinct' in val: - count_distinct = True - val = val['distinct'] - val = enlist(val) - exp_vals = [expr(self, v, c_code = self.c_code) for v in val] - self.children = exp_vals - self.opname = key - - str_vals = [e.sql for e in exp_vals] - type_vals = [e.type for e in exp_vals] - is_compound = any([e.is_compound for e in exp_vals]) - if key in self.ext_aggfuncs: - self.is_compound = False - else: - self.is_compound = is_compound - try: - self.type = op.return_type(*type_vals) - except AttributeError as e: - if type(self.root) is not udf: - # TODO: do something when this is not an error - # print(f'alert: {e}') - pass - self.type = AnyT - - if count_distinct: # inject distinct col later - self.sql = f'{{{op(self.c_code, *str_vals, True)}}}' - else: - self.sql = op(self.c_code, *str_vals) + for key, val in node.items(): + if key in self.operators: + if key in builtin_func: + if self.is_agg_func: + self.root.is_special = True # Nested Aggregation + else: + self.is_agg_func = True - special_func = [*self.context.udf_map.keys(), *self.context.module_map.keys(), - "maxs", "mins", "avgs", "sums", "deltas"] - if self.context.special_gb: - special_func = [*special_func, *self.ext_aggfuncs] + op = self.operators[key] + count_distinct = False + if key == 'count' and type(val) is dict and 'distinct' in val: + count_distinct = True + val = val['distinct'] + val = enlist(val) + exp_vals = [expr(self, v, c_code = self.c_code) for v in val] + self.children = exp_vals + self.opname = key - if key in special_func and not self.is_special: - self.is_special = True - if key in self.context.udf_map: - self.root.udf_called = self.context.udf_map[key] - if self.is_udfexpr and key == self.root.udf.name: - self.root.is_recursive_call_inudf = True - elif key in user_module_func.keys(): - udf.try_init_udf(self.context) - # TODO: make udf_called a set! - p = self.parent - while type(p) is expr and not p.udf_called: - p.udf_called = self.udf_called - p = p.parent - p = self.parent - while type(p) is expr and not p.is_special: - p.is_special = True - p = p.parent + str_vals = [e.sql for e in exp_vals] + type_vals = [e.type for e in exp_vals] + is_compound = any([e.is_compound for e in exp_vals]) + if key in self.ext_aggfuncs: + self.is_compound = False + else: + self.is_compound = is_compound + try: + self.type = op.return_type(*type_vals) + except AttributeError as e: + if type(self.root) is not udf: + # TODO: do something when this is not an error + # print(f'alert: {e}') + pass + self.type = AnyT + + if count_distinct: # inject distinct col later + self.sql = f'{{{op(self.c_code, *str_vals, True)}}}' + else: + self.sql = op(self.c_code, *str_vals) + + special_func = [*self.context.udf_map.keys(), *self.context.module_map.keys(), + "maxs", "mins", "avgs", "sums", "deltas", "last"] + if self.context.special_gb: + special_func = [*special_func, *self.ext_aggfuncs] + + if key in special_func and not self.is_special: + self.is_special = True + if key in self.context.udf_map: + self.root.udf_called = self.context.udf_map[key] + if self.is_udfexpr and key == self.root.udf.name: + self.root.is_recursive_call_inudf = True + elif key in user_module_func.keys(): + udf.try_init_udf(self.context) + # TODO: make udf_called a set! + p = self.parent + while type(p) is expr and not p.udf_called: + p.udf_called = self.udf_called + p = p.parent + p = self.parent + while type(p) is expr and not p.is_special: + p.is_special = True + p = p.parent - need_decltypestr = any([e.need_decltypestr for e in exp_vals]) - if need_decltypestr or (self.udf_called and type(op) is udf): - decltypestr_vals = [e.udf_decltypecall for e in exp_vals] - self.udf_decltypecall = op(self.c_code, *decltypestr_vals) + need_decltypestr = any([e.need_decltypestr for e in exp_vals]) + if need_decltypestr or (self.udf_called and type(op) is udf): + decltypestr_vals = [e.udf_decltypecall for e in exp_vals] + self.udf_decltypecall = op(self.c_code, *decltypestr_vals) - if self.udf_called and type(op) is udf: - self.udf_decltypecall = op.decltypecall(self.c_code, *decltypestr_vals) - - elif self.is_udfexpr: - var_table = self.root.udf.var_table - vec = key.split('.') - _vars = [*var_table, *self.builtin_vars] - def get_vname (node): - if node in self.builtin_vars: - self.root.udf.builtin[node].enabled = True - self.builtin_var = node - return node + if self.udf_called and type(op) is udf: + self.udf_decltypecall = op.decltypecall(self.c_code, *decltypestr_vals) + + elif self.is_udfexpr: + var_table = self.root.udf.var_table + vec = key.split('.') + _vars = [*var_table, *self.builtin_vars] + def get_vname (node): + if node in self.builtin_vars: + self.root.udf.builtin[node].enabled = True + self.builtin_var = node + return node + else: + return var_table[node] + if vec[0] not in _vars: + # print(f'Use of undefined variable {vec[0]}') + # TODO: do something when this is not an error + pass else: - return var_table[node] - if vec[0] not in _vars: - # print(f'Use of undefined variable {vec[0]}') - # TODO: do something when this is not an error - pass + vname = get_vname(vec[0]) + val = enlist(val) + if(len(val) > 2): + print('Warning: more than 2 indexes found for subvec operator.') + ex = [expr(self, v, c_code = self.c_code) for v in val] + idxs = ', '.join([e.sql for e in ex]) + self.sql = f'{vname}.subvec({idxs})' + if any([e.need_decltypestr for e in ex]): + self.udf_decltypecall = f'{vname}.subvec({[", ".join([e.udf_decltypecall for e in ex])]})' + if key == 'get' and len(val) > 1: + ex_vname = expr(self, val[0], c_code=self.c_code) + self.sql = f'{ex_vname.sql}[{expr(self, val[1], c_code=self.c_code).sql}]' + if hasattr(ex_vname, 'builtin_var'): + if not hasattr(self, 'builtin_var'): + self.builtin_var = [] + self.builtin_var = [*self.builtin_var, *ex_vname.builtin_var] + self.udf_decltypecall = ex_vname.sql else: - vname = get_vname(vec[0]) - val = enlist(val) - if(len(val) > 2): - print('Warning: more than 2 indexes found for subvec operator.') - ex = [expr(self, v, c_code = self.c_code) for v in val] - idxs = ', '.join([e.sql for e in ex]) - self.sql = f'{vname}.subvec({idxs})' - if any([e.need_decltypestr for e in ex]): - self.udf_decltypecall = f'{vname}.subvec({[", ".join([e.udf_decltypecall for e in ex])]})' - if key == 'get' and len(val) > 1: - ex_vname = expr(self, val[0], c_code=self.c_code) - self.sql = f'{ex_vname.sql}[{expr(self, val[1], c_code=self.c_code).sql}]' - if hasattr(ex_vname, 'builtin_var'): - if not hasattr(self, 'builtin_var'): - self.builtin_var = [] - self.builtin_var = [*self.builtin_var, *ex_vname.builtin_var] - self.udf_decltypecall = ex_vname.sql - else: - print(f'Undefined expr: {key}{val}') + print(f'Undefined expr: {key}{val}') - elif type(node) is str: + if type(node) is str: if self.is_udfexpr: curr_udf : udf = self.root.udf var_table = curr_udf.var_table @@ -231,14 +234,29 @@ class expr(ast_node): self.raw_col = self.raw_col if type(self.raw_col) is ColRef else None if self.raw_col is not None: self.is_ColExpr = True - self.sql = self.raw_col.name + table_name = '' + if '.' in node: + table_name = self.raw_col.table.table_name + if self.raw_col.table.alias: + alias = iter(self.raw_col.table.alias) + try: + a = next(alias) + while(not a or a == table_name): + a = next(alias) + if (a and a != table_name): + table_name = a + except StopIteration: + pass + if table_name: + table_name = table_name + '.' + self.sql = table_name + self.raw_col.name self.type = self.raw_col.type self.is_compound = True self.opname = self.raw_col else: - self.sql = node + self.sql = '\'' + node + '\'' self.type = StrT - self.opname = node + self.opname = self.sql if self.c_code and self.datasource is not None: self.sql = f'{{y(\"{self.sql}\")}}' elif type(node) is bool: @@ -248,7 +266,7 @@ class expr(ast_node): self.sql = '1' if node else '0' else: self.sql = 'TRUE' if node else 'FALSE' - else: + elif type(node) is not dict: self.sql = f'{node}' self.opname = node if type(node) is int: diff --git a/server/io.cpp b/server/io.cpp index 9a527d4..c34dc42 100644 --- a/server/io.cpp +++ b/server/io.cpp @@ -181,7 +181,7 @@ namespace types { return !operator==(other); } bool time_t::validate() const{ - return hours < 24 && minutes < 60 && seconds < 60 && ms < 1000; + return hours < 24 && minutes < 60 && seconds < 60 && ms < 1000000; } timestamp_t::timestamp_t(const char* str) { fromString(str); } @@ -244,13 +244,13 @@ std::ostream& operator<<(std::ostream& os, types::timestamp_t & v) using std::string; -string base62uuid(int l = 8) { +string base62uuid(int l) { using namespace std; constexpr static const char* base62alp = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; static mt19937_64 engine(chrono::system_clock::now().time_since_epoch().count()); static uniform_int_distribution u(0x10000, 0xfffff); uint64_t uuid = (u(engine) << 32ull) + (chrono::system_clock::now().time_since_epoch().count() & 0xffffffff); - printf("%llu\n", uuid); + //printf("%llu\n", uuid); string ret; while (uuid && l-- >= 0) { ret = string("") + base62alp[uuid % 62] + ret; diff --git a/server/types.h b/server/types.h index db79abe..43d2d1a 100644 --- a/server/types.h +++ b/server/types.h @@ -69,7 +69,7 @@ namespace types { time_t& fromString(const char*); bool validate() const; constexpr static unsigned string_length() { - return 13; + return 16; }; char* toString(char* buf) const; bool operator > (const time_t&) const; diff --git a/server/utils.h b/server/utils.h index cb58974..e54ed3c 100644 --- a/server/utils.h +++ b/server/utils.h @@ -1,5 +1,4 @@ #pragma once -#include #include #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L) constexpr static bool cpp_17 = true; @@ -13,4 +12,6 @@ inline const char* str(const T& v) { template <> inline const char* str(const bool& v) { return v ? "true" : "false"; -} \ No newline at end of file +} +#include +extern std::string base62uuid(int l = 6); \ No newline at end of file diff --git a/tests/best_profit.a b/tests/best_profit.a new file mode 100644 index 0000000..912399f --- /dev/null +++ b/tests/best_profit.a @@ -0,0 +1,41 @@ +-- please run datagen.get_stock_data() to generate data/stock.csv first + +-- create table ticks(ID varchar(10), timestamp int, tradeDate date, price int); + +-- LOAD DATA INFILE "data/stock.csv" +-- INTO TABLE ticks +-- FIELDS TERMINATED BY "," + +-- SELECT max(price-mins(price)) +-- FROM ticks ASSUMING ASC timestamp +-- WHERE ID="S" +-- AND tradeDate='2003-01-10' + +-- create table base(ID varchar(10), name varchar(10)); + +-- LOAD DATA INFILE "data/base.csv" +-- INTO TABLE base +-- FIELDS TERMINATED BY "," + +-- SELECT last(price) +-- FROM ticks t, base b +-- ASSUMING ASC name, ASC timestamp +-- WHERE t.ID=b.ID +-- AND name="x" + +create table TradedStocks(ID varchar(15), SeqNo int, TradeDate date, TimeStamp time, Type varchar(5)); +create table HistoricQuotes(ID varchar(15), TradeDate date, HighPrice real, LowPrice real, ClosePrice real, OpenPrice real, volume bigint); + +LOAD DATA INFILE "data/tick-price-file.csv" +INTO TABLE TradedStocks +FIELDS TERMINATED BY "|" +LOAD DATA INFILE "data/hist-price-file.csv" +INTO TABLE HistoricQuotes +FIELDS TERMINATED BY "|" + + +SELECT ts.ID, avgs(10, hq.ClosePrice) +FROM TradedStocks AS ts NATURAL JOIN + HistoricQuotes AS hq + ASSUMING ASC hq.TradeDate +GROUP BY ts.ID diff --git a/tests/datagen_jose/README b/tests/datagen_jose/README new file mode 100644 index 0000000..65c798c --- /dev/null +++ b/tests/datagen_jose/README @@ -0,0 +1,26 @@ +There are 2 programs included in this package: +1. histgen - generation program for historical data +2. tickgen - generation program for tick data + +histgen takes 2 parameters + a) Scale-factor: Number of securities for which data is to be generated + b) Depth of history: The number of days for which data is to be generated +and genertes 2 files + a) hist-base-file: contains all the static data + b) hist-price-file: contains the pricing info + c) hist-split-file: contains the split info + +tickgen also takes 2 parameters + a) Scale-factor: Number of securities for which data is to be generated + b) Ticks per day: A measure of the activity on a per security basis. e.g 100 would mean + that each security ticks about 100 time in a trading day +and generates 2 files + a) tick-base-file + b) tick-price-file + +run the programs without args to see the parameters required. + +------------------ +Make of programs: + +run the make program. It requires the C++ compiler (CC) to be in your path. diff --git a/tests/datagen_jose/RandGen.H b/tests/datagen_jose/RandGen.H new file mode 100644 index 0000000..4b9522e --- /dev/null +++ b/tests/datagen_jose/RandGen.H @@ -0,0 +1,61 @@ +// cmvc_id = %Z% %W% %I% %E% %U% + +#ifndef RandGenHEADER +#define RandGenHEADER + +/////////////////////////////////////////////////////////////////////////////// +// +// Copyright (c) 1997 Morgan Stanley & Co. Incorporated, All Rights Reserved +// +// Unpublished copyright. All rights reserved. This material contains +// proprietary information that shall be used or copied only within Morgan +// Stanley, except with written permission of Morgan Stanley. +// +// Module Name : RandGen.H +// Version : %Z% %W% %I% %E% %U% +// Project : DataGen +// Description : Random Number generator +// +/////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +using namespace std; + +class RandNumGen +{ +public: + RandNumGen(void) + { + struct timeval tv; + gettimeofday(&tv,0 ); + srand(tv.tv_sec + tv.tv_usec); + } + + RandNumGen(unsigned long seed_) + { + srand(seed_); + } + + ~RandNumGen(){} + + inline unsigned long operator() (void); + inline int operator() (int min_, int max_); +}; + +// Implementation of inline functions + +inline unsigned long RandNumGen::operator() (void) +{ + return rand(); +} + +inline int RandNumGen::operator() (int min_, int max_) +{ + unsigned long t = (*this)(); + int r = max_ - min_; + return (min_ + t % r); +} +#endif + diff --git a/tests/datagen_jose/Time.C b/tests/datagen_jose/Time.C new file mode 100644 index 0000000..5f852cb --- /dev/null +++ b/tests/datagen_jose/Time.C @@ -0,0 +1,71 @@ +// cmvc_id = %Z% %W% %I% %E% %U% + +#ifndef TimeIMPLEMENTATION +#define TimeIMPLEMENTATION + +/////////////////////////////////////////////////////////////////////////////// +// +// Copyright (c) 1997 Morgan Stanley & Co. Incorporated, All Rights Reserved +// +// Unpublished copyright. All rights reserved. This material contains +// proprietary information that shall be used or copied only within Morgan +// Stanley, except with written permission of Morgan Stanley. +// +// Module Name : Time.C +// Version : %Z% %W% %I% %E% %U% +// Project : +// Description : +// +/////////////////////////////////////////////////////////////////////////////// +#include +#include "Time.H" + +Time::Time(char *startTime_) +{ + sscanf(startTime_,"%d:%d:%d", &_hrs, &_mins, &_secs); + cout << "Hrs: " << _hrs << ",Mins: " << _mins << ",Secs: " << _secs << endl; +} + +Time::~Time() +{} + +Time &Time::operator++ (int) +{ + _secs++; + adjust(); + return *this; +} + +void Time::adjust(void) +{ + if (_secs >= 60) + { + _mins += _secs/60; + _secs = _secs%60; + } + + if (_mins >= 60) + { + _hrs += _mins/60; + _mins = _mins%60; + } + if (_hrs >= 24) _hrs = _hrs = _hrs%24; +} + +ostream &operator<< (ostream &os_, Time &that_) +{ + if (that_.hrs() < 10) os_ << "0"; + os_ << that_.hrs(); + os_ << ":" ; + + if (that_.mins() < 10) os_ << "0"; + os_ << that_.mins(); + os_ << ":" ; + + if (that_.secs() < 10) os_ << "0"; + os_ << that_.secs(); + return os_; +} + + +#endif diff --git a/tests/datagen_jose/Time.H b/tests/datagen_jose/Time.H new file mode 100644 index 0000000..0f8fc16 --- /dev/null +++ b/tests/datagen_jose/Time.H @@ -0,0 +1,43 @@ +// cmvc_id = %Z% %W% %I% %E% %U% + +#ifndef TimeHEADER +#define TimeHEADER + +/////////////////////////////////////////////////////////////////////////////// +// +// Copyright (c) 1997 Morgan Stanley & Co. Incorporated, All Rights Reserved +// +// Unpublished copyright. All rights reserved. This material contains +// proprietary information that shall be used or copied only within Morgan +// Stanley, except with written permission of Morgan Stanley. +// +// Module Name : Time.H +// Version : %Z% %W% %I% %E% %U% +// Project : +// Description : +// +/////////////////////////////////////////////////////////////////////////////// +#include + +using namespace std; + +class Time +{ +public: + Time(char *startTime_); + ~Time(); + + Time &operator++(int); + void adjust(void); + + int hrs(void) { return _hrs; } + int mins(void) { return _mins; } + int secs(void) { return _secs; } + +friend ostream &operator<< (ostream &os_, Time &that_); + +private: + int _hrs, _mins, _secs; +}; +#endif + diff --git a/tests/datagen_jose/cal.C b/tests/datagen_jose/cal.C new file mode 100644 index 0000000..cfff4e8 --- /dev/null +++ b/tests/datagen_jose/cal.C @@ -0,0 +1,79 @@ +#include +#include +#include + +#include "cal.H" +using namespace std; + +Calendar::Calendar(void) +{ + time_t clk = time(0); + struct tm *now = localtime(&clk); + _currdate = asJulianNumber(now->tm_mon+1, now->tm_mday, now->tm_year+1900); +} + +Calendar::~Calendar() +{} + +// year_ in yyyy format +unsigned int Calendar::asJulianNumber(int month_,int day_,int year_) +{ + unsigned long c,ya; + + if (month_>2) month_-=3; + else { month_+=9; year_--; } + c=year_/100; + ya=year_-100*c; + return ((146097*c)>>2)+((1461*ya)>>2)+(153*month_+2)/5+day_+1721119; +} + +void Calendar::split(int& month_,int& day_,int& year_) +{ + unsigned long d; + unsigned long j=_currdate-1721119; + year_=(int) (((j<<2)-1)/146097); + j=(j<<2)-1-146097*year_; + d=(j>>2); + j=((d<<2)+3)/1461; + d=(d<<2)+3-1461*j; + d=(d+4)>>2; + month_=(int)(5*d-3)/153; + d=5*d-3-153*month_; + day_=(int)((d+5)/5); + year_=(int)(100*year_+j); + if (month_<10) month_+=3; + else { month_-=9; year_++; } +} + +int Calendar::dayInWeek(void) +{ + return ((((_currdate+1)%7)+6)%7)+1; +} + +Calendar &Calendar::nextWeekday(void) +{ + (*this) += 1; + while (!isWeekday()) (*this)+= 1; + return *this; +} + +int Calendar::isWeekday(void) +{ + return (dayInWeek()<6)?1:0; +} + +Calendar &Calendar::operator+= (int incr_) +{ + _currdate += incr_; + return *this; +} + +ostream &operator<< (ostream &os_, Calendar &that_) +{ + int mo, day, year; + that_.split(mo,day,year); + os_ << year << "-" << mo << "-" << day; + // the below is a pain for monetdb + //os_ << mo << "/" << day << "/" << year; + return os_; +} diff --git a/tests/datagen_jose/cal.H b/tests/datagen_jose/cal.H new file mode 100644 index 0000000..78cf6cb --- /dev/null +++ b/tests/datagen_jose/cal.H @@ -0,0 +1,26 @@ +#ifndef _CAL_H_ +#define _CAL_H_ + +#include +using namespace std; + +class Calendar +{ +public: + Calendar(void); + ~Calendar(); + + unsigned int asJulianNumber(int month_, int day_, int year_); + int isWeekday(void); + Calendar &operator+= (int incr_); + +friend ostream &operator<< (ostream &os_,Calendar &that_); + + int dayInWeek(void); + Calendar &nextWeekday(void); + void split(int &mo_, int &day_, int &year_); +private: + unsigned int _currdate; +}; +#endif + diff --git a/tests/datagen_jose/cal.cpp b/tests/datagen_jose/cal.cpp new file mode 100644 index 0000000..56cd2c6 --- /dev/null +++ b/tests/datagen_jose/cal.cpp @@ -0,0 +1,76 @@ +#include +#include +#include + +#include "cal.H" + +Calendar::Calendar(void) +{ + time_t clk = time(0); + struct tm *now = localtime(&clk); + _currdate = asJulianNumber(now->tm_mon+1, now->tm_mday, now->tm_year+1900); +} + +Calendar::~Calendar() +{} + +// year_ in yyyy format +unsigned int Calendar::asJulianNumber(int month_,int day_,int year_) +{ + unsigned long c,ya; + + if (month_>2) month_-=3; + else { month_+=9; year_--; } + c=year_/100; + ya=year_-100*c; + return ((146097*c)>>2)+((1461*ya)>>2)+(153*month_+2)/5+day_+1721119; +} + +void Calendar::split(int& month_,int& day_,int& year_) +{ + unsigned long d; + unsigned long j=_currdate-1721119; + year_=(int) (((j<<2)-1)/146097); + j=(j<<2)-1-146097*year_; + d=(j>>2); + j=((d<<2)+3)/1461; + d=(d<<2)+3-1461*j; + d=(d+4)>>2; + month_=(int)(5*d-3)/153; + d=5*d-3-153*month_; + day_=(int)((d+5)/5); + year_=(int)(100*year_+j); + if (month_<10) month_+=3; + else { month_-=9; year_++; } +} + +int Calendar::dayInWeek(void) +{ + return ((((_currdate+1)%7)+6)%7)+1; +} + +Calendar &Calendar::nextWeekday(void) +{ + (*this) += 1; + while (!isWeekday()) (*this)+= 1; + return *this; +} + +int Calendar::isWeekday(void) +{ + return (dayInWeek()<6)?1:0; +} + +Calendar &Calendar::operator+= (int incr_) +{ + _currdate += incr_; + return *this; +} + +ostream &operator<< (ostream &os_, Calendar &that_) +{ + int mo, day, year; + that_.split(mo,day,year); + os_ << mo << "/" << day << "/" << year; + return os_; +} diff --git a/tests/datagen_jose/gen.C b/tests/datagen_jose/gen.C new file mode 100644 index 0000000..aeb34f6 --- /dev/null +++ b/tests/datagen_jose/gen.C @@ -0,0 +1,58 @@ +// cmvc_id = %Z% %W% %I% %E% %U% + +#ifndef genIMPLEMENTATION +#define genIMPLEMENTATION + +#include +#include "RandGen.H" + +int num[6]; +int nelems=0; + +int member(int a_) +{ + for (int i=0; i" << endl; + return 1; + } + + int k = atoi(av[1]); + for(int i=0; i +#include +#include +#include +#include +#include + +#include "RandGen.H" +#include "cal.H" +using namespace std; +inline int max(int a, int b) +{ + return (a>b)?a:b; +} + +inline int min(int a, int b) +{ + return (a [depth - in days. Default = 4000days]" << endl; + return 1; + } + + int scale=0, ndays = 4000; + if (ac >= 2) scale = atoi(av[1]); + if (ac >= 3) ndays = atoi(av[2]); + + // Generation of base info + int nex = 5; + char *ex[] = { "NY", "O", "AM", "LN", "TK"}; + + int nsic = 10; + char *sic[] = { "COMPUTERS", "CHEMICALS", "FINANCIAL", "INDUSTRIAL", "PHARMACEUTICALS", + "MEDICAL", "BANKING", "SOFTWARE", "ENTERTAINMENT", "CONSTRUCTION" }; + + char *cu[] = { "USD", "DEM", "JPY", "FFR", "GBP"}; + int ncu = 5; + + char *spr[] = { "AAA", "AA", "A", "BBB", "BB", "B", "CCC", "CC", "C"}; + int nspr = 9; + + + unsigned int rnum; + char id[100]; + char descr[256]; + // the below is a pain with monetdb, changing to better date format + //char *crdate = "3/11/1999"; + char *crdate = "1999-11-03"; + + basefile << "Id|Ex|Descr|SIC|SPR|Cu|CreateDate" << endl; + for (i=0; i 2.0*minop[k]) + { + int splitfactor = rg(1,4); + op[k] /= (double)splitfactor; + vs[k] *= splitfactor; + + splitfile << id; + splitfile << "|" << cal; + splitfile << "|" << cal; + splitfile << "|" << splitfactor; + splitfile << endl; + } + + // check dividends + if (op[k] > minop[k]) + { + // dividend as a fraction of current closing price + double dividend = (rg(1, 100) / 100.0) * cp; + + dividendfile << id; + dividendfile << "|" << cal; + dividendfile << "|" << dividend; + // assumes announced and disbursed same day, + // queries can be trivially modified to do away with this assumption + dividendfile << "|" << cal; + dividendfile << endl; + } + } + + + } + splitfile.close(); + pricefile.close(); + dividendfile.close(); +} + + +#endif diff --git a/tests/datagen_jose/makefile b/tests/datagen_jose/makefile new file mode 100644 index 0000000..a4c16cc --- /dev/null +++ b/tests/datagen_jose/makefile @@ -0,0 +1,20 @@ +.PHONY: clean + +all: histgen tickgen + +clean: + rm -rf *.o histgen tickgen + +%.o: %.C + g++-12 -Ofast -march=native -g -c $< + +tickgen: cal.o Time.o tickgen.o + g++-12 -lstdc++ -Ofast -march=native -flto -o tickgen cal.o Time.o tickgen.o + +histgen: cal.o histgen.o + g++-12 -lstdc++ -Ofast -flto -march=native -o histgen cal.o histgen.o + +timetest: Time.o timetest.o + g++-12 -lstdc++ -g -o timetest Time.o timetest.o + + diff --git a/tests/datagen_jose/tickgen.C b/tests/datagen_jose/tickgen.C new file mode 100644 index 0000000..d332633 --- /dev/null +++ b/tests/datagen_jose/tickgen.C @@ -0,0 +1,197 @@ +// cmvc_id = %Z% %W% %I% %E% %U% + +#ifndef histgenIMPLEMENTATION +#define histgenIMPLEMENTATION + +#include +#include +#include +#include +#include +#include + +#include "RandGen.H" +#include "cal.H" +#include "Time.H" + +using namespace std; + +inline int max(int a, int b) +{ + return (a>b)?a:b; +} + +inline int min(int a, int b) +{ + return (a [t -ticks per day.Default=100] [d - no of days.Default=90]" << endl; + return 1; + } + + int n=0,t=100,days=90; + + if (ac >= 2) n = atoi(av[1]); + if (ac >= 3) t = atoi(av[2]); + if (ac >= 4) days = atoi(av[3]); + + + int tps = (n*t)/28800; // ticks per second + tps++; + cout << "Ticks per second: " << tps << endl; + + // Generation of base info + int nex = 5; + char *ex[] = { "NY", "O", "AM", "LN", "TK"}; + + int nsic = 10; + char *sic[] = { "COMPUTERS", "CHEMICALS", "FINANCIAL", "INDUSTRIAL", "PHARMACEUTICALS", + "MEDICAL", "BANKING", "SOFTWARE", "ENTERTAINMENT", "CONSTRUCTION" }; + + char *cu[] = { "USD", "DEM", "JPY", "FFR", "GBP"}; + int ncu = 5; + + char *spr[] = { "AAA", "AA", "A", "BBB", "BB", "B", "CCC", "CC", "C"}; + int nspr = 9; + + + unsigned int rnum; + char id[100]; + char descr[256]; + char *crdate = "3/11/1999"; + + basefile << "Id | Ex | Descr | SIC | Cu" << endl; + + for (i=0; i 5)? +1:-1; + bp[sec] = (dir*rg(0,3)*tick)+bp[sec]; + int bs = rg(1,100) * 100; + pricefile << id; + pricefile << "|" << ++seq[sec]; + pricefile << "|" << cal; + pricefile << "|" << tm; + pricefile << "|Q" << endl; + break; + } + case 3: // cancel/correct as a trade + { + if (rg(0,100) < 5) break; + double tp = bp[sec]; + int ts = rg(1,100) * 100; + pricefile << id; + pricefile << "|" << ++seq[sec]; + pricefile << "|" << cal; + pricefile << "|" << tm; + pricefile << "|CT" << endl; + break; + } + default: + break; + } + } + // Go to the next second + tm++; + } + cal.nextWeekday(); + } + pricefile.close(); +} + + +#endif diff --git a/tests/stock.a b/tests/stock.a index a2ba8b5..ff7b6df 100644 --- a/tests/stock.a +++ b/tests/stock.a @@ -30,3 +30,4 @@ SELECT price, timestamp FROM stocks where price - timestamp > 1 and not (price*t SELECT max(price-mins(price)) FROM stocks ASSUMING DESC timestamp +