Bug fixes for alias&join. Add test in presentation.

dev
Bill 2 years ago
parent 2614d010da
commit ff63be720c

3
.gitignore vendored

@ -1,3 +1,6 @@
tests/datagen_jose/histgen
tests/datagen_jose/tickgen
datagen
*.dSYM
testmain.lib
testmain.exp

@ -2,7 +2,7 @@
## GLOBAL CONFIGURATION FLAGS
version_string = '0.4.4a'
version_string = '0.4.5a'
add_path_to_ldpath = True
rebuild_backend = False
run_backend = True

@ -44,6 +44,7 @@ LEFT = keyword("left")
LIKE = keyword("like")
LIMIT = keyword("limit").suppress()
MINUS = keyword("minus")
NATURAL = keyword("natural")
OFFSET = keyword("offset").suppress()
ON = keyword("on").suppress()
ORDER = keyword("order").suppress()
@ -145,7 +146,7 @@ VIEW = keyword("view")
joins = (
(
Optional(CROSS | OUTER | INNER | ((FULL | LEFT | RIGHT) + Optional(INNER | OUTER)))
Optional(CROSS | OUTER | INNER | NATURAL | ((FULL | LEFT | RIGHT) + Optional(INNER | OUTER)))
+ JOIN
+ Optional(LATERAL)
)
@ -214,6 +215,7 @@ RESERVED = MatchFirst([
LIKE,
LIMIT,
MINUS,
NATURAL,
NOCASE,
NOT,
NULL,
@ -253,6 +255,7 @@ EQ = Char("=").suppress()
join_keywords = {
"join",
"natural join",
"full join",
"cross join",
"inner join",

@ -323,9 +323,12 @@ def parser(literal_string, ident, sqlserver=False):
table_source = Forward()
assumption = Group((ASC|DESC) ("sort") + var_name("value"))
assumptions = Optional(ASSUMING.suppress() + Group(delimited_list(assumption)))
join = (
Group(joins)("op")
+ table_source("join")
+ (table_source )("join")
+ Optional((ON + expr("on")) | (USING + expr("using")))
| (
Group(WINDOW)("op")
@ -403,7 +406,12 @@ def parser(literal_string, ident, sqlserver=False):
| selection
+ Optional(INTO + table_source("into"))
+ Optional(
(FROM + delimited_list(table_source) + ZeroOrMore(join))("from")
(
FROM
+ (delimited_list(table_source)
+ ZeroOrMore(join))("table_source")
+ Optional(assumptions) ("assumptions")
)("from")
+ Optional(WHERE + expr("where"))
+ Optional(GROUP_BY + delimited_list(Group(named_column))("groupby"))
+ Optional(HAVING + expr("having"))
@ -443,12 +451,8 @@ def parser(literal_string, ident, sqlserver=False):
+ RB,
)
assumption = Group((ASC|DESC) ("sort") + var_name("value"))
assumptions = (ASSUMING + Group(delimited_list(assumption))("assumptions"))
table_source << Group(
((LB + query + RB) | stack | call_function | var_name)("value")
+ Optional(assumptions)
+ Optional(flag("with ordinality"))
+ Optional(tablesample)
+ alias

@ -37,7 +37,7 @@ void permutation(int *v, int n) {
}
}
int main(int argc, char* argv[])
int gen_trade_data(int argc, char* argv[])
{
using std::vector;
float frac = .3;
@ -108,3 +108,48 @@ int main(int argc, char* argv[])
fclose(fp);
return 0;
}
#include "./server/utils.h"
#include "./server/types.h"
#include <string>
types::date_t rand_date(){
unsigned char d = ui(engine) % 28 + 1;
unsigned char m = ui(engine) % 12 + 1;
short y = ui(engine) % 40 + 1990;
if (ui(engine) % 2) return types::date_t((unsigned char)10, (unsigned char)1, 2003);
return types::date_t{d, m, y};
}
int gen_stock_data(int argc, char* argv[]){
using std::string;
using namespace types;
int n_stocks = 5;
int n_data = 1000;
string* IDs = new string[n_stocks + 1];
string* names = new string[n_stocks + 1];
for(int i = 0; i < n_stocks; ++i){
IDs[i] = base62uuid();
names[i] = base62uuid();
}
IDs[n_stocks] = "S";
names[n_stocks] = "x";
FILE* fp = fopen("./data/stock.csv", "w");
fprintf(fp, "ID, timestamp, tradeDate, price\n");
char date_str_buf [types::date_t::string_length()];
int* timestamps = new int[n_data];
for(int i = 0; i < n_data; ++i) timestamps[i] = i+1;
permutation(timestamps, n_data);
for(int i = 0; i < n_data; ++i){
auto date = rand_date().toString(date_str_buf + date_t::string_length());
fprintf(fp, "%s,%d,%s,%d\n", IDs[ui(engine)%(n_stocks + 1)].c_str(), timestamps[i], date, ui(engine) % 1000);
}
fclose(fp);
fp = fopen("./data/base.csv", "w");
fprintf(fp, "ID, name\n");
for(int i = 0; i < n_stocks + 1; ++ i){
fprintf(fp, "%s,%s\n", IDs[i].c_str(), names[i].c_str());
}
fclose(fp);
}
int main(int argc, char* argv[]){
gen_stock_data(argc, argv);
}

@ -29,7 +29,7 @@ class projection(ast_node):
def spawn(self, node):
self.datasource = None
if 'from' in node:
from_clause = node['from']
from_clause = node['from']['table_source']
if type(from_clause) is list:
# from joins
join(self, from_clause)
@ -47,8 +47,8 @@ class projection(ast_node):
self.datasource = self.context.tables_byname[value]
if 'name' in value:
self.datasource.add_alias(value['name'])
if 'assumptions' in from_clause:
self.assumptions = enlist(from_clause['assumptions'])
if 'assuming' in node['from']:
self.assumptions = enlist(node['from']['assuming'])
elif type(from_clause) is str:
self.datasource = self.context.tables_byname[from_clause]

@ -73,7 +73,7 @@ class projection(ast_node):
self.datasource = join(self, [], self.context) # datasource is Join instead of TableInfo
self.assumptions = []
if 'from' in node:
from_clause = node['from']
from_clause = node['from']['table_source']
self.datasource = join(self, from_clause)
if 'assumptions' in from_clause:
self.assumptions = enlist(from_clause['assumptions'])
@ -129,12 +129,17 @@ class projection(ast_node):
if not proj_expr.is_special:
y = lambda x:x
name = eval('f\'' + name + '\'')
offset = len(col_exprs)
if name not in self.var_table:
self.var_table[name] = len(col_exprs)
proj_map[i] = [this_type, len(col_exprs), proj_expr]
self.var_table[name] = offset
if proj_expr.is_ColExpr and type(proj_expr.raw_col) is ColRef:
for n in (proj_expr.raw_col.table.alias):
self.var_table[f'{n}.'+name] = offset
proj_map[i] = [this_type, offset, proj_expr]
col_expr = name + ' AS ' + alias if alias else name
if alias:
self.var_table[alias] = len(col_exprs)
self.var_table[alias] = offset
col_exprs.append((col_expr, proj_expr.type))
else:
self.context.headers.add('"./server/aggregations.h"')
@ -164,10 +169,12 @@ class projection(ast_node):
self.add(', '.join([c[0] for c in col_exprs] + col_ext_names))
_base_offset = len(col_exprs)
for i, col in enumerate(col_ext_names):
if col not in self.var_table:
self.var_table[col] = i + _base_offset
for i, col in enumerate(self.col_ext):
if col.name not in self.var_table:
offset = i + _base_offset
self.var_table[col.name] = offset
for n in (col.table.alias):
self.var_table[f'{n}.'+col.name] = offset
def finialize(astnode:ast_node):
if(astnode is not None):
@ -548,24 +555,25 @@ class join(ast_node):
self.tables_dir = dict()
self.rec = None
self.top_level = self.parent and type(self.parent) is projection
self.have_sep = False
# self.tmp_name = 'join_' + base62uuid(4)
# self.datasource = TableInfo(self.tmp_name, [], self.context)
def append(self, tbls, __alias = ''):
alias = lambda t : '(' + t + ') ' + __alias if len(__alias) else t
alias = lambda t : t + ' ' + __alias if len(__alias) else t
if type(tbls) is join:
self.joins.append(alias(tbls.__str__()))
self.joins.append((alias(tbls.__str__()), tbls.have_sep))
self.tables += tbls.tables
self.tables_dir = {**self.tables_dir, **tbls.tables_dir}
elif type(tbls) is TableInfo:
self.joins.append(alias(tbls.table_name))
self.joins.append((alias(tbls.table_name), False))
self.tables.append(tbls)
self.tables_dir[tbls.table_name] = tbls
for a in tbls.alias:
self.tables_dir[a] = tbls
elif type(tbls) is projection:
self.joins.append(alias(tbls.finalize()))
self.joins.append((alias(tbls.finalize()), False))
def produce(self, node):
if type(node) is list:
@ -589,13 +597,14 @@ class join(ast_node):
tbl.add_alias(node['name'])
self.append(tbl, alias)
else:
keys = node.keys()
keys = list(node.keys())
if keys[0].lower().endswith('join'):
self.have_sep = True
j = join(self, node[keys[0]])
tablename = f' {keys[0]} {j}'
if keys[1].lower() == 'on':
if len(keys) > 1 and keys[1].lower() == 'on':
tablename += f' on {expr(self, node[keys[1]])}'
self.joins.append(tablename)
self.joins.append((tablename, self.have_sep))
self.tables += j.tables
self.tables_dir = {**self.tables_dir, **j.tables_dir}
@ -622,18 +631,27 @@ class join(ast_node):
if datasource is None:
raise ValueError(f'Table name/alias not defined{parsedColExpr[0]}')
else:
return datasource.parse_col_names(parsedColExpr[1])
datasource.rec = self.rec
ret = datasource.parse_col_names(parsedColExpr[1])
datasource.rec = None
return ret
@property
def all_cols(self):
return set([c for t in self.tables for c in t.columns])
def consume(self, node):
self.sql = ', '.join(self.joins)
self.sql = ''
for j in self.joins:
if not self.sql or j[1]:
self.sql += j[0]
else:
self.sql += ', ' + j[0]
if node and self.sql and self.top_level:
self.sql = ' FROM ' + self.sql
return super().consume(node)
def __str__(self):
return ', '.join(self.joins)
return self.sql
def __repr__(self):
return self.__str__()

@ -87,116 +87,119 @@ class expr(ast_node):
from reconstruct.ast import udf
if type(node) is dict:
if len(node) > 1:
print(f'Parser Error: {node} has more than 1 dict entry.')
if 'literal' in node:
node = node['literal']
else:
if len(node) > 1:
print(f'Parser Error: {node} has more than 1 dict entry.')
for key, val in node.items():
if key in self.operators:
if key in builtin_func:
if self.is_agg_func:
self.root.is_special = True # Nested Aggregation
else:
self.is_agg_func = True
op = self.operators[key]
count_distinct = False
if key == 'count' and type(val) is dict and 'distinct' in val:
count_distinct = True
val = val['distinct']
val = enlist(val)
exp_vals = [expr(self, v, c_code = self.c_code) for v in val]
self.children = exp_vals
self.opname = key
str_vals = [e.sql for e in exp_vals]
type_vals = [e.type for e in exp_vals]
is_compound = any([e.is_compound for e in exp_vals])
if key in self.ext_aggfuncs:
self.is_compound = False
else:
self.is_compound = is_compound
try:
self.type = op.return_type(*type_vals)
except AttributeError as e:
if type(self.root) is not udf:
# TODO: do something when this is not an error
# print(f'alert: {e}')
pass
self.type = AnyT
if count_distinct: # inject distinct col later
self.sql = f'{{{op(self.c_code, *str_vals, True)}}}'
else:
self.sql = op(self.c_code, *str_vals)
for key, val in node.items():
if key in self.operators:
if key in builtin_func:
if self.is_agg_func:
self.root.is_special = True # Nested Aggregation
else:
self.is_agg_func = True
special_func = [*self.context.udf_map.keys(), *self.context.module_map.keys(),
"maxs", "mins", "avgs", "sums", "deltas"]
if self.context.special_gb:
special_func = [*special_func, *self.ext_aggfuncs]
op = self.operators[key]
count_distinct = False
if key == 'count' and type(val) is dict and 'distinct' in val:
count_distinct = True
val = val['distinct']
val = enlist(val)
exp_vals = [expr(self, v, c_code = self.c_code) for v in val]
self.children = exp_vals
self.opname = key
if key in special_func and not self.is_special:
self.is_special = True
if key in self.context.udf_map:
self.root.udf_called = self.context.udf_map[key]
if self.is_udfexpr and key == self.root.udf.name:
self.root.is_recursive_call_inudf = True
elif key in user_module_func.keys():
udf.try_init_udf(self.context)
# TODO: make udf_called a set!
p = self.parent
while type(p) is expr and not p.udf_called:
p.udf_called = self.udf_called
p = p.parent
p = self.parent
while type(p) is expr and not p.is_special:
p.is_special = True
p = p.parent
str_vals = [e.sql for e in exp_vals]
type_vals = [e.type for e in exp_vals]
is_compound = any([e.is_compound for e in exp_vals])
if key in self.ext_aggfuncs:
self.is_compound = False
else:
self.is_compound = is_compound
try:
self.type = op.return_type(*type_vals)
except AttributeError as e:
if type(self.root) is not udf:
# TODO: do something when this is not an error
# print(f'alert: {e}')
pass
self.type = AnyT
if count_distinct: # inject distinct col later
self.sql = f'{{{op(self.c_code, *str_vals, True)}}}'
else:
self.sql = op(self.c_code, *str_vals)
special_func = [*self.context.udf_map.keys(), *self.context.module_map.keys(),
"maxs", "mins", "avgs", "sums", "deltas", "last"]
if self.context.special_gb:
special_func = [*special_func, *self.ext_aggfuncs]
if key in special_func and not self.is_special:
self.is_special = True
if key in self.context.udf_map:
self.root.udf_called = self.context.udf_map[key]
if self.is_udfexpr and key == self.root.udf.name:
self.root.is_recursive_call_inudf = True
elif key in user_module_func.keys():
udf.try_init_udf(self.context)
# TODO: make udf_called a set!
p = self.parent
while type(p) is expr and not p.udf_called:
p.udf_called = self.udf_called
p = p.parent
p = self.parent
while type(p) is expr and not p.is_special:
p.is_special = True
p = p.parent
need_decltypestr = any([e.need_decltypestr for e in exp_vals])
if need_decltypestr or (self.udf_called and type(op) is udf):
decltypestr_vals = [e.udf_decltypecall for e in exp_vals]
self.udf_decltypecall = op(self.c_code, *decltypestr_vals)
need_decltypestr = any([e.need_decltypestr for e in exp_vals])
if need_decltypestr or (self.udf_called and type(op) is udf):
decltypestr_vals = [e.udf_decltypecall for e in exp_vals]
self.udf_decltypecall = op(self.c_code, *decltypestr_vals)
if self.udf_called and type(op) is udf:
self.udf_decltypecall = op.decltypecall(self.c_code, *decltypestr_vals)
elif self.is_udfexpr:
var_table = self.root.udf.var_table
vec = key.split('.')
_vars = [*var_table, *self.builtin_vars]
def get_vname (node):
if node in self.builtin_vars:
self.root.udf.builtin[node].enabled = True
self.builtin_var = node
return node
if self.udf_called and type(op) is udf:
self.udf_decltypecall = op.decltypecall(self.c_code, *decltypestr_vals)
elif self.is_udfexpr:
var_table = self.root.udf.var_table
vec = key.split('.')
_vars = [*var_table, *self.builtin_vars]
def get_vname (node):
if node in self.builtin_vars:
self.root.udf.builtin[node].enabled = True
self.builtin_var = node
return node
else:
return var_table[node]
if vec[0] not in _vars:
# print(f'Use of undefined variable {vec[0]}')
# TODO: do something when this is not an error
pass
else:
return var_table[node]
if vec[0] not in _vars:
# print(f'Use of undefined variable {vec[0]}')
# TODO: do something when this is not an error
pass
vname = get_vname(vec[0])
val = enlist(val)
if(len(val) > 2):
print('Warning: more than 2 indexes found for subvec operator.')
ex = [expr(self, v, c_code = self.c_code) for v in val]
idxs = ', '.join([e.sql for e in ex])
self.sql = f'{vname}.subvec({idxs})'
if any([e.need_decltypestr for e in ex]):
self.udf_decltypecall = f'{vname}.subvec({[", ".join([e.udf_decltypecall for e in ex])]})'
if key == 'get' and len(val) > 1:
ex_vname = expr(self, val[0], c_code=self.c_code)
self.sql = f'{ex_vname.sql}[{expr(self, val[1], c_code=self.c_code).sql}]'
if hasattr(ex_vname, 'builtin_var'):
if not hasattr(self, 'builtin_var'):
self.builtin_var = []
self.builtin_var = [*self.builtin_var, *ex_vname.builtin_var]
self.udf_decltypecall = ex_vname.sql
else:
vname = get_vname(vec[0])
val = enlist(val)
if(len(val) > 2):
print('Warning: more than 2 indexes found for subvec operator.')
ex = [expr(self, v, c_code = self.c_code) for v in val]
idxs = ', '.join([e.sql for e in ex])
self.sql = f'{vname}.subvec({idxs})'
if any([e.need_decltypestr for e in ex]):
self.udf_decltypecall = f'{vname}.subvec({[", ".join([e.udf_decltypecall for e in ex])]})'
if key == 'get' and len(val) > 1:
ex_vname = expr(self, val[0], c_code=self.c_code)
self.sql = f'{ex_vname.sql}[{expr(self, val[1], c_code=self.c_code).sql}]'
if hasattr(ex_vname, 'builtin_var'):
if not hasattr(self, 'builtin_var'):
self.builtin_var = []
self.builtin_var = [*self.builtin_var, *ex_vname.builtin_var]
self.udf_decltypecall = ex_vname.sql
else:
print(f'Undefined expr: {key}{val}')
print(f'Undefined expr: {key}{val}')
elif type(node) is str:
if type(node) is str:
if self.is_udfexpr:
curr_udf : udf = self.root.udf
var_table = curr_udf.var_table
@ -231,14 +234,29 @@ class expr(ast_node):
self.raw_col = self.raw_col if type(self.raw_col) is ColRef else None
if self.raw_col is not None:
self.is_ColExpr = True
self.sql = self.raw_col.name
table_name = ''
if '.' in node:
table_name = self.raw_col.table.table_name
if self.raw_col.table.alias:
alias = iter(self.raw_col.table.alias)
try:
a = next(alias)
while(not a or a == table_name):
a = next(alias)
if (a and a != table_name):
table_name = a
except StopIteration:
pass
if table_name:
table_name = table_name + '.'
self.sql = table_name + self.raw_col.name
self.type = self.raw_col.type
self.is_compound = True
self.opname = self.raw_col
else:
self.sql = node
self.sql = '\'' + node + '\''
self.type = StrT
self.opname = node
self.opname = self.sql
if self.c_code and self.datasource is not None:
self.sql = f'{{y(\"{self.sql}\")}}'
elif type(node) is bool:
@ -248,7 +266,7 @@ class expr(ast_node):
self.sql = '1' if node else '0'
else:
self.sql = 'TRUE' if node else 'FALSE'
else:
elif type(node) is not dict:
self.sql = f'{node}'
self.opname = node
if type(node) is int:

@ -181,7 +181,7 @@ namespace types {
return !operator==(other);
}
bool time_t::validate() const{
return hours < 24 && minutes < 60 && seconds < 60 && ms < 1000;
return hours < 24 && minutes < 60 && seconds < 60 && ms < 1000000;
}
timestamp_t::timestamp_t(const char* str) { fromString(str); }
@ -244,13 +244,13 @@ std::ostream& operator<<(std::ostream& os, types::timestamp_t & v)
using std::string;
string base62uuid(int l = 8) {
string base62uuid(int l) {
using namespace std;
constexpr static const char* base62alp = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
static mt19937_64 engine(chrono::system_clock::now().time_since_epoch().count());
static uniform_int_distribution<uint64_t> u(0x10000, 0xfffff);
uint64_t uuid = (u(engine) << 32ull) + (chrono::system_clock::now().time_since_epoch().count() & 0xffffffff);
printf("%llu\n", uuid);
//printf("%llu\n", uuid);
string ret;
while (uuid && l-- >= 0) {
ret = string("") + base62alp[uuid % 62] + ret;

@ -69,7 +69,7 @@ namespace types {
time_t& fromString(const char*);
bool validate() const;
constexpr static unsigned string_length() {
return 13;
return 16;
};
char* toString(char* buf) const;
bool operator > (const time_t&) const;

@ -1,5 +1,4 @@
#pragma once
#include <string>
#include <ctime>
#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
constexpr static bool cpp_17 = true;
@ -13,4 +12,6 @@ inline const char* str(const T& v) {
template <>
inline const char* str(const bool& v) {
return v ? "true" : "false";
}
}
#include<string>
extern std::string base62uuid(int l = 6);

@ -0,0 +1,41 @@
-- please run datagen.get_stock_data() to generate data/stock.csv first
-- create table ticks(ID varchar(10), timestamp int, tradeDate date, price int);
-- LOAD DATA INFILE "data/stock.csv"
-- INTO TABLE ticks
-- FIELDS TERMINATED BY ","
-- SELECT max(price-mins(price))
-- FROM ticks ASSUMING ASC timestamp
-- WHERE ID="S"
-- AND tradeDate='2003-01-10'
-- create table base(ID varchar(10), name varchar(10));
-- LOAD DATA INFILE "data/base.csv"
-- INTO TABLE base
-- FIELDS TERMINATED BY ","
-- SELECT last(price)
-- FROM ticks t, base b
-- ASSUMING ASC name, ASC timestamp
-- WHERE t.ID=b.ID
-- AND name="x"
create table TradedStocks(ID varchar(15), SeqNo int, TradeDate date, TimeStamp time, Type varchar(5));
create table HistoricQuotes(ID varchar(15), TradeDate date, HighPrice real, LowPrice real, ClosePrice real, OpenPrice real, volume bigint);
LOAD DATA INFILE "data/tick-price-file.csv"
INTO TABLE TradedStocks
FIELDS TERMINATED BY "|"
LOAD DATA INFILE "data/hist-price-file.csv"
INTO TABLE HistoricQuotes
FIELDS TERMINATED BY "|"
SELECT ts.ID, avgs(10, hq.ClosePrice)
FROM TradedStocks AS ts NATURAL JOIN
HistoricQuotes AS hq
ASSUMING ASC hq.TradeDate
GROUP BY ts.ID

@ -0,0 +1,26 @@
There are 2 programs included in this package:
1. histgen - generation program for historical data
2. tickgen - generation program for tick data
histgen takes 2 parameters
a) Scale-factor: Number of securities for which data is to be generated
b) Depth of history: The number of days for which data is to be generated
and genertes 2 files
a) hist-base-file: contains all the static data
b) hist-price-file: contains the pricing info
c) hist-split-file: contains the split info
tickgen also takes 2 parameters
a) Scale-factor: Number of securities for which data is to be generated
b) Ticks per day: A measure of the activity on a per security basis. e.g 100 would mean
that each security ticks about 100 time in a trading day
and generates 2 files
a) tick-base-file
b) tick-price-file
run the programs without args to see the parameters required.
------------------
Make of programs:
run the make program. It requires the C++ compiler (CC) to be in your path.

@ -0,0 +1,61 @@
// cmvc_id = %Z% %W% %I% %E% %U%
#ifndef RandGenHEADER
#define RandGenHEADER
///////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 1997 Morgan Stanley & Co. Incorporated, All Rights Reserved
//
// Unpublished copyright. All rights reserved. This material contains
// proprietary information that shall be used or copied only within Morgan
// Stanley, except with written permission of Morgan Stanley.
//
// Module Name : RandGen.H
// Version : %Z% %W% %I% %E% %U%
// Project : DataGen
// Description : Random Number generator
//
///////////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <iostream>
#include <sys/time.h>
using namespace std;
class RandNumGen
{
public:
RandNumGen(void)
{
struct timeval tv;
gettimeofday(&tv,0 );
srand(tv.tv_sec + tv.tv_usec);
}
RandNumGen(unsigned long seed_)
{
srand(seed_);
}
~RandNumGen(){}
inline unsigned long operator() (void);
inline int operator() (int min_, int max_);
};
// Implementation of inline functions
inline unsigned long RandNumGen::operator() (void)
{
return rand();
}
inline int RandNumGen::operator() (int min_, int max_)
{
unsigned long t = (*this)();
int r = max_ - min_;
return (min_ + t % r);
}
#endif

@ -0,0 +1,71 @@
// cmvc_id = %Z% %W% %I% %E% %U%
#ifndef TimeIMPLEMENTATION
#define TimeIMPLEMENTATION
///////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 1997 Morgan Stanley & Co. Incorporated, All Rights Reserved
//
// Unpublished copyright. All rights reserved. This material contains
// proprietary information that shall be used or copied only within Morgan
// Stanley, except with written permission of Morgan Stanley.
//
// Module Name : Time.C
// Version : %Z% %W% %I% %E% %U%
// Project :
// Description :
//
///////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include "Time.H"
Time::Time(char *startTime_)
{
sscanf(startTime_,"%d:%d:%d", &_hrs, &_mins, &_secs);
cout << "Hrs: " << _hrs << ",Mins: " << _mins << ",Secs: " << _secs << endl;
}
Time::~Time()
{}
Time &Time::operator++ (int)
{
_secs++;
adjust();
return *this;
}
void Time::adjust(void)
{
if (_secs >= 60)
{
_mins += _secs/60;
_secs = _secs%60;
}
if (_mins >= 60)
{
_hrs += _mins/60;
_mins = _mins%60;
}
if (_hrs >= 24) _hrs = _hrs = _hrs%24;
}
ostream &operator<< (ostream &os_, Time &that_)
{
if (that_.hrs() < 10) os_ << "0";
os_ << that_.hrs();
os_ << ":" ;
if (that_.mins() < 10) os_ << "0";
os_ << that_.mins();
os_ << ":" ;
if (that_.secs() < 10) os_ << "0";
os_ << that_.secs();
return os_;
}
#endif

@ -0,0 +1,43 @@
// cmvc_id = %Z% %W% %I% %E% %U%
#ifndef TimeHEADER
#define TimeHEADER
///////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 1997 Morgan Stanley & Co. Incorporated, All Rights Reserved
//
// Unpublished copyright. All rights reserved. This material contains
// proprietary information that shall be used or copied only within Morgan
// Stanley, except with written permission of Morgan Stanley.
//
// Module Name : Time.H
// Version : %Z% %W% %I% %E% %U%
// Project :
// Description :
//
///////////////////////////////////////////////////////////////////////////////
#include <iostream>
using namespace std;
class Time
{
public:
Time(char *startTime_);
~Time();
Time &operator++(int);
void adjust(void);
int hrs(void) { return _hrs; }
int mins(void) { return _mins; }
int secs(void) { return _secs; }
friend ostream &operator<< (ostream &os_, Time &that_);
private:
int _hrs, _mins, _secs;
};
#endif

@ -0,0 +1,79 @@
#include <sys/types.h>
#include <time.h>
#include <iostream>
#include "cal.H"
using namespace std;
Calendar::Calendar(void)
{
time_t clk = time(0);
struct tm *now = localtime(&clk);
_currdate = asJulianNumber(now->tm_mon+1, now->tm_mday, now->tm_year+1900);
}
Calendar::~Calendar()
{}
// year_ in yyyy format
unsigned int Calendar::asJulianNumber(int month_,int day_,int year_)
{
unsigned long c,ya;
if (month_>2) month_-=3;
else { month_+=9; year_--; }
c=year_/100;
ya=year_-100*c;
return ((146097*c)>>2)+((1461*ya)>>2)+(153*month_+2)/5+day_+1721119;
}
void Calendar::split(int& month_,int& day_,int& year_)
{
unsigned long d;
unsigned long j=_currdate-1721119;
year_=(int) (((j<<2)-1)/146097);
j=(j<<2)-1-146097*year_;
d=(j>>2);
j=((d<<2)+3)/1461;
d=(d<<2)+3-1461*j;
d=(d+4)>>2;
month_=(int)(5*d-3)/153;
d=5*d-3-153*month_;
day_=(int)((d+5)/5);
year_=(int)(100*year_+j);
if (month_<10) month_+=3;
else { month_-=9; year_++; }
}
int Calendar::dayInWeek(void)
{
return ((((_currdate+1)%7)+6)%7)+1;
}
Calendar &Calendar::nextWeekday(void)
{
(*this) += 1;
while (!isWeekday()) (*this)+= 1;
return *this;
}
int Calendar::isWeekday(void)
{
return (dayInWeek()<6)?1:0;
}
Calendar &Calendar::operator+= (int incr_)
{
_currdate += incr_;
return *this;
}
ostream &operator<< (ostream &os_, Calendar &that_)
{
int mo, day, year;
that_.split(mo,day,year);
os_ << year << "-" << mo << "-" << day;
// the below is a pain for monetdb
//os_ << mo << "/" << day << "/" << year;
return os_;
}

@ -0,0 +1,26 @@
#ifndef _CAL_H_
#define _CAL_H_
#include <iostream>
using namespace std;
class Calendar
{
public:
Calendar(void);
~Calendar();
unsigned int asJulianNumber(int month_, int day_, int year_);
int isWeekday(void);
Calendar &operator+= (int incr_);
friend ostream &operator<< (ostream &os_,Calendar &that_);
int dayInWeek(void);
Calendar &nextWeekday(void);
void split(int &mo_, int &day_, int &year_);
private:
unsigned int _currdate;
};
#endif

@ -0,0 +1,76 @@
#include <sys/types.h>
#include <time.h>
#include <iostream>
#include "cal.H"
Calendar::Calendar(void)
{
time_t clk = time(0);
struct tm *now = localtime(&clk);
_currdate = asJulianNumber(now->tm_mon+1, now->tm_mday, now->tm_year+1900);
}
Calendar::~Calendar()
{}
// year_ in yyyy format
unsigned int Calendar::asJulianNumber(int month_,int day_,int year_)
{
unsigned long c,ya;
if (month_>2) month_-=3;
else { month_+=9; year_--; }
c=year_/100;
ya=year_-100*c;
return ((146097*c)>>2)+((1461*ya)>>2)+(153*month_+2)/5+day_+1721119;
}
void Calendar::split(int& month_,int& day_,int& year_)
{
unsigned long d;
unsigned long j=_currdate-1721119;
year_=(int) (((j<<2)-1)/146097);
j=(j<<2)-1-146097*year_;
d=(j>>2);
j=((d<<2)+3)/1461;
d=(d<<2)+3-1461*j;
d=(d+4)>>2;
month_=(int)(5*d-3)/153;
d=5*d-3-153*month_;
day_=(int)((d+5)/5);
year_=(int)(100*year_+j);
if (month_<10) month_+=3;
else { month_-=9; year_++; }
}
int Calendar::dayInWeek(void)
{
return ((((_currdate+1)%7)+6)%7)+1;
}
Calendar &Calendar::nextWeekday(void)
{
(*this) += 1;
while (!isWeekday()) (*this)+= 1;
return *this;
}
int Calendar::isWeekday(void)
{
return (dayInWeek()<6)?1:0;
}
Calendar &Calendar::operator+= (int incr_)
{
_currdate += incr_;
return *this;
}
ostream &operator<< (ostream &os_, Calendar &that_)
{
int mo, day, year;
that_.split(mo,day,year);
os_ << mo << "/" << day << "/" << year;
return os_;
}

@ -0,0 +1,58 @@
// cmvc_id = %Z% %W% %I% %E% %U%
#ifndef genIMPLEMENTATION
#define genIMPLEMENTATION
#include <iostream.h>
#include "RandGen.H"
int num[6];
int nelems=0;
int member(int a_)
{
for (int i=0; i<nelems; i++)
{
if (num[i] == a_) return 1;
}
return 0;
}
int gen(int flag_)
{
RandNumGen rg;
if (flag_ == 0)
{
for (int i=0;i<6;i++) num[i] = 0;
nelems=0;
return 0;
}
int rn;
while (1)
{
rn = rg(1,52);
if (member(rn) == 0) break;
}
num[nelems++] = rn;
return rn;
}
int main(int ac, char *av[])
{
if (ac < 2)
{
cerr << "Usage: " << av[0] << " <number>" << endl;
return 1;
}
int k = atoi(av[1]);
for(int i=0; i<k; i++)
{
gen(0);
for (int j=0;j<6;j++) cout << gen(1) << " ";
cout << endl;
}
}
#endif

@ -0,0 +1,198 @@
// cmvc_id = %Z% %W% %I% %E% %U%
#ifndef histgenIMPLEMENTATION
#define histgenIMPLEMENTATION
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <iostream>
#include <fstream>
#include <sys/time.h>
#include "RandGen.H"
#include "cal.H"
using namespace std;
inline int max(int a, int b)
{
return (a>b)?a:b;
}
inline int min(int a, int b)
{
return (a<b)?a:b;
}
int main(int ac, char *av[])
{
RandNumGen rg;
int i, j, d, k;
ofstream basefile("../../data/hist-base-file.csv");
ofstream pricefile("../../data/hist-price-file.csv");
ofstream splitfile("../../data/hist-split-file.csv");
ofstream dividendfile("../../data/hist-dividend-file.csv");
if (!basefile)
{
cerr << "Cannot open base-file" << endl;
return 1;
}
if (!pricefile)
{
cerr << "Cannot open price-file" << endl;
return 1;
}
if (!splitfile)
{
cerr << "Cannot open split-file" << endl;
return 1;
}
if (!dividendfile)
{
cerr << "Cannot open dividend-file" << endl;
return 1;
}
if (ac < 2)
{
cerr << "Usage: " << av[0] << " <scale - no of elements> [depth - in days. Default = 4000days]" << endl;
return 1;
}
int scale=0, ndays = 4000;
if (ac >= 2) scale = atoi(av[1]);
if (ac >= 3) ndays = atoi(av[2]);
// Generation of base info
int nex = 5;
char *ex[] = { "NY", "O", "AM", "LN", "TK"};
int nsic = 10;
char *sic[] = { "COMPUTERS", "CHEMICALS", "FINANCIAL", "INDUSTRIAL", "PHARMACEUTICALS",
"MEDICAL", "BANKING", "SOFTWARE", "ENTERTAINMENT", "CONSTRUCTION" };
char *cu[] = { "USD", "DEM", "JPY", "FFR", "GBP"};
int ncu = 5;
char *spr[] = { "AAA", "AA", "A", "BBB", "BB", "B", "CCC", "CC", "C"};
int nspr = 9;
unsigned int rnum;
char id[100];
char descr[256];
// the below is a pain with monetdb, changing to better date format
//char *crdate = "3/11/1999";
char *crdate = "1999-11-03";
basefile << "Id|Ex|Descr|SIC|SPR|Cu|CreateDate" << endl;
for (i=0; i<scale; i++)
{
sprintf(id,"Security_%d", i);
sprintf(descr, "'Financial security number: %d'", i);
basefile << id;
basefile << "|" << ex[rg(0,nex)];
basefile << "|" << descr;
basefile << "|" << sic[rg(0,nsic)];
basefile << "|" << spr[rg(0,nspr)];
basefile << "|" << cu[rg(0, ncu)];
basefile << "|" << crdate;
basefile << endl;
}
basefile.close();
// generation of price info
double *minop = new double[scale];
for (i=0; i<scale; i++) minop[i] = 0.0;
double *op = new double[scale];
for (i=0; i<scale; i++) op[i] = 0.0;
unsigned long *vs = new unsigned long[scale];
for (i=0; i<scale; i++) vs[i] = 0;
double cp, hp, lp;
Calendar cal;
pricefile << "Id|TradeDate|HighPrice|LowPrice|ClosePrice|OpenPrice|Volume" << endl;
splitfile << "Id|SplitDate|EntryDate|SplitFactor" << endl;
dividendfile << "Id|XdivDate|DivAmt|AnnounceDate" << endl;
for (d=0;d<ndays; d++)
{
cal.nextWeekday();
for (k=0;k<scale;k++)
{
sprintf(id,"Security_%d", k);
if (op[k]==0.0) op[k] = rg(0,100);
if (minop[k]==0.0) minop[k] = op[k];
if (vs[k]==0) vs[k] = rg();
else vs[k] = vs[k]*(100.0+rg(-10,+10))/100.0;
int skew = rg(0,+2);
double f = (100.0 + rg(-2,3+skew))/100.0;
cp = op[k] * f;
hp = max(op[k], cp) * (100.0+rg(0,+10))/100.0;
lp = min(op[k], cp) * (100.0-rg(0,+10))/100.0;
pricefile << id;
pricefile << "|" << cal;
pricefile << "|" << hp;
pricefile << "|" << lp;
pricefile << "|" << cp;
pricefile << "|" << op[k];
pricefile << "|" << vs[k];
pricefile << endl;
op[k] = cp;
// check splits
if (op[k] > 2.0*minop[k])
{
int splitfactor = rg(1,4);
op[k] /= (double)splitfactor;
vs[k] *= splitfactor;
splitfile << id;
splitfile << "|" << cal;
splitfile << "|" << cal;
splitfile << "|" << splitfactor;
splitfile << endl;
}
// check dividends
if (op[k] > minop[k])
{
// dividend as a fraction of current closing price
double dividend = (rg(1, 100) / 100.0) * cp;
dividendfile << id;
dividendfile << "|" << cal;
dividendfile << "|" << dividend;
// assumes announced and disbursed same day,
// queries can be trivially modified to do away with this assumption
dividendfile << "|" << cal;
dividendfile << endl;
}
}
}
splitfile.close();
pricefile.close();
dividendfile.close();
}
#endif

@ -0,0 +1,20 @@
.PHONY: clean
all: histgen tickgen
clean:
rm -rf *.o histgen tickgen
%.o: %.C
g++-12 -Ofast -march=native -g -c $<
tickgen: cal.o Time.o tickgen.o
g++-12 -lstdc++ -Ofast -march=native -flto -o tickgen cal.o Time.o tickgen.o
histgen: cal.o histgen.o
g++-12 -lstdc++ -Ofast -flto -march=native -o histgen cal.o histgen.o
timetest: Time.o timetest.o
g++-12 -lstdc++ -g -o timetest Time.o timetest.o

@ -0,0 +1,197 @@
// cmvc_id = %Z% %W% %I% %E% %U%
#ifndef histgenIMPLEMENTATION
#define histgenIMPLEMENTATION
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <iostream>
#include <fstream>
#include <sys/time.h>
#include "RandGen.H"
#include "cal.H"
#include "Time.H"
using namespace std;
inline int max(int a, int b)
{
return (a>b)?a:b;
}
inline int min(int a, int b)
{
return (a<b)?a:b;
}
int main(int ac, char *av[])
{
RandNumGen rg;
int i, j, d, k;
ofstream basefile("../../data/tick-base-file.csv");
ofstream pricefile("../../data/tick-price-file.csv");
if (!basefile)
{
cerr << "Cannot open base-file" << endl;
return 1;
}
if (!pricefile)
{
cerr << "Cannot open price-file" << endl;
return 1;
}
if (ac < 2)
{
cerr << "Usage: " << av[0] << " <n -scale> [t -ticks per day.Default=100] [d - no of days.Default=90]" << endl;
return 1;
}
int n=0,t=100,days=90;
if (ac >= 2) n = atoi(av[1]);
if (ac >= 3) t = atoi(av[2]);
if (ac >= 4) days = atoi(av[3]);
int tps = (n*t)/28800; // ticks per second
tps++;
cout << "Ticks per second: " << tps << endl;
// Generation of base info
int nex = 5;
char *ex[] = { "NY", "O", "AM", "LN", "TK"};
int nsic = 10;
char *sic[] = { "COMPUTERS", "CHEMICALS", "FINANCIAL", "INDUSTRIAL", "PHARMACEUTICALS",
"MEDICAL", "BANKING", "SOFTWARE", "ENTERTAINMENT", "CONSTRUCTION" };
char *cu[] = { "USD", "DEM", "JPY", "FFR", "GBP"};
int ncu = 5;
char *spr[] = { "AAA", "AA", "A", "BBB", "BB", "B", "CCC", "CC", "C"};
int nspr = 9;
unsigned int rnum;
char id[100];
char descr[256];
char *crdate = "3/11/1999";
basefile << "Id | Ex | Descr | SIC | Cu" << endl;
for (i=0; i<n; i++)
{
sprintf(id,"Security_%d", i);
sprintf(descr, "'Financial security number: %d'", i);
basefile << id;
basefile << " | " << ex[rg(0,nex)];
basefile << " | " << descr;
basefile << " | " << sic[rg(0,nsic)];
basefile << " | " << cu[rg(0, ncu)];
basefile << endl;
}
basefile.close();
// generation of price info
double tick=1.0/32.0;
// 1. gen the starting price of each security
double *bp = new double[n];
int *seq = new int[n];
for (i=0;i<n;i++)
{
bp[i] = rg(0,100);
seq[i] = 0;
}
Calendar cal;
Time tm("9:00:00");
cal.nextWeekday();
pricefile << "Id | SeqNo | TradeDate | TimeStamp | Type" << endl;
for (k=0;k<days; k++)
{
// for each second of the business day - 8*60*60
for(i=0;i<28800;i++)
{
// generate the required ticks
for(j=0;j<tps;j++)
{
//1. select a security
int sec = rg(0,n);
sprintf(id, "Security_%d", sec);
//2. select if it is a trade, ask, bid
int tqb = rg(0,4);
switch (tqb)
{
case 0: // trade
{
double tp = bp[sec];
int ts = rg(1,100) * 100;
pricefile << id;
pricefile << "|" << ++seq[sec];
pricefile << "|" << cal;
pricefile << "|" << tm;
pricefile << "|T" << endl;
break;
}
case 1: // ask
{
double ap = rg(0,4)*tick+bp[sec];
int as = rg(1,100) * 100;
pricefile << id;
pricefile << "|" << ++seq[sec];
pricefile << "|" << cal;
pricefile << "|" << tm;
pricefile << "|Q" << endl;
break;
}
case 2: // bid
{
int dir = (rg(3,10) > 5)? +1:-1;
bp[sec] = (dir*rg(0,3)*tick)+bp[sec];
int bs = rg(1,100) * 100;
pricefile << id;
pricefile << "|" << ++seq[sec];
pricefile << "|" << cal;
pricefile << "|" << tm;
pricefile << "|Q" << endl;
break;
}
case 3: // cancel/correct as a trade
{
if (rg(0,100) < 5) break;
double tp = bp[sec];
int ts = rg(1,100) * 100;
pricefile << id;
pricefile << "|" << ++seq[sec];
pricefile << "|" << cal;
pricefile << "|" << tm;
pricefile << "|CT" << endl;
break;
}
default:
break;
}
}
// Go to the next second
tm++;
}
cal.nextWeekday();
}
pricefile.close();
}
#endif

@ -30,3 +30,4 @@ SELECT price, timestamp FROM stocks where price - timestamp > 1 and not (price*t
SELECT max(price-mins(price))
FROM stocks
ASSUMING DESC timestamp

Loading…
Cancel
Save