From dda68bd9e1b3b2ac8ffe5ce92a41b1c38b19b298 Mon Sep 17 00:00:00 2001 From: Bill Date: Wed, 5 Oct 2022 03:51:57 +0800 Subject: [PATCH] bug fixes and clarification --- README.md | 18 ++++++++++++++++-- prompt.py | 6 ++++-- reconstruct/ast.py | 38 +++++++++++++++++++++++++++++--------- reconstruct/expr.py | 2 +- reconstruct/storage.py | 6 ++++-- requirements.txt | 1 + test.aquery | 2 +- tests/joins.a | 2 ++ tests/q4.a | 12 ++++++++++++ 9 files changed, 70 insertions(+), 17 deletions(-) create mode 100644 tests/q4.a diff --git a/README.md b/README.md index 4782c16..760acf3 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,20 @@ See ./tests/ for more examples. - A series of commands can be put in a script file and execute using `script` command. - Can be executed using `script` command - See `test.aquery` as an example + +# User Manual +## Data Types +- String Types: `STRING` and `TEXT` are variable-length strings with unlimited length. `VARCHAR(n)` is for strings with upper-bound limits. +- Integer Types: `INT` and `INTEGER` are 32-bit integers, `SMALLINT` is for 16-bit integers, `TINYINT` is for 8-bit integers and `BIGINT` is 64-bit integers. On Linux and macOS, `HGEINT` is 128-bit integers. +- Floating-Point Types: `REAL` denotes 32-bit floating point numbers while `DOUBLE` denotes 64-bit floating point numbers. +- Temporal Types: `DATE` only supports the format of `yyyy-mm-dd`, and `TIME` uses 24-hour format and has the form of `hh:mm:ss:ms` the milliseconds part can range from 0 to 999, `TIMESTAMP` has the format of `yyyy-mm-dd hh:mm:ss:ms`. When importing data from CSV files, please make sure the spreadsheet software (if they were used) doesn't change the format of the date and timestamp by double-checking the file with a plain-text editor. +- Boolean Type: `BOOLEAN` is a boolean type with values `TRUE` and `FALSE`. + +## Load Data: +- Use query like `LOAD DATA INFILE INTO [OPTIONS ]` +- File name is the relative path to the AQuery root directory (where prompy.py resides) +- File name can also be absolute path. +- See `data/q1.sql` for more information # Architecture ![Architecture](./docs/arch-hybrid.svg) @@ -123,8 +137,8 @@ See ./tests/ for more examples. - Backend of AQuery++ Compiler generates target code dependent on the Execution Engine. It can either be the C++ code for AQuery Execution Engine or sql and C++ post-processor for Hybrid Engine or k9 for the k9 Engine. ## Execution Engines - AQuery++ supports different execution engines thanks to the decoupled compiler structure. -- AQuery Execution Engine: executes queries by compiling the query plan to C++ code. Doesn't support joins and udf functions. - Hybrid Execution Engine: decouples the query into two parts. The sql-compliant part is executed by an Embedded version of Monetdb and everything else is executed by a post-process module which is generated by AQuery++ Compiler in C++ and then compiled and executed. +- AQuery Execution Engine: executes queries by compiling the query plan to C++ code. Doesn't support joins and udf functions. - K9 Execution Engine: (discontinued). # Roadmap @@ -160,4 +174,4 @@ See ./tests/ for more examples. - [x] Functionality: Basic helper functions in aquery - [ ] Bug: Join-Aware Column management - [ ] Bug: Order By after Group By -- [ ] Functionality: Having clause \ No newline at end of file +- [ ] Functionality: Having clause diff --git a/prompt.py b/prompt.py index 73a52d8..356026a 100644 --- a/prompt.py +++ b/prompt.py @@ -390,6 +390,8 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None): print('stdin inreadable, Exiting...') exit(0) q = og_q.lower().strip() + if (not re.sub(r'[ \r\n\t;]', '', q)): + continue if False and q == 'exec': # generate build and run (AQuery Engine) state.cfg.backend_type = Backend_Type.BACKEND_AQuery.value cxt = engine.exec(state.stmts, cxt, keep) @@ -483,7 +485,7 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None): continue elif q == 'format' or q == 'fmt': subprocess.call(['clang-format', 'out.cpp']) - elif q == 'exit' or q == 'exit()': + elif q == 'exit' or q == 'exit()' or q == 'quit' or q == 'quit()' or q == '\\q': rm(state) exit() elif q == 'r': # build and run @@ -553,7 +555,7 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None): state.stmts = parser.parse(contents) state.currstats.parse_time = state.currstats.stop() continue - state.stmts = parser.parse(q) + state.stmts = parser.parse(og_q.strip()) cxt.Info(state.stmts) state.currstats.parse_time = state.currstats.stop() except ParseException as e: diff --git a/reconstruct/ast.py b/reconstruct/ast.py index 11e4c37..7efdef0 100644 --- a/reconstruct/ast.py +++ b/reconstruct/ast.py @@ -144,14 +144,20 @@ class projection(ast_node): alias = proj['name'] if not proj_expr.is_special: - if proj_expr.node == '*': + if str(proj_expr.node).strip().endswith('*'): + _datasource = self.datasource + if '.' in proj_expr.node: + tbl = proj_expr.node.split('.')[0] + if tbl in self.datasource.tables_dir: + _datasource = self.datasource.tables_dir[tbl] + _datasource = _datasource.all_cols(ordered = True, stripped = True) name = [(c.get_name() if self.datasource.single_table else c.get_full_name() - ) for c in self.datasource.rec] - this_type = [c.type for c in self.datasource.rec] - compound = [c.compound for c in self.datasource.rec] - proj_expr = [expr(self, c.name) for c in self.datasource.rec] + ) for c in _datasource] + this_type = [c.type for c in _datasource] + compound = [c.compound for c in _datasource] + proj_expr = [expr(self, c.name) for c in _datasource] else: y = lambda x:x count = lambda : 'count(*)' @@ -185,7 +191,7 @@ class projection(ast_node): this_type = enlist(this_type) elif type(proj) is str: - col = self.datasource.get_col(proj) + col = self.datasource.get_cols(proj) this_type = col.type disp_name = proj print('Unknown behavior:', proj, 'is str') @@ -619,6 +625,15 @@ class join(ast_node): for col in cols: joint_cols |= self.joint_cols.get(col, set()) return joint_cols + + def strip_joint_cols(self, cols : Set[ColRef]): + stripped = type(cols)(cols) + for c in stripped: + jc = self.get_joint_cols([c]) + for j in jc: + if j != c and j in stripped: + stripped.remove(j) + return stripped def init(self, _): self.joins : List[join] = [] @@ -724,6 +739,8 @@ class join(ast_node): print(f'Error: table {node} not found.') def get_cols(self, colExpr: str) -> Optional[ColRef]: + if colExpr == '*': + return self.all_cols(ordered = True, stripped = True) for t in self.tables: if colExpr in t.columns_byname: col = t.columns_byname[colExpr] @@ -751,13 +768,16 @@ class join(ast_node): return len(self.tables) == 1 # @property - def all_cols(self): - ret = set() + def all_cols(self, ordered = False, stripped = True): + from ordered_set import OrderedSet + ret = OrderedSet() if ordered else set() for table in self.tables: rec = table.rec table.rec = self.rec - ret.update(table.all_cols()) + ret.update(table.all_cols(ordered = ordered)) table.rec = rec + if stripped: + return self.strip_joint_cols(ret) return ret # TODO: join condition awareness diff --git a/reconstruct/expr.py b/reconstruct/expr.py index ea2480c..ec8897c 100644 --- a/reconstruct/expr.py +++ b/reconstruct/expr.py @@ -257,7 +257,7 @@ class expr(ast_node): if (node == '*' and not (type(self.parent) is expr and 'count' in self.parent.node)): - self.datasource.all_cols() + self.datasource.all_cols(ordered = True) else: self.raw_col = self.datasource.parse_col_names(node) self.raw_col = self.raw_col if type(self.raw_col) is ColRef else None diff --git a/reconstruct/storage.py b/reconstruct/storage.py index ec5277f..9c9ddb5 100644 --- a/reconstruct/storage.py +++ b/reconstruct/storage.py @@ -108,10 +108,12 @@ class TableInfo: else: return datasource.parse_col_names(parsedColExpr[1]) - def all_cols(self): + def all_cols(self, ordered = False): + from ordered_set import OrderedSet + _ret_set_t = OrderedSet if ordered else set if type(self.rec) is set: self.rec.update(self.columns) - return set(self.columns) + return _ret_set_t(self.columns) @property def single_table(self): diff --git a/requirements.txt b/requirements.txt index 766c81c..64088ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ mo-future==6.2.21303 mo-dots==9.173.22126 mo-parsing==8.183.22158 mo-imports==7.169.22121 +ordered-set dataclasses; python_version < '3.7' vswhere; sys_platform == 'win32' numpy diff --git a/test.aquery b/test.aquery index ebfbd02..a3aa27b 100644 --- a/test.aquery +++ b/test.aquery @@ -2,7 +2,7 @@ # stats on -select "hello world" +select "Hello, World!" xexec echo Testing Insert, Filters and Nested Aggregation diff --git a/tests/joins.a b/tests/joins.a index 525a46f..08ec70e 100644 --- a/tests/joins.a +++ b/tests/joins.a @@ -32,3 +32,5 @@ FIELDS TERMINATED BY "," select sd(a) + sales from tt, sale1 where tt.a = sale1.Mont +select * from tt, sale1 where tt.a = sale1.Mont + diff --git a/tests/q4.a b/tests/q4.a new file mode 100644 index 0000000..005cbdf --- /dev/null +++ b/tests/q4.a @@ -0,0 +1,12 @@ +create table ticks(id varchar(20), timestamp int, tradeDate date, price int) + +load data infile "data/stocksym_price.csv" +into table ticks fields terminated by "," + +-- select max(price - mins(price)) +-- from ticks assuming asc timestamp +-- where ID = "S" and tradeDate= '01-10-22'; + +select max(price - mins(price)) +from ticks assuming asc timestamp +where ID = "S" and tradeDate= '2022-10-01';