From dda68bd9e1b3b2ac8ffe5ce92a41b1c38b19b298 Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Wed, 5 Oct 2022 03:51:57 +0800
Subject: [PATCH] bug fixes and clarification

---
 README.md              | 18 ++++++++++++++++--
 prompt.py              |  6 ++++--
 reconstruct/ast.py     | 38 +++++++++++++++++++++++++++++---------
 reconstruct/expr.py    |  2 +-
 reconstruct/storage.py |  6 ++++--
 requirements.txt       |  1 +
 test.aquery            |  2 +-
 tests/joins.a          |  2 ++
 tests/q4.a             | 12 ++++++++++++
 9 files changed, 70 insertions(+), 17 deletions(-)
 create mode 100644 tests/q4.a
diff --git a/README.md b/README.md
index 4782c16..760acf3 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,20 @@ See ./tests/ for more examples.
 - A series of commands can be put in a script file and execute using `script` command.
 - Can be executed using `script` command
 - See `test.aquery` as an example
+
+# User Manual
+## Data Types
+- String Types: `STRING` and `TEXT` are variable-length strings with unlimited length. `VARCHAR(n)` is for strings with upper-bound limits.
+- Integer Types: `INT` and `INTEGER` are 32-bit integers, `SMALLINT` is for 16-bit integers, `TINYINT` is for 8-bit integers and `BIGINT` is 64-bit integers. On Linux and macOS, `HGEINT` is 128-bit integers. 
+- Floating-Point Types: `REAL` denotes 32-bit floating point numbers while `DOUBLE` denotes 64-bit floating point numbers. 
+- Temporal Types: `DATE` only supports the format of `yyyy-mm-dd`, and `TIME` uses 24-hour format and has the form of `hh:mm:ss:ms` the milliseconds part can range from 0 to 999, `TIMESTAMP` has the format of `yyyy-mm-dd hh:mm:ss:ms`. When importing data from CSV files, please make sure the spreadsheet software (if they were used) doesn't change the format of the date and timestamp by double-checking the file with a plain-text editor.
+- Boolean Type: `BOOLEAN` is a boolean type with values `TRUE` and `FALSE`.
+
+## Load Data:
+- Use query like `LOAD DATA INFILE <filename> INTO <table_name> [OPTIONS <options>]`
+- File name is the relative path to the AQuery root directory (where prompy.py resides)
+- File name can also be absolute path.
+- See `data/q1.sql` for more information 
 # Architecture 
 ![Architecture](./docs/arch-hybrid.svg)
 
@@ -123,8 +137,8 @@ See ./tests/ for more examples.
 - Backend of AQuery++ Compiler generates target code dependent on the Execution Engine. It can either be the C++ code for AQuery Execution Engine or sql and C++ post-processor for Hybrid Engine or k9 for the k9 Engine.
 ## Execution Engines
 - AQuery++ supports different execution engines thanks to the decoupled compiler structure.
-- AQuery Execution Engine: executes queries by compiling the query plan to C++ code. Doesn't support joins and udf functions. 
 - Hybrid Execution Engine: decouples the query into two parts. The sql-compliant part is executed by an Embedded version of Monetdb and everything else is executed by a post-process module which is generated by AQuery++ Compiler in C++ and then compiled and executed.
+- AQuery Execution Engine: executes queries by compiling the query plan to C++ code. Doesn't support joins and udf functions. 
 - K9 Execution Engine: (discontinued).
   
 # Roadmap
@@ -160,4 +174,4 @@ See ./tests/ for more examples.
 - [x] Functionality: Basic helper functions in aquery 
 - [ ] Bug: Join-Aware Column management
 - [ ] Bug: Order By after Group By
-- [ ] Functionality: Having clause
\ No newline at end of file
+- [ ] Functionality: Having clause
diff --git a/prompt.py b/prompt.py
index 73a52d8..356026a 100644
--- a/prompt.py
+++ b/prompt.py
@@ -390,6 +390,8 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
                 print('stdin inreadable, Exiting...')
                 exit(0)
             q = og_q.lower().strip()
+            if (not re.sub(r'[ \r\n\t;]', '', q)):
+                continue
             if False and q == 'exec': # generate build and run (AQuery Engine)
                 state.cfg.backend_type = Backend_Type.BACKEND_AQuery.value
                 cxt = engine.exec(state.stmts, cxt, keep)
@@ -483,7 +485,7 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
                 continue
             elif q == 'format' or q == 'fmt':
                 subprocess.call(['clang-format', 'out.cpp'])
-            elif q == 'exit' or q == 'exit()':
+            elif q == 'exit' or q == 'exit()' or q == 'quit' or q == 'quit()' or q == '\\q':
                 rm(state)
                 exit()
             elif q == 'r': # build and run
@@ -553,7 +555,7 @@ def prompt(running = lambda:True, next = lambda:input('> '), state = None):
                 state.stmts = parser.parse(contents)
                 state.currstats.parse_time = state.currstats.stop()
                 continue
-            state.stmts = parser.parse(q)
+            state.stmts = parser.parse(og_q.strip())
             cxt.Info(state.stmts)
             state.currstats.parse_time = state.currstats.stop()
         except ParseException as e:
diff --git a/reconstruct/ast.py b/reconstruct/ast.py
index 11e4c37..7efdef0 100644
--- a/reconstruct/ast.py
+++ b/reconstruct/ast.py
@@ -144,14 +144,20 @@ class projection(ast_node):
                     alias = proj['name']
                         
                 if not proj_expr.is_special:
-                    if proj_expr.node == '*':
+                    if str(proj_expr.node).strip().endswith('*'):
+                        _datasource = self.datasource
+                        if '.' in proj_expr.node:
+                            tbl = proj_expr.node.split('.')[0]
+                            if tbl in self.datasource.tables_dir:
+                                _datasource = self.datasource.tables_dir[tbl]
+                        _datasource = _datasource.all_cols(ordered = True, stripped = True)
                         name = [(c.get_name()
                                  if self.datasource.single_table
                                  else c.get_full_name()
-                                 ) for c in self.datasource.rec]
-                        this_type = [c.type for c in self.datasource.rec]
-                        compound = [c.compound for c in self.datasource.rec]
-                        proj_expr = [expr(self, c.name) for c in self.datasource.rec]
+                                 ) for c in _datasource]
+                        this_type = [c.type for c in _datasource]
+                        compound = [c.compound for c in _datasource]
+                        proj_expr = [expr(self, c.name) for c in _datasource]
                     else:
                         y = lambda x:x
                         count = lambda : 'count(*)'
@@ -185,7 +191,7 @@ class projection(ast_node):
                 this_type = enlist(this_type)
                 
             elif type(proj) is str:
-                col = self.datasource.get_col(proj)
+                col = self.datasource.get_cols(proj)
                 this_type = col.type
                 disp_name = proj
                 print('Unknown behavior:', proj, 'is str')    
@@ -619,6 +625,15 @@ class join(ast_node):
         for col in cols:
             joint_cols |= self.joint_cols.get(col, set())
         return joint_cols
+
+    def strip_joint_cols(self, cols : Set[ColRef]):
+        stripped = type(cols)(cols)
+        for c in stripped:
+            jc = self.get_joint_cols([c])
+            for j in jc:
+                if j != c and j in stripped:
+                    stripped.remove(j)
+        return stripped
     
     def init(self, _):
         self.joins : List[join] = []
@@ -724,6 +739,8 @@ class join(ast_node):
                 print(f'Error: table {node} not found.')
     
     def get_cols(self, colExpr: str) -> Optional[ColRef]:
+        if colExpr == '*':
+            return self.all_cols(ordered = True, stripped = True)
         for t in self.tables:
             if colExpr in t.columns_byname:
                 col = t.columns_byname[colExpr]
@@ -751,13 +768,16 @@ class join(ast_node):
         return len(self.tables) == 1
     
 #    @property
-    def all_cols(self):
-        ret = set()
+    def all_cols(self, ordered = False, stripped = True):
+        from ordered_set import OrderedSet
+        ret = OrderedSet() if ordered else set()
         for table in self.tables:
             rec = table.rec
             table.rec = self.rec
-            ret.update(table.all_cols())
+            ret.update(table.all_cols(ordered = ordered))
             table.rec = rec
+        if stripped:
+            return self.strip_joint_cols(ret)
         return ret
     
     # TODO: join condition awareness
diff --git a/reconstruct/expr.py b/reconstruct/expr.py
index ea2480c..ec8897c 100644
--- a/reconstruct/expr.py
+++ b/reconstruct/expr.py
@@ -257,7 +257,7 @@ class expr(ast_node):
                     if (node == '*' and 
                         not (type(self.parent) is expr 
                              and 'count' in self.parent.node)):
-                        self.datasource.all_cols()
+                        self.datasource.all_cols(ordered = True)
                     else:
                         self.raw_col = self.datasource.parse_col_names(node)
                         self.raw_col = self.raw_col if type(self.raw_col) is ColRef else None
diff --git a/reconstruct/storage.py b/reconstruct/storage.py
index ec5277f..9c9ddb5 100644
--- a/reconstruct/storage.py
+++ b/reconstruct/storage.py
@@ -108,10 +108,12 @@ class TableInfo:
             else:
                 return datasource.parse_col_names(parsedColExpr[1])
     
-    def all_cols(self):
+    def all_cols(self, ordered = False):
+        from ordered_set import OrderedSet
+        _ret_set_t = OrderedSet if ordered else set
         if type(self.rec) is set:
             self.rec.update(self.columns)
-        return set(self.columns)
+        return _ret_set_t(self.columns)
             
     @property
     def single_table(self):
diff --git a/requirements.txt b/requirements.txt
index 766c81c..64088ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ mo-future==6.2.21303
 mo-dots==9.173.22126
 mo-parsing==8.183.22158
 mo-imports==7.169.22121
+ordered-set
 dataclasses; python_version < '3.7'
 vswhere; sys_platform == 'win32'
 numpy
diff --git a/test.aquery b/test.aquery
index ebfbd02..a3aa27b 100644
--- a/test.aquery
+++ b/test.aquery
@@ -2,7 +2,7 @@
 
 # stats on 
 
-select "hello world"
+select "Hello, World!"
 xexec
 
 echo Testing Insert, Filters and Nested Aggregation
diff --git a/tests/joins.a b/tests/joins.a
index 525a46f..08ec70e 100644
--- a/tests/joins.a
+++ b/tests/joins.a
@@ -32,3 +32,5 @@ FIELDS TERMINATED BY ","
 
 select sd(a) + sales from tt, sale1 where tt.a = sale1.Mont
 
+select * from tt, sale1 where tt.a = sale1.Mont
+
diff --git a/tests/q4.a b/tests/q4.a
new file mode 100644
index 0000000..005cbdf
--- /dev/null
+++ b/tests/q4.a
@@ -0,0 +1,12 @@
+create table ticks(id varchar(20), timestamp int, tradeDate date, price int)
+
+load data infile "data/stocksym_price.csv"
+into table ticks fields terminated by ","
+
+-- select max(price - mins(price))
+-- from ticks assuming asc timestamp
+-- where ID = "S" and tradeDate= '01-10-22';
+
+select max(price - mins(price))
+from ticks assuming asc timestamp
+where ID = "S" and tradeDate= '2022-10-01';