From 52ffb1ad25019c80b3aa443c0f2a69e90de7a096 Mon Sep 17 00:00:00 2001 From: sam Date: Fri, 7 Oct 2022 01:52:21 -0400 Subject: [PATCH] Sam Code --- .Dockerfile.swp | Bin 0 -> 12288 bytes aquery_parser/sql_parser.py | 17 +- engine/types.py | 3 +- reconstruct/ast.py | 2 +- sdk/DecisionTree.h | 57 +++ sdk/Evaluation.cpp | 278 ++++++++++++ sdk/Evaluation.h | 24 + sdk/RF.cpp | 139 ++++++ sdk/RF.h | 46 ++ sdk/aquery.h | 23 +- sdk/incrementalDecisionTree.cpp | 774 ++++++++++++++++++++++++++++++++ sdk/irf.cpp | 53 +++ tests/.modules.a.swp | Bin 0 -> 12288 bytes tests/modules.a | 24 +- 14 files changed, 1421 insertions(+), 19 deletions(-) create mode 100644 .Dockerfile.swp create mode 100644 sdk/DecisionTree.h create mode 100644 sdk/Evaluation.cpp create mode 100644 sdk/Evaluation.h create mode 100644 sdk/RF.cpp create mode 100644 sdk/RF.h create mode 100644 sdk/incrementalDecisionTree.cpp create mode 100644 sdk/irf.cpp create mode 100644 tests/.modules.a.swp diff --git a/.Dockerfile.swp b/.Dockerfile.swp new file mode 100644 index 0000000000000000000000000000000000000000..10ca7bab25c5ccabfc7d729782a090293ac28c37 GIT binary patch literal 12288 zcmeI2O-~a+7{>?W5!7g+iC!Ly;UMmA3z`s;YM?EIkha=}1|vz?mtnhMyF1Lx3XP(7 z@jH0)qIchJdh@6sKtF(f2cPY>_>ur7oJgKY{@Kp%%>Vh%Z}zh3IQiC66=uu=!*-6b z&oAz{<;$b&IfE3Zzc#a6aBSc=!_>XqCCz)UmK6SKO9v$C|sBaM`oU zvAfCTMiBCY4d@9GAOgoAFvg~*3i;GvJa-i?oL@PH7fmGsM1Tko0U|&IhyW2F0z}}X z5zu0UJ;l(c2Enfkt|R|k2VJ@l0U|&IhyW2F0z`la5CI}U1c(3;AOa_lfbB7sA7$(< zD*gX|`2GL$EMq@VUr}FBuTig1KB|m*eul9<)FsqK)Tc4VKBD$e&rnTN17)CYpiZN{ zV~ua952*L3cc`@P8x%DnKm>>Y5g-CYfCvx)B0vO)z##-m)iSsqWao#cU}6G9Pj}*I z8ib6yLYtz8-ykcsBL|JcLLij}nMAN1gdsRd5PIO+Zihn<=@^2p?c=%RO05M;&33t7 zTCOzOOVzo>O0fW?)zzXK+L1pwX%uc|C5K6jE{H&&iPAP28*&&I&p&S^f#h8tX=Umi zoz3$D4Q?1m96DMHHEUUTr;|9Q8+R?0M7xmWk5b zGkxKko;5H@d;8hZ&m7*5rEhuhb`-|8r>tsPvOI4r-d!}7EBDh`i~%+w5%>n=JnsC5 z4UV!PckNNV*=|)DyEzzCgS7}B8A`gxFj^_vo7( my3y&5Wqaoeu$&;W99wm=U#i2{LZe=V#7QEZ%!c@`VD?{Tyhqpo literal 0 HcmV?d00001 diff --git a/aquery_parser/sql_parser.py b/aquery_parser/sql_parser.py index 45bbe28..9c08db6 100644 --- a/aquery_parser/sql_parser.py +++ b/aquery_parser/sql_parser.py @@ -672,13 +672,15 @@ def parser(literal_string, ident, sqlserver=False): module_func_def = ( var_name("fname") + LB - + delimited_list( - ( - var_name("arg") - + COLON - + var_name("type") - )("vars") - ) + + Optional( + delimited_list( + ( + var_name("arg") + + COLON + + var_name("type") + )("vars") + ) + ) + RB + LAMBDA + var_name("ret_type") @@ -725,3 +727,4 @@ def parser(literal_string, ident, sqlserver=False): ) return stmts.finalize() + diff --git a/engine/types.py b/engine/types.py index 74541c6..7f6491f 100644 --- a/engine/types.py +++ b/engine/types.py @@ -98,7 +98,7 @@ FloatT = Types(16, name = 'float', cname = 'float', sqlname = 'REAL', HgeT = Types(9, name = 'int128',cname='__int128_t', sqlname = 'HUGEINT', fp_type = DoubleT) UHgeT = Types(10, name = 'uint128', cname='__uint128_t', sqlname = 'HUGEINT', fp_type = DoubleT) LongT = Types(4, name = 'int64', sqlname = 'BIGINT', fp_type = DoubleT) -BoolT = Types(0, name = 'bool', sqlname = 'BOOL', long_type=LongT, fp_type=FloatT) +BoolT = Types(0, name = 'bool', cname='bool', sqlname = 'BOOL', long_type=LongT, fp_type=FloatT) ByteT = Types(1, name = 'int8', sqlname = 'TINYINT', long_type=LongT, fp_type=FloatT) ShortT = Types(2, name = 'int16', sqlname='SMALLINT', long_type=LongT, fp_type=FloatT) IntT = Types(3, name = 'int', cname = 'int', long_type=LongT, fp_type=FloatT) @@ -338,4 +338,3 @@ user_module_func = {} builtin_operators : Dict[str, OperatorBase] = {**builtin_binary_arith, **builtin_binary_logical, **builtin_unary_arith, **builtin_unary_logical, **builtin_unary_special, **builtin_func, **builtin_cstdlib, **user_module_func} - \ No newline at end of file diff --git a/reconstruct/ast.py b/reconstruct/ast.py index da8e01c..3f56048 100644 --- a/reconstruct/ast.py +++ b/reconstruct/ast.py @@ -916,7 +916,7 @@ class insert(ast_node): # raise ValueError("Column Mismatch") list_values = [] - for i, s in enumerate(values): + for i, s in enumerate(enlist(values)): if 'value' in s: list_values.append(f"{s['value']}") else: diff --git a/sdk/DecisionTree.h b/sdk/DecisionTree.h new file mode 100644 index 0000000..44ee6c5 --- /dev/null +++ b/sdk/DecisionTree.h @@ -0,0 +1,57 @@ +#ifndef CART_H +#define CART_H + +#include "Evaluation.h" + +struct minEval; + +struct DR; + +struct DT; + +//enum Evaluation {gini, entropy, logLoss}; + +class DecisionTree{ +public: + +DT* DTree = nullptr; +int maxHeight; +long feature; +long maxFeature; +long seed; +long classes; +int* Sparse; +double forgetRate; +Evaluation evalue; +long Rebuild; +long roundNo; +long called; +long retain; + +DecisionTree(int hight, long f, int* sparse, double forget, long maxFeature, long noClasses, Evaluation e, long r, long rb); + +void Stablelize(); + +void Free(); + +minEval findMinGiniDense(double** data, long* result, long* totalT, long size, long col); + +minEval findMinGiniSparse(double** data, long* result, long* totalT, long size, long col, DT* current); + +minEval incrementalMinGiniDense(double** data, long* result, long size, long col, long*** count, double** record, long* max, long newCount, long forgetSize, bool isRoot); + +minEval incrementalMinGiniSparse(double** dataNew, long* resultNew, long sizeNew, long sizeOld, DT* current, long col, long forgetSize, bool isRoot); + +long* fitThenPredict(double** trainData, long* trainResult, long trainSize, double** testData, long testSize); + +void fit(double** data, long* result, long size); + +void Update(double** data, long* result, long size, DT* current); + +void IncrementalUpdate(double** data, long* result, long size, DT* current); + +long Test(double* data, DT* root); + +void print(DT* root); +}; +#endif diff --git a/sdk/Evaluation.cpp b/sdk/Evaluation.cpp new file mode 100644 index 0000000..3683597 --- /dev/null +++ b/sdk/Evaluation.cpp @@ -0,0 +1,278 @@ +#include "Evaluation.h" +#include +#include +#include + +struct minEval{ + double value; + double values; + + double eval; + long left; // how many on its left + double* record; + long max; + long** count; + long* sorted; // sorted d +}; + +minEval giniSparse(double** data, long* result, long* d, long size, long col, long classes, long* totalT){ + double max = data[d[size-1]][col]; + minEval ret; + ret.eval = DBL_MAX; + + long i, j; + long count[classes]; + long total = 0; + for(i=0; igini1){ + ret.eval = gini1; + ret.value = c; + ret.left = total; + } + } + return ret; +} + +minEval entropySparse(double** data, long* result, long* d, long size, long col, long classes, long* totalT){ + double max = data[d[size-1]][col]; + minEval ret; + ret.eval = DBL_MAX; + + long i, j; + long count[classes]; + long total = 0; + for(i=0; ientropy1){ + ret.eval = entropy1; + ret.value = c; + ret.left = total; + } + } + return ret; +} + +minEval giniSparseIncremental(long sizeTotal, long classes, double* newSortedData, long* newSortedResult, long* T){ + long l, r, i, j; + minEval ret; + ret.eval = DBL_MAX; + double gini1, gini2; + long count[classes]; + long total = 0; + for(i=0; igini1){ + ret.eval = gini1; + ret.value = c; + } + } + return ret; +} + +minEval entropySparseIncremental(long sizeTotal, long classes, double* newSortedData, long* newSortedResult, long* T){ + long l, r, i, j; + minEval ret; + ret.eval = DBL_MAX; + double e1, e2; + long count[classes]; + long total = 0; + for(i=0; ie1){ + ret.eval = e1; + ret.value = c; + } + } + return ret; +} + +minEval giniDense(long max, long size, long classes, long** rem, long* d, double* record, long* totalT){ + minEval ret; + ret.eval = DBL_MAX; + + double gini1, gini2; + long *t, *t2, *r, *r2, i, j; + for(i=0;i0){ + t2 = rem[d[i-1]]; + for(j=0;j<=classes;j++){ + t[j]+=t2[j]; + } + } + if(t[classes]>=size)break; + gini1 = 1.0; + gini2 = 1.0; + for(j=0;j0){ + t2 = rem[d[i-1]]; + for(j=0;j<=classes;j++){ + t[j]+=t2[j]; + } + } + if(t[classes]>=size)break; + entropy1 = 0; + entropy2 = 0; + for(j=0;j +#include +#include + +struct DT{ + int height; + long* featureId; + DT* left = nullptr; + DT* right = nullptr; + + // split info + bool terminate; + double dpoint; + long feature; + long result; + + // Sparse data record + double** sortedData; // for each feature, sorted data + long** sortedResult; + + // Dense data record + long*** count = nullptr;// for each feature, number of data belongs to each class and dense value + double** record = nullptr;// for each feature, record each dense data + long* max = nullptr;// number of dense value of each feature + + //long* T; // number of data in each class in this node + double** dataRecord = nullptr;// Record the data + long* resultRecord = nullptr;// Record the result + long size = 0;// Size of the dataset +}; + +RandomForest::RandomForest(long mTree, long actTree, long rTime, int h, long feature, int* s, double forg, long maxF, long noC, Evaluation eval, long r, long rb){ + srand((long)clock()); + Rebuild = rb; + if(actTree<1)actTree=1; + noTree = actTree; + activeTree = actTree; + treePointer = 0; + if(mTreefit(newData, newResult, size); + } + timer++; +} + +long* RandomForest::fitThenPredict(double** trainData, long* trainResult, long trainSize, double** testData, long testSize){ + fit(trainData, trainResult, trainSize); + long* testResult = (long*)malloc(testSize*sizeof(long)); + for(long i=0; iFree(); + delete DTrees[(treePointer+activeTree)%maxTree]; + }else{ + noTree++; + } + DTrees[(treePointer+activeTree)%maxTree] = new DecisionTree(height, f, sparse, forget, maxFeature, noClasses, e, retain, Rebuild); + long size = DTrees[(treePointer+activeTree-1)%maxTree]->DTree->size; + double** newData = new double*[size]; + long* newResult = new long[size]; + for(long j = 0; jDTree->dataRecord[j][k]; + } + newResult[j] = DTrees[(treePointer+activeTree-1)%maxTree]->DTree->resultRecord[j]; + } + + DTrees[(treePointer+activeTree)%maxTree]->fit(newData, newResult, size); + DTrees[treePointer]->Stablelize(); + if(++treePointer==maxTree)treePointer=0; +} + + +long RandomForest::Test(double* data){ + long i; + long predict[noClasses]; + for(i=0; iTest(data, DTrees[i]->DTree)]++; + } + + long ret = 0; + for(i=1; ipredict[ret])ret = i; + } + + return ret; +} diff --git a/sdk/RF.h b/sdk/RF.h new file mode 100644 index 0000000..d0eee67 --- /dev/null +++ b/sdk/RF.h @@ -0,0 +1,46 @@ +#ifndef RF_H +#define RF_H + +#include "DecisionTree.h" + +struct minEval; + +struct DR; + +struct DT; + +//enum Evaluation {gini, entropy, logLoss}; + +class RandomForest{ +public: + +long noTree; +long maxTree; +long activeTree; +long treePointer; +long rotateTime; +long timer; +long retain; +DecisionTree** DTrees = nullptr; + +long height; +long Rebuild; +long f; +int* sparse; +double forget; +long maxFeature; +long noClasses; +Evaluation e; + + +RandomForest(long maxTree, long activeTree, long rotateTime, int height, long f, int* sparse, double forget, long maxFeature=0, long noClasses=2, Evaluation e=Evaluation::gini, long r=-1, long rb=2147483647); + +void fit(double** data, long* result, long size); + +long* fitThenPredict(double** trainData, long* trainResult, long trainSize, double** testData, long testSize); + +void Rotate(); + +long Test(double* data); +}; +#endif diff --git a/sdk/aquery.h b/sdk/aquery.h index 15848f9..261b204 100644 --- a/sdk/aquery.h +++ b/sdk/aquery.h @@ -86,11 +86,26 @@ __AQEXPORT__(void) init_session(Context* cxt); #else void* memcpy(void*, const void*, unsigned long long); #endif + +struct vectortype_storage{ + void* container = nullptr; + unsigned int size = 0, capacity = 0; + vectortype_storage(void* container, unsigned int size, unsigned int capacity) : + container(container), size(size), capacity(capacity) {} + vectortype_storage() = default; + template class VT> + vectortype_storage(const VT& vt) { + memcpy(this, &vt, sizeof(vectortype_storage)); + } +}; struct ColRef_storage { - void* container; - unsigned int capacity, size; - const char* name; - int ty; // what if enum is not int? + void* container = nullptr; + unsigned int size = 0, capacity = 0; + const char* name = nullptr; + int ty = 0; // what if enum is not int? + ColRef_storage(void* container, unsigned int size, unsigned int capacity, const char* name, int ty) : + container(container), size(size), capacity(capacity), name(name), ty(ty) {} + ColRef_storage() = default; template class VT> ColRef_storage(const VT& vt) { memcpy(this, &vt, sizeof(ColRef_storage)); diff --git a/sdk/incrementalDecisionTree.cpp b/sdk/incrementalDecisionTree.cpp new file mode 100644 index 0000000..47560be --- /dev/null +++ b/sdk/incrementalDecisionTree.cpp @@ -0,0 +1,774 @@ +#include +#include +#include +#include +#include "DecisionTree.h" +#include "Evaluation.h" +#include +#include +#include + +struct minEval{ + double value; + int* values; + + double eval; + long left; // how many on its left + double* record; + long max; + long** count; +}; + +struct DT{ + int height; + long* featureId; + DT* left = nullptr; + DT* right = nullptr; + + // split info + bool terminate; + double dpoint; + long feature; + long result; + + // Sparse data record + double** sortedData; // for each feature, sorted data + long** sortedResult; + + // Dense data record + long*** count = nullptr;// for each feature, number of data belongs to each class and dense value + double** record = nullptr;// for each feature, record each dense data + long* max = nullptr;// number of dense value of each feature + + //long* T; // number of data in each class in this node + double** dataRecord = nullptr;// Record the data + long* resultRecord = nullptr;// Record the result + long size = 0;// Size of the dataset +}; +long seed = (long)clock(); +long* Rands(long feature, long maxFeature){ + //srand(seed++); + long i; + long* ret = (long*) malloc(feature*sizeof(long)); + for(i =0; icount = (long***)malloc(f*sizeof(long**)); + for(i=0; icount[i]=nullptr; + t->record = (double**)malloc(f*sizeof(double*)); + for(i=0; irecord[i]=nullptr; + t->max = (long*)malloc(f*sizeof(long)); + t->max[0] = -1; + t->featureId = Rands(f, maxF); + //t->T = (long*)malloc(classes*sizeof(long)); + t->sortedData = (double**) malloc(f*sizeof(double*)); + for(i=0; isortedData[i]=nullptr; + t->sortedResult = (long**) malloc(f*sizeof(long*)); + for(i=0; isortedResult[i]=nullptr; + t->dataRecord = nullptr; + t->resultRecord = nullptr; + t->height = currentHeight; + t->feature = -1; + t->size = 0; + if(currentHeight>height){ + t->right = nullptr; + t->left = nullptr; + return; + } + + t->left = (DT*)malloc(sizeof(DT)); + t->right = (DT*)malloc(sizeof(DT)); + createTree(t->left, currentHeight+1, height, f, maxF, classes); + createTree(t->right, currentHeight+1, height, f, maxF, classes); +} + +void stableTree(DT* t, long f){ + long i, j; + for(i=0; icount[i]==nullptr)continue; + for(j=0; jmax[i]; j++){ + free(t->count[i][j]); + } + free(t->count[i]); + } + free(t->count); + for(i=0; irecord[i]==nullptr)continue; + free(t->record[i]); + } + free(t->record); + free(t->max); + free(t->featureId); + for(i=0; isortedData[i]==nullptr)continue; + free(t->sortedData[i]); + } + free(t->sortedData); + for(i=0; isortedResult[i]==nullptr)continue; + free(t->sortedResult[i]); + } + free(t->sortedResult); + free(t->dataRecord); + free(t->resultRecord); + if(t->right!=nullptr)stableTree(t->right, f); + if(t->left!=nullptr)stableTree(t->left, f); +} + +void freeTree(DT* t){ + if(t->left != nullptr)freeTree(t->left); + if(t->right != nullptr)freeTree(t->right); + free(t); +} + +DecisionTree::DecisionTree(int height, long f, int* sparse, double forget=0.1, long maxF=0, long noClasses=2, Evaluation e=Evaluation::gini, long r=-1, long rb=1){ + evalue = e; + called = 0; + long i; + // Max tree height + maxHeight = height; + // Number of features + feature = f; + // If each feature is sparse or dense, 0 for dense, 1 for sparse, >2 for number of category + Sparse = (int*)malloc(f*sizeof(int)); + for(i = 0; ifeature = -1; + // The number of feature that is considered in each node + if(maxF>=f){ + maxFeature = f; + }else if(maxF<=0){ + maxFeature = (long)round(sqrt(f)); + }else{ + maxFeature = maxF; + } + forgetRate = std::min(1.0, forget); + retain = r; + createTree(DTree, 0, maxHeight, f, maxFeature, noClasses); + // Randomly generate the features + //DTree->featureId = Rands(); + //DTree->sorted = (long**) malloc(f*sizeof(long*)); + // Number of classes of this dataset + Rebuild = rb; + roundNo = 0; + classes = std::max(noClasses, (long)2); + //DTree->T = (long*) malloc(noClasses*sizeof(long)); + /*for(long i = 0; iT[i]=0; + }*/ +} + +void DecisionTree::Stablelize(){ + free(Sparse); + stableTree(DTree, feature); +} + +void DecisionTree::Free(){ + freeTree(DTree); +} + +minEval DecisionTree::incrementalMinGiniSparse(double** dataTotal, long* resultTotal, long sizeTotal, long sizeNew, DT* current, long col, long forgetSize, bool isRoot){ + long i, j; + if(isRoot){sizeNew=sizeTotal-forgetSize;} + long newD[sizeNew]; + for(i=0; isortedData[col]; + long* oldResult = current->sortedResult[col]; + for(i=0; isortedData[col] = newSortedData; + current->sortedResult[col] = newSortedResult; + free(oldData); + free(oldResult); + + minEval ret; + if(evalue == Evaluation::gini){ + ret = giniSparseIncremental(sizeTotal, classes, newSortedData, newSortedResult, T); + }else if(evalue == Evaluation::entropy or evalue == Evaluation::logLoss){ + ret = entropySparseIncremental(sizeTotal, classes, newSortedData, newSortedResult, T); + } + ret.values = nullptr; + return ret; +} +minEval DecisionTree::incrementalMinGiniDense(double** data, long* result, long size, long col, long*** count, double** record, long* max, long newSize, long forgetSize, bool isRoot){ + // newSize is before forget + long low = 0; + if(isRoot)size=newSize-forgetSize; + long i, j, k; + long newMax = 0; + long maxLocal = max[col]; + long **newCount=(long**)malloc(size*sizeof(long*)); + for(i=0;icurrentMinMax){ + currentMinMax = record[col][j]; + for(k=0;k<=classes;k++)newCount[newMax][k]=count[col][j][k]; + } + } + for(j=0;j0){ + max[col]+=newMax; + long** updateCount = (long**)malloc(max[col]*sizeof(long*)); + double* updateRecord = (double*)malloc(max[col]*sizeof(double)); + for(i=0; i=newMax){ + updateCount[i] = count[col][i-newMax]; + updateRecord[i] = record[col][i-newMax]; + } + else{ + updateCount[i] = newCount[i]; + updateRecord[i] = newRecord[i]; + } + } + free(count[col]); + free(record[col]); + count[col]=updateCount; + record[col]=updateRecord; + } + for(i=newMax; isortedData[col] != nullptr)free(current->sortedData[col]); + if(current->sortedResult[col] != nullptr)free(current->sortedResult[col]); + current->sortedData[col] = (double*) malloc(size*sizeof(double)); + current->sortedResult[col] = (long*) malloc(size*sizeof(long)); + for(i=0;isortedData[col][i] = data[d[i]][col]; + current->sortedResult[col][i] = result[d[i]]; + } + free(d); + ret.values = nullptr; + return ret; + +} + +minEval DecisionTree::findMinGiniDense(double** data, long* result, long* totalT, long size, long col){ + long low = 0; + long i, j, k, max=0; + long** count = (long**)malloc(size*sizeof(long*)); + // size2 and count2 are after forget + double* record = (double*)malloc(size*sizeof(double)); + bool find; + for(i=0;isize==0){ + Update(data, result, size, DTree); + }else{ + IncrementalUpdate(data, result, size, DTree); + } + /* + if(Rebuild and called==10){ + called = 0; + Rebuild = false; + }else if(Rebuild){ + called = 11; + }else{ + called++; + }*/ +} + +long* DecisionTree::fitThenPredict(double** trainData, long* trainResult, long trainSize, double** testData, long testSize){ + fit(trainData, trainResult, trainSize); + long* testResult = (long*)malloc(testSize*sizeof(long)); + for(long i=0; i0 and current->size+size>retain) forgetSize = std::min(current->size+size - retain, current->size); + else if(retain<0) forgetSize = (long)current->size*forgetRate; + long* index = new long[current->size]; + double** dataNew; + long* resultNew; + if(current->height == 0){ + dataNew = (double**)malloc((size+current->size-forgetSize)*sizeof(double*)); + resultNew = (long*)malloc((size+current->size-forgetSize)*sizeof(long)); + for(i=0;isize; i++){ + index[i] = i; + } + std::random_shuffle(index, index+current->size); + long x = 0; + for(i=0;isize;i++){ + if(i>=current->size-forgetSize){ + current->dataRecord[index[i]][feature-1] = DBL_MAX; + + }else{ + dataNew[i+size] = current->dataRecord[index[i]]; + resultNew[i+size] = current->resultRecord[index[i]]; + } + } + }else{ + forgetSize = 0; + dataNew = (double**)malloc((size+current->size)*sizeof(double*)); + resultNew = (long*)malloc((size+current->size)*sizeof(long)); + for(i=0;isize;i++){ + if(current->dataRecord[i][feature-1] == DBL_MAX){ + forgetSize++; + continue; + }else{ + dataNew[i+size-forgetSize] = current->dataRecord[i]; + resultNew[i+size-forgetSize] = current->resultRecord[i]; + } + } + } + free(data); + free(result); + current->size -= forgetSize; + current->size += size; + // end condition + if(current->terminate or roundNo%Rebuild==0){ + if(current->height == 0){ + for(i=0; idataRecord[index[current->size-size+i]]); + } + } + delete(index); + Update(dataNew, resultNew, current->size, current); + return; + } + // find min gini + minEval c, cMin; + long cFeature; + cMin.eval = DBL_MAX; + cMin.values = nullptr; + // TODO + for(i=0;ifeatureId[i]]==1){ + c = incrementalMinGiniSparse(dataNew, resultNew, current->size+forgetSize, size, current, current->featureId[i], forgetSize, false); + } + else if(Sparse[current->featureId[i]]==0){ + c = incrementalMinGiniDense(dataNew, resultNew, size, current->featureId[i], current->count, current->record, current->max, current->size+forgetSize, forgetSize, false); + }else{ + //c = incrementalMinGiniCategorical(); + } + if(c.evalfeatureId[i]; + }else if(c.values!=nullptr)free(c.values); + } + if(cMin.eval==DBL_MAX){ + current->terminate = true; + long t[classes]; + for(i=0;iresult = std::distance(t, std::max_element(t, t+classes)); + return; + } + //diverse data + long ptL=0, ptR=0; + double* t; + long currentSize = current->size; + //TODO:Discrete + // Same diverse point as last time + if(current->dpoint==cMin.value and current->feature==cFeature){ + long xxx = current->left->size; + /*for(i=0; ifeature]<=current->dpoint){ + ptL++; + }else{ + ptR++; + } + }*/ + ptL = size; + ptR = size; + long* resultL = (long*)malloc((ptL)*sizeof(long)); + long* resultR = (long*)malloc((ptR)*sizeof(long)); + double** dataL = (double**)malloc((ptL)*sizeof(double*)); + double** dataR = (double**)malloc((ptR)*sizeof(double*)); + ptL = 0; + ptR = 0; + for(i=0; ifeature]<=current->dpoint){ + dataL[ptL] = dataNew[i]; + resultL[ptL] = resultNew[i]; + ptL++; + }else{ + dataR[ptR] = dataNew[i]; + resultR[ptR] = resultNew[i]; + ptR++; + } + } + IncrementalUpdate(dataL, resultL, ptL, current->left); + IncrementalUpdate(dataR, resultR, ptR, current->right); + + if(current->height == 0){ + for(i=0; idataRecord[index[current->size-size+i]]); + } + } + delete(index); + free(current->dataRecord); + free(current->resultRecord); + current->dataRecord = dataNew; + current->resultRecord = resultNew; + return; + } + + // Different diverse point + current->feature = cFeature; + current->dpoint = cMin.value; + /*for(i=0; ifeature]<=current->dpoint){ + ptL++; + }else{ + ptR++; + } + }*/ + long* resultL = (long*)malloc(currentSize*sizeof(long)); + long* resultR = (long*)malloc(currentSize*sizeof(long)); + double** dataL = (double**)malloc(currentSize*sizeof(double*)); + double** dataR = (double**)malloc(currentSize*sizeof(double*)); + ptL = 0; + ptR = 0; + for(i=0; ifeature]<=current->dpoint){ + dataL[ptL] = dataNew[i]; + resultL[ptL] = resultNew[i]; + ptL++; + }else{ + dataR[ptR] = dataNew[i]; + resultR[ptR] = resultNew[i]; + ptR++; + } + } + Update(dataL, resultL, ptL, current->left); + Update(dataR, resultR, ptR, current->right); + + if(current->height == 0){ + for(i=0; idataRecord[index[current->size-size+i]]); + } + } + + delete(index); + free(current->dataRecord); + free(current->resultRecord); + current->dataRecord = dataNew; + current->resultRecord = resultNew; +} +void DecisionTree::Update(double** data, long* result, long size, DT* current){ + long low = 0; + long i, j; + // end condition + if(current->dataRecord!=nullptr)free(current->dataRecord); + current->dataRecord = data; + if(current->resultRecord!=nullptr)free(current->resultRecord); + current->resultRecord = result; + current->size = size; + if(current->height == maxHeight){ + current->terminate = true; + long t[classes]; + for(i=0;iresult = std::distance(t, std::max_element(t, t+classes)); + return; + } + long T[classes]; + for(i=0;iterminate = true; + current->result = i; + return; + } + } + // find min Evaluation + minEval c, cMin; + long cFeature, oldMax, col, left=0; + cMin.eval = DBL_MAX; + cMin.values = nullptr; + //TODO + for(i=0;ifeatureId[i]; + if(Sparse[current->featureId[i]]==1){ + c = findMinGiniSparse(data, result, T, size, col, current); + } + else if(Sparse[current->featureId[i]]==0){ + c = findMinGiniDense(data, result, T, size, col); + if(current->count[col]!=nullptr){ + for(j=0; jmax[col]; j++){ + if(current->count[col][j]!=nullptr)free(current->count[col][j]); + } + free(current->count[col]); + free(current->record[col]); + } + current->count[col] = c.count; + current->record[col] = c.record; + current->max[col] = c.max; + }else{ + + } + if(c.evalfeatureId[i]; + left = c.left; + }else if(c.values!=nullptr){ + free(c.values); + } + } + if(cMin.eval == DBL_MAX){ + current->terminate = true; + long max = 0; + for(i=1;iresult = max; + return; + } + //diverse data + current->terminate = false; + current->feature = cFeature; + current->dpoint = cMin.value; + long ptL=0, ptR=0; + //TODO:Discrete + long* resultL = new long[left]; + long* resultR = new long[size-left]; + double** dataL = new double*[left]; + double** dataR = new double*[size-left]; + for(i=low; ifeature]<=current->dpoint){ + dataL[ptL] = data[i]; + resultL[ptL] = result[i]; + ptL++; + }else{ + dataR[ptR] = data[i]; + resultR[ptR] = result[i]; + ptR++; + } + } + Update(dataL, resultL, ptL, current->left); + Update(dataR, resultR, ptR, current->right); +} + +long DecisionTree::Test(double* data, DT* root){ + if(root->terminate)return root->result; + if(data[root->feature]<=root->dpoint)return Test(data, root->left); + return Test(data, root->right); +} + +void DecisionTree::print(DT* root){ + int x; + //std::cin>>x; + if(root->terminate){ + printf("%ld", root->result); + return; + } + printf("([%ld, %f]:", root->feature, root->dpoint); + print(root->left); + printf(", "); + print(root->right); + printf(")"); +} diff --git a/sdk/irf.cpp b/sdk/irf.cpp new file mode 100644 index 0000000..8089c9f --- /dev/null +++ b/sdk/irf.cpp @@ -0,0 +1,53 @@ +#include "DecisionTree.h" +#include "aquery.h" +// __AQ_NO_SESSION__ +#include "../server/table.h" + +DecisionTree* dt = nullptr; +long pt = 0; +double** data = nullptr; +long* result = nullptr; + +__AQEXPORT__(bool) newtree(int height, long f, ColRef sparse, double forget, long maxf, long noclasses, Evaluation e, long r, long rb){ + if(sparse.size!=f)return 0; + int* issparse = (int*)malloc(f*sizeof(int)); + for(long i=0; iX, long y, long size){ + long j = 0; + if(size>0){ + free(data); + free(result); + pt = 0; + data=(double**)malloc(size*sizeof(double*)); + result=(long*)malloc(size*sizeof(long)); + } + data[pt] = (double*)malloc(X.size*sizeof(double)); + for(j=0; jfit(data, result, pt); + return 1; +} + +__AQEXPORT__(ColRef_storage) predict(){ + int* result = (int*)malloc(pt*sizeof(int)); + for(long i=0; iTest(data[i], dt->DTree); + } + ColRef_storage ret(result, pt, pt, "prediction", 0); + return ret; +} + + diff --git a/tests/.modules.a.swp b/tests/.modules.a.swp new file mode 100644 index 0000000000000000000000000000000000000000..06182080c684f48ea479de63b836525e29b278ee GIT binary patch literal 12288 zcmeI1&ubGw6vwBYV=EQB_E`}U*z6{$KS~6pnidL8L)Fw?cQ=!E=_VUyCN;@H{{m0` z3kqI63f}x11i_OhJt}w={O%^Z*4o{{K&Z|9_bi;v=*TEkHTw=d2JvpzqKp=neD&QqU7<0s4hHpP_fqODKd^ zq1#a0V;=g7ejlLM&?|@s6JP>NfC(@GCcp%k025#WOdunmWhi|^JtbR#Z}NpZViIWw zSp9!@?48Dg4sNRz9AiBuG(J(%KRZpFZDc2Jyox=tsBBTOT%fj!I)u=%wxLC(DQttJ z+S1`-L03y`hAv0^u!9uh0wRp; zU83S$I*>kglX&t(0l(s^TOY0DYHEp?CzP^Vo^KG z>bXkGxZpS*sa double, - mulvec(a:int, b:vecfloat) -> vecfloat + newtree(height:int, f:int64, sparse:vecint, forget:double, maxf:int64, noclasses:int64, e:int, r:int64, rb:int64) -> bool, + additem(X:vecdouble, y:int64, size:int64) -> bool, + fit() -> bool, + predict() -> vecint ); -select mydiv(2,3); - +create table tb(x int); +create table tb2(x double, y double, z double); +insert into tb values (0); +insert into tb values (0); +insert into tb values (0); +select newtree(5, 3, tb.x, 0, 3, 2, 0, 100, 1) from tb; +insert into tb2 values (1, 0, 1); +insert into tb2 values (0, 1, 1); +insert into tb2 values (1, 1, 1); +select additem(tb2.x, 1, 3) from tb2; +select additem(tb2.y, 0, -1) from tb2; +select additem(tb2.z, 1, -1) from tb2; +select fit(); +select predict();