#include #include #include #include #include "DecisionTree.h" #include "Evaluation.h" #include #include #include #include std::random_device rd; std::mt19937 g(rd()); struct minEval{ double value; int* values; double eval; long left; // how many on its left double* record; long max; long** count; }; struct DT{ int height; long* featureId; DT* left = nullptr; DT* right = nullptr; // split info bool terminate; double dpoint; long feature; long result; // Sparse data record double** sortedData; // for each feature, sorted data long** sortedResult; // Dense data record long*** count = nullptr;// for each feature, number of data belongs to each class and dense value double** record = nullptr;// for each feature, record each dense data long* max = nullptr;// number of dense value of each feature //long* T; // number of data in each class in this node double** dataRecord = nullptr;// Record the data long* resultRecord = nullptr;// Record the result long size = 0;// Size of the dataset }; long seed = (long)clock(); long* Rands(long feature, long maxFeature){ //srand(seed++); long i; long* ret = (long*) malloc(feature*sizeof(long)); for(i =0; icount = (long***)malloc(f*sizeof(long**)); for(i=0; icount[i]=nullptr; t->record = (double**)malloc(f*sizeof(double*)); for(i=0; irecord[i]=nullptr; t->max = (long*)malloc(f*sizeof(long)); t->max[0] = -1; t->featureId = Rands(f, maxF); //t->T = (long*)malloc(classes*sizeof(long)); t->sortedData = (double**) malloc(f*sizeof(double*)); for(i=0; isortedData[i]=nullptr; t->sortedResult = (long**) malloc(f*sizeof(long*)); for(i=0; isortedResult[i]=nullptr; t->dataRecord = nullptr; t->resultRecord = nullptr; t->height = currentHeight; t->feature = -1; t->size = 0; if(currentHeight>height){ t->right = nullptr; t->left = nullptr; return; } t->left = (DT*)malloc(sizeof(DT)); t->right = (DT*)malloc(sizeof(DT)); createTree(t->left, currentHeight+1, height, f, maxF, classes); createTree(t->right, currentHeight+1, height, f, maxF, classes); } void stableTree(DT* t, long f){ long i, j; for(i=0; icount[i]==nullptr)continue; for(j=0; jmax[i]; j++){ free(t->count[i][j]); } free(t->count[i]); } free(t->count); for(i=0; irecord[i]==nullptr)continue; free(t->record[i]); } free(t->record); free(t->max); free(t->featureId); for(i=0; isortedData[i]==nullptr)continue; free(t->sortedData[i]); } free(t->sortedData); for(i=0; isortedResult[i]==nullptr)continue; free(t->sortedResult[i]); } free(t->sortedResult); free(t->dataRecord); free(t->resultRecord); if(t->right!=nullptr)stableTree(t->right, f); if(t->left!=nullptr)stableTree(t->left, f); } void freeTree(DT* t){ if(t->left != nullptr)freeTree(t->left); if(t->right != nullptr)freeTree(t->right); free(t); } DecisionTree::DecisionTree(int height, long f, int* sparse, double forget=0.1, long maxF=0, long noClasses=2, Evaluation e=Evaluation::gini, long r=-1, long rb=1){ evalue = e; called = 0; long i; // Max tree height maxHeight = height; // Number of features feature = f; // If each feature is sparse or dense, 0 for dense, 1 for sparse, >2 for number of category Sparse = (int*)malloc(f*sizeof(int)); for(i = 0; ifeature = -1; // The number of feature that is considered in each node if(maxF>=f){ maxFeature = f; }else if(maxF<=0){ maxFeature = (long)round(sqrt(f)); }else{ maxFeature = maxF; } forgetRate = std::min(1.0, forget); retain = r; createTree(DTree, 0, maxHeight, f, maxFeature, noClasses); // Randomly generate the features //DTree->featureId = Rands(); //DTree->sorted = (long**) malloc(f*sizeof(long*)); // Number of classes of this dataset Rebuild = rb; roundNo = 0; classes = std::max(noClasses, (long)2); //DTree->T = (long*) malloc(noClasses*sizeof(long)); /*for(long i = 0; iT[i]=0; }*/ } void DecisionTree::Stablelize(){ free(Sparse); stableTree(DTree, feature); } void DecisionTree::Free(){ freeTree(DTree); } minEval DecisionTree::incrementalMinGiniSparse(double** dataTotal, long* resultTotal, long sizeTotal, long sizeNew, DT* current, long col, long forgetSize, bool isRoot){ long i, j; if(isRoot){sizeNew=sizeTotal-forgetSize;} long newD[sizeNew]; for(i=0; isortedData[col]; long* oldResult = current->sortedResult[col]; for(i=0; isortedData[col] = newSortedData; current->sortedResult[col] = newSortedResult; free(oldData); free(oldResult); minEval ret; if(evalue == Evaluation::gini){ ret = giniSparseIncremental(sizeTotal, classes, newSortedData, newSortedResult, T); }else if(evalue == Evaluation::entropy or evalue == Evaluation::logLoss){ ret = entropySparseIncremental(sizeTotal, classes, newSortedData, newSortedResult, T); } ret.values = nullptr; return ret; } minEval DecisionTree::incrementalMinGiniDense(double** data, long* result, long size, long col, long*** count, double** record, long* max, long newSize, long forgetSize, bool isRoot){ // newSize is before forget long low = 0; if(isRoot)size=newSize-forgetSize; long i, j, k; long newMax = 0; long maxLocal = max[col]; long **newCount=(long**)malloc(size*sizeof(long*)); for(i=0;icurrentMinMax){ currentMinMax = record[col][j]; for(k=0;k<=classes;k++)newCount[newMax][k]=count[col][j][k]; } } for(j=0;j0){ max[col]+=newMax; long** updateCount = (long**)malloc(max[col]*sizeof(long*)); double* updateRecord = (double*)malloc(max[col]*sizeof(double)); for(i=0; i=newMax){ updateCount[i] = count[col][i-newMax]; updateRecord[i] = record[col][i-newMax]; } else{ updateCount[i] = newCount[i]; updateRecord[i] = newRecord[i]; } } free(count[col]); free(record[col]); count[col]=updateCount; record[col]=updateRecord; } for(i=newMax; isortedData[col] != nullptr)free(current->sortedData[col]); if(current->sortedResult[col] != nullptr)free(current->sortedResult[col]); current->sortedData[col] = (double*) malloc(size*sizeof(double)); current->sortedResult[col] = (long*) malloc(size*sizeof(long)); for(i=0;isortedData[col][i] = data[d[i]][col]; current->sortedResult[col][i] = result[d[i]]; } free(d); ret.values = nullptr; return ret; } minEval DecisionTree::findMinGiniDense(double** data, long* result, long* totalT, long size, long col){ long low = 0; long i, j, k, max=0; long** count = (long**)malloc(size*sizeof(long*)); // size2 and count2 are after forget double* record = (double*)malloc(size*sizeof(double)); bool find; for(i=0;isize==0){ Update(data, result, size, DTree); }else{ IncrementalUpdate(data, result, size, DTree); } /* if(Rebuild and called==10){ called = 0; Rebuild = false; }else if(Rebuild){ called = 11; }else{ called++; }*/ } long* DecisionTree::fitThenPredict(double** trainData, long* trainResult, long trainSize, double** testData, long testSize){ fit(trainData, trainResult, trainSize); long* testResult = (long*)malloc(testSize*sizeof(long)); for(long i=0; i0 and current->size+size>retain) forgetSize = std::min(current->size+size - retain, current->size); else if(retain<0) forgetSize = (long)current->size*forgetRate; long* index = new long[current->size]; double** dataNew; long* resultNew; if(current->height == 0){ dataNew = (double**)malloc((size+current->size-forgetSize)*sizeof(double*)); resultNew = (long*)malloc((size+current->size-forgetSize)*sizeof(long)); for(i=0;isize; i++){ index[i] = i; } std::shuffle(index, index+current->size, g); long x = 0; for(i=0;isize;i++){ if(i>=current->size-forgetSize){ current->dataRecord[index[i]][feature-1] = DBL_MAX; }else{ dataNew[i+size] = current->dataRecord[index[i]]; resultNew[i+size] = current->resultRecord[index[i]]; } } }else{ forgetSize = 0; dataNew = (double**)malloc((size+current->size)*sizeof(double*)); resultNew = (long*)malloc((size+current->size)*sizeof(long)); for(i=0;isize;i++){ if(current->dataRecord[i][feature-1] == DBL_MAX){ forgetSize++; continue; }else{ dataNew[i+size-forgetSize] = current->dataRecord[i]; resultNew[i+size-forgetSize] = current->resultRecord[i]; } } } free(data); free(result); current->size -= forgetSize; current->size += size; // end condition if(current->terminate or roundNo%Rebuild==0){ if(current->height == 0){ for(i=0; idataRecord[index[current->size-size+i]]); } } delete(index); Update(dataNew, resultNew, current->size, current); return; } // find min gini minEval c, cMin; long cFeature; cMin.eval = DBL_MAX; cMin.values = nullptr; // TODO for(i=0;ifeatureId[i]]==1){ c = incrementalMinGiniSparse(dataNew, resultNew, current->size+forgetSize, size, current, current->featureId[i], forgetSize, false); } else if(Sparse[current->featureId[i]]==0){ c = incrementalMinGiniDense(dataNew, resultNew, size, current->featureId[i], current->count, current->record, current->max, current->size+forgetSize, forgetSize, false); }else{ //c = incrementalMinGiniCategorical(); } if(c.evalfeatureId[i]; }else if(c.values!=nullptr)free(c.values); } if(cMin.eval==DBL_MAX){ current->terminate = true; long t[classes]; for(i=0;iresult = std::distance(t, std::max_element(t, t+classes)); return; } //diverse data long ptL=0, ptR=0; double* t; long currentSize = current->size; //TODO:Discrete // Same diverse point as last time if(current->dpoint==cMin.value and current->feature==cFeature){ long xxx = current->left->size; /*for(i=0; ifeature]<=current->dpoint){ ptL++; }else{ ptR++; } }*/ ptL = size; ptR = size; long* resultL = (long*)malloc((ptL)*sizeof(long)); long* resultR = (long*)malloc((ptR)*sizeof(long)); double** dataL = (double**)malloc((ptL)*sizeof(double*)); double** dataR = (double**)malloc((ptR)*sizeof(double*)); ptL = 0; ptR = 0; for(i=0; ifeature]<=current->dpoint){ dataL[ptL] = dataNew[i]; resultL[ptL] = resultNew[i]; ptL++; }else{ dataR[ptR] = dataNew[i]; resultR[ptR] = resultNew[i]; ptR++; } } IncrementalUpdate(dataL, resultL, ptL, current->left); IncrementalUpdate(dataR, resultR, ptR, current->right); if(current->height == 0){ for(i=0; idataRecord[index[current->size-size+i]]); } } delete(index); free(current->dataRecord); free(current->resultRecord); current->dataRecord = dataNew; current->resultRecord = resultNew; return; } // Different diverse point current->feature = cFeature; current->dpoint = cMin.value; /*for(i=0; ifeature]<=current->dpoint){ ptL++; }else{ ptR++; } }*/ long* resultL = (long*)malloc(currentSize*sizeof(long)); long* resultR = (long*)malloc(currentSize*sizeof(long)); double** dataL = (double**)malloc(currentSize*sizeof(double*)); double** dataR = (double**)malloc(currentSize*sizeof(double*)); ptL = 0; ptR = 0; for(i=0; ifeature]<=current->dpoint){ dataL[ptL] = dataNew[i]; resultL[ptL] = resultNew[i]; ptL++; }else{ dataR[ptR] = dataNew[i]; resultR[ptR] = resultNew[i]; ptR++; } } Update(dataL, resultL, ptL, current->left); Update(dataR, resultR, ptR, current->right); if(current->height == 0){ for(i=0; idataRecord[index[current->size-size+i]]); } } delete(index); free(current->dataRecord); free(current->resultRecord); current->dataRecord = dataNew; current->resultRecord = resultNew; } void DecisionTree::Update(double** data, long* result, long size, DT* current){ long low = 0; long i, j; // end condition if(current->dataRecord!=nullptr)free(current->dataRecord); current->dataRecord = data; if(current->resultRecord!=nullptr)free(current->resultRecord); current->resultRecord = result; current->size = size; if(current->height == maxHeight){ current->terminate = true; long t[classes]; for(i=0;iresult = std::distance(t, std::max_element(t, t+classes)); return; } long T[classes]; for(i=0;iterminate = true; current->result = i; return; } } // find min Evaluation minEval c, cMin; long cFeature, oldMax, col, left=0; cMin.eval = DBL_MAX; cMin.values = nullptr; //TODO for(i=0;ifeatureId[i]; if(Sparse[current->featureId[i]]==1){ c = findMinGiniSparse(data, result, T, size, col, current); } else if(Sparse[current->featureId[i]]==0){ c = findMinGiniDense(data, result, T, size, col); if(current->count[col]!=nullptr){ for(j=0; jmax[col]; j++){ if(current->count[col][j]!=nullptr)free(current->count[col][j]); } free(current->count[col]); free(current->record[col]); } current->count[col] = c.count; current->record[col] = c.record; current->max[col] = c.max; }else{ } if(c.evalfeatureId[i]; left = c.left; }else if(c.values!=nullptr){ free(c.values); } } if(cMin.eval == DBL_MAX){ current->terminate = true; long max = 0; for(i=1;iresult = max; return; } //diverse data current->terminate = false; current->feature = cFeature; current->dpoint = cMin.value; long ptL=0, ptR=0; //TODO:Discrete long* resultL = new long[left]; long* resultR = new long[size-left]; double** dataL = new double*[left]; double** dataR = new double*[size-left]; for(i=low; ifeature]<=current->dpoint){ dataL[ptL] = data[i]; resultL[ptL] = result[i]; ptL++; }else{ dataR[ptR] = data[i]; resultR[ptR] = result[i]; ptR++; } } Update(dataL, resultL, ptL, current->left); Update(dataR, resultR, ptR, current->right); } long DecisionTree::Test(double* data, DT* root){ if(root->terminate)return root->result; if(data[root->feature]<=root->dpoint)return Test(data, root->left); return Test(data, root->right); } void DecisionTree::print(DT* root){ int x; //std::cin>>x; if(root->terminate){ printf("%ld", root->result); return; } printf("([%ld, %f]:", root->feature, root->dpoint); print(root->left); printf(", "); print(root->right); printf(")"); }