#include #include #include #include #include "DecisionTree.h" #include "Evaluation.h" #include #include #include #include std::random_device rd; std::mt19937 g(rd()); struct minEval{ double value; int* values = nullptr; double eval; long left; // how many on its left double* record; long max; long** count; }; struct DT{ long height; long* featureId; DT* left = nullptr; DT* right = nullptr; bool created; // split info bool terminate; double dpoint; long feature; long result; // Sparse data record double** sortedData; // for each feature, sorted data long** sortedResult; // Dense data record long*** count = nullptr;// for each feature, number of data belongs to each class and dense value double** record = nullptr;// for each feature, record each dense data long* max = nullptr;// number of dense value of each feature //long* T; // number of data in each class in this node double** dataRecord = nullptr;// Record the data long* resultRecord = nullptr;// Record the result long size = 0;// Size of the dataset }; long seed = (long)clock(); long* Rands(long feature, long maxFeature){ srand(seed); long i; long* ret = (long*) malloc(feature*sizeof(long)); for(i =0; icreated = true; long i; t->count = (long***)malloc(f*sizeof(long**)); for(i=0; icount[i]=nullptr; t->record = (double**)malloc(f*sizeof(double*)); for(i=0; irecord[i]=nullptr; t->max = (long*)malloc(f*sizeof(long)); t->max[0] = -1; t->sortedData = (double**) malloc(f*sizeof(double*)); for(i=0; isortedData[i]=nullptr; t->sortedResult = (long**) malloc(f*sizeof(long*)); for(i=0; isortedResult[i]=nullptr; t->dataRecord = nullptr; t->resultRecord = nullptr; t->height = currentHeight; t->feature = -1; t->size = 0; t->left = (DT*)malloc(sizeof(DT)); t->right = (DT*)malloc(sizeof(DT)); t->left->created = false; t->right->created = false; t->left->height = currentHeight+1; t->right->height = currentHeight+1; } void stableTree(DT* t, long f){ long i, j; if(not t->created)return; for(i=0; icount[i]==nullptr)continue; for(j=0; jmax[i]; j++){ free(t->count[i][j]); } free(t->count[i]); } free(t->count); for(i=0; irecord[i]==nullptr)continue; free(t->record[i]); } free(t->record); free(t->max); for(i=0; isortedData[i]==nullptr)continue; free(t->sortedData[i]); } free(t->sortedData); for(i=0; isortedResult[i]==nullptr)continue; free(t->sortedResult[i]); } free(t->dataRecord); free(t->resultRecord); free(t->sortedResult); if(t->right!=nullptr)stableTree(t->right, f); if(t->left!=nullptr)stableTree(t->left, f); } void freeTree(DT* t){ if(t->created){ freeTree(t->left); freeTree(t->right); } free(t); } DecisionTree::DecisionTree(long f, int* sparse, double rate, long maxF, long noClasses, Evaluation e){ evalue = e; long i; // Max tree height initialIR = rate; increaseRate = rate; isRF = false; // Number of features feature = f; // If each feature is sparse or dense, 0 for dense, 1 for sparse, >2 for number of category Sparse = (int*)malloc(f*sizeof(int)); for(i = 0; ifeature = -1; // The number of feature that is considered in each node if(maxF>=f){ maxF = f; }else if(maxF<=0){ maxF = (long)round(sqrt(f)); } maxFeature = maxF; forgetRate = -10.0; retain = 0; DTree->featureId = Rands(f, maxF); DTree->terminate = true; DTree->result = 0; DTree->size = 0; createNode(DTree, 0, f, noClasses); // Number of classes of this dataset Rebuild = 2147483647; roundNo = 64; classes = std::max(noClasses, (long)2); // last Acc lastAll = classes; lastT = 1; } void DecisionTree::Stablelize(){ free(Sparse); long i, j; DT* t = DTree; long f = feature; for(i=0; icount[i]==nullptr)continue; for(j=0; jmax[i]; j++){ free(t->count[i][j]); } free(t->count[i]); } free(t->count); for(i=0; irecord[i]==nullptr)continue; free(t->record[i]); } free(t->record); free(t->max); free(t->featureId); for(i=0; isortedData[i]==nullptr)continue; free(t->sortedData[i]); } free(t->sortedData); for(i=0; isortedResult[i]==nullptr)continue; free(t->sortedResult[i]); } free(t->sortedResult); if(DTree->right!=nullptr)stableTree(t->right, feature); if(DTree->left!=nullptr)stableTree(t->left, feature); } void DecisionTree::Free(){ free(DTree->dataRecord); free(DTree->resultRecord); freeTree(DTree); } minEval DecisionTree::incrementalMinGiniSparse(double** dataTotal, long* resultTotal, long sizeTotal, long sizeNew, DT* current, long col, long forgetSize, double** forgottenData, long* forgottenClass){ long i, j; long newD[sizeNew]; for(i=0; isortedData[col]; long* oldResult = current->sortedResult[col]; long tmp2 = forgetSize; long* allForget = (long*)malloc(sizeof(long)*classes); for(i=0; isortedData[col] = newSortedData; current->sortedResult[col] = newSortedResult; free(oldData); free(oldResult); minEval ret; if(evalue == Evaluation::gini){ ret = giniSparseIncremental(sizeTotal, classes, newSortedData, newSortedResult, T); }else if(evalue == Evaluation::entropy or evalue == Evaluation::logLoss){ ret = entropySparseIncremental(sizeTotal, classes, newSortedData, newSortedResult, T); } ret.values = nullptr; return ret; } minEval DecisionTree::incrementalMinGiniDense(double** data, long* result, long size, long col, long*** count, double** record, long* max, long newSize, long forgetSize, double** forgottenData, long* forgottenClass){ // newSize is before forget long low = 0; //if(isRoot) long i, j, k, tmp; long newMax = 0; long maxLocal = max[col]; long **newCount=(long**)malloc(size*sizeof(long*)); double newRecord[size]; bool find; long tmp3 = newSize-size; long tmp4 = forgetSize; // find total count for each class long T[classes]; long tmp2=0; long* allForget = new long[classes]; for(i=0;i0){ d = (long*)malloc(sizeof(long)*newMax); for(i=0;irecord[col][k]){ updateCount[i] = count[col][k]; updateRecord[i] = record[col][k]; k++; } else{ updateCount[i] = newCount[j]; updateRecord[i] = newRecord[j]; j++; } } free(count[col]); free(record[col]); count[col]=updateCount; record[col]=updateRecord; free(d); } free(newCount); //calculate gini minEval ret; if(evalue==Evaluation::gini){ ret = giniDense(max[col], newSize, classes, count[col], d, record[col], T); }else if(evalue==Evaluation::entropy or evalue==Evaluation::logLoss){ ret = entropyDense(max[col], newSize, classes, count[col], d, record[col], T); } ret.values = nullptr; return ret; } minEval DecisionTree::findMinGiniSparse(double** data, long* result, long* totalT, long size, long col, DT* current){ long i, j; long* d = (long*)malloc(size*sizeof(long)); for(i=0; isortedData[col] != nullptr)free(current->sortedData[col]); if(current->sortedResult[col] != nullptr)free(current->sortedResult[col]); current->sortedData[col] = (double*) malloc(size*sizeof(double)); current->sortedResult[col] = (long*) malloc(size*sizeof(long)); for(i=0;isortedData[col][i] = data[d[i]][col]; current->sortedResult[col][i] = result[d[i]]; } free(d); ret.values = nullptr; return ret; } minEval DecisionTree::findMinGiniDense(double** data, long* result, long* totalT, long size, long col){ long low = 0; long i, j, k, max=0; long** count = (long**)malloc(size*sizeof(long*)); double* record = (double*)malloc(size*sizeof(double)); bool find; for(i=0;isize==0){ retain = size; maxHeight = (long)log2((double)retain); maxHeight = std::max(maxHeight, (long)1); Update(data, result, size, DTree); }else{ if(forgetRate<=0){ for(long j=0; jsize; i++){ guesses[DTree->resultRecord[i]]++; } for(i=0; isize/size; }*/ if(localSm <= guessAcc){ //if(localSm <= 1.0/classes){ lastT = localT; lastAll = localAll; retain = size; //increaseRate = 1.0-localSm; } else if(lastSm <= guessAcc){ //else if(lastSm <= 1.0/classes){ lastT = localT; lastAll = localAll; //forgetRate=-5.0; retain += size; //increaseRate -= localSm; //increaseRate = initialIR; //increaseRate -= localSm; //increaseRate /= (double)localSm-1.0/classes; } else if(lastSm == localSm){ lastT += localT; lastAll += localAll; retain+=(long)round(increaseRate*size); //increaseRate*=increaseRate; //retain = (long)((double)retain*isUp+0.25*size); } else{ /*double lastSd = sqrt(pow((1.0-lastSm),2)*lastT+pow(lastSm,2)*(lastAll-lastT)/(lastAll-1)); double localSd = sqrt(pow((1.0-localSm),2)*localT+pow(localSm,2)*(localAll-localT)/(localAll-1)); double v = lastAll+localAll-2; double sp = sqrt(((lastAll-1) * lastSd * lastSd + (localAll-1) * localSd * localSd) / v); double q; //double t=lastSm-localSm; if(sp==0)q=1.0; else if(lastAll+lastAll<2000){ q = abs(lastSm-localSm); } else{ double t = t/(sp*sqrt(1.0/lastAll+1.0/localAll)); boost::math::students_t dist(v); double c = cdf(dist, t); q = cdf(complement(dist, fabs(t))); }*/ isUp = ((double)localSm-guessAcc)/((double)lastSm-guessAcc); //isUp = ((double)localSm-1.0/classes)/((double)lastSm-1.0/classes); increaseRate = increaseRate/isUp; //increaseRate += increaseRate*factor; if(isUp>=1.0)isUp=pow(isUp, 2); else{ isUp=pow(isUp, 3-isUp); } retain = std::min((long)round(retain*isUp+increaseRate*size), retain+size); //double factor = ((lastSm-localSm)/localSm)*abs((lastSm-localSm)/localSm)*increaseRate; //retain += std::min((long)round(factor*retain+increaseRate*size), size); lastT = localT; lastAll = localAll; } //printf(" %f, %f, %f\n", increaseRate, localSm, lastSm); }else{ long i; retain = DTree->size+size; /*double guessAcc=0.0; long guesses[classes]; for(i=0; isize; i++){ guesses[DTree->resultRecord[i]]++; } for(i=0; isize/size; }*/ while(retain>=roundNo){ if((double)localT/localAll>guessAcc){ forgetRate+=5.0; } roundNo*=2; } if((double)localT/localAll<=guessAcc){ forgetRate=-10.0; } if(forgetRate>=0){ forgetRate=0.0; } lastT = localT; lastAll = localAll; } } //if(increaseRate>initialIR)increaseRate=initialIR; //printf("%f\n", increaseRate); if(retainsize); if(current->size+size>retain and current->height==0) { forgetSize = std::min(current->size+size - retain, current->size); } if(forgetSize==current->size){ Update(data, result, size, current); return; } double** dataNew; long* resultNew; double*** forgottenData = (double***)malloc(feature*sizeof(double**)); long* forgottenClass = (long*)malloc(classes*sizeof(long)); for(i=0;iheight == 0){ for(i=0; isize-forgetSize)*sizeof(double*)); resultNew = (long*)malloc((size+current->size-forgetSize)*sizeof(long)); for(i=0;isize; i++){ index[i] = i; } if(isRF)std::shuffle(index, index+current->size, g); long x = 0; for(i=0;isize;i++){ if(i>=current->size-forgetSize){ for(j=0; jresultRecord[index[i]]][forgottenClass[current->resultRecord[index[i]]]]=current->dataRecord[index[i]][j]; } forgottenClass[current->resultRecord[index[i]]]++; current->dataRecord[index[i]][feature] = DBL_MAX; }else{ dataNew[i+size] = current->dataRecord[index[i]]; resultNew[i+size] = current->resultRecord[index[i]]; } } for(i=0; isize)*sizeof(double*)); resultNew = (long*)malloc((size+current->size)*sizeof(long)); long xxx[current->size]; for(i=0;isize;i++){ if(current->dataRecord[i][feature] == DBL_MAX){ xxx[forgetSize]=i; forgetSize++; forgottenClass[current->resultRecord[i]]++; }else{ dataNew[i+size-forgetSize] = current->dataRecord[i]; resultNew[i+size-forgetSize] = current->resultRecord[i]; } } if(forgetSize==current->size){ free(forgottenData); free(forgottenClass); if(size!=0){ free(dataNew); free(resultNew); Update(data, result, size, current); }else{ // if a node have no new data and forget all old data, just keep old data return; } return; } for(i=0; iresultRecord[tmp]][k[current->resultRecord[tmp]]]=current->dataRecord[tmp][j]; } k[current->resultRecord[tmp]]++; } free(k); for(i=0; isize -= forgetSize; current->size += size; // end condition if(current->terminate or current->height==maxHeight or current->size==1){ for(i=0;iheight == 0){ for(i=0; idataRecord[index[current->size-size+i]]); } } free(index); Update(dataNew, resultNew, current->size, current); return; }else if(size==0){ for(i=0;isize, current); return; } // find min gini minEval c, cMin; long cFeature; cMin.eval = DBL_MAX; cMin.values = nullptr; long T[classes]; double HY=0; for(i=0;ifeatureId[i]; if(Sparse[col]==1){ c = incrementalMinGiniSparse(dataNew, resultNew, current->size, size, current, col, forgetSize, forgottenData[col], forgottenClass); } else if(Sparse[col]==0){ c = incrementalMinGiniDense(dataNew, resultNew, size, col, current->count, current->record, current->max, current->size, forgetSize, forgottenData[col], forgottenClass); }else{ //c = incrementalMinGiniCategorical(); } if(c.evalterminate = true; long t[classes]; for(i=0;iresult = std::distance(t, std::max_element(t, t+classes)); free(index); free(current->dataRecord); free(current->resultRecord); current->dataRecord = dataNew; current->resultRecord = resultNew; return; } //diverse data long ptL=0, ptR=0; double* t; long currentSize = current->size; // Same diverse point as last time if(current->dpoint==cMin.value and current->feature==cFeature){ ptL = size; ptR = size; long* resultL = (long*)malloc((ptL)*sizeof(long)); long* resultR = (long*)malloc((ptR)*sizeof(long)); double** dataL = (double**)malloc((ptL)*sizeof(double*)); double** dataR = (double**)malloc((ptR)*sizeof(double*)); ptL = 0; ptR = 0; for(i=0; ifeature]<=current->dpoint){ dataL[ptL] = dataNew[i]; resultL[ptL] = resultNew[i]; ptL++; }else{ dataR[ptR] = dataNew[i]; resultR[ptR] = resultNew[i]; ptR++; } } IncrementalUpdate(dataL, resultL, ptL, current->left); IncrementalUpdate(dataR, resultR, ptR, current->right); if(current->height == 0){ for(i=0; idataRecord[index[current->size-size+i]]); } } free(index); free(current->dataRecord); free(current->resultRecord); current->dataRecord = dataNew; current->resultRecord = resultNew; return; } // Different diverse point current->feature = cFeature; current->dpoint = cMin.value; /*for(i=0; ifeature]<=current->dpoint){ ptL++; }else{ ptR++; } }*/ long* resultL = (long*)malloc(currentSize*sizeof(long)); long* resultR = (long*)malloc(currentSize*sizeof(long)); double** dataL = (double**)malloc(currentSize*sizeof(double*)); double** dataR = (double**)malloc(currentSize*sizeof(double*)); ptL = 0; ptR = 0; for(i=0; ifeature]<=current->dpoint){ dataL[ptL] = dataNew[i]; resultL[ptL] = resultNew[i]; ptL++; }else{ dataR[ptR] = dataNew[i]; resultR[ptR] = resultNew[i]; ptR++; } } Update(dataL, resultL, ptL, current->left); Update(dataR, resultR, ptR, current->right); // TODO: free memeory if(current->height == 0){ for(i=0; idataRecord[index[current->size-size+i]]); } } free(index); free(current->dataRecord); free(current->resultRecord); current->dataRecord = dataNew; current->resultRecord = resultNew; } void DecisionTree::Update(double** data, long* result, long size, DT* current){ if(not current->created)createNode(current, current->height, feature, classes); long low = 0; long i, j; double HY = 0; // end condition if(current->dataRecord!=nullptr)free(current->dataRecord); current->dataRecord = data; if(current->resultRecord!=nullptr)free(current->resultRecord); current->resultRecord = result; current->size = size; if(current->height == maxHeight){ current->terminate = true; long t[classes]; for(i=0;iresult = std::distance(t, std::max_element(t, t+classes)); return; } long T[classes]; for(i=0;iterminate = true; current->result = i; return; } if(evalue == Evaluation::entropy){ if(T[i]!=0)HY -= ((double)T[i]/size)*log2((double)T[i]/size); }else{ HY += pow(((double)T[i]/size), 2); } } // find min Evaluation minEval c, cMin; long cFeature, oldMax, col, left=0; cMin.eval = DBL_MAX; cMin.values = nullptr; cFeature = -1; //TODO: categorical for(i=0;ifeatureId[i]; if(Sparse[col]==1){ c = findMinGiniSparse(data, result, T, size, col, current); } else if(Sparse[col]==0){ c = findMinGiniDense(data, result, T, size, col); if(current->count[col]!=nullptr){ for(j=0; jmax[col]; j++){ if(current->count[col][j]!=nullptr)free(current->count[col][j]); } free(current->count[col]); free(current->record[col]); } current->count[col] = c.count; current->record[col] = c.record; current->max[col] = c.max; }else{ } if(c.evalterminate = true; long max = 0; long maxs[classes]; long count = 0; for(i=1;iresult = max; return; } //printf(" %f\n", HY-cMin.eval); //diverse data current->terminate = false; current->feature = cFeature; current->dpoint = cMin.value; long ptL=0, ptR=0; //TODO: categorical long* resultL = new long[size]; long* resultR = new long[size]; double** dataL = new double*[size]; double** dataR = new double*[size]; for(i=low; ifeature]<=current->dpoint){ dataL[ptL] = data[i]; resultL[ptL] = result[i]; ptL++; }else{ dataR[ptR] = data[i]; resultR[ptR] = result[i]; ptR++; } } Update(dataL, resultL, ptL, current->left); Update(dataR, resultR, ptR, current->right); } long DecisionTree::Test(double* data, DT* root){ if(root->terminate or root->height == maxHeight)return root->result; if(data[root->feature]<=root->dpoint)return Test(data, root->left); return Test(data, root->right); } void DecisionTree::print(DT* root){ if(root->terminate){ printf("%ld", root->result); return; } printf("([%ld, %f]:", root->feature, root->dpoint); print(root->left); printf(", "); print(root->right); printf(")"); }