#include <InputFeatureSelector.h>
Public Member Functions | |
InputFeatureSelector () | |
virtual | ~InputFeatureSelector () |
Static Public Member Functions | |
static void | forwardSelection (bool *selection, REAL *inputs, int nFeatures, int nExamples, int *labels, REAL *targets, int nClass, int nDomain) |
static void | backwardElimination (bool *selection, REAL *inputs, int nFeatures, int nExamples, int *labels, REAL *targets, int nClass, int nDomain) |
static void | selectFeatures (bool *selection, REAL *inputs, int nFeatures, int nExamples, int *labels, REAL *targets, int nClass, int nDomain) |
Static Private Member Functions | |
static void | predictWithRidgeRegression (int *colSelect, int nCol, int totalCol, int nTarget, REAL *ATAfull, REAL *ATbfull, REAL *testFeat, REAL *output, int nPred, REAL *ATA, REAL *ATb, REAL *AbTrans) |
Definition at line 38 of file InputFeatureSelector.h.
InputFeatureSelector::InputFeatureSelector | ( | ) |
InputFeatureSelector::~InputFeatureSelector | ( | ) | [virtual] |
void InputFeatureSelector::forwardSelection | ( | bool * | selection, | |
REAL * | inputs, | |||
int | nFeatures, | |||
int | nExamples, | |||
int * | labels, | |||
REAL * | targets, | |||
int | nClass, | |||
int | nDomain | |||
) | [static] |
Forwars selection - greedy search Prediction model is regularized linear regression
Features are selected on a 50% trainset via greedy selection Evaluation is done on the remaining 50% to get the point of overfitting. Overfitting is an essential issue here.
Definition at line 77 of file InputFeatureSelector.cpp.
00078 { 00079 cout<<endl<<"---------------------------------- FORWARD GREEDY FEATURE SELECTION ----------------------------------"<<endl; 00080 00081 int minFeatures = 1000, maxFeatures = 2000; 00082 int nCross = 8; 00083 float percentTrain = 0.5; // [%] train set 00084 00085 int testBegin = nExamples * percentTrain; 00086 cout<<"testBegin:"<<testBegin<<" nExamples:"<<nExamples<<endl; 00087 int trainSize = testBegin; 00088 int testSize = nExamples - testBegin; 00089 cout<<"testSize:"<<testSize<<" trainSize:"<<trainSize<<endl; 00090 int* trainSplits = new int[nCross+1]; 00091 int* testSplits = new int[nCross+1]; 00092 trainSplits[0] = 0; 00093 testSplits[0] = testBegin; 00094 trainSplits[nCross] = testBegin; 00095 testSplits[nCross] = nExamples; 00096 REAL trainSlotSize = trainSize / ( REAL ) nCross; 00097 REAL testSlotSize = testSize / ( REAL ) nCross; 00098 for ( int i=0;i<nCross;i++ ) 00099 { 00100 trainSplits[i+1] = trainSlotSize * ( i+1 ); 00101 testSplits[i+1] = testSlotSize * ( i+1 ) + testBegin; 00102 } 00103 00104 bool optimizeRMSE = true; 00105 REAL reg = 1.0e-1; 00106 cout<<"reg:"<<reg<<endl; 00107 00108 // precalculated matrices for cross-fold validation on train and testset 00109 REAL** ATATrainFull = new REAL*[nCross], **ATbTrainFull = new REAL*[nCross]; 00110 REAL** ATATestFull = new REAL*[nCross], **ATbTestFull = new REAL*[nCross]; 00111 for ( int i=0;i<nCross;i++ ) 00112 { 00113 ATATrainFull[i] = new REAL[nFeatures*nFeatures]; 00114 ATbTrainFull[i] = new REAL[nFeatures*nClass*nDomain]; 00115 ATATestFull[i] = new REAL[nFeatures*nFeatures]; 00116 ATbTestFull[i] = new REAL[nFeatures*nClass*nDomain]; 00117 } 00118 00119 // mark already used features 00120 bool* usedFeatures = new bool[nFeatures]; 00121 int* bestFeatures = new int[nFeatures]; 00122 for ( int i=0;i<nFeatures;i++ ) 00123 { 00124 usedFeatures[i] = false; 00125 bestFeatures[i] = -1; 00126 } 00127 int bestFeaturesCnt = 0; 00128 00129 // add const (last input dimension) 00130 cout<<"Add feature nr. "<<nFeatures-1<<" (added const. 1 as last input dimension)"<<endl; 00131 bestFeatures[0] = nFeatures-1; 00132 usedFeatures[nFeatures-1] = true; 00133 //featureIDFile<<bestFeatures[bestFeaturesCnt]<<endl; 00134 bestFeaturesCnt++; 00135 00136 cout<<"Precalculate ATA and ATb for every cross-fold (trainset): "<<flush; 00137 00138 REAL* inputsTmp = new REAL[nExamples*nFeatures]; 00139 REAL* targetsTmp = new REAL[nExamples*nClass*nDomain]; 00140 for ( int i=0;i<nCross;i++ ) 00141 { 00142 cout<<"."<<flush; 00143 00144 // ================================== TRAINSET ================================== 00145 memcpy ( inputsTmp, inputs, sizeof ( REAL ) *nExamples*nFeatures ); 00146 memcpy ( targetsTmp, targets, sizeof ( REAL ) *nExamples*nClass*nDomain ); 00147 00148 // train data: set validation elements to zero 00149 memset ( inputsTmp + nFeatures * trainSplits[i], 0, sizeof ( REAL ) *nFeatures* ( trainSplits[i+1]-trainSplits[i] ) ); 00150 memset ( targetsTmp + nClass * nDomain * trainSplits[i], 0, sizeof ( REAL ) *nClass*nDomain* ( trainSplits[i+1]-trainSplits[i] ) ); 00151 00152 // train data: set testset elements to zero 00153 memset ( inputsTmp + nFeatures * testBegin, 0, sizeof ( REAL ) *nFeatures*testSize ); 00154 memset ( targetsTmp + nClass * nDomain * testBegin, 0, sizeof ( REAL ) *nClass*nDomain*testSize ); 00155 00156 int nTrain = 0; 00157 for ( int j=0;j<nExamples;j++ ) 00158 { 00159 int cnt = 0; 00160 for ( int k=0;k<nFeatures;k++ ) 00161 if ( inputsTmp[j*nFeatures+k] == 0.0 ) 00162 cnt++; 00163 if ( cnt != nFeatures ) 00164 nTrain++; 00165 } 00166 cout<<" [nTrain:"<<nTrain<<"]"; 00167 00168 int n = nExamples, m = nFeatures, k = nClass*nDomain; 00169 00170 REAL* A = inputsTmp; 00171 // A'*A 00172 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, m, n, 1.0, A, m, A, m, 0.0, ATATrainFull[i], m ); // AA = A'A 00173 00174 // add lambda 00175 for ( int j=0;j<m;j++ ) 00176 ATATrainFull[i][j*m + j] += ( ( REAL ) trainSize - ( REAL ) trainSize/ ( REAL ) nCross ) * reg; 00177 00178 // A'*b 00179 REAL *b = targetsTmp; 00180 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, k , n, 1.0, A, m, b, k, 0.0, ATbTrainFull[i], k ); // Ab = A'b 00181 } 00182 cout<<endl; 00183 00184 cout<<"Precalculate ATA and ATb for every cross-fold (testset): "<<flush; 00185 for ( int i=0;i<nCross;i++ ) 00186 { 00187 cout<<"."<<flush; 00188 00189 // ================================== TESTSET ================================== 00190 memcpy ( inputsTmp, inputs, sizeof ( REAL ) *nExamples*nFeatures ); 00191 memcpy ( targetsTmp, targets, sizeof ( REAL ) *nExamples*nClass*nDomain ); 00192 00193 // test data: set validation elements to zero 00194 memset ( inputsTmp + nFeatures * testSplits[i], 0, sizeof ( REAL ) *nFeatures* ( testSplits[i+1]-testSplits[i] ) ); 00195 memset ( targetsTmp + nClass * nDomain * testSplits[i], 0, sizeof ( REAL ) *nClass*nDomain* ( testSplits[i+1]-testSplits[i] ) ); 00196 00197 // test data: set trainset elements to 00198 memset ( inputsTmp, 0, sizeof ( REAL ) *nFeatures*trainSize ); 00199 memset ( targetsTmp, 0, sizeof ( REAL ) *nClass*nDomain*trainSize ); 00200 00201 int nTest = 0; 00202 for ( int j=0;j<nExamples;j++ ) 00203 { 00204 int cnt = 0; 00205 for ( int k=0;k<nFeatures;k++ ) 00206 if ( inputsTmp[j*nFeatures+k] == 0.0 ) 00207 cnt++; 00208 if ( cnt != nFeatures ) 00209 nTest++; 00210 } 00211 cout<<" [nTest:"<<nTest<<"]"; 00212 00213 int n = nExamples, m = nFeatures, k = nClass*nDomain; 00214 00215 REAL* A = inputsTmp; 00216 // A'*A 00217 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, m, n, 1.0, A, m, A, m, 0.0, ATATestFull[i], m ); // AA = A'A 00218 00219 // add lambda 00220 for ( int j=0;j<m;j++ ) 00221 ATATestFull[i][j*m + j] += ( ( REAL ) testSize - ( REAL ) testSize/ ( REAL ) nCross ) * reg; 00222 00223 // A'*b 00224 REAL *b = targetsTmp; 00225 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, k , n, 1.0, A, m, b, k, 0.0, ATbTestFull[i], k ); // Ab = A'b 00226 } 00227 cout<<endl; 00228 00229 delete[] inputsTmp; 00230 delete[] targetsTmp; 00231 00232 00233 REAL rmseTestBest = 1e10; 00234 00235 // greedy through all features 00236 for ( int outerLoop=bestFeaturesCnt;outerLoop<nFeatures;outerLoop++ ) 00237 { 00238 time_t t0 = time ( 0 ); 00239 int progress = nFeatures/10 + 1; 00240 00241 int bestInd = -1; 00242 REAL bestTrainErr = 1e10; 00243 00244 REAL* prediction = new REAL[nExamples*nClass*nDomain]; 00245 REAL* ATA = new REAL[ ( bestFeaturesCnt+1 ) * ( bestFeaturesCnt+1 ) ]; 00246 REAL *ATb = new REAL[ ( bestFeaturesCnt+1 ) *nClass*nDomain]; 00247 REAL* AbTrans = new REAL[ ( bestFeaturesCnt+1 ) *nClass*nDomain]; 00248 00249 cout<<"nF:"<<bestFeaturesCnt<<" "<<flush; 00250 00251 // search over all features 00252 for ( int innerLoop=0;innerLoop<nFeatures;innerLoop++ ) 00253 { 00254 if ( innerLoop%progress==0 ) 00255 cout<<"."<<flush; 00256 00257 if ( usedFeatures[innerLoop] == true ) 00258 continue; 00259 00260 // select current feature 00261 bestFeatures[bestFeaturesCnt] = innerLoop; 00262 bestFeaturesCnt++; 00263 00264 // cross-fold train prediction 00265 for ( int f=0;f<nCross;f++ ) 00266 { 00267 int nPred = trainSplits[f+1] - trainSplits[f]; 00268 int nBest = bestFeaturesCnt; 00269 REAL* predPtr = prediction + trainSplits[f] * nClass * nDomain; 00270 REAL* featPtr = inputs + trainSplits[f] * nFeatures; 00271 predictWithRidgeRegression ( bestFeatures, nBest, nFeatures, nClass*nDomain, ATATrainFull[f], ATbTrainFull[f], featPtr, predPtr, nPred, ATA, ATb, AbTrans ); 00272 } 00273 00274 00275 // calc train error 00276 double rmseTrain = 0.0; 00277 int rmseTrainCnt = 0; 00278 for ( int i=0;i<testBegin;i++ ) 00279 { 00280 for ( int j=0;j<nClass*nDomain;j++ ) 00281 { 00282 double err = prediction[j + i*nClass*nDomain] - targets[j + i*nClass*nDomain]; 00283 rmseTrain += err*err; 00284 rmseTrainCnt++; 00285 } 00286 } 00287 rmseTrain = sqrt ( rmseTrain/ ( double ) rmseTrainCnt ); 00288 /*AUC auc; 00289 REAL* tmp = new REAL[trainSize*nClass*nDomain]; 00290 int* tmpL = new int[trainSize*nDomain]; 00291 for(int i=0;i<trainSize;i++) 00292 { 00293 for(int j=0;j<nClass*nDomain;j++) 00294 tmp[i+j*trainSize] = prediction[j+i*nClass*nDomain]; 00295 for(int j=0;j<nDomain;j++) 00296 tmpL[j+i*nDomain] = labels[j+i*nDomain]; 00297 } 00298 rmseTrain = auc.getAUC(tmp, tmpL, nClass, nDomain, trainSize); 00299 delete[] tmp; 00300 delete[] tmpL; 00301 */ 00302 if ( rmseTrain < bestTrainErr ) 00303 { 00304 bestTrainErr = rmseTrain; 00305 bestInd = innerLoop; 00306 } 00307 00308 // deselect current feature 00309 bestFeaturesCnt--; 00310 00311 } 00312 00313 // add best found feature 00314 usedFeatures[bestInd] = true; 00315 bestFeatures[bestFeaturesCnt] = bestInd; 00316 bestFeaturesCnt++; 00317 00318 // cross-fold test prediction 00319 for ( int f=0;f<nCross;f++ ) 00320 { 00321 int nPred = testSplits[f+1] - testSplits[f]; 00322 int nBest = bestFeaturesCnt; 00323 REAL* predPtr = prediction + testSplits[f] * nClass * nDomain; 00324 REAL* featPtr = inputs + testSplits[f] * nFeatures; 00325 predictWithRidgeRegression ( bestFeatures, nBest, nFeatures, nClass*nDomain, ATATrainFull[f], ATbTrainFull[f], featPtr, predPtr, nPred, ATA, ATb, AbTrans ); 00326 } 00327 00328 // calc test error 00329 double rmseTest = 0.0; 00330 int rmseTestCnt = 0; 00331 for ( int i=testBegin;i<nExamples;i++ ) 00332 { 00333 for ( int j=0;j<nClass*nDomain;j++ ) 00334 { 00335 double err = prediction[j + i*nClass*nDomain] - targets[j + i*nClass*nDomain]; 00336 rmseTest += err*err; 00337 rmseTestCnt++; 00338 } 00339 } 00340 rmseTest = sqrt ( rmseTest/ ( double ) rmseTestCnt ); 00341 /*AUC auc; 00342 REAL* tmp = new REAL[testSize*nClass*nDomain]; 00343 int* tmpL = new int[testSize*nDomain]; 00344 for(int i=0;i<testSize;i++) 00345 { 00346 for(int j=0;j<nClass*nDomain;j++) 00347 tmp[i+j*testSize] = prediction[j+i*nClass*nDomain + trainSize*nClass*nDomain]; 00348 for(int j=0;j<nDomain;j++) 00349 tmpL[j+i*nDomain] = labels[j+i*nDomain+trainSize*nDomain]; 00350 } 00351 rmseTest = auc.getAUC(tmp, tmpL, nClass, nDomain, testSize); 00352 delete[] tmp; 00353 delete[] tmpL;*/ 00354 00355 delete[] ATA; 00356 delete[] ATb; 00357 delete[] AbTrans; 00358 delete[] prediction; 00359 00360 cout<<" err:"<<bestTrainErr<<" (testErr:"<<rmseTest<<")"<<" (bestInd:"<<bestInd<<") "<<time ( 0 )-t0<<"[s]"; 00361 00362 if ( rmseTest < rmseTestBest ) 00363 { 00364 rmseTestBest = rmseTest; 00365 cout<<" [best test]"; 00366 00367 // write best features (not the first one!) 00368 // open the feature file and write best (test min.) found features 00369 fstream featureIDFile; 00370 featureIDFile.open ( FEATURE_TXT_FILE,ios::out ); 00371 for ( int i=1;i<nFeatures;i++ ) 00372 if ( bestFeatures[i] != -1 ) 00373 featureIDFile<<bestFeatures[i]<<endl; 00374 featureIDFile.close(); 00375 } 00376 else if ( bestFeaturesCnt >= minFeatures ) 00377 return; 00378 if ( bestFeaturesCnt >= maxFeatures ) 00379 return; 00380 00381 cout<<endl; 00382 } 00383 00384 00385 00386 }
void InputFeatureSelector::predictWithRidgeRegression | ( | int * | colSelect, | |
int | nCol, | |||
int | totalCol, | |||
int | nTarget, | |||
REAL * | ATAfull, | |||
REAL * | ATbfull, | |||
REAL * | testFeat, | |||
REAL * | output, | |||
int | nPred, | |||
REAL * | ATA, | |||
REAL * | ATb, | |||
REAL * | AbTrans | |||
) | [static, private] |
Solve linear regression by selecting subset of columns Predict targets
Definition at line 25 of file InputFeatureSelector.cpp.
00026 { 00027 // fill ATA matrix 00028 for ( int i=0;i<nCol;i++ ) 00029 for ( int j=0;j<nCol;j++ ) 00030 ATA[i*nCol + j] = ATAfull[totalCol*colSelect[i] + colSelect[j]]; 00031 00032 // fill ATb matrix 00033 for ( int i=0;i<nCol;i++ ) 00034 for ( int j=0;j<nTarget;j++ ) 00035 ATb[i*nTarget + j] = ATbfull[colSelect[i]*nTarget + j]; 00036 00037 //int m = nCol, k = nTarget; 00038 for ( int i=0;i<nTarget;i++ ) 00039 for ( int j=0;j<nCol;j++ ) 00040 AbTrans[j+i*nCol] = ATb[i+j*nTarget]; // copy to colMajor 00041 00042 int lda = nCol, nrhs = nTarget, ldb = nCol; 00043 int info; 00044 00045 //Cholesky factorization of a symmetric (Hermitian) positive-definite matrix 00046 LAPACK_POTRF ( "U", &nCol, ATA, &lda, &info ); 00047 00048 // Solves a system of linear equations with a Cholesky-factored symmetric (Hermitian) positive-definite matrix 00049 LAPACK_POTRS ( "U", &nCol, &nrhs, ATA, &lda, AbTrans, &ldb, &info ); // Y=solution vector 00050 00051 if ( info != 0 ) 00052 cout<<"error equation solver failed to converge: "<<info<<endl; 00053 00054 // predict test set 00055 for ( int i=0;i<nTarget;i++ ) 00056 { 00057 REAL* xPtr = AbTrans + i*nCol; 00058 for ( int j=0;j<nPred;j++ ) //fold*testSize;j<(fold+1)*testSize;j++) 00059 { 00060 REAL sum = 0.0; 00061 for ( int a=0;a<nCol;a++ ) 00062 sum += xPtr[a] * testFeat[colSelect[a] + j*totalCol]; 00063 output[i+j*nTarget] = sum; 00064 } 00065 } 00066 }
void InputFeatureSelector::selectFeatures | ( | bool * | selection, | |
REAL * | inputs, | |||
int | nFeatures, | |||
int | nExamples, | |||
int * | labels, | |||
REAL * | targets, | |||
int | nClass, | |||
int | nDomain | |||
) | [static] |
Greedy search
Definition at line 397 of file InputFeatureSelector.cpp.
00398 { 00399 REAL* inputsWithConst = new REAL[ ( nFeatures+1 ) *nExamples]; 00400 for ( int i=0;i<nExamples;i++ ) 00401 { 00402 for ( int j=0;j<nFeatures;j++ ) 00403 inputsWithConst[i* ( nFeatures+1 ) +j] = inputs[i*nFeatures+j]; 00404 inputsWithConst[i* ( nFeatures+1 ) +nFeatures] = 1.0; 00405 } 00406 00407 forwardSelection ( selection, inputsWithConst, nFeatures+1, nExamples, labels, targets, nClass, nDomain ); 00408 delete[] inputsWithConst; 00409 return; 00410 }