00001 #include "InputFeatureSelector.h"
00002
00003 extern StreamOutput cout;
00004
00008 InputFeatureSelector::InputFeatureSelector()
00009 {
00010 cout<<"InputFeatureSelector"<<endl;
00011 }
00012
00016 InputFeatureSelector::~InputFeatureSelector()
00017 {
00018 cout<<"descructor InputFeatureSelector"<<endl;
00019 }
00020
00025 void InputFeatureSelector::predictWithRidgeRegression ( int* colSelect, int nCol, int totalCol, int nTarget, REAL* ATAfull, REAL* ATbfull, REAL* testFeat, REAL* output, int nPred, REAL* ATA, REAL* ATb, REAL* AbTrans )
00026 {
00027
00028 for ( int i=0;i<nCol;i++ )
00029 for ( int j=0;j<nCol;j++ )
00030 ATA[i*nCol + j] = ATAfull[totalCol*colSelect[i] + colSelect[j]];
00031
00032
00033 for ( int i=0;i<nCol;i++ )
00034 for ( int j=0;j<nTarget;j++ )
00035 ATb[i*nTarget + j] = ATbfull[colSelect[i]*nTarget + j];
00036
00037
00038 for ( int i=0;i<nTarget;i++ )
00039 for ( int j=0;j<nCol;j++ )
00040 AbTrans[j+i*nCol] = ATb[i+j*nTarget];
00041
00042 int lda = nCol, nrhs = nTarget, ldb = nCol;
00043 int info;
00044
00045
00046 LAPACK_POTRF ( "U", &nCol, ATA, &lda, &info );
00047
00048
00049 LAPACK_POTRS ( "U", &nCol, &nrhs, ATA, &lda, AbTrans, &ldb, &info );
00050
00051 if ( info != 0 )
00052 cout<<"error equation solver failed to converge: "<<info<<endl;
00053
00054
00055 for ( int i=0;i<nTarget;i++ )
00056 {
00057 REAL* xPtr = AbTrans + i*nCol;
00058 for ( int j=0;j<nPred;j++ )
00059 {
00060 REAL sum = 0.0;
00061 for ( int a=0;a<nCol;a++ )
00062 sum += xPtr[a] * testFeat[colSelect[a] + j*totalCol];
00063 output[i+j*nTarget] = sum;
00064 }
00065 }
00066 }
00067
00077 void InputFeatureSelector::forwardSelection ( bool* selection, REAL* inputs, int nFeatures, int nExamples, int* labels, REAL* targets, int nClass, int nDomain )
00078 {
00079 cout<<endl<<"---------------------------------- FORWARD GREEDY FEATURE SELECTION ----------------------------------"<<endl;
00080
00081 int minFeatures = 1000, maxFeatures = 2000;
00082 int nCross = 8;
00083 float percentTrain = 0.5;
00084
00085 int testBegin = nExamples * percentTrain;
00086 cout<<"testBegin:"<<testBegin<<" nExamples:"<<nExamples<<endl;
00087 int trainSize = testBegin;
00088 int testSize = nExamples - testBegin;
00089 cout<<"testSize:"<<testSize<<" trainSize:"<<trainSize<<endl;
00090 int* trainSplits = new int[nCross+1];
00091 int* testSplits = new int[nCross+1];
00092 trainSplits[0] = 0;
00093 testSplits[0] = testBegin;
00094 trainSplits[nCross] = testBegin;
00095 testSplits[nCross] = nExamples;
00096 REAL trainSlotSize = trainSize / ( REAL ) nCross;
00097 REAL testSlotSize = testSize / ( REAL ) nCross;
00098 for ( int i=0;i<nCross;i++ )
00099 {
00100 trainSplits[i+1] = trainSlotSize * ( i+1 );
00101 testSplits[i+1] = testSlotSize * ( i+1 ) + testBegin;
00102 }
00103
00104 bool optimizeRMSE = true;
00105 REAL reg = 1.0e-1;
00106 cout<<"reg:"<<reg<<endl;
00107
00108
00109 REAL** ATATrainFull = new REAL*[nCross], **ATbTrainFull = new REAL*[nCross];
00110 REAL** ATATestFull = new REAL*[nCross], **ATbTestFull = new REAL*[nCross];
00111 for ( int i=0;i<nCross;i++ )
00112 {
00113 ATATrainFull[i] = new REAL[nFeatures*nFeatures];
00114 ATbTrainFull[i] = new REAL[nFeatures*nClass*nDomain];
00115 ATATestFull[i] = new REAL[nFeatures*nFeatures];
00116 ATbTestFull[i] = new REAL[nFeatures*nClass*nDomain];
00117 }
00118
00119
00120 bool* usedFeatures = new bool[nFeatures];
00121 int* bestFeatures = new int[nFeatures];
00122 for ( int i=0;i<nFeatures;i++ )
00123 {
00124 usedFeatures[i] = false;
00125 bestFeatures[i] = -1;
00126 }
00127 int bestFeaturesCnt = 0;
00128
00129
00130 cout<<"Add feature nr. "<<nFeatures-1<<" (added const. 1 as last input dimension)"<<endl;
00131 bestFeatures[0] = nFeatures-1;
00132 usedFeatures[nFeatures-1] = true;
00133
00134 bestFeaturesCnt++;
00135
00136 cout<<"Precalculate ATA and ATb for every cross-fold (trainset): "<<flush;
00137
00138 REAL* inputsTmp = new REAL[nExamples*nFeatures];
00139 REAL* targetsTmp = new REAL[nExamples*nClass*nDomain];
00140 for ( int i=0;i<nCross;i++ )
00141 {
00142 cout<<"."<<flush;
00143
00144
00145 memcpy ( inputsTmp, inputs, sizeof ( REAL ) *nExamples*nFeatures );
00146 memcpy ( targetsTmp, targets, sizeof ( REAL ) *nExamples*nClass*nDomain );
00147
00148
00149 memset ( inputsTmp + nFeatures * trainSplits[i], 0, sizeof ( REAL ) *nFeatures* ( trainSplits[i+1]-trainSplits[i] ) );
00150 memset ( targetsTmp + nClass * nDomain * trainSplits[i], 0, sizeof ( REAL ) *nClass*nDomain* ( trainSplits[i+1]-trainSplits[i] ) );
00151
00152
00153 memset ( inputsTmp + nFeatures * testBegin, 0, sizeof ( REAL ) *nFeatures*testSize );
00154 memset ( targetsTmp + nClass * nDomain * testBegin, 0, sizeof ( REAL ) *nClass*nDomain*testSize );
00155
00156 int nTrain = 0;
00157 for ( int j=0;j<nExamples;j++ )
00158 {
00159 int cnt = 0;
00160 for ( int k=0;k<nFeatures;k++ )
00161 if ( inputsTmp[j*nFeatures+k] == 0.0 )
00162 cnt++;
00163 if ( cnt != nFeatures )
00164 nTrain++;
00165 }
00166 cout<<" [nTrain:"<<nTrain<<"]";
00167
00168 int n = nExamples, m = nFeatures, k = nClass*nDomain;
00169
00170 REAL* A = inputsTmp;
00171
00172 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, m, n, 1.0, A, m, A, m, 0.0, ATATrainFull[i], m );
00173
00174
00175 for ( int j=0;j<m;j++ )
00176 ATATrainFull[i][j*m + j] += ( ( REAL ) trainSize - ( REAL ) trainSize/ ( REAL ) nCross ) * reg;
00177
00178
00179 REAL *b = targetsTmp;
00180 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, k , n, 1.0, A, m, b, k, 0.0, ATbTrainFull[i], k );
00181 }
00182 cout<<endl;
00183
00184 cout<<"Precalculate ATA and ATb for every cross-fold (testset): "<<flush;
00185 for ( int i=0;i<nCross;i++ )
00186 {
00187 cout<<"."<<flush;
00188
00189
00190 memcpy ( inputsTmp, inputs, sizeof ( REAL ) *nExamples*nFeatures );
00191 memcpy ( targetsTmp, targets, sizeof ( REAL ) *nExamples*nClass*nDomain );
00192
00193
00194 memset ( inputsTmp + nFeatures * testSplits[i], 0, sizeof ( REAL ) *nFeatures* ( testSplits[i+1]-testSplits[i] ) );
00195 memset ( targetsTmp + nClass * nDomain * testSplits[i], 0, sizeof ( REAL ) *nClass*nDomain* ( testSplits[i+1]-testSplits[i] ) );
00196
00197
00198 memset ( inputsTmp, 0, sizeof ( REAL ) *nFeatures*trainSize );
00199 memset ( targetsTmp, 0, sizeof ( REAL ) *nClass*nDomain*trainSize );
00200
00201 int nTest = 0;
00202 for ( int j=0;j<nExamples;j++ )
00203 {
00204 int cnt = 0;
00205 for ( int k=0;k<nFeatures;k++ )
00206 if ( inputsTmp[j*nFeatures+k] == 0.0 )
00207 cnt++;
00208 if ( cnt != nFeatures )
00209 nTest++;
00210 }
00211 cout<<" [nTest:"<<nTest<<"]";
00212
00213 int n = nExamples, m = nFeatures, k = nClass*nDomain;
00214
00215 REAL* A = inputsTmp;
00216
00217 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, m, n, 1.0, A, m, A, m, 0.0, ATATestFull[i], m );
00218
00219
00220 for ( int j=0;j<m;j++ )
00221 ATATestFull[i][j*m + j] += ( ( REAL ) testSize - ( REAL ) testSize/ ( REAL ) nCross ) * reg;
00222
00223
00224 REAL *b = targetsTmp;
00225 CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, k , n, 1.0, A, m, b, k, 0.0, ATbTestFull[i], k );
00226 }
00227 cout<<endl;
00228
00229 delete[] inputsTmp;
00230 delete[] targetsTmp;
00231
00232
00233 REAL rmseTestBest = 1e10;
00234
00235
00236 for ( int outerLoop=bestFeaturesCnt;outerLoop<nFeatures;outerLoop++ )
00237 {
00238 time_t t0 = time ( 0 );
00239 int progress = nFeatures/10 + 1;
00240
00241 int bestInd = -1;
00242 REAL bestTrainErr = 1e10;
00243
00244 REAL* prediction = new REAL[nExamples*nClass*nDomain];
00245 REAL* ATA = new REAL[ ( bestFeaturesCnt+1 ) * ( bestFeaturesCnt+1 ) ];
00246 REAL *ATb = new REAL[ ( bestFeaturesCnt+1 ) *nClass*nDomain];
00247 REAL* AbTrans = new REAL[ ( bestFeaturesCnt+1 ) *nClass*nDomain];
00248
00249 cout<<"nF:"<<bestFeaturesCnt<<" "<<flush;
00250
00251
00252 for ( int innerLoop=0;innerLoop<nFeatures;innerLoop++ )
00253 {
00254 if ( innerLoop%progress==0 )
00255 cout<<"."<<flush;
00256
00257 if ( usedFeatures[innerLoop] == true )
00258 continue;
00259
00260
00261 bestFeatures[bestFeaturesCnt] = innerLoop;
00262 bestFeaturesCnt++;
00263
00264
00265 for ( int f=0;f<nCross;f++ )
00266 {
00267 int nPred = trainSplits[f+1] - trainSplits[f];
00268 int nBest = bestFeaturesCnt;
00269 REAL* predPtr = prediction + trainSplits[f] * nClass * nDomain;
00270 REAL* featPtr = inputs + trainSplits[f] * nFeatures;
00271 predictWithRidgeRegression ( bestFeatures, nBest, nFeatures, nClass*nDomain, ATATrainFull[f], ATbTrainFull[f], featPtr, predPtr, nPred, ATA, ATb, AbTrans );
00272 }
00273
00274
00275
00276 double rmseTrain = 0.0;
00277 int rmseTrainCnt = 0;
00278 for ( int i=0;i<testBegin;i++ )
00279 {
00280 for ( int j=0;j<nClass*nDomain;j++ )
00281 {
00282 double err = prediction[j + i*nClass*nDomain] - targets[j + i*nClass*nDomain];
00283 rmseTrain += err*err;
00284 rmseTrainCnt++;
00285 }
00286 }
00287 rmseTrain = sqrt ( rmseTrain/ ( double ) rmseTrainCnt );
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302 if ( rmseTrain < bestTrainErr )
00303 {
00304 bestTrainErr = rmseTrain;
00305 bestInd = innerLoop;
00306 }
00307
00308
00309 bestFeaturesCnt--;
00310
00311 }
00312
00313
00314 usedFeatures[bestInd] = true;
00315 bestFeatures[bestFeaturesCnt] = bestInd;
00316 bestFeaturesCnt++;
00317
00318
00319 for ( int f=0;f<nCross;f++ )
00320 {
00321 int nPred = testSplits[f+1] - testSplits[f];
00322 int nBest = bestFeaturesCnt;
00323 REAL* predPtr = prediction + testSplits[f] * nClass * nDomain;
00324 REAL* featPtr = inputs + testSplits[f] * nFeatures;
00325 predictWithRidgeRegression ( bestFeatures, nBest, nFeatures, nClass*nDomain, ATATrainFull[f], ATbTrainFull[f], featPtr, predPtr, nPred, ATA, ATb, AbTrans );
00326 }
00327
00328
00329 double rmseTest = 0.0;
00330 int rmseTestCnt = 0;
00331 for ( int i=testBegin;i<nExamples;i++ )
00332 {
00333 for ( int j=0;j<nClass*nDomain;j++ )
00334 {
00335 double err = prediction[j + i*nClass*nDomain] - targets[j + i*nClass*nDomain];
00336 rmseTest += err*err;
00337 rmseTestCnt++;
00338 }
00339 }
00340 rmseTest = sqrt ( rmseTest/ ( double ) rmseTestCnt );
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355 delete[] ATA;
00356 delete[] ATb;
00357 delete[] AbTrans;
00358 delete[] prediction;
00359
00360 cout<<" err:"<<bestTrainErr<<" (testErr:"<<rmseTest<<")"<<" (bestInd:"<<bestInd<<") "<<time ( 0 )-t0<<"[s]";
00361
00362 if ( rmseTest < rmseTestBest )
00363 {
00364 rmseTestBest = rmseTest;
00365 cout<<" [best test]";
00366
00367
00368
00369 fstream featureIDFile;
00370 featureIDFile.open ( FEATURE_TXT_FILE,ios::out );
00371 for ( int i=1;i<nFeatures;i++ )
00372 if ( bestFeatures[i] != -1 )
00373 featureIDFile<<bestFeatures[i]<<endl;
00374 featureIDFile.close();
00375 }
00376 else if ( bestFeaturesCnt >= minFeatures )
00377 return;
00378 if ( bestFeaturesCnt >= maxFeatures )
00379 return;
00380
00381 cout<<endl;
00382 }
00383
00384
00385
00386 }
00387
00388 void InputFeatureSelector::backwardElimination ( bool* selection, REAL* inputs, int nFeatures, int nExamples, int* labels, REAL* targets, int nClass, int nDomain )
00389 {
00390 assert ( false );
00391 }
00392
00397 void InputFeatureSelector::selectFeatures ( bool* selection, REAL* inputs, int nFeatures, int nExamples, int* labels, REAL* targets, int nClass, int nDomain )
00398 {
00399 REAL* inputsWithConst = new REAL[ ( nFeatures+1 ) *nExamples];
00400 for ( int i=0;i<nExamples;i++ )
00401 {
00402 for ( int j=0;j<nFeatures;j++ )
00403 inputsWithConst[i* ( nFeatures+1 ) +j] = inputs[i*nFeatures+j];
00404 inputsWithConst[i* ( nFeatures+1 ) +nFeatures] = 1.0;
00405 }
00406
00407 forwardSelection ( selection, inputsWithConst, nFeatures+1, nExamples, labels, targets, nClass, nDomain );
00408 delete[] inputsWithConst;
00409 return;
00410 }