InputFeatureSelector.cpp

00001 #include "InputFeatureSelector.h"
00002 
00003 extern StreamOutput cout;
00004 
00008 InputFeatureSelector::InputFeatureSelector()
00009 {
00010     cout<<"InputFeatureSelector"<<endl;
00011 }
00012 
00016 InputFeatureSelector::~InputFeatureSelector()
00017 {
00018     cout<<"descructor InputFeatureSelector"<<endl;
00019 }
00020 
00025 void InputFeatureSelector::predictWithRidgeRegression ( int* colSelect, int nCol, int totalCol, int nTarget, REAL* ATAfull, REAL* ATbfull, REAL* testFeat, REAL* output, int nPred, REAL* ATA, REAL* ATb, REAL* AbTrans )
00026 {
00027     // fill ATA matrix
00028     for ( int i=0;i<nCol;i++ )
00029         for ( int j=0;j<nCol;j++ )
00030             ATA[i*nCol + j] = ATAfull[totalCol*colSelect[i] + colSelect[j]];
00031 
00032     // fill ATb matrix
00033     for ( int i=0;i<nCol;i++ )
00034         for ( int j=0;j<nTarget;j++ )
00035             ATb[i*nTarget + j] = ATbfull[colSelect[i]*nTarget + j];
00036 
00037     //int m = nCol, k = nTarget;
00038     for ( int i=0;i<nTarget;i++ )
00039         for ( int j=0;j<nCol;j++ )
00040             AbTrans[j+i*nCol] = ATb[i+j*nTarget];  // copy to colMajor
00041 
00042     int lda = nCol, nrhs = nTarget, ldb = nCol;
00043     int info;
00044 
00045     //Cholesky factorization of a symmetric (Hermitian) positive-definite matrix
00046     LAPACK_POTRF ( "U", &nCol, ATA, &lda, &info );
00047 
00048     // Solves a system of linear equations with a Cholesky-factored symmetric (Hermitian) positive-definite matrix
00049     LAPACK_POTRS ( "U", &nCol, &nrhs, ATA, &lda, AbTrans, &ldb, &info );  // Y=solution vector
00050 
00051     if ( info != 0 )
00052         cout<<"error equation solver failed to converge: "<<info<<endl;
00053 
00054     // predict test set
00055     for ( int i=0;i<nTarget;i++ )
00056     {
00057         REAL* xPtr = AbTrans + i*nCol;
00058         for ( int j=0;j<nPred;j++ ) //fold*testSize;j<(fold+1)*testSize;j++)
00059         {
00060             REAL sum = 0.0;
00061             for ( int a=0;a<nCol;a++ )
00062                 sum += xPtr[a] * testFeat[colSelect[a] + j*totalCol];
00063             output[i+j*nTarget] = sum;
00064         }
00065     }
00066 }
00067 
00077 void InputFeatureSelector::forwardSelection ( bool* selection, REAL* inputs, int nFeatures, int nExamples, int* labels, REAL* targets, int nClass, int nDomain )
00078 {
00079     cout<<endl<<"---------------------------------- FORWARD GREEDY FEATURE SELECTION ----------------------------------"<<endl;
00080 
00081     int minFeatures = 1000, maxFeatures = 2000;
00082     int nCross = 8;
00083     float percentTrain = 0.5;  // [%] train set
00084 
00085     int testBegin = nExamples * percentTrain;
00086     cout<<"testBegin:"<<testBegin<<" nExamples:"<<nExamples<<endl;
00087     int trainSize = testBegin;
00088     int testSize = nExamples - testBegin;
00089     cout<<"testSize:"<<testSize<<" trainSize:"<<trainSize<<endl;
00090     int* trainSplits = new int[nCross+1];
00091     int* testSplits = new int[nCross+1];
00092     trainSplits[0] = 0;
00093     testSplits[0] = testBegin;
00094     trainSplits[nCross] = testBegin;
00095     testSplits[nCross] = nExamples;
00096     REAL trainSlotSize = trainSize / ( REAL ) nCross;
00097     REAL testSlotSize = testSize / ( REAL ) nCross;
00098     for ( int i=0;i<nCross;i++ )
00099     {
00100         trainSplits[i+1] = trainSlotSize * ( i+1 );
00101         testSplits[i+1] = testSlotSize * ( i+1 ) + testBegin;
00102     }
00103 
00104     bool optimizeRMSE = true;
00105     REAL reg = 1.0e-1;
00106     cout<<"reg:"<<reg<<endl;
00107 
00108     // precalculated matrices for cross-fold validation on train and testset
00109     REAL** ATATrainFull = new REAL*[nCross], **ATbTrainFull = new REAL*[nCross];
00110     REAL** ATATestFull = new REAL*[nCross], **ATbTestFull = new REAL*[nCross];
00111     for ( int i=0;i<nCross;i++ )
00112     {
00113         ATATrainFull[i] = new REAL[nFeatures*nFeatures];
00114         ATbTrainFull[i] = new REAL[nFeatures*nClass*nDomain];
00115         ATATestFull[i] = new REAL[nFeatures*nFeatures];
00116         ATbTestFull[i] = new REAL[nFeatures*nClass*nDomain];
00117     }
00118 
00119     // mark already used features
00120     bool* usedFeatures = new bool[nFeatures];
00121     int* bestFeatures = new int[nFeatures];
00122     for ( int i=0;i<nFeatures;i++ )
00123     {
00124         usedFeatures[i] = false;
00125         bestFeatures[i] = -1;
00126     }
00127     int bestFeaturesCnt = 0;
00128 
00129     // add const (last input dimension)
00130     cout<<"Add feature nr. "<<nFeatures-1<<" (added const. 1 as last input dimension)"<<endl;
00131     bestFeatures[0] = nFeatures-1;
00132     usedFeatures[nFeatures-1] = true;
00133     //featureIDFile<<bestFeatures[bestFeaturesCnt]<<endl;
00134     bestFeaturesCnt++;
00135 
00136     cout<<"Precalculate ATA and ATb for every cross-fold (trainset): "<<flush;
00137 
00138     REAL* inputsTmp = new REAL[nExamples*nFeatures];
00139     REAL* targetsTmp = new REAL[nExamples*nClass*nDomain];
00140     for ( int i=0;i<nCross;i++ )
00141     {
00142         cout<<"."<<flush;
00143 
00144         // ================================== TRAINSET ==================================
00145         memcpy ( inputsTmp, inputs, sizeof ( REAL ) *nExamples*nFeatures );
00146         memcpy ( targetsTmp, targets, sizeof ( REAL ) *nExamples*nClass*nDomain );
00147 
00148         // train data: set validation elements to zero
00149         memset ( inputsTmp + nFeatures * trainSplits[i], 0, sizeof ( REAL ) *nFeatures* ( trainSplits[i+1]-trainSplits[i] ) );
00150         memset ( targetsTmp + nClass * nDomain * trainSplits[i], 0, sizeof ( REAL ) *nClass*nDomain* ( trainSplits[i+1]-trainSplits[i] ) );
00151 
00152         // train data: set testset elements to zero
00153         memset ( inputsTmp + nFeatures * testBegin, 0, sizeof ( REAL ) *nFeatures*testSize );
00154         memset ( targetsTmp + nClass * nDomain * testBegin, 0, sizeof ( REAL ) *nClass*nDomain*testSize );
00155 
00156         int nTrain = 0;
00157         for ( int j=0;j<nExamples;j++ )
00158         {
00159             int cnt = 0;
00160             for ( int k=0;k<nFeatures;k++ )
00161                 if ( inputsTmp[j*nFeatures+k] == 0.0 )
00162                     cnt++;
00163             if ( cnt != nFeatures )
00164                 nTrain++;
00165         }
00166         cout<<" [nTrain:"<<nTrain<<"]";
00167 
00168         int n = nExamples, m = nFeatures, k = nClass*nDomain;
00169 
00170         REAL* A = inputsTmp;
00171         // A'*A
00172         CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, m, n, 1.0, A, m, A, m, 0.0, ATATrainFull[i], m ); // AA = A'A
00173 
00174         // add lambda
00175         for ( int j=0;j<m;j++ )
00176             ATATrainFull[i][j*m + j] += ( ( REAL ) trainSize - ( REAL ) trainSize/ ( REAL ) nCross ) * reg;
00177 
00178         // A'*b
00179         REAL *b = targetsTmp;
00180         CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, k , n, 1.0, A, m, b, k, 0.0, ATbTrainFull[i], k ); // Ab = A'b
00181     }
00182     cout<<endl;
00183 
00184     cout<<"Precalculate ATA and ATb for every cross-fold (testset): "<<flush;
00185     for ( int i=0;i<nCross;i++ )
00186     {
00187         cout<<"."<<flush;
00188 
00189         // ================================== TESTSET ==================================
00190         memcpy ( inputsTmp, inputs, sizeof ( REAL ) *nExamples*nFeatures );
00191         memcpy ( targetsTmp, targets, sizeof ( REAL ) *nExamples*nClass*nDomain );
00192 
00193         // test data: set validation elements to zero
00194         memset ( inputsTmp + nFeatures * testSplits[i], 0, sizeof ( REAL ) *nFeatures* ( testSplits[i+1]-testSplits[i] ) );
00195         memset ( targetsTmp + nClass * nDomain * testSplits[i], 0, sizeof ( REAL ) *nClass*nDomain* ( testSplits[i+1]-testSplits[i] ) );
00196 
00197         // test data: set trainset elements to
00198         memset ( inputsTmp, 0, sizeof ( REAL ) *nFeatures*trainSize );
00199         memset ( targetsTmp, 0, sizeof ( REAL ) *nClass*nDomain*trainSize );
00200 
00201         int nTest = 0;
00202         for ( int j=0;j<nExamples;j++ )
00203         {
00204             int cnt = 0;
00205             for ( int k=0;k<nFeatures;k++ )
00206                 if ( inputsTmp[j*nFeatures+k] == 0.0 )
00207                     cnt++;
00208             if ( cnt != nFeatures )
00209                 nTest++;
00210         }
00211         cout<<" [nTest:"<<nTest<<"]";
00212 
00213         int n = nExamples, m = nFeatures, k = nClass*nDomain;
00214 
00215         REAL* A = inputsTmp;
00216         // A'*A
00217         CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, m, n, 1.0, A, m, A, m, 0.0, ATATestFull[i], m ); // AA = A'A
00218 
00219         // add lambda
00220         for ( int j=0;j<m;j++ )
00221             ATATestFull[i][j*m + j] += ( ( REAL ) testSize - ( REAL ) testSize/ ( REAL ) nCross ) * reg;
00222 
00223         // A'*b
00224         REAL *b = targetsTmp;
00225         CBLAS_GEMM ( CblasRowMajor, CblasTrans, CblasNoTrans, m, k , n, 1.0, A, m, b, k, 0.0, ATbTestFull[i], k ); // Ab = A'b
00226     }
00227     cout<<endl;
00228 
00229     delete[] inputsTmp;
00230     delete[] targetsTmp;
00231 
00232 
00233     REAL rmseTestBest = 1e10;
00234 
00235     // greedy through all features
00236     for ( int outerLoop=bestFeaturesCnt;outerLoop<nFeatures;outerLoop++ )
00237     {
00238         time_t t0 = time ( 0 );
00239         int progress = nFeatures/10 + 1;
00240 
00241         int bestInd = -1;
00242         REAL bestTrainErr = 1e10;
00243 
00244         REAL* prediction = new REAL[nExamples*nClass*nDomain];
00245         REAL* ATA = new REAL[ ( bestFeaturesCnt+1 ) * ( bestFeaturesCnt+1 ) ];
00246         REAL *ATb = new REAL[ ( bestFeaturesCnt+1 ) *nClass*nDomain];
00247         REAL* AbTrans = new REAL[ ( bestFeaturesCnt+1 ) *nClass*nDomain];
00248 
00249         cout<<"nF:"<<bestFeaturesCnt<<" "<<flush;
00250 
00251         // search over all features
00252         for ( int innerLoop=0;innerLoop<nFeatures;innerLoop++ )
00253         {
00254             if ( innerLoop%progress==0 )
00255                 cout<<"."<<flush;
00256 
00257             if ( usedFeatures[innerLoop] == true )
00258                 continue;
00259 
00260             // select current feature
00261             bestFeatures[bestFeaturesCnt] = innerLoop;
00262             bestFeaturesCnt++;
00263 
00264             // cross-fold train prediction
00265             for ( int f=0;f<nCross;f++ )
00266             {
00267                 int nPred = trainSplits[f+1] - trainSplits[f];
00268                 int nBest = bestFeaturesCnt;
00269                 REAL* predPtr = prediction + trainSplits[f] * nClass * nDomain;
00270                 REAL* featPtr = inputs + trainSplits[f] * nFeatures;
00271                 predictWithRidgeRegression ( bestFeatures, nBest, nFeatures, nClass*nDomain, ATATrainFull[f], ATbTrainFull[f], featPtr, predPtr, nPred, ATA, ATb, AbTrans );
00272             }
00273 
00274 
00275             // calc train error
00276             double rmseTrain = 0.0;
00277             int rmseTrainCnt = 0;
00278             for ( int i=0;i<testBegin;i++ )
00279             {
00280                 for ( int j=0;j<nClass*nDomain;j++ )
00281                 {
00282                     double err = prediction[j + i*nClass*nDomain] - targets[j + i*nClass*nDomain];
00283                     rmseTrain += err*err;
00284                     rmseTrainCnt++;
00285                 }
00286             }
00287             rmseTrain = sqrt ( rmseTrain/ ( double ) rmseTrainCnt );
00288             /*AUC auc;
00289             REAL* tmp = new REAL[trainSize*nClass*nDomain];
00290             int* tmpL = new int[trainSize*nDomain];
00291             for(int i=0;i<trainSize;i++)
00292             {
00293                 for(int j=0;j<nClass*nDomain;j++)
00294                     tmp[i+j*trainSize] = prediction[j+i*nClass*nDomain];
00295                 for(int j=0;j<nDomain;j++)
00296                     tmpL[j+i*nDomain] = labels[j+i*nDomain];
00297             }
00298             rmseTrain = auc.getAUC(tmp, tmpL, nClass, nDomain, trainSize);
00299             delete[] tmp;
00300             delete[] tmpL;
00301             */
00302             if ( rmseTrain < bestTrainErr )
00303             {
00304                 bestTrainErr = rmseTrain;
00305                 bestInd = innerLoop;
00306             }
00307 
00308             // deselect current feature
00309             bestFeaturesCnt--;
00310 
00311         }
00312 
00313         // add best found feature
00314         usedFeatures[bestInd] = true;
00315         bestFeatures[bestFeaturesCnt] = bestInd;
00316         bestFeaturesCnt++;
00317 
00318         // cross-fold test prediction
00319         for ( int f=0;f<nCross;f++ )
00320         {
00321             int nPred = testSplits[f+1] - testSplits[f];
00322             int nBest = bestFeaturesCnt;
00323             REAL* predPtr = prediction + testSplits[f] * nClass * nDomain;
00324             REAL* featPtr = inputs + testSplits[f] * nFeatures;
00325             predictWithRidgeRegression ( bestFeatures, nBest, nFeatures, nClass*nDomain, ATATrainFull[f], ATbTrainFull[f], featPtr, predPtr, nPred, ATA, ATb, AbTrans );
00326         }
00327 
00328         // calc test error
00329         double rmseTest = 0.0;
00330         int rmseTestCnt = 0;
00331         for ( int i=testBegin;i<nExamples;i++ )
00332         {
00333             for ( int j=0;j<nClass*nDomain;j++ )
00334             {
00335                 double err = prediction[j + i*nClass*nDomain] - targets[j + i*nClass*nDomain];
00336                 rmseTest += err*err;
00337                 rmseTestCnt++;
00338             }
00339         }
00340         rmseTest = sqrt ( rmseTest/ ( double ) rmseTestCnt );
00341         /*AUC auc;
00342         REAL* tmp = new REAL[testSize*nClass*nDomain];
00343         int* tmpL = new int[testSize*nDomain];
00344         for(int i=0;i<testSize;i++)
00345         {
00346             for(int j=0;j<nClass*nDomain;j++)
00347                 tmp[i+j*testSize] = prediction[j+i*nClass*nDomain + trainSize*nClass*nDomain];
00348             for(int j=0;j<nDomain;j++)
00349                 tmpL[j+i*nDomain] = labels[j+i*nDomain+trainSize*nDomain];
00350         }
00351         rmseTest = auc.getAUC(tmp, tmpL, nClass, nDomain, testSize);
00352         delete[] tmp;
00353         delete[] tmpL;*/
00354 
00355         delete[] ATA;
00356         delete[] ATb;
00357         delete[] AbTrans;
00358         delete[] prediction;
00359 
00360         cout<<" err:"<<bestTrainErr<<" (testErr:"<<rmseTest<<")"<<" (bestInd:"<<bestInd<<") "<<time ( 0 )-t0<<"[s]";
00361 
00362         if ( rmseTest < rmseTestBest )
00363         {
00364             rmseTestBest = rmseTest;
00365             cout<<" [best test]";
00366 
00367             // write best features (not the first one!)
00368             // open the feature file and write best (test min.) found features
00369             fstream featureIDFile;
00370             featureIDFile.open ( FEATURE_TXT_FILE,ios::out );
00371             for ( int i=1;i<nFeatures;i++ )
00372                 if ( bestFeatures[i] != -1 )
00373                     featureIDFile<<bestFeatures[i]<<endl;
00374             featureIDFile.close();
00375         }
00376         else if ( bestFeaturesCnt >= minFeatures )
00377             return;
00378         if ( bestFeaturesCnt >= maxFeatures )
00379             return;
00380 
00381         cout<<endl;
00382     }
00383 
00384 
00385 
00386 }
00387 
00388 void InputFeatureSelector::backwardElimination ( bool* selection, REAL* inputs, int nFeatures, int nExamples, int* labels, REAL* targets, int nClass, int nDomain )
00389 {
00390     assert ( false );
00391 }
00392 
00397 void InputFeatureSelector::selectFeatures ( bool* selection, REAL* inputs, int nFeatures, int nExamples, int* labels, REAL* targets, int nClass, int nDomain )
00398 {
00399     REAL* inputsWithConst = new REAL[ ( nFeatures+1 ) *nExamples];
00400     for ( int i=0;i<nExamples;i++ )
00401     {
00402         for ( int j=0;j<nFeatures;j++ )
00403             inputsWithConst[i* ( nFeatures+1 ) +j] = inputs[i*nFeatures+j];
00404         inputsWithConst[i* ( nFeatures+1 ) +nFeatures] = 1.0;
00405     }
00406 
00407     forwardSelection ( selection, inputsWithConst, nFeatures+1, nExamples, labels, targets, nClass, nDomain );
00408     delete[] inputsWithConst;
00409     return;
00410 }

Generated on Tue Jan 26 09:20:59 2010 for ELF by  doxygen 1.5.8