AlgorithmExploration.cpp

00001 #include "AlgorithmExploration.h"
00002 
00003 extern StreamOutput cout;
00004 
00008 AlgorithmExploration::AlgorithmExploration()
00009 {
00010     cout<<"AlgorithmExploration"<<endl;
00011 }
00012 
00016 AlgorithmExploration::~AlgorithmExploration()
00017 {
00018     cout<<"descructor AlgorithmExploration"<<endl;
00019 }
00020 
00027 void AlgorithmExploration::start()
00028 {
00029     time_t t0 = time ( 0 );
00030     cout.setOutputFile ( "out.txt" );
00031 
00032     // available algorithms for exploration
00033     vector<string> algos;
00034     algos.push_back ( "LM" );
00035     algos.push_back ( "NN" );
00036     algos.push_back ( "KNN" );
00037     //algos.push_back("PR");
00038     algos.push_back ( "KRR" );
00039 
00040     map<string,string> algoDscMap;
00041     algoDscMap["LM"] = "LinearModel_1.dsc";
00042     algoDscMap["NN"] = "NeuralNetwork_1.dsc";
00043     algoDscMap["KNN"] = "KNearestNeighbor_1.dsc";
00044     algoDscMap["PR"] = "PolynomialRegression_1.dsc";
00045     algoDscMap["KRR"] = "KernelRidgeRegression_1.dsc";
00046 
00047     map<string,string> algoPredMap;
00048     algoPredMap["LM"] = "LinearModel_1.dat";
00049     algoPredMap["NN"] = "NeuralNetwork_1.dat";
00050     algoPredMap["KNN"] = "KNearestNeighbor_1.dat";
00051     algoPredMap["PR"] = "PolynomialRegression_1.dat";
00052     algoPredMap["KRR"] = "KernelRidgeRegression_1.dat";
00053 
00054     // available datasets for exploration
00055     vector<string> datasets;
00056     /*datasets.push_back("CREDIT");
00057     datasets.push_back("BALANCE");
00058     datasets.push_back("BREAST");
00059     datasets.push_back("DIABETES");
00060     datasets.push_back("GERMAN");
00061     datasets.push_back("GLASS");*/
00062     datasets.push_back ( "HEPATITIS" );
00063     /*datasets.push_back("IONOSPHERE");
00064     datasets.push_back("IRIS");
00065     datasets.push_back("SONAR");
00066     datasets.push_back("SURVIVAL");
00067     datasets.push_back("VEHICLE");
00068     datasets.push_back("VOTES");
00069     datasets.push_back("WINE");*/
00070 
00071     //datasets.push_back("MUSHROOM"); // LARGE*/
00072     //datasets.push_back("LETTER"); // LARGE
00073     //datasets.push_back("SATIMAGE");  // LARGE
00074     //datasets.push_back("ADULT");  // LARGE
00075 
00076     // log file names
00077     string logResidualFilename = "logResidual.txt";
00078     string logResidualCascadeFilename = "logResidualCascade.txt";
00079     string logCascadeFilename = "logCascade.txt";
00080     string logStackingFilename = "logStacking.txt";
00081     /*string model = algos[0] + ".txt";
00082     string logResidualFilename = "";
00083     string logResidualCascadeFilename = "";
00084     string logCascadeFilename = model;
00085     string logStackingFilename = "";
00086     */
00087     /*
00088     // enable ensemble training methods
00089     bool enableResidual = true;
00090     bool enableResidualCascade = true;
00091     bool enableCascade = true;
00092     bool enableStacking = true;
00093     */
00094     // bagging
00095     bool enableResidual = false;
00096     bool enableResidualCascade = false;
00097     bool enableCascade = false;
00098     bool enableStacking = true;
00099 
00100 
00101     // number of test splits
00102     int nTestSplits = 100;
00103 
00104     // generate random permutations
00105     m_trainList.clear();
00106     for ( int depth=1;depth<algos.size() +1;depth++ )
00107     {
00108         vector<string> stack;
00109         randPerm ( stack, algos, depth );
00110     }
00111     cout<<endl<<"Residual/Cascade train list"<<endl;
00112     for ( int i=0;i<m_trainList.size();i++ )
00113     {
00114         for ( int j=0;j<m_trainList[i].size();j++ )
00115             cout<<m_trainList[i][j]<<" ";
00116         cout<<endl;
00117     }
00118 
00119     // generate train list for stacking (no permutations)
00120     cout<<endl<<"Stacking train list"<<endl;
00121     vector<vector<string> > stackingTrainListTmp;
00122     vector<vector<string> > stackingTrainList;
00123     for ( int i=0;i<m_trainList.size();i++ )
00124     {
00125         stackingTrainListTmp.push_back ( m_trainList[i] );
00126         sort ( stackingTrainListTmp[i].begin(), stackingTrainListTmp[i].end() );
00127 
00128         bool found = false;
00129         for ( int j=0;j<stackingTrainList.size();j++ )
00130         {
00131             if ( stackingTrainList[j] == stackingTrainListTmp[i] )
00132                 found = true;
00133         }
00134         if ( found == false )
00135         {
00136             stackingTrainList.push_back ( m_trainList[i] );
00137             int s = stackingTrainList.size();
00138             sort ( stackingTrainList[s-1].begin(), stackingTrainList[s-1].end() );
00139         }
00140     }
00141     for ( int i=0;i<stackingTrainList.size();i++ )
00142     {
00143         for ( int j=0;j<stackingTrainList[i].size();j++ )
00144             cout<<stackingTrainList[i][j]<<" ";
00145         cout<<endl;
00146     }
00147 
00148     // hide console output
00149     cout.disableAllOutputs();
00150 
00151     // through all datasets
00152     for ( int datasetCnt=0;datasetCnt<datasets.size();datasetCnt++ )
00153     {
00154         string path = datasets[datasetCnt] + "/";
00155         printf ( "\nLog: %s  (nTestsets:%d)\n", ( path + logResidualFilename ).c_str(),nTestSplits );
00156         printf ( "Log: %s  (nTestsets:%d)\n", ( path + logResidualCascadeFilename ).c_str(),nTestSplits );
00157         printf ( "Log: %s  (nTestsets:%d)\n", ( path + logCascadeFilename ).c_str(),nTestSplits );
00158         printf ( "Log: %s  (nTestsets:%d)\n", ( path + logStackingFilename ).c_str(),nTestSplits );
00159         fstream fAnalyzeResidual ( ( path + logResidualFilename ).c_str(),ios::out );
00160         fstream fAnalyzeResidualCascade ( ( path + logResidualCascadeFilename ).c_str(),ios::out );
00161         fstream fAnalyzeCascade ( ( path + logCascadeFilename ).c_str(),ios::out );
00162         fstream fAnalyzeStacking ( ( path + logStackingFilename ).c_str(),ios::out );
00163         fstream f;
00164 
00165         time_t runTime = time ( 0 );
00166 
00167         // through all test splits
00168         for ( int testCnt=0;testCnt<nTestSplits;testCnt++ )
00169         {
00170             printf ( " %d ",testCnt );
00171             fflush ( stdout );
00172 
00173             uint randomSeedSplit = time ( 0 ) + testCnt;
00174 
00175             // ========================================= RESIDUAL TRAINING ==========================================
00176             // through all possible algorithm setups
00177             for ( int run=0;run<m_trainList.size() && enableResidual;run++ )
00178             {
00179                 uint randomSeed = randomSeedSplit + run;
00180 
00181                 printf ( "r" );
00182                 fflush ( stdout );
00183 
00184                 // write Master.dsc file
00185                 string master;
00186                 vector<string> algoDscList;
00187                 for ( int i=0;i<m_trainList[run].size();i++ )
00188                     algoDscList.push_back ( algoDscMap[m_trainList[run][i]] );
00189                 bool clas = true;
00190                 bool cascade = false;
00191                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegression", cascade );
00192                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00193                 f<<master;
00194                 f.close();
00195 
00196                 // write algorithm *.dsc files
00197                 string preEffect = "";
00198                 for ( int i=0;i<m_trainList[run].size();i++ )
00199                 {
00200                     string algoStr;
00201                     if ( m_trainList[run][i] == "LM" )
00202                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, true );
00203                     else if ( m_trainList[run][i] == "NN" )
00204                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, true );
00205                     else if ( m_trainList[run][i] == "KNN" )
00206                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, true );
00207                     else if ( m_trainList[run][i] == "PR" )
00208                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, true );
00209                     else if ( m_trainList[run][i] == "KRR" )
00210                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true );
00211                     else
00212                         assert ( false );
00213                     preEffect = algoPredMap[m_trainList[run][i]];
00214 
00215                     //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl;
00216                     f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out );
00217                     f<<algoStr;
00218                     f.close();
00219                 }
00220 
00221                 // train the ensemble
00222                 Scheduler s;
00223                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00224 
00225                 s.train();
00226                 s.predict();
00227 
00228                 // bagging
00229                 //Framework::setAdditionalStartupParameter("10");
00230                 //s.bagging();
00231 
00232                 // boosting
00233                 //Framework::setAdditionalStartupParameter("20");
00234                 //s.boosting();
00235 
00236                 // save error
00237                 REAL rmse = s.getPredictionRMSE();
00238                 REAL classErr = s.getClassificationError();
00239                 fAnalyzeResidual<<rmse<<" "<<classErr<<" ";
00240             }
00241             fAnalyzeResidual<<endl;
00242 
00243 
00244             // ========================================= RESIDUAL+CASCADE TRAINING ==========================================
00245             // through all possible algorithm setups
00246             for ( int run=0;run<m_trainList.size() && enableResidualCascade;run++ )
00247             {
00248                 uint randomSeed = randomSeedSplit + run;
00249 
00250                 printf ( "m" );
00251                 fflush ( stdout );
00252 
00253                 // write Master.dsc file
00254                 string master;
00255                 vector<string> algoDscList;
00256                 for ( int i=0;i<m_trainList[run].size();i++ )
00257                     algoDscList.push_back ( algoDscMap[m_trainList[run][i]] );
00258                 bool clas = true;
00259                 bool cascade = true;
00260                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegression", cascade );
00261                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00262                 f<<master;
00263                 f.close();
00264 
00265                 // write algorithm *.dsc files
00266                 string preEffect = "";
00267                 for ( int i=0;i<m_trainList[run].size();i++ )
00268                 {
00269                     string algoStr;
00270                     if ( m_trainList[run][i] == "LM" )
00271                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, true );
00272                     else if ( m_trainList[run][i] == "NN" )
00273                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, true );
00274                     else if ( m_trainList[run][i] == "KNN" )
00275                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, true );
00276                     else if ( m_trainList[run][i] == "PR" )
00277                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, true );
00278                     else if ( m_trainList[run][i] == "KRR" )
00279                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true );
00280                     else
00281                         assert ( false );
00282                     preEffect = algoPredMap[m_trainList[run][i]];
00283 
00284                     //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl;
00285                     f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out );
00286                     f<<algoStr;
00287                     f.close();
00288                 }
00289 
00290                 // train the ensemble
00291                 Scheduler s;
00292                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00293                 s.train();
00294                 s.predict();
00295 
00296                 // save error
00297                 REAL rmse = s.getPredictionRMSE();
00298                 REAL classErr = s.getClassificationError();
00299                 fAnalyzeResidualCascade<<rmse<<" "<<classErr<<" ";
00300             }
00301             fAnalyzeResidualCascade<<endl;
00302 
00303 
00304             // ========================================= CASCADE TRAINING ==========================================
00305             // through all possible algorithm setups
00306             for ( int run=0;run<m_trainList.size() && enableCascade;run++ )
00307             {
00308                 uint randomSeed = randomSeedSplit + run;
00309 
00310                 printf ( "c" );
00311                 fflush ( stdout );
00312 
00313                 // write Master.dsc file
00314                 string master;
00315                 vector<string> algoDscList;
00316                 for ( int i=0;i<m_trainList[run].size();i++ )
00317                     algoDscList.push_back ( algoDscMap[m_trainList[run][i]] );
00318                 bool clas = true;
00319                 bool cascade = true;
00320                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "TakeLast", cascade );
00321                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00322                 f<<master;
00323                 f.close();
00324 
00325                 // write algorithm *.dsc files
00326                 string preEffect = "";
00327                 for ( int i=0;i<m_trainList[run].size();i++ )
00328                 {
00329                     string algoStr;
00330                     if ( m_trainList[run][i] == "LM" )
00331                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, false );
00332                     else if ( m_trainList[run][i] == "NN" )
00333                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, false );
00334                     else if ( m_trainList[run][i] == "KNN" )
00335                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, false );
00336                     else if ( m_trainList[run][i] == "PR" )
00337                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, false );
00338                     else if ( m_trainList[run][i] == "KRR" )
00339                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true );
00340                     else
00341                         assert ( false );
00342                     //preEffect = algoPredMap[m_trainList[run][i]];
00343 
00344                     //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl;
00345                     f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out );
00346                     f<<algoStr;
00347                     f.close();
00348                 }
00349 
00350                 // train the ensemble
00351                 Scheduler s;
00352                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00353                 s.train();
00354                 s.predict();
00355 
00356                 // save error
00357                 REAL rmse = s.getPredictionRMSE();
00358                 REAL classErr = s.getClassificationError();
00359                 fAnalyzeCascade<<rmse<<" "<<classErr<<" ";
00360             }
00361             fAnalyzeCascade<<endl;
00362 
00363 
00364             // ========================================= STACKING TRAINING ==========================================
00365             // through all possible algorithm setups
00366             for ( int run=0;run<stackingTrainList.size() && enableStacking;run++ )
00367             {
00368                 uint randomSeed = randomSeedSplit + run;
00369 
00370                 printf ( "s" );
00371                 fflush ( stdout );
00372 
00373                 // write Master.dsc file
00374                 string master;
00375                 vector<string> algoDscList;
00376                 for ( int i=0;i<stackingTrainList[run].size();i++ )
00377                     algoDscList.push_back ( algoDscMap[stackingTrainList[run][i]] );
00378                 bool clas = true;
00379                 bool cascade = false;
00380                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegressionNonNeg", cascade );
00381                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00382                 f<<master;
00383                 f.close();
00384 
00385                 // write algorithm *.dsc files
00386                 string preEffect = "";
00387                 for ( int i=0;i<stackingTrainList[run].size();i++ )
00388                 {
00389                     string algoStr;
00390                     if ( stackingTrainList[run][i] == "LM" )
00391                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, false );
00392                     else if ( stackingTrainList[run][i] == "NN" )
00393                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, false );
00394                     else if ( stackingTrainList[run][i] == "KNN" )
00395                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, false );
00396                     else if ( stackingTrainList[run][i] == "PR" )
00397                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, false );
00398                     else if ( stackingTrainList[run][i] == "KRR" )
00399                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, false );
00400                     else
00401                         assert ( false );
00402                     //preEffect = algoPredMap[stackingTrainList[run][i]];
00403 
00404                     //cout<<"Write:"<<path+algoDscMap[stackingTrainList[run][i]]<<endl;
00405                     f.open ( ( path+algoDscMap[stackingTrainList[run][i]] ).c_str(),ios::out );
00406                     f<<algoStr;
00407                     f.close();
00408                 }
00409 
00410                 // train the ensemble
00411                 Scheduler s;
00412                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00413 
00414                 s.train();
00415                 s.predict();
00416 
00417                 // bagging
00418                 //Framework::setAdditionalStartupParameter("50");
00419                 //s.bagging();
00420 
00421                 // boosting
00422                 //Framework::setAdditionalStartupParameter("20");
00423                 //s.boosting();
00424 
00425                 // save error
00426                 REAL rmse = s.getPredictionRMSE();
00427                 REAL classErr = s.getClassificationError();
00428                 fAnalyzeStacking<<rmse<<" "<<classErr<<" ";
00429             }
00430             fAnalyzeStacking<<endl;
00431 
00432         }
00433 
00434         printf ( " run: %d[s]\n", ( int ) ( time ( 0 )-runTime ) );
00435 
00436         fAnalyzeResidual.close();
00437         fAnalyzeCascade.close();
00438         fAnalyzeStacking.close();
00439     }
00440 
00441     printf ( "Finished in: %d[s]\n", ( int ) ( time ( 0 )-t0 ) );
00442 }
00443 
00454 void AlgorithmExploration::randPerm ( vector<string> algorithmStack, vector<string> availableAlgorithms, int maxDepth )
00455 {
00456     if ( maxDepth == 0 ) // max depth reached
00457     {
00458         m_trainList.push_back ( algorithmStack );
00459         return;
00460     }
00461 
00462     int size = availableAlgorithms.size();
00463     for ( int i=0;i<size;i++ )
00464     {
00465         vector<string> stack = algorithmStack;
00466         stack.push_back ( availableAlgorithms[i] );
00467         vector<string> algos;
00468         for ( int j=0;j<size;j++ )
00469             if ( i != j )
00470                 algos.push_back ( availableAlgorithms[j] );
00471         randPerm ( stack, algos, maxDepth-1 );
00472     }
00473 }
00474 

Generated on Tue Jan 26 09:20:58 2010 for ELF by  doxygen 1.5.8