#include <AlgorithmExploration.h>
Public Member Functions | |
AlgorithmExploration () | |
~AlgorithmExploration () | |
void | start () |
void | randPerm (vector< string > algorithmStack, vector< string > availableAlgorithms, int maxDepth) |
Private Attributes | |
vector< vector< string > > | m_trainList |
Objective:
Definition at line 24 of file AlgorithmExploration.h.
AlgorithmExploration::AlgorithmExploration | ( | ) |
AlgorithmExploration::~AlgorithmExploration | ( | ) |
void AlgorithmExploration::randPerm | ( | vector< string > | algorithmStack, | |
vector< string > | availableAlgorithms, | |||
int | maxDepth | |||
) |
Solve a combinatorical problem with recursion Draw m out of n algorithms without replacement m..depth n..total available
algorithmStack | The current stack of algorithms (training chain) | |
availableAlgorithms | List of avaliable algorithms (not used before in the chain) | |
maxDepth | Current depth of recursion |
Definition at line 454 of file AlgorithmExploration.cpp.
00455 { 00456 if ( maxDepth == 0 ) // max depth reached 00457 { 00458 m_trainList.push_back ( algorithmStack ); 00459 return; 00460 } 00461 00462 int size = availableAlgorithms.size(); 00463 for ( int i=0;i<size;i++ ) 00464 { 00465 vector<string> stack = algorithmStack; 00466 stack.push_back ( availableAlgorithms[i] ); 00467 vector<string> algos; 00468 for ( int j=0;j<size;j++ ) 00469 if ( i != j ) 00470 algos.push_back ( availableAlgorithms[j] ); 00471 randPerm ( stack, algos, maxDepth-1 ); 00472 } 00473 }
void AlgorithmExploration::start | ( | ) |
Start the exploration process, this is for evaluation of possible configurations This is used to test all possible chains of models on a given dataset Model templates and output filenames can be specified It runs stacking, cascade learning, residual training and cascade+residual
Definition at line 27 of file AlgorithmExploration.cpp.
00028 { 00029 time_t t0 = time ( 0 ); 00030 cout.setOutputFile ( "out.txt" ); 00031 00032 // available algorithms for exploration 00033 vector<string> algos; 00034 algos.push_back ( "LM" ); 00035 algos.push_back ( "NN" ); 00036 algos.push_back ( "KNN" ); 00037 //algos.push_back("PR"); 00038 algos.push_back ( "KRR" ); 00039 00040 map<string,string> algoDscMap; 00041 algoDscMap["LM"] = "LinearModel_1.dsc"; 00042 algoDscMap["NN"] = "NeuralNetwork_1.dsc"; 00043 algoDscMap["KNN"] = "KNearestNeighbor_1.dsc"; 00044 algoDscMap["PR"] = "PolynomialRegression_1.dsc"; 00045 algoDscMap["KRR"] = "KernelRidgeRegression_1.dsc"; 00046 00047 map<string,string> algoPredMap; 00048 algoPredMap["LM"] = "LinearModel_1.dat"; 00049 algoPredMap["NN"] = "NeuralNetwork_1.dat"; 00050 algoPredMap["KNN"] = "KNearestNeighbor_1.dat"; 00051 algoPredMap["PR"] = "PolynomialRegression_1.dat"; 00052 algoPredMap["KRR"] = "KernelRidgeRegression_1.dat"; 00053 00054 // available datasets for exploration 00055 vector<string> datasets; 00056 /*datasets.push_back("CREDIT"); 00057 datasets.push_back("BALANCE"); 00058 datasets.push_back("BREAST"); 00059 datasets.push_back("DIABETES"); 00060 datasets.push_back("GERMAN"); 00061 datasets.push_back("GLASS");*/ 00062 datasets.push_back ( "HEPATITIS" ); 00063 /*datasets.push_back("IONOSPHERE"); 00064 datasets.push_back("IRIS"); 00065 datasets.push_back("SONAR"); 00066 datasets.push_back("SURVIVAL"); 00067 datasets.push_back("VEHICLE"); 00068 datasets.push_back("VOTES"); 00069 datasets.push_back("WINE");*/ 00070 00071 //datasets.push_back("MUSHROOM"); // LARGE*/ 00072 //datasets.push_back("LETTER"); // LARGE 00073 //datasets.push_back("SATIMAGE"); // LARGE 00074 //datasets.push_back("ADULT"); // LARGE 00075 00076 // log file names 00077 string logResidualFilename = "logResidual.txt"; 00078 string logResidualCascadeFilename = "logResidualCascade.txt"; 00079 string logCascadeFilename = "logCascade.txt"; 00080 string logStackingFilename = "logStacking.txt"; 00081 /*string model = algos[0] + ".txt"; 00082 string logResidualFilename = ""; 00083 string logResidualCascadeFilename = ""; 00084 string logCascadeFilename = model; 00085 string logStackingFilename = ""; 00086 */ 00087 /* 00088 // enable ensemble training methods 00089 bool enableResidual = true; 00090 bool enableResidualCascade = true; 00091 bool enableCascade = true; 00092 bool enableStacking = true; 00093 */ 00094 // bagging 00095 bool enableResidual = false; 00096 bool enableResidualCascade = false; 00097 bool enableCascade = false; 00098 bool enableStacking = true; 00099 00100 00101 // number of test splits 00102 int nTestSplits = 100; 00103 00104 // generate random permutations 00105 m_trainList.clear(); 00106 for ( int depth=1;depth<algos.size() +1;depth++ ) 00107 { 00108 vector<string> stack; 00109 randPerm ( stack, algos, depth ); 00110 } 00111 cout<<endl<<"Residual/Cascade train list"<<endl; 00112 for ( int i=0;i<m_trainList.size();i++ ) 00113 { 00114 for ( int j=0;j<m_trainList[i].size();j++ ) 00115 cout<<m_trainList[i][j]<<" "; 00116 cout<<endl; 00117 } 00118 00119 // generate train list for stacking (no permutations) 00120 cout<<endl<<"Stacking train list"<<endl; 00121 vector<vector<string> > stackingTrainListTmp; 00122 vector<vector<string> > stackingTrainList; 00123 for ( int i=0;i<m_trainList.size();i++ ) 00124 { 00125 stackingTrainListTmp.push_back ( m_trainList[i] ); 00126 sort ( stackingTrainListTmp[i].begin(), stackingTrainListTmp[i].end() ); 00127 00128 bool found = false; 00129 for ( int j=0;j<stackingTrainList.size();j++ ) 00130 { 00131 if ( stackingTrainList[j] == stackingTrainListTmp[i] ) 00132 found = true; 00133 } 00134 if ( found == false ) 00135 { 00136 stackingTrainList.push_back ( m_trainList[i] ); 00137 int s = stackingTrainList.size(); 00138 sort ( stackingTrainList[s-1].begin(), stackingTrainList[s-1].end() ); 00139 } 00140 } 00141 for ( int i=0;i<stackingTrainList.size();i++ ) 00142 { 00143 for ( int j=0;j<stackingTrainList[i].size();j++ ) 00144 cout<<stackingTrainList[i][j]<<" "; 00145 cout<<endl; 00146 } 00147 00148 // hide console output 00149 cout.disableAllOutputs(); 00150 00151 // through all datasets 00152 for ( int datasetCnt=0;datasetCnt<datasets.size();datasetCnt++ ) 00153 { 00154 string path = datasets[datasetCnt] + "/"; 00155 printf ( "\nLog: %s (nTestsets:%d)\n", ( path + logResidualFilename ).c_str(),nTestSplits ); 00156 printf ( "Log: %s (nTestsets:%d)\n", ( path + logResidualCascadeFilename ).c_str(),nTestSplits ); 00157 printf ( "Log: %s (nTestsets:%d)\n", ( path + logCascadeFilename ).c_str(),nTestSplits ); 00158 printf ( "Log: %s (nTestsets:%d)\n", ( path + logStackingFilename ).c_str(),nTestSplits ); 00159 fstream fAnalyzeResidual ( ( path + logResidualFilename ).c_str(),ios::out ); 00160 fstream fAnalyzeResidualCascade ( ( path + logResidualCascadeFilename ).c_str(),ios::out ); 00161 fstream fAnalyzeCascade ( ( path + logCascadeFilename ).c_str(),ios::out ); 00162 fstream fAnalyzeStacking ( ( path + logStackingFilename ).c_str(),ios::out ); 00163 fstream f; 00164 00165 time_t runTime = time ( 0 ); 00166 00167 // through all test splits 00168 for ( int testCnt=0;testCnt<nTestSplits;testCnt++ ) 00169 { 00170 printf ( " %d ",testCnt ); 00171 fflush ( stdout ); 00172 00173 uint randomSeedSplit = time ( 0 ) + testCnt; 00174 00175 // ========================================= RESIDUAL TRAINING ========================================== 00176 // through all possible algorithm setups 00177 for ( int run=0;run<m_trainList.size() && enableResidual;run++ ) 00178 { 00179 uint randomSeed = randomSeedSplit + run; 00180 00181 printf ( "r" ); 00182 fflush ( stdout ); 00183 00184 // write Master.dsc file 00185 string master; 00186 vector<string> algoDscList; 00187 for ( int i=0;i<m_trainList[run].size();i++ ) 00188 algoDscList.push_back ( algoDscMap[m_trainList[run][i]] ); 00189 bool clas = true; 00190 bool cascade = false; 00191 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegression", cascade ); 00192 f.open ( ( path+"Master.dsc" ).c_str(),ios::out ); 00193 f<<master; 00194 f.close(); 00195 00196 // write algorithm *.dsc files 00197 string preEffect = ""; 00198 for ( int i=0;i<m_trainList[run].size();i++ ) 00199 { 00200 string algoStr; 00201 if ( m_trainList[run][i] == "LM" ) 00202 algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, true ); 00203 else if ( m_trainList[run][i] == "NN" ) 00204 algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, true ); 00205 else if ( m_trainList[run][i] == "KNN" ) 00206 algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, true ); 00207 else if ( m_trainList[run][i] == "PR" ) 00208 algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, true ); 00209 else if ( m_trainList[run][i] == "KRR" ) 00210 algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true ); 00211 else 00212 assert ( false ); 00213 preEffect = algoPredMap[m_trainList[run][i]]; 00214 00215 //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl; 00216 f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out ); 00217 f<<algoStr; 00218 f.close(); 00219 } 00220 00221 // train the ensemble 00222 Scheduler s; 00223 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" ); 00224 00225 s.train(); 00226 s.predict(); 00227 00228 // bagging 00229 //Framework::setAdditionalStartupParameter("10"); 00230 //s.bagging(); 00231 00232 // boosting 00233 //Framework::setAdditionalStartupParameter("20"); 00234 //s.boosting(); 00235 00236 // save error 00237 REAL rmse = s.getPredictionRMSE(); 00238 REAL classErr = s.getClassificationError(); 00239 fAnalyzeResidual<<rmse<<" "<<classErr<<" "; 00240 } 00241 fAnalyzeResidual<<endl; 00242 00243 00244 // ========================================= RESIDUAL+CASCADE TRAINING ========================================== 00245 // through all possible algorithm setups 00246 for ( int run=0;run<m_trainList.size() && enableResidualCascade;run++ ) 00247 { 00248 uint randomSeed = randomSeedSplit + run; 00249 00250 printf ( "m" ); 00251 fflush ( stdout ); 00252 00253 // write Master.dsc file 00254 string master; 00255 vector<string> algoDscList; 00256 for ( int i=0;i<m_trainList[run].size();i++ ) 00257 algoDscList.push_back ( algoDscMap[m_trainList[run][i]] ); 00258 bool clas = true; 00259 bool cascade = true; 00260 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegression", cascade ); 00261 f.open ( ( path+"Master.dsc" ).c_str(),ios::out ); 00262 f<<master; 00263 f.close(); 00264 00265 // write algorithm *.dsc files 00266 string preEffect = ""; 00267 for ( int i=0;i<m_trainList[run].size();i++ ) 00268 { 00269 string algoStr; 00270 if ( m_trainList[run][i] == "LM" ) 00271 algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, true ); 00272 else if ( m_trainList[run][i] == "NN" ) 00273 algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, true ); 00274 else if ( m_trainList[run][i] == "KNN" ) 00275 algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, true ); 00276 else if ( m_trainList[run][i] == "PR" ) 00277 algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, true ); 00278 else if ( m_trainList[run][i] == "KRR" ) 00279 algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true ); 00280 else 00281 assert ( false ); 00282 preEffect = algoPredMap[m_trainList[run][i]]; 00283 00284 //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl; 00285 f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out ); 00286 f<<algoStr; 00287 f.close(); 00288 } 00289 00290 // train the ensemble 00291 Scheduler s; 00292 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" ); 00293 s.train(); 00294 s.predict(); 00295 00296 // save error 00297 REAL rmse = s.getPredictionRMSE(); 00298 REAL classErr = s.getClassificationError(); 00299 fAnalyzeResidualCascade<<rmse<<" "<<classErr<<" "; 00300 } 00301 fAnalyzeResidualCascade<<endl; 00302 00303 00304 // ========================================= CASCADE TRAINING ========================================== 00305 // through all possible algorithm setups 00306 for ( int run=0;run<m_trainList.size() && enableCascade;run++ ) 00307 { 00308 uint randomSeed = randomSeedSplit + run; 00309 00310 printf ( "c" ); 00311 fflush ( stdout ); 00312 00313 // write Master.dsc file 00314 string master; 00315 vector<string> algoDscList; 00316 for ( int i=0;i<m_trainList[run].size();i++ ) 00317 algoDscList.push_back ( algoDscMap[m_trainList[run][i]] ); 00318 bool clas = true; 00319 bool cascade = true; 00320 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "TakeLast", cascade ); 00321 f.open ( ( path+"Master.dsc" ).c_str(),ios::out ); 00322 f<<master; 00323 f.close(); 00324 00325 // write algorithm *.dsc files 00326 string preEffect = ""; 00327 for ( int i=0;i<m_trainList[run].size();i++ ) 00328 { 00329 string algoStr; 00330 if ( m_trainList[run][i] == "LM" ) 00331 algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, false ); 00332 else if ( m_trainList[run][i] == "NN" ) 00333 algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, false ); 00334 else if ( m_trainList[run][i] == "KNN" ) 00335 algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, false ); 00336 else if ( m_trainList[run][i] == "PR" ) 00337 algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, false ); 00338 else if ( m_trainList[run][i] == "KRR" ) 00339 algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true ); 00340 else 00341 assert ( false ); 00342 //preEffect = algoPredMap[m_trainList[run][i]]; 00343 00344 //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl; 00345 f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out ); 00346 f<<algoStr; 00347 f.close(); 00348 } 00349 00350 // train the ensemble 00351 Scheduler s; 00352 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" ); 00353 s.train(); 00354 s.predict(); 00355 00356 // save error 00357 REAL rmse = s.getPredictionRMSE(); 00358 REAL classErr = s.getClassificationError(); 00359 fAnalyzeCascade<<rmse<<" "<<classErr<<" "; 00360 } 00361 fAnalyzeCascade<<endl; 00362 00363 00364 // ========================================= STACKING TRAINING ========================================== 00365 // through all possible algorithm setups 00366 for ( int run=0;run<stackingTrainList.size() && enableStacking;run++ ) 00367 { 00368 uint randomSeed = randomSeedSplit + run; 00369 00370 printf ( "s" ); 00371 fflush ( stdout ); 00372 00373 // write Master.dsc file 00374 string master; 00375 vector<string> algoDscList; 00376 for ( int i=0;i<stackingTrainList[run].size();i++ ) 00377 algoDscList.push_back ( algoDscMap[stackingTrainList[run][i]] ); 00378 bool clas = true; 00379 bool cascade = false; 00380 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegressionNonNeg", cascade ); 00381 f.open ( ( path+"Master.dsc" ).c_str(),ios::out ); 00382 f<<master; 00383 f.close(); 00384 00385 // write algorithm *.dsc files 00386 string preEffect = ""; 00387 for ( int i=0;i<stackingTrainList[run].size();i++ ) 00388 { 00389 string algoStr; 00390 if ( stackingTrainList[run][i] == "LM" ) 00391 algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, false ); 00392 else if ( stackingTrainList[run][i] == "NN" ) 00393 algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, false ); 00394 else if ( stackingTrainList[run][i] == "KNN" ) 00395 algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, false ); 00396 else if ( stackingTrainList[run][i] == "PR" ) 00397 algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, false ); 00398 else if ( stackingTrainList[run][i] == "KRR" ) 00399 algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, false ); 00400 else 00401 assert ( false ); 00402 //preEffect = algoPredMap[stackingTrainList[run][i]]; 00403 00404 //cout<<"Write:"<<path+algoDscMap[stackingTrainList[run][i]]<<endl; 00405 f.open ( ( path+algoDscMap[stackingTrainList[run][i]] ).c_str(),ios::out ); 00406 f<<algoStr; 00407 f.close(); 00408 } 00409 00410 // train the ensemble 00411 Scheduler s; 00412 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" ); 00413 00414 s.train(); 00415 s.predict(); 00416 00417 // bagging 00418 //Framework::setAdditionalStartupParameter("50"); 00419 //s.bagging(); 00420 00421 // boosting 00422 //Framework::setAdditionalStartupParameter("20"); 00423 //s.boosting(); 00424 00425 // save error 00426 REAL rmse = s.getPredictionRMSE(); 00427 REAL classErr = s.getClassificationError(); 00428 fAnalyzeStacking<<rmse<<" "<<classErr<<" "; 00429 } 00430 fAnalyzeStacking<<endl; 00431 00432 } 00433 00434 printf ( " run: %d[s]\n", ( int ) ( time ( 0 )-runTime ) ); 00435 00436 fAnalyzeResidual.close(); 00437 fAnalyzeCascade.close(); 00438 fAnalyzeStacking.close(); 00439 } 00440 00441 printf ( "Finished in: %d[s]\n", ( int ) ( time ( 0 )-t0 ) ); 00442 }