AlgorithmExploration Class Reference

#include <AlgorithmExploration.h>

Inheritance diagram for AlgorithmExploration:

List of all members.

Public Member Functions

AlgorithmExploration ()

~AlgorithmExploration ()

void start ()

void randPerm (vector< string > algorithmStack, vector< string > availableAlgorithms, int maxDepth)

Private Attributes

vector< vector< string > > m_trainList

Detailed Description

Test different train configurations in order to explore the best ensemble setup

Objective:

best performance on a dataset
which chain of models is the best?
compare: stacking, cascade, residual, cascade+residual

Definition at line 24 of file AlgorithmExploration.h.

Constructor & Destructor Documentation

AlgorithmExploration::AlgorithmExploration ( )

Constructor

Definition at line 8 of file AlgorithmExploration.cpp.

00009 {
00010     cout<<"AlgorithmExploration"<<endl;
00011 }

AlgorithmExploration::~AlgorithmExploration ( )

Destructor

Definition at line 16 of file AlgorithmExploration.cpp.

00017 {
00018     cout<<"descructor AlgorithmExploration"<<endl;
00019 }

Member Function Documentation

void AlgorithmExploration::randPerm	(	vector< string >	algorithmStack,
		vector< string >	availableAlgorithms,
		int	maxDepth
	)

Solve a combinatorical problem with recursion Draw m out of n algorithms without replacement m..depth n..total available

Parameters:

	algorithmStack	The current stack of algorithms (training chain)
	availableAlgorithms	List of avaliable algorithms (not used before in the chain)
	maxDepth	Current depth of recursion

Definition at line 454 of file AlgorithmExploration.cpp.

00455 {
00456     if ( maxDepth == 0 ) // max depth reached
00457     {
00458         m_trainList.push_back ( algorithmStack );
00459         return;
00460     }
00461 
00462     int size = availableAlgorithms.size();
00463     for ( int i=0;i<size;i++ )
00464     {
00465         vector<string> stack = algorithmStack;
00466         stack.push_back ( availableAlgorithms[i] );
00467         vector<string> algos;
00468         for ( int j=0;j<size;j++ )
00469             if ( i != j )
00470                 algos.push_back ( availableAlgorithms[j] );
00471         randPerm ( stack, algos, maxDepth-1 );
00472     }
00473 }

void AlgorithmExploration::start ( )

Start the exploration process, this is for evaluation of possible configurations This is used to test all possible chains of models on a given dataset Model templates and output filenames can be specified It runs stacking, cascade learning, residual training and cascade+residual

Definition at line 27 of file AlgorithmExploration.cpp.

00028 {
00029     time_t t0 = time ( 0 );
00030     cout.setOutputFile ( "out.txt" );
00031 
00032     // available algorithms for exploration
00033     vector<string> algos;
00034     algos.push_back ( "LM" );
00035     algos.push_back ( "NN" );
00036     algos.push_back ( "KNN" );
00037     //algos.push_back("PR");
00038     algos.push_back ( "KRR" );
00039 
00040     map<string,string> algoDscMap;
00041     algoDscMap["LM"] = "LinearModel_1.dsc";
00042     algoDscMap["NN"] = "NeuralNetwork_1.dsc";
00043     algoDscMap["KNN"] = "KNearestNeighbor_1.dsc";
00044     algoDscMap["PR"] = "PolynomialRegression_1.dsc";
00045     algoDscMap["KRR"] = "KernelRidgeRegression_1.dsc";
00046 
00047     map<string,string> algoPredMap;
00048     algoPredMap["LM"] = "LinearModel_1.dat";
00049     algoPredMap["NN"] = "NeuralNetwork_1.dat";
00050     algoPredMap["KNN"] = "KNearestNeighbor_1.dat";
00051     algoPredMap["PR"] = "PolynomialRegression_1.dat";
00052     algoPredMap["KRR"] = "KernelRidgeRegression_1.dat";
00053 
00054     // available datasets for exploration
00055     vector<string> datasets;
00056     /*datasets.push_back("CREDIT");
00057     datasets.push_back("BALANCE");
00058     datasets.push_back("BREAST");
00059     datasets.push_back("DIABETES");
00060     datasets.push_back("GERMAN");
00061     datasets.push_back("GLASS");*/
00062     datasets.push_back ( "HEPATITIS" );
00063     /*datasets.push_back("IONOSPHERE");
00064     datasets.push_back("IRIS");
00065     datasets.push_back("SONAR");
00066     datasets.push_back("SURVIVAL");
00067     datasets.push_back("VEHICLE");
00068     datasets.push_back("VOTES");
00069     datasets.push_back("WINE");*/
00070 
00071     //datasets.push_back("MUSHROOM"); // LARGE*/
00072     //datasets.push_back("LETTER"); // LARGE
00073     //datasets.push_back("SATIMAGE");  // LARGE
00074     //datasets.push_back("ADULT");  // LARGE
00075 
00076     // log file names
00077     string logResidualFilename = "logResidual.txt";
00078     string logResidualCascadeFilename = "logResidualCascade.txt";
00079     string logCascadeFilename = "logCascade.txt";
00080     string logStackingFilename = "logStacking.txt";
00081     /*string model = algos[0] + ".txt";
00082     string logResidualFilename = "";
00083     string logResidualCascadeFilename = "";
00084     string logCascadeFilename = model;
00085     string logStackingFilename = "";
00086     */
00087     /*
00088     // enable ensemble training methods
00089     bool enableResidual = true;
00090     bool enableResidualCascade = true;
00091     bool enableCascade = true;
00092     bool enableStacking = true;
00093     */
00094     // bagging
00095     bool enableResidual = false;
00096     bool enableResidualCascade = false;
00097     bool enableCascade = false;
00098     bool enableStacking = true;
00099 
00100 
00101     // number of test splits
00102     int nTestSplits = 100;
00103 
00104     // generate random permutations
00105     m_trainList.clear();
00106     for ( int depth=1;depth<algos.size() +1;depth++ )
00107     {
00108         vector<string> stack;
00109         randPerm ( stack, algos, depth );
00110     }
00111     cout<<endl<<"Residual/Cascade train list"<<endl;
00112     for ( int i=0;i<m_trainList.size();i++ )
00113     {
00114         for ( int j=0;j<m_trainList[i].size();j++ )
00115             cout<<m_trainList[i][j]<<" ";
00116         cout<<endl;
00117     }
00118 
00119     // generate train list for stacking (no permutations)
00120     cout<<endl<<"Stacking train list"<<endl;
00121     vector<vector<string> > stackingTrainListTmp;
00122     vector<vector<string> > stackingTrainList;
00123     for ( int i=0;i<m_trainList.size();i++ )
00124     {
00125         stackingTrainListTmp.push_back ( m_trainList[i] );
00126         sort ( stackingTrainListTmp[i].begin(), stackingTrainListTmp[i].end() );
00127 
00128         bool found = false;
00129         for ( int j=0;j<stackingTrainList.size();j++ )
00130         {
00131             if ( stackingTrainList[j] == stackingTrainListTmp[i] )
00132                 found = true;
00133         }
00134         if ( found == false )
00135         {
00136             stackingTrainList.push_back ( m_trainList[i] );
00137             int s = stackingTrainList.size();
00138             sort ( stackingTrainList[s-1].begin(), stackingTrainList[s-1].end() );
00139         }
00140     }
00141     for ( int i=0;i<stackingTrainList.size();i++ )
00142     {
00143         for ( int j=0;j<stackingTrainList[i].size();j++ )
00144             cout<<stackingTrainList[i][j]<<" ";
00145         cout<<endl;
00146     }
00147 
00148     // hide console output
00149     cout.disableAllOutputs();
00150 
00151     // through all datasets
00152     for ( int datasetCnt=0;datasetCnt<datasets.size();datasetCnt++ )
00153     {
00154         string path = datasets[datasetCnt] + "/";
00155         printf ( "\nLog: %s  (nTestsets:%d)\n", ( path + logResidualFilename ).c_str(),nTestSplits );
00156         printf ( "Log: %s  (nTestsets:%d)\n", ( path + logResidualCascadeFilename ).c_str(),nTestSplits );
00157         printf ( "Log: %s  (nTestsets:%d)\n", ( path + logCascadeFilename ).c_str(),nTestSplits );
00158         printf ( "Log: %s  (nTestsets:%d)\n", ( path + logStackingFilename ).c_str(),nTestSplits );
00159         fstream fAnalyzeResidual ( ( path + logResidualFilename ).c_str(),ios::out );
00160         fstream fAnalyzeResidualCascade ( ( path + logResidualCascadeFilename ).c_str(),ios::out );
00161         fstream fAnalyzeCascade ( ( path + logCascadeFilename ).c_str(),ios::out );
00162         fstream fAnalyzeStacking ( ( path + logStackingFilename ).c_str(),ios::out );
00163         fstream f;
00164 
00165         time_t runTime = time ( 0 );
00166 
00167         // through all test splits
00168         for ( int testCnt=0;testCnt<nTestSplits;testCnt++ )
00169         {
00170             printf ( " %d ",testCnt );
00171             fflush ( stdout );
00172 
00173             uint randomSeedSplit = time ( 0 ) + testCnt;
00174 
00175             // ========================================= RESIDUAL TRAINING ==========================================
00176             // through all possible algorithm setups
00177             for ( int run=0;run<m_trainList.size() && enableResidual;run++ )
00178             {
00179                 uint randomSeed = randomSeedSplit + run;
00180 
00181                 printf ( "r" );
00182                 fflush ( stdout );
00183 
00184                 // write Master.dsc file
00185                 string master;
00186                 vector<string> algoDscList;
00187                 for ( int i=0;i<m_trainList[run].size();i++ )
00188                     algoDscList.push_back ( algoDscMap[m_trainList[run][i]] );
00189                 bool clas = true;
00190                 bool cascade = false;
00191                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegression", cascade );
00192                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00193                 f<<master;
00194                 f.close();
00195 
00196                 // write algorithm *.dsc files
00197                 string preEffect = "";
00198                 for ( int i=0;i<m_trainList[run].size();i++ )
00199                 {
00200                     string algoStr;
00201                     if ( m_trainList[run][i] == "LM" )
00202                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, true );
00203                     else if ( m_trainList[run][i] == "NN" )
00204                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, true );
00205                     else if ( m_trainList[run][i] == "KNN" )
00206                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, true );
00207                     else if ( m_trainList[run][i] == "PR" )
00208                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, true );
00209                     else if ( m_trainList[run][i] == "KRR" )
00210                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true );
00211                     else
00212                         assert ( false );
00213                     preEffect = algoPredMap[m_trainList[run][i]];
00214 
00215                     //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl;
00216                     f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out );
00217                     f<<algoStr;
00218                     f.close();
00219                 }
00220 
00221                 // train the ensemble
00222                 Scheduler s;
00223                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00224 
00225                 s.train();
00226                 s.predict();
00227 
00228                 // bagging
00229                 //Framework::setAdditionalStartupParameter("10");
00230                 //s.bagging();
00231 
00232                 // boosting
00233                 //Framework::setAdditionalStartupParameter("20");
00234                 //s.boosting();
00235 
00236                 // save error
00237                 REAL rmse = s.getPredictionRMSE();
00238                 REAL classErr = s.getClassificationError();
00239                 fAnalyzeResidual<<rmse<<" "<<classErr<<" ";
00240             }
00241             fAnalyzeResidual<<endl;
00242 
00243 
00244             // ========================================= RESIDUAL+CASCADE TRAINING ==========================================
00245             // through all possible algorithm setups
00246             for ( int run=0;run<m_trainList.size() && enableResidualCascade;run++ )
00247             {
00248                 uint randomSeed = randomSeedSplit + run;
00249 
00250                 printf ( "m" );
00251                 fflush ( stdout );
00252 
00253                 // write Master.dsc file
00254                 string master;
00255                 vector<string> algoDscList;
00256                 for ( int i=0;i<m_trainList[run].size();i++ )
00257                     algoDscList.push_back ( algoDscMap[m_trainList[run][i]] );
00258                 bool clas = true;
00259                 bool cascade = true;
00260                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegression", cascade );
00261                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00262                 f<<master;
00263                 f.close();
00264 
00265                 // write algorithm *.dsc files
00266                 string preEffect = "";
00267                 for ( int i=0;i<m_trainList[run].size();i++ )
00268                 {
00269                     string algoStr;
00270                     if ( m_trainList[run][i] == "LM" )
00271                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, true );
00272                     else if ( m_trainList[run][i] == "NN" )
00273                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, true );
00274                     else if ( m_trainList[run][i] == "KNN" )
00275                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, true );
00276                     else if ( m_trainList[run][i] == "PR" )
00277                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, true );
00278                     else if ( m_trainList[run][i] == "KRR" )
00279                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true );
00280                     else
00281                         assert ( false );
00282                     preEffect = algoPredMap[m_trainList[run][i]];
00283 
00284                     //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl;
00285                     f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out );
00286                     f<<algoStr;
00287                     f.close();
00288                 }
00289 
00290                 // train the ensemble
00291                 Scheduler s;
00292                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00293                 s.train();
00294                 s.predict();
00295 
00296                 // save error
00297                 REAL rmse = s.getPredictionRMSE();
00298                 REAL classErr = s.getClassificationError();
00299                 fAnalyzeResidualCascade<<rmse<<" "<<classErr<<" ";
00300             }
00301             fAnalyzeResidualCascade<<endl;
00302 
00303 
00304             // ========================================= CASCADE TRAINING ==========================================
00305             // through all possible algorithm setups
00306             for ( int run=0;run<m_trainList.size() && enableCascade;run++ )
00307             {
00308                 uint randomSeed = randomSeedSplit + run;
00309 
00310                 printf ( "c" );
00311                 fflush ( stdout );
00312 
00313                 // write Master.dsc file
00314                 string master;
00315                 vector<string> algoDscList;
00316                 for ( int i=0;i<m_trainList[run].size();i++ )
00317                     algoDscList.push_back ( algoDscMap[m_trainList[run][i]] );
00318                 bool clas = true;
00319                 bool cascade = true;
00320                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "TakeLast", cascade );
00321                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00322                 f<<master;
00323                 f.close();
00324 
00325                 // write algorithm *.dsc files
00326                 string preEffect = "";
00327                 for ( int i=0;i<m_trainList[run].size();i++ )
00328                 {
00329                     string algoStr;
00330                     if ( m_trainList[run][i] == "LM" )
00331                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, false );
00332                     else if ( m_trainList[run][i] == "NN" )
00333                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, false );
00334                     else if ( m_trainList[run][i] == "KNN" )
00335                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, false );
00336                     else if ( m_trainList[run][i] == "PR" )
00337                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, false );
00338                     else if ( m_trainList[run][i] == "KRR" )
00339                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, true );
00340                     else
00341                         assert ( false );
00342                     //preEffect = algoPredMap[m_trainList[run][i]];
00343 
00344                     //cout<<"Write:"<<path+algoDscMap[m_trainList[run][i]]<<endl;
00345                     f.open ( ( path+algoDscMap[m_trainList[run][i]] ).c_str(),ios::out );
00346                     f<<algoStr;
00347                     f.close();
00348                 }
00349 
00350                 // train the ensemble
00351                 Scheduler s;
00352                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00353                 s.train();
00354                 s.predict();
00355 
00356                 // save error
00357                 REAL rmse = s.getPredictionRMSE();
00358                 REAL classErr = s.getClassificationError();
00359                 fAnalyzeCascade<<rmse<<" "<<classErr<<" ";
00360             }
00361             fAnalyzeCascade<<endl;
00362 
00363 
00364             // ========================================= STACKING TRAINING ==========================================
00365             // through all possible algorithm setups
00366             for ( int run=0;run<stackingTrainList.size() && enableStacking;run++ )
00367             {
00368                 uint randomSeed = randomSeedSplit + run;
00369 
00370                 printf ( "s" );
00371                 fflush ( stdout );
00372 
00373                 // write Master.dsc file
00374                 string master;
00375                 vector<string> algoDscList;
00376                 for ( int i=0;i<stackingTrainList[run].size();i++ )
00377                     algoDscList.push_back ( algoDscMap[stackingTrainList[run][i]] );
00378                 bool clas = true;
00379                 bool cascade = false;
00380                 master = Scheduler::masterDscTemplateGenerator ( datasets[datasetCnt], clas, algoDscList, randomSeed, "LinearRegressionNonNeg", cascade );
00381                 f.open ( ( path+"Master.dsc" ).c_str(),ios::out );
00382                 f<<master;
00383                 f.close();
00384 
00385                 // write algorithm *.dsc files
00386                 string preEffect = "";
00387                 for ( int i=0;i<stackingTrainList[run].size();i++ )
00388                 {
00389                     string algoStr;
00390                     if ( stackingTrainList[run][i] == "LM" )
00391                         algoStr = LinearModel::templateGenerator ( i+1, preEffect, 1, false );
00392                     else if ( stackingTrainList[run][i] == "NN" )
00393                         algoStr = NeuralNetwork::templateGenerator ( i+1, preEffect, 1, false );
00394                     else if ( stackingTrainList[run][i] == "KNN" )
00395                         algoStr = KNearestNeighbor::templateGenerator ( i+1, preEffect, 1, false );
00396                     else if ( stackingTrainList[run][i] == "PR" )
00397                         algoStr = PolynomialRegression::templateGenerator ( i+1, preEffect, 1, false );
00398                     else if ( stackingTrainList[run][i] == "KRR" )
00399                         algoStr = KernelRidgeRegression::templateGenerator ( i+1, preEffect, 1, false );
00400                     else
00401                         assert ( false );
00402                     //preEffect = algoPredMap[stackingTrainList[run][i]];
00403 
00404                     //cout<<"Write:"<<path+algoDscMap[stackingTrainList[run][i]]<<endl;
00405                     f.open ( ( path+algoDscMap[stackingTrainList[run][i]] ).c_str(),ios::out );
00406                     f<<algoStr;
00407                     f.close();
00408                 }
00409 
00410                 // train the ensemble
00411                 Scheduler s;
00412                 s.readMasterDscFile ( datasets[datasetCnt], "Master.dsc" );
00413 
00414                 s.train();
00415                 s.predict();
00416 
00417                 // bagging
00418                 //Framework::setAdditionalStartupParameter("50");
00419                 //s.bagging();
00420 
00421                 // boosting
00422                 //Framework::setAdditionalStartupParameter("20");
00423                 //s.boosting();
00424 
00425                 // save error
00426                 REAL rmse = s.getPredictionRMSE();
00427                 REAL classErr = s.getClassificationError();
00428                 fAnalyzeStacking<<rmse<<" "<<classErr<<" ";
00429             }
00430             fAnalyzeStacking<<endl;
00431 
00432         }
00433 
00434         printf ( " run: %d[s]\n", ( int ) ( time ( 0 )-runTime ) );
00435 
00436         fAnalyzeResidual.close();
00437         fAnalyzeCascade.close();
00438         fAnalyzeStacking.close();
00439     }
00440 
00441     printf ( "Finished in: %d[s]\n", ( int ) ( time ( 0 )-t0 ) );
00442 }

The documentation for this class was generated from the following files:

ELF/AlgorithmExploration.h
ELF/AlgorithmExploration.cpp


Public Member Functions
	AlgorithmExploration ()
	~AlgorithmExploration ()
void	start ()
void	randPerm (vector< string > algorithmStack, vector< string > availableAlgorithms, int maxDepth)
Private Attributes
vector< vector< string > >	m_trainList