Scheduler Class Reference

#include <Scheduler.h>

Inheritance diagram for Scheduler:

Framework AlgorithmExploration

List of all members.

Public Member Functions

 Scheduler ()
 ~Scheduler ()
void readMasterDscFile (string path, string masterName)
void train ()
void predict ()
void blend ()
void bagging ()
void boosting ()
REAL getPredictionRMSE ()
REAL getClassificationError ()

Static Public Member Functions

static string masterDscTemplateGenerator (string dataset, bool isClass, vector< string > algos, int rSeed, string blendAlgo, bool cascade)

Private Member Functions

void trainAlgorithm (string fnameTemplate, string fnameDsc)
void checkAlgorithmTemplate (string fname, string &algoName, string &id)
void setPredictionModeInAlgorithm (string fname)
int getIDFromFullPredictor (string fullPredictor)
void algorithmDispatcher (Algorithm *&algo, string name)
void getEnsemblePrediction (REAL *input, REAL *output)
void preparePredictionMode ()
int getIndexOfMax (REAL *vector, int length)
void endPredictionMode ()

Private Attributes

vector< string > m_algorithmList
vector< int > m_algorithmIDList
vector< string > m_algorithmNameList
Datam_data
vector< Algorithm * > m_algorithmObjectList
vector< Algorithm ** > m_algorithmObjectListList
BlendStoppingm_blender
BlendingNNm_blenderNN
int * m_labelsPredict
int * m_effectID
REAL * m_noEffect
REAL ** m_outputs
REAL ** m_effects
REAL m_predictionRMSE
REAL m_predictionClassificationError
REAL * m_outputVectorTmp
int * m_labelsTmp
bool m_baggingRun
bool m_boostingRun
uint m_randSeedBagBoost
REAL * m_probs
REAL * m_boostingTrain
REAL * m_boostingTargets
int m_boostingNTrain
int m_boostingEpoch


Detailed Description

Schedules the training and prediction Reads the master-description *dsc file

Possible operation modes

This class can force the ready-to-prediction mode in the ensemble Very useful for predicting any test feature

Definition at line 41 of file Scheduler.h.


Constructor & Destructor Documentation

Scheduler::Scheduler (  ) 

Constructor

Definition at line 8 of file Scheduler.cpp.

00009 {
00010     cout<<"Scheduler"<<endl;
00011     // init member vars
00012     m_data = 0;
00013     m_blender = 0;
00014     m_blenderNN = 0;
00015     m_labelsPredict = 0;
00016     m_effectID = 0;
00017     m_noEffect = 0;
00018     m_outputs = 0;
00019     m_effects = 0;
00020     m_predictionRMSE = 0;
00021     m_predictionClassificationError = 0;
00022 
00023     m_data = new Data();
00024     m_data->setPathes ( TMP_PATH, DSC_PATH, FULL_PREDICTOR_PATH, DATA_PATH );
00025 
00026     m_baggingRun = 0;
00027     m_boostingRun = 0;
00028     m_randSeedBagBoost = 0;
00029     m_probs = 0;
00030     m_boostingTrain = 0;
00031     m_boostingTargets = 0;
00032     m_boostingNTrain = 0;
00033 
00034 }

Scheduler::~Scheduler (  ) 

Destructor

Definition at line 39 of file Scheduler.cpp.

00040 {
00041     cout<<"descructor Scheduler"<<endl;
00042     if ( m_data )
00043         delete m_data;
00044     m_data = 0;
00045 }


Member Function Documentation

void Scheduler::algorithmDispatcher ( Algorithm *&  algo,
string  name 
) [private]

Make a new instance of an Algorithm based on the model name

Parameters:
algo Reference to the Algorithm object pointer
name Name of the model

Definition at line 1468 of file Scheduler.cpp.

01469 {
01470     if ( name == "LinearModel" )
01471         algo = new LinearModel();
01472     else if ( name == "KNearestNeighbor" )
01473         algo = new KNearestNeighbor();
01474     else if ( name == "NeuralNetwork" )
01475         algo = new NeuralNetwork();
01476     else if ( name == "PolynomialRegression" )
01477         algo = new PolynomialRegression();
01478     else if ( name == "LinearModelNonNeg" )
01479         algo = new LinearModelNonNeg();
01480     else if ( name == "KernelRidgeRegression" )
01481         algo = new KernelRidgeRegression();
01482     else if ( name == "NeuralNetworkRBMauto" )
01483         algo = new NeuralNetworkRBMauto();
01484     else if ( name == "Autoencoder" )
01485         algo = new Autoencoder();
01486     else if ( name == "GBDT" )
01487         algo = new GBDT();
01488     else if ( name == "LogisticRegression" )
01489         algo = new LogisticRegression();
01490     else
01491         assert ( false );
01492 }

void Scheduler::bagging (  ) 

Generate a bagging ensemble produce a set of predictions with modified train set measuring accuracy on the test set

Definition at line 600 of file Scheduler.cpp.

00601 {
00602     int epochs = Framework::getAdditionalStartupParameter();
00603     cout<<endl<<endl;
00604     cout<<"================================= Bagging ================================="<<endl;
00605     cout<<"epochs:"<<epochs<<endl<<endl<<endl;
00606     m_baggingRun = true;
00607 
00608     vector<string> baggingFileNames;
00609     uint testSize = 0;
00610     double rmseMean = 0.0, classErrMean = 0.0;
00611 
00612     for ( int e=0;e<epochs;e++ )
00613     {
00614         cout<<"e:"<<e<<endl;
00615 
00616         m_randSeedBagBoost = e + 1;
00617 
00618         // train and predict testset
00619         train();
00620         predict();
00621 
00622         rmseMean += getPredictionRMSE();
00623         classErrMean += getClassificationError();
00624 
00625         fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00626         if ( fTest.is_open() ==false )
00627             assert ( false );
00628         char buf[512];
00629         sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e );
00630         baggingFileNames.push_back ( buf );
00631         fstream fTmp ( buf,ios::out );
00632 
00633         // get length of file
00634         fTest.seekg ( 0, ios::end );
00635         uint length = fTest.tellg();
00636         testSize = length/sizeof ( REAL );
00637         fTest.seekg ( 0, ios::beg );
00638 
00639         // allocate memory
00640         char* buffer = new char[length];
00641 
00642         // read data as a block
00643         fTest.read ( buffer,length );
00644         fTest.close();
00645 
00646         // write
00647         fTmp.write ( buffer,length );
00648         delete[] buffer;
00649 
00650         fTmp.close();
00651     }
00652 
00653 
00654     srand ( m_data->m_randSeed );
00655     m_data->readDataset ( m_data->m_datasetName );
00656 
00657     testSize = m_data->m_nTest * m_data->m_nClass * m_data->m_nDomain;
00658 
00659     // calc bag mean
00660     REAL* testMean = new REAL[testSize];
00661     for ( int i=0;i<testSize;i++ )
00662         testMean[i] = 0.0;
00663     for ( int e=0;e<epochs;e++ )
00664     {
00665         char nameBuf[512];
00666         sprintf ( nameBuf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e );
00667         fstream f ( nameBuf,ios::in );
00668         float* buf = new float[testSize];
00669         f.read ( ( char* ) buf,sizeof ( float ) *testSize );
00670         f.close();
00671 
00672         // add this run to ensemble
00673         for ( int i=0;i<testSize;i++ )
00674             testMean[i] += buf[i];
00675 
00676         delete[] buf;
00677 
00678 
00679         // per epoch: calculate prediction RMSE and classification error
00680         double classErrBag = 0.0;
00681         double rmseBag = 0.0;
00682 
00683         // go through the test set
00684         for ( uint i=0;i<m_data->m_nTest;i++ )
00685         {
00686             REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00687             REAL* ensembleOutputNorm = new REAL[m_data->m_nClass*m_data->m_nDomain];
00688             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00689                 ensembleOutputNorm[j] = ensembleOutput[j] / ( ( double ) e+1.0 );
00690 
00691             // if the dataset has classification type, count the #wrong labeled
00692             if ( Framework::getDatasetType() )
00693             {
00694                 for ( int d=0;d<m_data->m_nDomain;d++ )
00695                     if ( getIndexOfMax ( ensembleOutputNorm + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00696                         classErrBag += 1.0;
00697             }
00698 
00699             // rmse calculation over all targets
00700             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00701             {
00702                 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00703                 REAL prediction = ensembleOutputNorm[j];
00704                 rmseBag += ( prediction - target ) * ( prediction - target );
00705             }
00706 
00707             delete[] ensembleOutputNorm;
00708         }
00709 
00710         if ( Framework::getDatasetType() )
00711             classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00712         rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass*m_data->m_nDomain*m_data->m_nTest ) );
00713         cout<<e<<": "<<"RMSE:"<<rmseBag<<" classErr:"<<classErrBag<<endl;
00714 
00715     }
00716 
00717     // take the mean
00718     for ( int i=0;i<testSize;i++ )
00719         testMean[i] /= ( REAL ) epochs;
00720 
00721     fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00722     if ( fTest.is_open() ==false )
00723         assert ( false );
00724     fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize );
00725     fTest.close();
00726 
00727 
00728     // calculate prediction RMSE and classification error
00729     double classErrBag = 0.0;
00730     double rmseBag = 0.0;
00731 
00732     // go through the test set
00733     for ( uint i=0;i<m_data->m_nTest;i++ )
00734     {
00735         REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00736 
00737         // if the dataset has classification type, count the #wrong labeled
00738         if ( Framework::getDatasetType() )
00739         {
00740             for ( int d=0;d<m_data->m_nDomain;d++ )
00741                 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00742                     classErrBag += 1.0;
00743         }
00744 
00745         // rmse calculation over all targets
00746         for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00747         {
00748             REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00749             REAL prediction = ensembleOutput[j];
00750             rmseBag += ( prediction - target ) * ( prediction - target );
00751         }
00752 
00753     }
00754 
00755     // calc errors
00756     if ( Framework::getDatasetType() )
00757         classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00758     rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00759 
00760     m_predictionRMSE = rmseBag;
00761     m_predictionClassificationError = classErrBag;
00762 
00763     cout<<endl;
00764     cout<<epochs<<" runs"<<endl;
00765     cout<<"Bagging runs (with boostrap sample):   rmseMean:"<<rmseMean/ ( double ) epochs<<"   classErrMean:"<<classErrMean/ ( double ) epochs<<endl;
00766     cout<<"Bagged (mean)                      :   rmse    :"<<rmseBag<<"   classErr    :"<<classErrBag<<endl<<endl;
00767 
00768     delete[] testMean;
00769 }

void Scheduler::blend (  ) 

Blend the predictions with a neural network

Definition at line 405 of file Scheduler.cpp.

00406 {
00407     Framework::setFrameworkMode ( 0 );
00408 
00409     cout<<"Start blending after training"<<endl;
00410 
00411     // set the list of already trained predictors
00412     m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) );
00413 
00414     // fix random seed
00415     srand ( m_data->m_randSeed );
00416 
00417     // fill the data object with the dataset
00418     cout<<"Fill data"<<endl;
00419     m_data->readDataset ( m_data->m_datasetName );
00420     srand ( m_data->m_randSeed );
00421     m_data->allocMemForCrossValidationSets();
00422     m_data->partitionDatasetToCrossValidationSets();
00423 
00424     if ( m_data->m_enablePostNNBlending )
00425     {
00426         m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
00427 
00428         BlendingNN nn;
00429         nn.setDataPointers ( m_data );
00430         nn.readSpecificMaps();
00431         nn.init();
00432         nn.train();
00433     }
00434     else
00435     {
00436         BlendStopping bb ( ( Algorithm* ) m_data, "" );
00437         bb.setRegularization ( m_data->m_blendingRegularization );
00438         double rmse = bb.calcBlending();
00439         cout<<"BLEND RMSE OF ACTUAL FULLPREDICTION PATH:"<<rmse<<endl;
00440         bb.saveBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath );
00441     }
00442 }

void Scheduler::boosting (  ) 

Generate a boosting ensemble reweight sample probabilites based on train error

Definition at line 776 of file Scheduler.cpp.

00777 {
00778     int epochs = Framework::getAdditionalStartupParameter();
00779     cout<<endl<<endl;
00780     cout<<"================================= Boosting ================================="<<endl;
00781     cout<<"epochs:"<<epochs<<endl<<endl<<endl;
00782     m_boostingRun = true;
00783 
00784     vector<string> boostingFileNames;
00785     uint testSize = 0;
00786     double rmseMean = 0.0, classErrMean = 0.0;
00787     REAL* beta = new REAL[epochs];
00788     for ( m_boostingEpoch=0;m_boostingEpoch<epochs;m_boostingEpoch++ )
00789     {
00790         cout<<"e:"<<m_boostingEpoch<<endl;
00791 
00792         m_randSeedBagBoost = m_boostingEpoch;
00793 
00794         // train and predict testset (testset must be fixed)
00795         train();
00796         predict();
00797 
00798         fstream f ( "A.txt",ios::out );
00799         for ( int i=0;i<m_boostingNTrain;i++ )
00800             f<<m_probs[i]<<endl;
00801         f.close();
00802 
00803         rmseMean += getPredictionRMSE();
00804         classErrMean += getClassificationError();
00805 
00806         fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00807         if ( fTest.is_open() ==false )
00808             assert ( false );
00809         char buf[512];
00810         sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),m_boostingEpoch );
00811         boostingFileNames.push_back ( buf );
00812         fstream fTmp ( buf,ios::out );
00813 
00814         // get length of file
00815         fTest.seekg ( 0, ios::end );
00816         uint length = fTest.tellg();
00817         testSize = length/sizeof ( float );
00818         fTest.seekg ( 0, ios::beg );
00819 
00820         // allocate memory
00821         char* buffer = new char [length];
00822 
00823         // read data as a block
00824         fTest.read ( buffer,length );
00825         fTest.close();
00826 
00827         // write
00828         fTmp.write ( buffer,length );
00829         delete[] buffer;
00830 
00831         fTmp.close();
00832 
00833         // ==================== predict train set =====================
00834         double rmseBoost = 0.0, epsilon = 0.0, rmseTrain = 0.0;
00835         REAL min = m_data->m_negativeTarget, max = m_data->m_positiveTarget;
00836         Framework::setFrameworkMode ( 1 );
00837 
00838         preparePredictionMode();
00839         REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain];
00840         REAL* loss = new REAL[m_boostingNTrain];
00841         // go through the train set
00842         int nOut = m_data->m_nClass*m_data->m_nDomain;
00843         for ( int i=0;i<m_boostingNTrain;i++ )
00844         {
00845             // predict one example
00846             REAL* inputFeature = m_boostingTrain + i * m_data->m_nFeatures;
00847             getEnsemblePrediction ( inputFeature, ensembleOutput );
00848 
00849             // rmse calculation over all targets
00850             REAL err = 0.0, err2 = 0.0;
00851             for ( int j=0;j<m_data->m_nDomain;j++ )
00852             {
00853                 int indMax = -1;
00854                 REAL maxTarget = -1e10;
00855                 for ( int k=0;k<m_data->m_nClass;k++ )
00856                     if ( maxTarget < m_boostingTargets[i * nOut + m_data->m_nClass*j + k] )
00857                     {
00858                         maxTarget = m_boostingTargets[i * nOut + m_data->m_nClass*j + k];
00859                         indMax = k;
00860                     }
00861                 if ( indMax == -1 )
00862                     assert ( false );
00863                 for ( int k=0;k<m_data->m_nClass;k++ )
00864                 {
00865                     if ( indMax != k )
00866                     {
00867                         REAL predictionTarget = ensembleOutput[m_data->m_nClass*j + indMax];
00868                         REAL prediction = ensembleOutput[m_data->m_nClass*j + k];
00869 
00870                         err += 1.0 - ( predictionTarget-min ) / ( max-min ) + ( prediction-min ) / ( max-min );
00871                         err2 += 1.0 + ( predictionTarget-min ) / ( max-min ) - ( prediction-min ) / ( max-min );
00872                     }
00873                 }
00874 
00875                 for ( int j=0;j<m_data->m_nDomain;j++ )
00876                     for ( int k=0;k<m_data->m_nClass;k++ )
00877                     {
00878                         REAL out = ensembleOutput[m_data->m_nClass*j + k];
00879                         REAL target = m_boostingTargets[i * nOut + m_data->m_nClass*j + k];
00880                         rmseTrain += ( out-target ) * ( out-target );
00881                     }
00882 
00883             }
00884             epsilon += m_probs[i] * err / ( REAL ) ( m_data->m_nClass-1 );
00885             loss[i] = err2 / ( REAL ) ( m_data->m_nClass-1 );
00886         }
00887         rmseTrain = sqrt ( rmseTrain/ ( double ) ( m_boostingNTrain*m_data->m_nClass*m_data->m_nDomain ) );
00888         cout<<"rmseTrain(boosting):"<<rmseTrain<<endl;
00889         epsilon *= 0.5;
00890         beta[m_boostingEpoch] = epsilon / ( 1.0 - epsilon );
00891         // update example probabilities
00892         for ( int i=0;i<m_boostingNTrain;i++ )
00893             m_probs[i] *= pow ( beta[m_boostingEpoch], 0.5 * loss[i] );
00894         double sum = 0.0;
00895         for ( int i=0;i<m_boostingNTrain;i++ )
00896             sum += m_probs[i];
00897         // normalize
00898         for ( int i=0;i<m_boostingNTrain;i++ )
00899             m_probs[i] /= sum;
00900 
00901         delete[] loss;
00902         delete[] ensembleOutput;
00903 
00904         endPredictionMode();
00905     }
00906 
00907     // read test data
00908     srand ( m_data->m_randSeed );
00909     m_data->readDataset ( m_data->m_datasetName );
00910 
00911     // calc boosting mean
00912     cout<<endl<<endl<<"#test values:"<<testSize<<" (dataset size:"<<m_data->m_nTest<<")"<<endl;
00913     REAL* testMean = new REAL[testSize];
00914     for ( int i=0;i<testSize;i++ )
00915         testMean[i] = 0.0;
00916     for ( int e=0;e<epochs;e++ )
00917     {
00918         cout<<"Cascade layer "<<e<<": weight:"<<log10 ( 1.0/beta[e] ) <<"  "<<flush;
00919         fstream f ( boostingFileNames[e].c_str(),ios::in );
00920         if ( f.is_open() == false )
00921             assert ( false );
00922         float* buf = new float[testSize];
00923         f.read ( ( char* ) buf,sizeof ( float ) *testSize );
00924         f.close();
00925 
00926         // add this run to ensemble
00927         for ( int i=0;i<testSize;i++ )
00928         {
00929             REAL w = log10 ( 1.0/beta[e] );
00930             testMean[i] += w*buf[i];
00931         }
00932         delete[] buf;
00933 
00934 
00935         // Calculate per-epoch errors
00936         // go through the test set
00937         double classErrBoostingPerEpoch = 0.0;
00938         double rmseBoostingPerEpoch = 0.0;
00939         double rmseBoostingPerEpoch0 = 0.0;
00940         double rmseBoostingPerEpoch1 = 0.0;
00941         for ( int i=0;i<m_data->m_nTest;i++ )
00942         {
00943             REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00944             REAL* ensembleOutputNorm0 = new REAL[m_data->m_nClass*m_data->m_nDomain];
00945             REAL* ensembleOutputNorm1 = new REAL[m_data->m_nClass*m_data->m_nDomain];
00946 
00947             REAL norm0 = 0.0;
00948             for ( int j=0;j<=e;j++ )
00949                 norm0 += log10 ( 1.0/beta[e] );
00950             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00951             {
00952                 ensembleOutputNorm0[j] = ensembleOutput[j]/ ( REAL ) ( e+1 );
00953                 ensembleOutputNorm1[j] = ensembleOutput[j]/norm0;
00954             }
00955 
00956             // if the dataset has classification type, count the #wrong labeled
00957             if ( Framework::getDatasetType() )
00958             {
00959                 for ( int d=0;d<m_data->m_nDomain;d++ )
00960                     if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00961                         classErrBoostingPerEpoch += 1.0;
00962             }
00963 
00964             // rmse calculation over all targets
00965             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00966             {
00967                 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00968                 REAL prediction = ensembleOutput[j];
00969                 rmseBoostingPerEpoch += ( prediction - target ) * ( prediction - target );
00970 
00971                 prediction = ensembleOutputNorm0[j];
00972                 rmseBoostingPerEpoch0 += ( prediction - target ) * ( prediction - target );
00973 
00974                 prediction = ensembleOutputNorm1[j];
00975                 rmseBoostingPerEpoch1 += ( prediction - target ) * ( prediction - target );
00976             }
00977 
00978             delete[] ensembleOutputNorm0;
00979             delete[] ensembleOutputNorm1;
00980         }
00981         // calc errors
00982         if ( Framework::getDatasetType() )
00983             classErrBoostingPerEpoch = 100.0*classErrBoostingPerEpoch/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00984         rmseBoostingPerEpoch = sqrt ( rmseBoostingPerEpoch/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00985         rmseBoostingPerEpoch0 = sqrt ( rmseBoostingPerEpoch0/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00986         rmseBoostingPerEpoch1 = sqrt ( rmseBoostingPerEpoch1/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00987         cout<<"Boosting:  rmse:"<<rmseBoostingPerEpoch<<"  rmse0:"<<rmseBoostingPerEpoch0<<"  rmse1:"<<rmseBoostingPerEpoch1<<"  classErr:"<<classErrBoostingPerEpoch<<"%"<<endl;
00988     }
00989 
00990     // take the mean
00991     for ( int i=0;i<testSize;i++ )
00992         testMean[i] /= ( REAL ) epochs;
00993 
00994     fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00995     if ( fTest.is_open() ==false )
00996         assert ( false );
00997     fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize );
00998     fTest.close();
00999 
01000 
01001     // calculate prediction RMSE and classification error
01002     double classErrBoosting = 0.0;
01003     double rmseBoosting = 0.0;
01004 
01005     // go through the test set
01006     for ( int i=0;i<m_data->m_nTest;i++ )
01007     {
01008         REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
01009 
01010         // if the dataset has classification type, count the #wrong labeled
01011         if ( Framework::getDatasetType() )
01012         {
01013             for ( int d=0;d<m_data->m_nDomain;d++ )
01014                 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
01015                     classErrBoosting += 1.0;
01016         }
01017 
01018         // rmse calculation over all targets
01019         for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01020         {
01021             REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
01022             REAL prediction = ensembleOutput[j];
01023             rmseBoosting += ( prediction - target ) * ( prediction - target );
01024         }
01025 
01026     }
01027 
01028     // calc errors
01029     if ( Framework::getDatasetType() )
01030         classErrBoosting = 100.0*classErrBoosting/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
01031     rmseBoosting = sqrt ( rmseBoosting/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
01032 
01033     m_predictionRMSE = rmseBoosting;
01034     m_predictionClassificationError = classErrBoosting;
01035 
01036     cout<<endl;
01037     cout<<epochs<<" runs"<<endl;
01038     cout<<"Boosting runs (mean boostrap sample):   rmseMean:"<<rmseMean/ ( double ) epochs<<"   classErrMean:"<<classErrMean/ ( double ) epochs<<"%"<<endl;
01039     cout<<"Boosting (mean)                     :   rmse    :"<<rmseBoosting<<"   classErr    :"<<classErrBoosting<<"%"<<endl<<endl;
01040 
01041     delete[] testMean;
01042 }

void Scheduler::checkAlgorithmTemplate ( string  fname,
string &  algoName,
string &  id 
) [private]

Check if the Algorithm dsc file has no errors

Parameters:
fname Dsc filename of Algorithm
algoName Reference the the algorithm name (KNN, NN, LinearModel, ..)
id Reference to the ID (ascending number from 0,1..)

Definition at line 1051 of file Scheduler.cpp.

01052 {
01053     // check, if the algorithm line exists
01054     fstream f ( fname.c_str(), ios::in );
01055     if ( f.is_open() == false )
01056         assert ( false );
01057     string firstLine, secondLine, thirdLine;
01058     f>>firstLine;
01059     f>>secondLine;
01060     f>>thirdLine;
01061     f.close();
01062     int pos = firstLine.find ( "=" );
01063     string name = firstLine.substr ( 0, pos );
01064     algoName = firstLine.substr ( pos+1 );
01065     if ( name != "ALGORITHM" )
01066     {
01067         cout<<"Wrong dsc file, no ALGORITHM=.. found in first line"<<endl;
01068         exit ( 0 );
01069     }
01070     pos = secondLine.find ( "=" );
01071     name = secondLine.substr ( 0, pos );
01072     id = secondLine.substr ( pos+1 );
01073     if ( name != "ID" )
01074     {
01075         cout<<"Wrong dsc file, no ID=.. found in second line"<<endl;
01076         exit ( 0 );
01077     }
01078 }

void Scheduler::endPredictionMode (  )  [private]

End of the prediction mode Deallocation of memory

Definition at line 1351 of file Scheduler.cpp.

01352 {
01353     cout<<"End scheduled prediction"<<endl;
01354     m_data->deleteMemory();
01355 
01356     for ( int i=0;i<m_algorithmObjectList.size();i++ )
01357         delete m_algorithmObjectList[i];
01358     m_algorithmObjectList.clear();
01359 
01360     if ( m_data->m_enablePostNNBlending )
01361         delete m_blenderNN;
01362     delete m_blender;
01363     delete[] m_effectID;
01364     int N = m_algorithmList.size();
01365     for ( int i=0;i<N+1;i++ )
01366     {
01367         delete[] m_outputs[i];
01368         delete[] m_effects[i];
01369     }
01370     delete[] m_noEffect;
01371     delete[] m_outputs;
01372     delete[] m_effects;
01373     delete[] m_labelsPredict;
01374 
01375 }

REAL Scheduler::getClassificationError (  ) 

Return classification error of last testset prediction

Definition at line 1559 of file Scheduler.cpp.

01560 {
01561     return m_predictionClassificationError;
01562 }

void Scheduler::getEnsemblePrediction ( REAL *  input,
REAL *  output 
) [private]

Predict a target vector with given input feature Based on the trained ensemble

Parameters:
input REAL* vector to original input feature (read)
output REAL* vector to target (write)

Definition at line 1167 of file Scheduler.cpp.

01168 {
01169     int N = m_algorithmList.size();
01170     REAL* tmp = new REAL[m_data->m_nFeatures+N];
01171     
01172     // predict all targets per algorithm
01173     // if the algorithm needs a preprocessor, the effect file is loaded
01174     for ( int i=0;i<N;i++ )
01175     {
01176         // effect = pre-processor for this algorithm
01177         int ID = m_effectID[i];
01178         REAL* effect = m_noEffect;  // constant zero
01179         REAL* outputVector = m_outputs[i+1]; // +1: jump over constant 1
01180         if ( ID != 0 )
01181         {
01182             if ( ID < 0 || ID > i )
01183                 assert ( false );
01184             effect = m_outputs[ID];  // output of another prediction as effect
01185         }
01186 
01187         // cascade learning: add predictions of previous model as input to current
01188         if ( m_data->m_enableCascadeLearning )
01189         {
01190             int nF = m_data->m_nFeatures;
01191             int nFAlgo = m_algorithmObjectList[i]->m_nFeatures;
01192 
01193             // add input feature + normalize
01194             for ( int j=0;j<nF;j++ )
01195                 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j];
01196 
01197             // add predictions + normalize
01198             for ( int j=0;j<i;j++ ) // over all previous models
01199             {
01200                 REAL* previousOutputVector = m_outputs[j+1];
01201                 int nOut = m_data->m_nClass*m_data->m_nDomain;
01202                 for ( int k=0;k<nOut;k++ )
01203                     tmp[nF+j*nOut+k] = ( previousOutputVector[k] - m_data->m_mean[nF+j*nOut+k] ) / m_data->m_std[nF+j*nOut+k];
01204             }
01205         }
01206         else  // standard
01207         {
01208             for ( int j=0;j<m_data->m_nFeatures;j++ )
01209                 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j];
01210         }
01211 
01212         if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet")
01213             m_algorithmObjectList[i]->predictMultipleOutputs ( tmp, effect, outputVector, m_labelsPredict, 1, m_data->m_nCross );
01214         else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" )
01215         {
01216             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01217                 outputVector[j] = 0.0;
01218             for ( int j=0;j<m_data->m_nCross;j++ )
01219             {
01220                 m_algorithmObjectListList[i][j]->predictMultipleOutputs ( tmp, effect, m_outputVectorTmp, m_labelsTmp, 1, j );
01221                 for ( int k=0;k<m_data->m_nClass*m_data->m_nDomain;k++ )
01222                     outputVector[k] += m_outputVectorTmp[k];
01223             }
01224             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01225                 outputVector[j] /= ( REAL ) m_data->m_nCross;
01226 
01227             // calc output labels (for classification dataset)
01228             if ( Framework::getDatasetType() )
01229             {
01230                 // in all domains
01231                 for ( int d=0;d<m_data->m_nDomain;d++ )
01232                 {
01233                     // find max. output value
01234                     int indMax = -1;
01235                     REAL max = -1e10;
01236                     for ( int j=0;j<m_data->m_nClass;j++ )
01237                     {
01238                         if ( max < outputVector[d*m_data->m_nClass+j] )
01239                         {
01240                             max = outputVector[d*m_data->m_nClass+j];
01241                             indMax = j;
01242                         }
01243                     }
01244                     m_labelsPredict[d] = indMax;
01245                 }
01246             }
01247 
01248         }
01249         else
01250             assert(false);
01251     }
01252 
01253     delete[] tmp;
01254     
01255     // calculate the ensemble output with the blender
01256     if ( m_data->m_enablePostNNBlending )
01257         m_blenderNN->predictEnsembleOutput ( m_outputs, output );
01258     else
01259         m_blender->predictEnsembleOutput ( m_outputs, output );
01260 }

int Scheduler::getIDFromFullPredictor ( string  fullPredictor  )  [private]

Returns the ID from the corresponding dsc-file of the given full-prediction file

Parameters:
fullPredictor Full-predictor file name
Returns:
The id from the dsc-file (which belongs to the full-predictor)

Definition at line 1149 of file Scheduler.cpp.

01150 {
01151     if ( fullPredictor=="" )
01152         return 0;
01153     for ( int i=0;i<m_algorithmObjectList.size();i++ )
01154         if ( m_algorithmObjectList[i]->m_stringMap["fullPrediction"] == fullPredictor )
01155             return m_algorithmObjectList[i]->m_algorithmID;
01156     cout<<"Error, this fullPredictor was not found:"<<fullPredictor<<endl;
01157     assert ( false );
01158 }

int Scheduler::getIndexOfMax ( REAL *  vector,
int  length 
) [private]

Find the largest element in a vector and return the index

Parameters:
vector Input REAL vector
length The number of elements of vector
Returns:
The index of the largest element

Definition at line 1384 of file Scheduler.cpp.

01385 {
01386     int indMax = -1;
01387     REAL max = -1e10;
01388     for ( int i=0;i<length;i++ )
01389     {
01390         if ( max < vector[i] )
01391         {
01392             max = vector[i];
01393             indMax = i;
01394         }
01395     }
01396 
01397     return indMax;
01398 }

REAL Scheduler::getPredictionRMSE (  ) 

Return rmse of last testset prediction

Definition at line 1550 of file Scheduler.cpp.

01551 {
01552     return m_predictionRMSE;
01553 }

string Scheduler::masterDscTemplateGenerator ( string  dataset,
bool  isClass,
vector< string >  algos,
int  rSeed,
string  blendAlgo,
bool  cascade 
) [static]

Generates a template of the master description file This is an example of a Master.dsc file

Returns:
The template string

Definition at line 1500 of file Scheduler.cpp.

01501 {
01502     stringstream s;
01503     s<<"dataset="<<dataset<<endl;
01504     s<<"isClassificationDataset="<<isClass<<endl;
01505     s<<"maxThreads=2"<<endl;
01506     s<<"maxThreadsInCross=2"<<endl;
01507     s<<"nCrossValidation=6"<<endl;
01508     s<<"validationType=Retraining"<<endl;
01509     s<<"positiveTarget=1.0"<<endl;
01510     s<<"negativeTarget=-1.0"<<endl;
01511     s<<"randomSeed="<<rSeed<<endl;
01512     s<<"nMixDataset=20"<<endl;
01513     s<<"nMixTrainList=100"<<endl;
01514     s<<"standardDeviationMin=0.01"<<endl;
01515     s<<"blendingRegularization=1e-4"<<endl;
01516     s<<"blendingEnableCrossValidation=0"<<endl;
01517     s<<"blendingAlgorithm="<<blendAlgo<<endl;
01518     s<<"enablePostNNBlending=0"<<endl;
01519     s<<"enableCascadeLearning="<<cascade<<endl;
01520     s<<"enableGlobalMeanStdEstimate=0"<<endl;
01521     s<<"enableSaveMemory=1"<<endl;
01522     s<<"addOutputNoise=0"<<endl;
01523     s<<"enablePostBlendClipping=0"<<endl;
01524     s<<"enableFeatureSelection=0"<<endl;
01525     s<<"featureSelectionWriteBinaryDataset=0"<<endl;
01526     s<<"enableGlobalBlendingWeights=1"<<endl;
01527     s<<"errorFunction=RMSE"<<endl;
01528     s<<"disableWriteDscFile=0"<<endl;
01529     s<<"enableStaticNormalization=0"<<endl;
01530     s<<"staticMeanNormalization=0.0"<<endl;
01531     s<<"staticStdNormalization=1.0"<<endl;
01532     s<<"enableProbablisticNormalization=0"<<endl;
01533     s<<"dimensionalityReduction=no"<<endl;
01534     s<<"subsampleTrainSet=1.0"<<endl;
01535     s<<"subsampleFeatures=1.0"<<endl;
01536     s<<"globalTrainingLoops=1"<<endl;
01537     s<<"addConstantInput=0"<<endl;
01538     s<<endl;
01539     s<<"[ALGORITHMS]"<<endl;
01540     for ( int i=0;i<algos.size();i++ )
01541         s<<algos[i]<<endl;
01542 
01543     return s.str();
01544 }

void Scheduler::predict (  ) 

Predict the testset save the predictions to a binary file: out.dat

Definition at line 450 of file Scheduler.cpp.

00451 {
00452     Framework::setFrameworkMode ( 1 );
00453 
00454     preparePredictionMode();
00455 
00456     int progress = m_data->m_nTest / 100 + 1;
00457     double mean = 0.0, rmse = 0.0;
00458 
00459     // output file (binary)
00460     string fname;
00461     if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() >= 0 )
00462     {
00463         cout<<"Dataset:NETFLIX, slot:"<<Framework::getAdditionalStartupParameter() <<" ";
00464         char buf[512];
00465         sprintf ( buf,"p%d",Framework::getAdditionalStartupParameter() );
00466         fname = string ( NETFLIX_SLOTDATA_ROOT_DIR ) + buf + "/testPrediction.data";
00467         cout<<"pName:"<<fname<<endl;
00468     }
00469     else if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() < -100 )
00470     {
00471         char buf[512];
00472         sprintf ( buf,"ELFprediction%d",Framework::getRandomSeed() );
00473         string algos;
00474         for ( int i=0;i<m_algorithmList.size();i++ )
00475             algos += "_" + m_algorithmList[i];
00476         fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/" + buf + algos + ".dat";
00477         cout<<"pName:"<<fname<<endl;
00478     }
00479     else
00480     {
00481         char nr[512];
00482         sprintf ( nr,"%d",rand() );
00483         fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/testPrediction" + string ( nr ) + ".data";
00484         //fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/testPrediction.data";
00485     }
00486 
00487     fstream fOutput ( fname.c_str(),ios::out );
00488 
00489     // the output vector of the ensemble
00490     REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain];
00491 
00492     int* wrongLabelCnt = new int[m_data->m_nDomain];
00493     for ( int i=0;i<m_data->m_nDomain;i++ )
00494         wrongLabelCnt[i] = 0;
00495 
00496     // store the real input dimension of data
00497     int nrFeat = m_data->m_nFeatures;
00498 
00499     m_outputVectorTmp = new REAL[m_data->m_nClass*m_data->m_nDomain];
00500     m_labelsTmp = new int[m_data->m_nClass*m_data->m_nDomain];
00501 
00502     // load the autoencoder net
00503     Autoencoder* autoEnc = 0;
00504     bool enableAutoencoder = false;
00505     REAL* autoencoderOutput = 0;
00506     if ( m_data->m_dimensionalityReduction == "Autoencoder" )
00507     {
00508         autoEnc = new Autoencoder();
00509         autoEnc->setDataPointers ( m_data );
00510         autoEnc->loadWeights();
00511         autoEnc->loadNormalizations();
00512         enableAutoencoder = true;
00513         autoencoderOutput = new REAL[autoEnc->m_nClass];
00514         m_data->m_nFeatures = autoEnc->m_nClass;  //  modify input dimension
00515     }
00516 
00517     cout<<endl<<"predict(100 dots): "<<flush;
00518     time_t t0 = time ( 0 );
00519 
00520     // go through the test set
00521     for ( uint i=0;i<m_data->m_nTest;i++ )
00522     {
00523         if ( i % progress == 0 )
00524             cout<<"."<<flush;
00525 
00526         // predict one example
00527         REAL* inputFeature = m_data->m_testOrig + i * ( uint ) nrFeat;
00528 
00529         if ( enableAutoencoder )
00530         {
00531             autoEnc->predictAllOutputs ( inputFeature, autoencoderOutput, 1, 0 );
00532             getEnsemblePrediction ( autoencoderOutput, ensembleOutput );
00533         }
00534         else
00535             getEnsemblePrediction ( inputFeature, ensembleOutput );
00536 
00537         // if the dataset has classification type, count the #wrong labeled
00538         if ( Framework::getDatasetType() )
00539         {
00540             for ( uint d=0;d<m_data->m_nDomain;d++ )
00541                 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i* ( uint ) m_data->m_nDomain] )
00542                     wrongLabelCnt[d]++;
00543         }
00544 
00545         // rmse calculation over all targets
00546         for ( uint j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00547         {
00548             REAL target = m_data->m_testTargetOrig[i * ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain + j];
00549             REAL prediction = ensembleOutput[j];
00550             rmse += ( prediction - target ) * ( prediction - target );
00551             mean += prediction;
00552             float predictionSP = prediction;
00553             fOutput.write ( ( char* ) &predictionSP, sizeof ( float ) );
00554         }
00555 
00556     }
00557 
00558     delete[] m_outputVectorTmp;
00559     delete[] m_labelsTmp;
00560 
00561     // print classification error
00562     if ( Framework::getDatasetType() )
00563     {
00564         int nWrong = 0;
00565         for ( int d=0;d<m_data->m_nDomain;d++ )
00566         {
00567             nWrong += wrongLabelCnt[d];
00568             if ( m_data->m_nDomain > 1 )
00569                 cout<<"["<< ( double ) wrongLabelCnt[d]/ ( double ) m_data->m_nTest<<"] ";
00570         }
00571         m_predictionClassificationError = 100.0* ( double ) nWrong/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00572         cout<<endl<<"Classification test error: "<<m_predictionClassificationError<<"%"<<endl;
00573     }
00574 
00575     // print RMSE
00576     m_predictionRMSE = sqrt ( rmse/ ( double ) ( ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain * m_data->m_nTest ) );
00577     cout<<"RMSE test: "<<m_predictionRMSE<<endl;
00578 
00579     // print info
00580     cout<<endl<<"Predictions are written to binary output file: "<<fname<<" ("<< ( uint ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain*sizeof ( float ) );
00581     cout<<" Bytes, REAL="<< ( int ) sizeof ( float ) <<"Bytes, #elements:"<< ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain<<") ";
00582     cout<<"[mean:"<<mean/ ( double ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass* ( uint ) m_data->m_nDomain ) <<"] )"<<endl;
00583     cout<<"Prediction time: "<<time ( 0 )-t0<<"[s]"<<endl<<endl;
00584 
00585     fOutput.close();
00586 
00587     if ( ensembleOutput )
00588         delete[] ensembleOutput;
00589     ensembleOutput = 0;
00590 
00591     endPredictionMode();
00592 }

void Scheduler::preparePredictionMode (  )  [private]

Prepare the trained ensemble to predict unknown input features

Definition at line 1266 of file Scheduler.cpp.

01267 {
01268     cout<<"Start scheduled prediction"<<endl;
01269 
01270     // fix random seed
01271     srand ( m_data->m_randSeed );
01272 
01273     // load test set
01274     m_data->readDataset ( m_data->m_datasetName );
01275     srand ( m_data->m_randSeed );
01276 
01277     if(m_data->m_validationType=="ValidationSet")
01278         m_data->m_nCross = 0;
01279     
01280     // number of algorithms in the ensemble
01281     int N = m_algorithmList.size();
01282 
01283     // load normalization (mean, std)
01284     if ( m_data->m_enableCascadeLearning )
01285         m_data->loadNormalization ( N-1 );
01286     else
01287         m_data->loadNormalization();
01288 
01289     // go to prediction mode in all template files
01290     m_algorithmIDList.clear();
01291     for ( int i=0;i<N;i++ )
01292     {
01293         string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i];
01294         setPredictionModeInAlgorithm ( fAlgoTemplateName );
01295     }
01296 
01297     // new NN blender
01298     if ( m_data->m_enablePostNNBlending )
01299     {
01300         m_blenderNN = new BlendingNN();
01301         m_blenderNN->setDataPointers ( m_data );
01302         m_blenderNN->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
01303         m_blenderNN->readSpecificMaps();
01304         m_blenderNN->loadWeights();
01305     }
01306 
01307     // load blending weights
01308     m_blender = new BlendStopping ( ( Algorithm* ) m_data );
01309     m_blender->loadBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath, N+1 );
01310 
01311     for ( int i=0;i<N;i++ )
01312         cout<<"ALGO FROM MASTER DSC-FILE:"<<m_algorithmObjectList[i]->m_stringMap["fullPrediction"]<<endl;
01313 
01314     m_blender->printWeights();
01315 
01316     int nClass = m_data->m_nClass;
01317     int nDomain = m_data->m_nDomain;
01318 
01319     // precompute IDs from effect files per prediction
01320     m_effectID = new int[N];
01321     for ( int i=0;i<N;i++ )
01322         m_effectID[i] = getIDFromFullPredictor ( m_algorithmObjectList[i]->m_trainOnFullPredictorFile );
01323 
01324     // used in prediction mode
01325     // per sample: store prediction for every model
01326     m_noEffect = new REAL[nClass*nDomain];
01327     for ( int i=0;i<nClass*nDomain;i++ )
01328         m_noEffect[i] = 0.0;
01329     m_outputs = new REAL*[N+1];
01330     m_effects = new REAL*[N+1];
01331     for ( int i=0;i<N+1;i++ )
01332     {
01333         m_outputs[i] = new REAL[nClass*nDomain];
01334         m_effects[i] = new REAL[nClass*nDomain];
01335         for ( int j=0;j<nClass*nDomain;j++ )
01336         {
01337             m_outputs[i][j] = 1.0;  // init with constant 1.0 (needed in blend)
01338             m_effects[i][j] = 0.0;
01339         }
01340     }
01341 
01342     // tmp variable for predict labels
01343     m_labelsPredict = new int[m_data->m_nDomain];
01344 
01345 }

void Scheduler::readMasterDscFile ( string  path,
string  masterName 
)

Read the master description file The master file set up some initial train settings, dataset name and train ordering

Parameters:
fname Name of master-dsc file (string)

Definition at line 53 of file Scheduler.cpp.

00054 {
00055     m_data->m_datasetPath = path;
00056     cout<<"Open master .dsc file:"<<(path + "/" + masterName)<<endl;
00057     fstream fMaster ( (path + "/" + masterName).c_str(), ios::in );
00058 
00059     // check is file exists
00060     if ( fMaster.is_open() == 0 )
00061     {
00062         cout<<"Error: no Master.dsc file found in "<<path<<endl;
00063         exit ( 0 );
00064     }
00065 
00066     // read all lines
00067     char buf[1024];
00068     bool readAlgorithmList = false;
00069     while ( fMaster.getline ( buf, 1024 ) ) // read all lines
00070     {
00071         // the line
00072         string line = string ( buf );
00073 
00074         // an empty line or comments
00075         if ( line=="" || line[0]=='#' )
00076             continue;
00077 
00078         // read the algorithm dsc files
00079         if ( readAlgorithmList )
00080         {
00081             m_algorithmList.push_back ( line );
00082             continue;
00083         }
00084 
00085         // list of algorithm dsc files begins
00086         if ( line=="[ALGORITHMS]" )
00087         {
00088             readAlgorithmList = true;
00089             continue;
00090         }
00091 
00092         // split into 2 strings at the '=' char
00093         int pos = line.find ( "=" );
00094         string name = line.substr ( 0, pos );
00095         string value = line.substr ( pos+1 );
00096 
00097         // read the meta training values
00098         if ( name=="dataset" )
00099             m_data->m_datasetName = value;
00100         if ( name=="isClassificationDataset" )
00101             Framework::setDatasetType ( atoi ( value.c_str() ) );
00102         if ( name=="maxThreads" )
00103         {
00104             cout<<"Set max. threads in MKL and IPP: "<<atoi ( value.c_str() ) <<endl;
00105             mkl_set_num_threads ( atoi ( value.c_str() ) );
00106             ippSetNumThreads ( atoi ( value.c_str() ) );
00107         }
00108         if ( name=="maxThreadsInCross" )
00109         {
00110             Framework::setMaxThreads ( atoi ( value.c_str() ) );  // store max. number of threads
00111             m_data->m_maxThreadsInCross = atoi ( value.c_str() );  // #threads in cross-fold-validation
00112         }
00113         if ( name=="nCrossValidation" )
00114         {
00115             m_data->m_nCross = atoi ( value.c_str() );
00116             cout<<"Train "<<m_data->m_nCross<<"-fold cross validation"<<endl;
00117         }
00118         if ( name=="validationType" )
00119         {
00120             assert ( value=="Retraining" || value=="CrossFoldMean" || value=="Bagging" || value=="ValidationSet");
00121             m_data->m_validationType = value;
00122             cout<<"ValidationType: "<<value<<endl;
00123         }
00124         if ( name=="positiveTarget" )
00125             m_data->m_positiveTarget = atof ( value.c_str() );
00126         if ( name=="negativeTarget" )
00127             m_data->m_negativeTarget = atof ( value.c_str() );
00128         if ( name=="standardDeviationMin" )
00129             m_data->m_standardDeviationMin = atof ( value.c_str() );
00130         if ( name=="randomSeed" )
00131         {
00132             if ( value=="time(0)" )
00133                 m_data->m_randSeed = time ( 0 );
00134             else
00135                 m_data->m_randSeed = atoi ( value.c_str() );
00136             cout<<"Set random seed to: "<<m_data->m_randSeed<<endl;
00137             setRandomSeed ( m_data->m_randSeed );
00138         }
00139         if ( name=="nMixDataset" )
00140             m_data->m_nMixDataset = atoi ( value.c_str() );
00141         if ( name=="nMixTrainList" )
00142             m_data->m_nMixTrainList = atoi ( value.c_str() );
00143         if ( name=="blendingRegularization" )
00144             m_data->m_blendingRegularization = atof ( value.c_str() );
00145         if ( name=="blendingAlgorithm" )
00146             m_data->m_blendingAlgorithm = value;
00147         if ( name=="blendingEnableCrossValidation" )
00148             m_data->m_blendingEnableCrossValidation = atoi ( value.c_str() );
00149         if ( name=="enablePostNNBlending" )
00150             m_data->m_enablePostNNBlending = atoi ( value.c_str() );
00151         if ( name=="enableCascadeLearning" )
00152             m_data->m_enableCascadeLearning = atoi ( value.c_str() );
00153         if ( name=="enableGlobalMeanStdEstimate" )
00154             m_data->m_enableGlobalMeanStdEstimate = atoi ( value.c_str() );
00155         if ( name=="enableSaveMemory" )
00156             m_data->m_enableSaveMemory = atoi ( value.c_str() );
00157         if ( name=="errorFunction" )
00158             m_data->m_errorFunction = value;
00159         if ( name=="enablePostBlendClipping" )
00160             m_data->m_enablePostBlendClipping = atoi ( value.c_str() );
00161         if ( name=="addOutputNoise" )
00162             m_data->m_addOutputNoise = atof ( value.c_str() );
00163         if ( name=="enableFeatureSelection" )
00164             m_data->m_enableFeatureSelection = atoi ( value.c_str() );
00165         if ( name=="featureSelectionWriteBinaryDataset" )
00166             m_data->m_featureSelectionWriteBinaryDataset = atoi ( value.c_str() );
00167         if ( name=="enableGlobalBlendingWeights" )
00168             m_data->m_enableGlobalBlendingWeights = atoi ( value.c_str() );
00169         if ( name=="disableWriteDscFile" )
00170         {
00171             m_data->m_disableWriteDscFile = atoi ( value.c_str() );
00172             if ( m_data->m_disableWriteDscFile )
00173                 cout.disableFileOutputs();
00174         }
00175         if ( name=="enableStaticNormalization" )
00176             m_data->m_enableStaticNormalization = atoi ( value.c_str() );
00177         if ( name=="staticMeanNormalization" )
00178             m_data->m_staticMeanNormalization = atof ( value.c_str() );
00179         if ( name=="staticStdNormalization" )
00180             m_data->m_staticStdNormalization = atof ( value.c_str() );
00181         if ( name=="enableProbablisticNormalization" )
00182             m_data->m_enableProbablisticNormalization = atoi ( value.c_str() );
00183         if ( name=="dimensionalityReduction" )
00184             m_data->m_dimensionalityReduction = value;
00185         if ( name=="subsampleTrainSet" )
00186             m_data->m_subsampleTrainSet = atof ( value.c_str() );
00187         if ( name=="subsampleFeatures" )
00188             m_data->m_subsampleFeatures = atof ( value.c_str() );
00189         if ( name=="globalTrainingLoops" )
00190             m_data->m_globalTrainingLoops = atoi ( value.c_str() );
00191         if ( name=="addConstantInput" )
00192             m_data->m_addConstantInput = atoi ( value.c_str() );
00193     }
00194 
00195     fMaster.close();
00196 }

void Scheduler::setPredictionModeInAlgorithm ( string  fname  )  [private]

Used in Prediction mode

Set the particular algorithm in the prediction mode

Parameters:
fname The name of the dsc-file of the Algorithm

Definition at line 1407 of file Scheduler.cpp.

01408 {
01409     cout<<"Prediction mode in algorithm:"<<fname<<endl;
01410 
01411     // check the dsc file
01412     string algoName, id;
01413     checkAlgorithmTemplate ( fname, algoName, id );
01414 
01415     // read dsc file
01416     m_data->readDscFile ( fname );
01417 
01418     // make an instance of the algorithm and give him the data
01419     if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet")
01420     {
01421         Algorithm* algo = 0;
01422         algorithmDispatcher ( algo, algoName );
01423         algo->setDataPointers ( m_data );
01424         algo->setPredictionMode ( m_data->m_nCross );
01425 
01426         // add the algorithm to internal object list of algorithms
01427         m_algorithmObjectList.push_back ( algo );
01428     }
01429     else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" )
01430     {
01431         cout<<"Make "<<m_data->m_nCross<<" models ready to predict"<<endl;
01432         Algorithm** algoList = new Algorithm*[m_data->m_nCross];
01433         for ( int i=0;i<m_data->m_nCross;i++ )
01434         {
01435             Algorithm* algo = 0;
01436             algorithmDispatcher ( algo, algoName );
01437             algo->setDataPointers ( m_data );
01438             algo->setPredictionMode ( i );
01439             algoList[i] = algo;
01440         }
01441         m_algorithmObjectListList.push_back ( algoList );
01442         m_algorithmObjectList.push_back ( algoList[0] );
01443     }
01444     else
01445         assert(false);
01446 
01447     // check, if id already exist
01448     for ( int i=0;i<m_algorithmIDList.size();i++ )
01449         if ( m_algorithmIDList[i] == atoi ( id.c_str() ) )
01450         {
01451             cout<<"ID:"<<id<<" in "<<algoName<<" already exists"<<endl;
01452             assert ( false );
01453         }
01454 
01455     m_algorithmIDList.push_back ( atoi ( id.c_str() ) );
01456 
01457     m_algorithmNameList.push_back ( algoName );
01458 
01459     cout<<endl;
01460 }

void Scheduler::train (  ) 

Train the stack of Algorithms (described is dsc files itself)

Definition at line 202 of file Scheduler.cpp.

00203 {
00204     Framework::setFrameworkMode ( 0 );
00205 
00206     cout<<"Start scheduled training"<<endl;
00207 
00208     // fill the data object with the dataset
00209     cout<<"Fill data"<<endl;
00210 
00211     // autoencoder file objects
00212     fstream fA0 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataMean.dat" ).c_str(), ios::in );
00213     fstream fA1 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataStd.dat" ).c_str(), ios::in );
00214     fstream fA2 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTest.dat" ).c_str(), ios::in );
00215     fstream fA3 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTestTarget.dat" ).c_str(), ios::in );
00216     fstream fA4 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrain.dat" ).c_str(), ios::in );
00217     fstream fA5 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrainTarget.dat" ).c_str(), ios::in );
00218     fstream fA6 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderWeights.dat" ).c_str(), ios::in );
00219 
00220     bool autoencoderFilesOK = fA0.is_open() && fA1.is_open() && fA2.is_open() && fA3.is_open() && fA4.is_open() && fA5.is_open() && fA6.is_open();
00221 
00222     // perform: reduce the dimensionalty of data
00223     if ( m_data->m_dimensionalityReduction == "Autoencoder" && autoencoderFilesOK == false )
00224     {
00225         cout<<"Autoencoder: start training"<<endl;
00226         
00227         // fix random seed
00228         srand ( m_data->m_randSeed );
00229 
00230         // read dataset
00231         m_data->readDataset ( m_data->m_datasetName );
00232         m_data->mergeTrainAndTest();
00233         m_data->mixDataset();
00234 
00235         // prepare cross-fold validation
00236         m_data->allocMemForCrossValidationSets();
00237         m_data->normalizeZeroOne();
00238 
00239         // train algorithm
00240         trainAlgorithm ( m_data->m_datasetPath + "/Autoencoder.dsc", m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/Autoencoder.dsc" );
00241 
00242         // clear mem
00243         m_data->deleteMemory();
00244     }
00245 
00246     // optimize sequentially the whole ensemble
00247     // 1st run: build initial ensemble
00248     // 2nd...endRun: optimize each algorithm's metaparameters
00249     time_t totalTime = time(0);
00250     cout<<"globalTrainingLoops:"<<m_data->m_globalTrainingLoops<<endl;
00251     for ( int globalLoop=0;globalLoop<m_data->m_globalTrainingLoops;globalLoop++ )
00252     {
00253         // train all template files
00254         for ( int i=0;i<m_algorithmList.size();i++ )
00255         {
00256             //m_data->m_randSeed+=i;
00257 
00258             // fix random seed
00259             srand ( m_data->m_randSeed );
00260 
00261             // read dataset
00262             if ( m_data->m_dimensionalityReduction == "Autoencoder" )
00263             {
00264                 Autoencoder a;
00265                 a.setDataPointers ( m_data );
00266 
00267                 // fix random seed
00268                 srand ( m_data->m_randSeed );
00269 
00270                 a.readDataset ( m_data, m_data->m_datasetName );
00271             }
00272             else
00273                 m_data->readDataset ( m_data->m_datasetName );
00274 
00275             // bagging: modify the trainset in retraining
00276             m_data->enableBagging ( m_baggingRun );
00277             m_data->baggingRandomSeed ( m_randSeedBagBoost );
00278 
00279             // copy train data for later evaluation
00280             if ( m_boostingRun )
00281             {
00282                 if ( m_probs == 0 )
00283                 {
00284                     cout<<"Init bootstrap probabilities to 1/N"<<endl;
00285                     m_probs = new REAL[m_data->m_nTrain];
00286                     for ( int j=0;j<m_data->m_nTrain;j++ )
00287                         m_probs[j] = 1.0 / ( ( REAL ) m_data->m_nTrain );
00288                 }
00289                 if ( m_boostingTrain==0 )
00290                 {
00291                     cout<<"Copy train set (features + targets) to boosting trainset"<<endl;
00292                     m_boostingNTrain = m_data->m_nTrain;
00293                     m_boostingTrain = new REAL[m_data->m_nTrain*m_data->m_nFeatures];
00294                     m_boostingTargets = new REAL[m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain];
00295                     memcpy ( m_boostingTrain, m_data->m_trainOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nFeatures );
00296                     memcpy ( m_boostingTargets, m_data->m_trainTargetOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain );
00297                 }
00298 
00299                 if ( m_boostingEpoch > 0 )
00300                     m_data->doBootstrapSampling ( m_probs,m_data->m_trainOrig,m_data->m_trainTargetOrig,m_data->m_trainTargetOrigEffect,m_data->m_trainTargetOrigResidual,m_data->m_trainLabelOrig );
00301             }
00302 
00303             srand ( m_data->m_randSeed );
00304 
00305             // set the list of already trained predictors
00306             if ( globalLoop == 0 )
00307                 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.begin() +i ) );
00308             else
00309             {
00310                 vector<string> tmp;
00311                 for ( int j=0;j<m_algorithmList.size();j++ )
00312                     if ( j != i )
00313                         tmp.push_back ( m_algorithmList[j] );
00314                 m_data->setAlgorithmList ( tmp );
00315             }
00316 
00317             time_t beginTime = time ( 0 );
00318 
00319             // extend input features with previous predictions
00320             if ( m_data->m_enableCascadeLearning )
00321             {
00322                 if(m_data->m_validationType=="ValidationSet")
00323                     assert(false);
00324                 m_data->fillCascadeLearningInputs();
00325                 m_data->extendTrainDataWithCascadeInputs();
00326             }
00327 
00328             m_data->allocMemForCrossValidationSets();
00329 
00330             // algorithm dsc file (template)
00331             string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i];
00332             string fAlgoName = m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/" + m_algorithmList[i];
00333             /*fstream fAlgoTemplate(fAlgoTemplateName.c_str(), ios::in);
00334 
00335             // open the dsc file
00336             fstream fAlgo(fAlgoName.c_str(), ios::out);
00337 
00338             cout<<"AlgoTemplate:"<<fAlgoTemplateName<<"  Algo:"<<fAlgoName<<endl;
00339 
00340             // copy the content from the template to the dsc file
00341             char buf[1024];
00342             while(fAlgoTemplate.getline(buf, 1024))  // read all lines
00343             {
00344                 string line = string(buf);
00345                 fAlgo<<line<<endl;
00346             }
00347 
00348             fAlgoTemplate.close();
00349             fAlgo.close();
00350 
00351             // redirect cout to filename
00352             cout.setOutputFile(fAlgoName);
00353 
00354             cout<<"Floating point precision: "<<(int)sizeof(REAL)<<" Bytes"<<endl;
00355             */
00356             // =========================== train the algorithm ===========================
00357 
00358             if ( globalLoop > 0 )
00359                 m_data->m_loadWeightsBeforeTraining = true;
00360 
00361             trainAlgorithm ( fAlgoTemplateName, fAlgoName );
00362             cout<<"Finished in "<<time ( 0 )-beginTime<<"[s]"<<endl;
00363 
00364             // clear file redirection of cout<<
00365             cout.setOutputFile ( "" );
00366 
00367             // clear mem
00368             m_data->deleteMemory();
00369         }
00370     }
00371 
00372     cout<<"Total training time:"<<time(0)-totalTime<<"[s]"<<endl;
00373     
00374     if ( m_data->m_enablePostNNBlending )
00375     {
00376         // set the list of already trained predictors
00377         m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) );
00378 
00379         // fix random seed
00380         srand ( m_data->m_randSeed );
00381 
00382         // read dataset
00383         m_data->readDataset ( m_data->m_datasetName );
00384         srand ( m_data->m_randSeed );
00385         m_data->allocMemForCrossValidationSets();
00386         m_data->partitionDatasetToCrossValidationSets();
00387 
00388         m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
00389 
00390         BlendingNN nn;
00391         nn.setDataPointers ( m_data );
00392         nn.readSpecificMaps();
00393         nn.init();
00394         nn.train();
00395 
00396         // clear mem
00397         m_data->deleteMemory();
00398     }
00399 }

void Scheduler::trainAlgorithm ( string  fnameTemplate,
string  fnameDsc 
) [private]

Start the training of the particular Algorithm

Parameters:
fname Dsc file name of algorithm

Definition at line 1085 of file Scheduler.cpp.

01086 {
01087     cout<<"Train algorithm:"<<fnameTemplate<<endl;
01088 
01089     string algoName, id;
01090     checkAlgorithmTemplate ( fnameTemplate, algoName, id );
01091 
01092     // read dsc file
01093     m_data->readDscFile ( fnameTemplate );
01094     if ( m_data->m_disableTraining )
01095     {
01096         cout<<"Training disabled."<<endl;
01097         return;
01098     }
01099 
01100     // copy the content of the template to the dsc file
01101     fstream fAlgoTemplate ( fnameTemplate.c_str(), ios::in );
01102     fstream fAlgo ( fnameDsc.c_str(), ios::out );
01103     cout<<"AlgoTemplate:"<<fnameTemplate<<"  Algo:"<<fnameDsc<<endl;
01104     char buf[1024];
01105     while ( fAlgoTemplate.getline ( buf, 1024 ) ) // read all lines
01106     {
01107         string line = string ( buf );
01108         fAlgo<<line<<endl;
01109     }
01110     fAlgoTemplate.close();
01111     fAlgo.close();
01112 
01113     // redirect cout to filename
01114     cout.setOutputFile ( fnameDsc );
01115 
01116     cout<<"Floating point precision: "<< ( int ) sizeof ( REAL ) <<" Bytes"<<endl;
01117 
01118     m_data->partitionDatasetToCrossValidationSets();
01119 
01120     // start the algorithm
01121     Algorithm* algo = 0;
01122     algorithmDispatcher ( algo, algoName );
01123     algo->setDataPointers ( m_data );
01124 
01125     if ( m_data->m_enableFeatureSelection )
01126     {
01127         algo->doFeatureSelection();
01128         exit ( 0 );
01129     }
01130     else
01131         algo->train();
01132 
01133     if ( algo )
01134     {
01135         cout<<"delete algo"<<endl;
01136         delete algo;
01137     }
01138     algo = 0;
01139     cout<<"Finished train algorithm:"<<fnameTemplate<<endl;
01140 
01141 }


The documentation for this class was generated from the following files:

Generated on Tue Jan 26 09:21:16 2010 for ELF by  doxygen 1.5.8