Scheduler.cpp

00001 #include "Scheduler.h"
00002 
00003 extern StreamOutput cout;
00004 
00008 Scheduler::Scheduler()
00009 {
00010     cout<<"Scheduler"<<endl;
00011     // init member vars
00012     m_data = 0;
00013     m_blender = 0;
00014     m_blenderNN = 0;
00015     m_labelsPredict = 0;
00016     m_effectID = 0;
00017     m_noEffect = 0;
00018     m_outputs = 0;
00019     m_effects = 0;
00020     m_predictionRMSE = 0;
00021     m_predictionClassificationError = 0;
00022 
00023     m_data = new Data();
00024     m_data->setPathes ( TMP_PATH, DSC_PATH, FULL_PREDICTOR_PATH, DATA_PATH );
00025 
00026     m_baggingRun = 0;
00027     m_boostingRun = 0;
00028     m_randSeedBagBoost = 0;
00029     m_probs = 0;
00030     m_boostingTrain = 0;
00031     m_boostingTargets = 0;
00032     m_boostingNTrain = 0;
00033 
00034 }
00035 
00039 Scheduler::~Scheduler()
00040 {
00041     cout<<"descructor Scheduler"<<endl;
00042     if ( m_data )
00043         delete m_data;
00044     m_data = 0;
00045 }
00046 
00053 void Scheduler::readMasterDscFile ( string path, string masterName )
00054 {
00055     m_data->m_datasetPath = path;
00056     cout<<"Open master .dsc file:"<<(path + "/" + masterName)<<endl;
00057     fstream fMaster ( (path + "/" + masterName).c_str(), ios::in );
00058 
00059     // check is file exists
00060     if ( fMaster.is_open() == 0 )
00061     {
00062         cout<<"Error: no Master.dsc file found in "<<path<<endl;
00063         exit ( 0 );
00064     }
00065 
00066     // read all lines
00067     char buf[1024];
00068     bool readAlgorithmList = false;
00069     while ( fMaster.getline ( buf, 1024 ) ) // read all lines
00070     {
00071         // the line
00072         string line = string ( buf );
00073 
00074         // an empty line or comments
00075         if ( line=="" || line[0]=='#' )
00076             continue;
00077 
00078         // read the algorithm dsc files
00079         if ( readAlgorithmList )
00080         {
00081             m_algorithmList.push_back ( line );
00082             continue;
00083         }
00084 
00085         // list of algorithm dsc files begins
00086         if ( line=="[ALGORITHMS]" )
00087         {
00088             readAlgorithmList = true;
00089             continue;
00090         }
00091 
00092         // split into 2 strings at the '=' char
00093         int pos = line.find ( "=" );
00094         string name = line.substr ( 0, pos );
00095         string value = line.substr ( pos+1 );
00096 
00097         // read the meta training values
00098         if ( name=="dataset" )
00099             m_data->m_datasetName = value;
00100         if ( name=="isClassificationDataset" )
00101             Framework::setDatasetType ( atoi ( value.c_str() ) );
00102         if ( name=="maxThreads" )
00103         {
00104             cout<<"Set max. threads in MKL and IPP: "<<atoi ( value.c_str() ) <<endl;
00105             mkl_set_num_threads ( atoi ( value.c_str() ) );
00106             ippSetNumThreads ( atoi ( value.c_str() ) );
00107         }
00108         if ( name=="maxThreadsInCross" )
00109         {
00110             Framework::setMaxThreads ( atoi ( value.c_str() ) );  // store max. number of threads
00111             m_data->m_maxThreadsInCross = atoi ( value.c_str() );  // #threads in cross-fold-validation
00112         }
00113         if ( name=="nCrossValidation" )
00114         {
00115             m_data->m_nCross = atoi ( value.c_str() );
00116             cout<<"Train "<<m_data->m_nCross<<"-fold cross validation"<<endl;
00117         }
00118         if ( name=="validationType" )
00119         {
00120             assert ( value=="Retraining" || value=="CrossFoldMean" || value=="Bagging" || value=="ValidationSet");
00121             m_data->m_validationType = value;
00122             cout<<"ValidationType: "<<value<<endl;
00123         }
00124         if ( name=="positiveTarget" )
00125             m_data->m_positiveTarget = atof ( value.c_str() );
00126         if ( name=="negativeTarget" )
00127             m_data->m_negativeTarget = atof ( value.c_str() );
00128         if ( name=="standardDeviationMin" )
00129             m_data->m_standardDeviationMin = atof ( value.c_str() );
00130         if ( name=="randomSeed" )
00131         {
00132             if ( value=="time(0)" )
00133                 m_data->m_randSeed = time ( 0 );
00134             else
00135                 m_data->m_randSeed = atoi ( value.c_str() );
00136             cout<<"Set random seed to: "<<m_data->m_randSeed<<endl;
00137             setRandomSeed ( m_data->m_randSeed );
00138         }
00139         if ( name=="nMixDataset" )
00140             m_data->m_nMixDataset = atoi ( value.c_str() );
00141         if ( name=="nMixTrainList" )
00142             m_data->m_nMixTrainList = atoi ( value.c_str() );
00143         if ( name=="blendingRegularization" )
00144             m_data->m_blendingRegularization = atof ( value.c_str() );
00145         if ( name=="blendingAlgorithm" )
00146             m_data->m_blendingAlgorithm = value;
00147         if ( name=="blendingEnableCrossValidation" )
00148             m_data->m_blendingEnableCrossValidation = atoi ( value.c_str() );
00149         if ( name=="enablePostNNBlending" )
00150             m_data->m_enablePostNNBlending = atoi ( value.c_str() );
00151         if ( name=="enableCascadeLearning" )
00152             m_data->m_enableCascadeLearning = atoi ( value.c_str() );
00153         if ( name=="enableGlobalMeanStdEstimate" )
00154             m_data->m_enableGlobalMeanStdEstimate = atoi ( value.c_str() );
00155         if ( name=="enableSaveMemory" )
00156             m_data->m_enableSaveMemory = atoi ( value.c_str() );
00157         if ( name=="errorFunction" )
00158             m_data->m_errorFunction = value;
00159         if ( name=="enablePostBlendClipping" )
00160             m_data->m_enablePostBlendClipping = atoi ( value.c_str() );
00161         if ( name=="addOutputNoise" )
00162             m_data->m_addOutputNoise = atof ( value.c_str() );
00163         if ( name=="enableFeatureSelection" )
00164             m_data->m_enableFeatureSelection = atoi ( value.c_str() );
00165         if ( name=="featureSelectionWriteBinaryDataset" )
00166             m_data->m_featureSelectionWriteBinaryDataset = atoi ( value.c_str() );
00167         if ( name=="enableGlobalBlendingWeights" )
00168             m_data->m_enableGlobalBlendingWeights = atoi ( value.c_str() );
00169         if ( name=="disableWriteDscFile" )
00170         {
00171             m_data->m_disableWriteDscFile = atoi ( value.c_str() );
00172             if ( m_data->m_disableWriteDscFile )
00173                 cout.disableFileOutputs();
00174         }
00175         if ( name=="enableStaticNormalization" )
00176             m_data->m_enableStaticNormalization = atoi ( value.c_str() );
00177         if ( name=="staticMeanNormalization" )
00178             m_data->m_staticMeanNormalization = atof ( value.c_str() );
00179         if ( name=="staticStdNormalization" )
00180             m_data->m_staticStdNormalization = atof ( value.c_str() );
00181         if ( name=="enableProbablisticNormalization" )
00182             m_data->m_enableProbablisticNormalization = atoi ( value.c_str() );
00183         if ( name=="dimensionalityReduction" )
00184             m_data->m_dimensionalityReduction = value;
00185         if ( name=="subsampleTrainSet" )
00186             m_data->m_subsampleTrainSet = atof ( value.c_str() );
00187         if ( name=="subsampleFeatures" )
00188             m_data->m_subsampleFeatures = atof ( value.c_str() );
00189         if ( name=="globalTrainingLoops" )
00190             m_data->m_globalTrainingLoops = atoi ( value.c_str() );
00191         if ( name=="addConstantInput" )
00192             m_data->m_addConstantInput = atoi ( value.c_str() );
00193     }
00194 
00195     fMaster.close();
00196 }
00197 
00202 void Scheduler::train()
00203 {
00204     Framework::setFrameworkMode ( 0 );
00205 
00206     cout<<"Start scheduled training"<<endl;
00207 
00208     // fill the data object with the dataset
00209     cout<<"Fill data"<<endl;
00210 
00211     // autoencoder file objects
00212     fstream fA0 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataMean.dat" ).c_str(), ios::in );
00213     fstream fA1 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataStd.dat" ).c_str(), ios::in );
00214     fstream fA2 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTest.dat" ).c_str(), ios::in );
00215     fstream fA3 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTestTarget.dat" ).c_str(), ios::in );
00216     fstream fA4 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrain.dat" ).c_str(), ios::in );
00217     fstream fA5 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrainTarget.dat" ).c_str(), ios::in );
00218     fstream fA6 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderWeights.dat" ).c_str(), ios::in );
00219 
00220     bool autoencoderFilesOK = fA0.is_open() && fA1.is_open() && fA2.is_open() && fA3.is_open() && fA4.is_open() && fA5.is_open() && fA6.is_open();
00221 
00222     // perform: reduce the dimensionalty of data
00223     if ( m_data->m_dimensionalityReduction == "Autoencoder" && autoencoderFilesOK == false )
00224     {
00225         cout<<"Autoencoder: start training"<<endl;
00226         
00227         // fix random seed
00228         srand ( m_data->m_randSeed );
00229 
00230         // read dataset
00231         m_data->readDataset ( m_data->m_datasetName );
00232         m_data->mergeTrainAndTest();
00233         m_data->mixDataset();
00234 
00235         // prepare cross-fold validation
00236         m_data->allocMemForCrossValidationSets();
00237         m_data->normalizeZeroOne();
00238 
00239         // train algorithm
00240         trainAlgorithm ( m_data->m_datasetPath + "/Autoencoder.dsc", m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/Autoencoder.dsc" );
00241 
00242         // clear mem
00243         m_data->deleteMemory();
00244     }
00245 
00246     // optimize sequentially the whole ensemble
00247     // 1st run: build initial ensemble
00248     // 2nd...endRun: optimize each algorithm's metaparameters
00249     time_t totalTime = time(0);
00250     cout<<"globalTrainingLoops:"<<m_data->m_globalTrainingLoops<<endl;
00251     for ( int globalLoop=0;globalLoop<m_data->m_globalTrainingLoops;globalLoop++ )
00252     {
00253         // train all template files
00254         for ( int i=0;i<m_algorithmList.size();i++ )
00255         {
00256             //m_data->m_randSeed+=i;
00257 
00258             // fix random seed
00259             srand ( m_data->m_randSeed );
00260 
00261             // read dataset
00262             if ( m_data->m_dimensionalityReduction == "Autoencoder" )
00263             {
00264                 Autoencoder a;
00265                 a.setDataPointers ( m_data );
00266 
00267                 // fix random seed
00268                 srand ( m_data->m_randSeed );
00269 
00270                 a.readDataset ( m_data, m_data->m_datasetName );
00271             }
00272             else
00273                 m_data->readDataset ( m_data->m_datasetName );
00274 
00275             // bagging: modify the trainset in retraining
00276             m_data->enableBagging ( m_baggingRun );
00277             m_data->baggingRandomSeed ( m_randSeedBagBoost );
00278 
00279             // copy train data for later evaluation
00280             if ( m_boostingRun )
00281             {
00282                 if ( m_probs == 0 )
00283                 {
00284                     cout<<"Init bootstrap probabilities to 1/N"<<endl;
00285                     m_probs = new REAL[m_data->m_nTrain];
00286                     for ( int j=0;j<m_data->m_nTrain;j++ )
00287                         m_probs[j] = 1.0 / ( ( REAL ) m_data->m_nTrain );
00288                 }
00289                 if ( m_boostingTrain==0 )
00290                 {
00291                     cout<<"Copy train set (features + targets) to boosting trainset"<<endl;
00292                     m_boostingNTrain = m_data->m_nTrain;
00293                     m_boostingTrain = new REAL[m_data->m_nTrain*m_data->m_nFeatures];
00294                     m_boostingTargets = new REAL[m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain];
00295                     memcpy ( m_boostingTrain, m_data->m_trainOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nFeatures );
00296                     memcpy ( m_boostingTargets, m_data->m_trainTargetOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain );
00297                 }
00298 
00299                 if ( m_boostingEpoch > 0 )
00300                     m_data->doBootstrapSampling ( m_probs,m_data->m_trainOrig,m_data->m_trainTargetOrig,m_data->m_trainTargetOrigEffect,m_data->m_trainTargetOrigResidual,m_data->m_trainLabelOrig );
00301             }
00302 
00303             srand ( m_data->m_randSeed );
00304 
00305             // set the list of already trained predictors
00306             if ( globalLoop == 0 )
00307                 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.begin() +i ) );
00308             else
00309             {
00310                 vector<string> tmp;
00311                 for ( int j=0;j<m_algorithmList.size();j++ )
00312                     if ( j != i )
00313                         tmp.push_back ( m_algorithmList[j] );
00314                 m_data->setAlgorithmList ( tmp );
00315             }
00316 
00317             time_t beginTime = time ( 0 );
00318 
00319             // extend input features with previous predictions
00320             if ( m_data->m_enableCascadeLearning )
00321             {
00322                 if(m_data->m_validationType=="ValidationSet")
00323                     assert(false);
00324                 m_data->fillCascadeLearningInputs();
00325                 m_data->extendTrainDataWithCascadeInputs();
00326             }
00327 
00328             m_data->allocMemForCrossValidationSets();
00329 
00330             // algorithm dsc file (template)
00331             string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i];
00332             string fAlgoName = m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/" + m_algorithmList[i];
00333             /*fstream fAlgoTemplate(fAlgoTemplateName.c_str(), ios::in);
00334 
00335             // open the dsc file
00336             fstream fAlgo(fAlgoName.c_str(), ios::out);
00337 
00338             cout<<"AlgoTemplate:"<<fAlgoTemplateName<<"  Algo:"<<fAlgoName<<endl;
00339 
00340             // copy the content from the template to the dsc file
00341             char buf[1024];
00342             while(fAlgoTemplate.getline(buf, 1024))  // read all lines
00343             {
00344                 string line = string(buf);
00345                 fAlgo<<line<<endl;
00346             }
00347 
00348             fAlgoTemplate.close();
00349             fAlgo.close();
00350 
00351             // redirect cout to filename
00352             cout.setOutputFile(fAlgoName);
00353 
00354             cout<<"Floating point precision: "<<(int)sizeof(REAL)<<" Bytes"<<endl;
00355             */
00356             // =========================== train the algorithm ===========================
00357 
00358             if ( globalLoop > 0 )
00359                 m_data->m_loadWeightsBeforeTraining = true;
00360 
00361             trainAlgorithm ( fAlgoTemplateName, fAlgoName );
00362             cout<<"Finished in "<<time ( 0 )-beginTime<<"[s]"<<endl;
00363 
00364             // clear file redirection of cout<<
00365             cout.setOutputFile ( "" );
00366 
00367             // clear mem
00368             m_data->deleteMemory();
00369         }
00370     }
00371 
00372     cout<<"Total training time:"<<time(0)-totalTime<<"[s]"<<endl;
00373     
00374     if ( m_data->m_enablePostNNBlending )
00375     {
00376         // set the list of already trained predictors
00377         m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) );
00378 
00379         // fix random seed
00380         srand ( m_data->m_randSeed );
00381 
00382         // read dataset
00383         m_data->readDataset ( m_data->m_datasetName );
00384         srand ( m_data->m_randSeed );
00385         m_data->allocMemForCrossValidationSets();
00386         m_data->partitionDatasetToCrossValidationSets();
00387 
00388         m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
00389 
00390         BlendingNN nn;
00391         nn.setDataPointers ( m_data );
00392         nn.readSpecificMaps();
00393         nn.init();
00394         nn.train();
00395 
00396         // clear mem
00397         m_data->deleteMemory();
00398     }
00399 }
00400 
00405 void Scheduler::blend()
00406 {
00407     Framework::setFrameworkMode ( 0 );
00408 
00409     cout<<"Start blending after training"<<endl;
00410 
00411     // set the list of already trained predictors
00412     m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) );
00413 
00414     // fix random seed
00415     srand ( m_data->m_randSeed );
00416 
00417     // fill the data object with the dataset
00418     cout<<"Fill data"<<endl;
00419     m_data->readDataset ( m_data->m_datasetName );
00420     srand ( m_data->m_randSeed );
00421     m_data->allocMemForCrossValidationSets();
00422     m_data->partitionDatasetToCrossValidationSets();
00423 
00424     if ( m_data->m_enablePostNNBlending )
00425     {
00426         m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
00427 
00428         BlendingNN nn;
00429         nn.setDataPointers ( m_data );
00430         nn.readSpecificMaps();
00431         nn.init();
00432         nn.train();
00433     }
00434     else
00435     {
00436         BlendStopping bb ( ( Algorithm* ) m_data, "" );
00437         bb.setRegularization ( m_data->m_blendingRegularization );
00438         double rmse = bb.calcBlending();
00439         cout<<"BLEND RMSE OF ACTUAL FULLPREDICTION PATH:"<<rmse<<endl;
00440         bb.saveBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath );
00441     }
00442 }
00443 
00444 
00450 void Scheduler::predict()
00451 {
00452     Framework::setFrameworkMode ( 1 );
00453 
00454     preparePredictionMode();
00455 
00456     int progress = m_data->m_nTest / 100 + 1;
00457     double mean = 0.0, rmse = 0.0;
00458 
00459     // output file (binary)
00460     string fname;
00461     if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() >= 0 )
00462     {
00463         cout<<"Dataset:NETFLIX, slot:"<<Framework::getAdditionalStartupParameter() <<" ";
00464         char buf[512];
00465         sprintf ( buf,"p%d",Framework::getAdditionalStartupParameter() );
00466         fname = string ( NETFLIX_SLOTDATA_ROOT_DIR ) + buf + "/testPrediction.data";
00467         cout<<"pName:"<<fname<<endl;
00468     }
00469     else if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() < -100 )
00470     {
00471         char buf[512];
00472         sprintf ( buf,"ELFprediction%d",Framework::getRandomSeed() );
00473         string algos;
00474         for ( int i=0;i<m_algorithmList.size();i++ )
00475             algos += "_" + m_algorithmList[i];
00476         fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/" + buf + algos + ".dat";
00477         cout<<"pName:"<<fname<<endl;
00478     }
00479     else
00480     {
00481         char nr[512];
00482         sprintf ( nr,"%d",rand() );
00483         fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/testPrediction" + string ( nr ) + ".data";
00484         //fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/testPrediction.data";
00485     }
00486 
00487     fstream fOutput ( fname.c_str(),ios::out );
00488 
00489     // the output vector of the ensemble
00490     REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain];
00491 
00492     int* wrongLabelCnt = new int[m_data->m_nDomain];
00493     for ( int i=0;i<m_data->m_nDomain;i++ )
00494         wrongLabelCnt[i] = 0;
00495 
00496     // store the real input dimension of data
00497     int nrFeat = m_data->m_nFeatures;
00498 
00499     m_outputVectorTmp = new REAL[m_data->m_nClass*m_data->m_nDomain];
00500     m_labelsTmp = new int[m_data->m_nClass*m_data->m_nDomain];
00501 
00502     // load the autoencoder net
00503     Autoencoder* autoEnc = 0;
00504     bool enableAutoencoder = false;
00505     REAL* autoencoderOutput = 0;
00506     if ( m_data->m_dimensionalityReduction == "Autoencoder" )
00507     {
00508         autoEnc = new Autoencoder();
00509         autoEnc->setDataPointers ( m_data );
00510         autoEnc->loadWeights();
00511         autoEnc->loadNormalizations();
00512         enableAutoencoder = true;
00513         autoencoderOutput = new REAL[autoEnc->m_nClass];
00514         m_data->m_nFeatures = autoEnc->m_nClass;  //  modify input dimension
00515     }
00516 
00517     cout<<endl<<"predict(100 dots): "<<flush;
00518     time_t t0 = time ( 0 );
00519 
00520     // go through the test set
00521     for ( uint i=0;i<m_data->m_nTest;i++ )
00522     {
00523         if ( i % progress == 0 )
00524             cout<<"."<<flush;
00525 
00526         // predict one example
00527         REAL* inputFeature = m_data->m_testOrig + i * ( uint ) nrFeat;
00528 
00529         if ( enableAutoencoder )
00530         {
00531             autoEnc->predictAllOutputs ( inputFeature, autoencoderOutput, 1, 0 );
00532             getEnsemblePrediction ( autoencoderOutput, ensembleOutput );
00533         }
00534         else
00535             getEnsemblePrediction ( inputFeature, ensembleOutput );
00536 
00537         // if the dataset has classification type, count the #wrong labeled
00538         if ( Framework::getDatasetType() )
00539         {
00540             for ( uint d=0;d<m_data->m_nDomain;d++ )
00541                 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i* ( uint ) m_data->m_nDomain] )
00542                     wrongLabelCnt[d]++;
00543         }
00544 
00545         // rmse calculation over all targets
00546         for ( uint j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00547         {
00548             REAL target = m_data->m_testTargetOrig[i * ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain + j];
00549             REAL prediction = ensembleOutput[j];
00550             rmse += ( prediction - target ) * ( prediction - target );
00551             mean += prediction;
00552             float predictionSP = prediction;
00553             fOutput.write ( ( char* ) &predictionSP, sizeof ( float ) );
00554         }
00555 
00556     }
00557 
00558     delete[] m_outputVectorTmp;
00559     delete[] m_labelsTmp;
00560 
00561     // print classification error
00562     if ( Framework::getDatasetType() )
00563     {
00564         int nWrong = 0;
00565         for ( int d=0;d<m_data->m_nDomain;d++ )
00566         {
00567             nWrong += wrongLabelCnt[d];
00568             if ( m_data->m_nDomain > 1 )
00569                 cout<<"["<< ( double ) wrongLabelCnt[d]/ ( double ) m_data->m_nTest<<"] ";
00570         }
00571         m_predictionClassificationError = 100.0* ( double ) nWrong/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00572         cout<<endl<<"Classification test error: "<<m_predictionClassificationError<<"%"<<endl;
00573     }
00574 
00575     // print RMSE
00576     m_predictionRMSE = sqrt ( rmse/ ( double ) ( ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain * m_data->m_nTest ) );
00577     cout<<"RMSE test: "<<m_predictionRMSE<<endl;
00578 
00579     // print info
00580     cout<<endl<<"Predictions are written to binary output file: "<<fname<<" ("<< ( uint ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain*sizeof ( float ) );
00581     cout<<" Bytes, REAL="<< ( int ) sizeof ( float ) <<"Bytes, #elements:"<< ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain<<") ";
00582     cout<<"[mean:"<<mean/ ( double ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass* ( uint ) m_data->m_nDomain ) <<"] )"<<endl;
00583     cout<<"Prediction time: "<<time ( 0 )-t0<<"[s]"<<endl<<endl;
00584 
00585     fOutput.close();
00586 
00587     if ( ensembleOutput )
00588         delete[] ensembleOutput;
00589     ensembleOutput = 0;
00590 
00591     endPredictionMode();
00592 }
00593 
00600 void Scheduler::bagging()
00601 {
00602     int epochs = Framework::getAdditionalStartupParameter();
00603     cout<<endl<<endl;
00604     cout<<"================================= Bagging ================================="<<endl;
00605     cout<<"epochs:"<<epochs<<endl<<endl<<endl;
00606     m_baggingRun = true;
00607 
00608     vector<string> baggingFileNames;
00609     uint testSize = 0;
00610     double rmseMean = 0.0, classErrMean = 0.0;
00611 
00612     for ( int e=0;e<epochs;e++ )
00613     {
00614         cout<<"e:"<<e<<endl;
00615 
00616         m_randSeedBagBoost = e + 1;
00617 
00618         // train and predict testset
00619         train();
00620         predict();
00621 
00622         rmseMean += getPredictionRMSE();
00623         classErrMean += getClassificationError();
00624 
00625         fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00626         if ( fTest.is_open() ==false )
00627             assert ( false );
00628         char buf[512];
00629         sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e );
00630         baggingFileNames.push_back ( buf );
00631         fstream fTmp ( buf,ios::out );
00632 
00633         // get length of file
00634         fTest.seekg ( 0, ios::end );
00635         uint length = fTest.tellg();
00636         testSize = length/sizeof ( REAL );
00637         fTest.seekg ( 0, ios::beg );
00638 
00639         // allocate memory
00640         char* buffer = new char[length];
00641 
00642         // read data as a block
00643         fTest.read ( buffer,length );
00644         fTest.close();
00645 
00646         // write
00647         fTmp.write ( buffer,length );
00648         delete[] buffer;
00649 
00650         fTmp.close();
00651     }
00652 
00653 
00654     srand ( m_data->m_randSeed );
00655     m_data->readDataset ( m_data->m_datasetName );
00656 
00657     testSize = m_data->m_nTest * m_data->m_nClass * m_data->m_nDomain;
00658 
00659     // calc bag mean
00660     REAL* testMean = new REAL[testSize];
00661     for ( int i=0;i<testSize;i++ )
00662         testMean[i] = 0.0;
00663     for ( int e=0;e<epochs;e++ )
00664     {
00665         char nameBuf[512];
00666         sprintf ( nameBuf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e );
00667         fstream f ( nameBuf,ios::in );
00668         float* buf = new float[testSize];
00669         f.read ( ( char* ) buf,sizeof ( float ) *testSize );
00670         f.close();
00671 
00672         // add this run to ensemble
00673         for ( int i=0;i<testSize;i++ )
00674             testMean[i] += buf[i];
00675 
00676         delete[] buf;
00677 
00678 
00679         // per epoch: calculate prediction RMSE and classification error
00680         double classErrBag = 0.0;
00681         double rmseBag = 0.0;
00682 
00683         // go through the test set
00684         for ( uint i=0;i<m_data->m_nTest;i++ )
00685         {
00686             REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00687             REAL* ensembleOutputNorm = new REAL[m_data->m_nClass*m_data->m_nDomain];
00688             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00689                 ensembleOutputNorm[j] = ensembleOutput[j] / ( ( double ) e+1.0 );
00690 
00691             // if the dataset has classification type, count the #wrong labeled
00692             if ( Framework::getDatasetType() )
00693             {
00694                 for ( int d=0;d<m_data->m_nDomain;d++ )
00695                     if ( getIndexOfMax ( ensembleOutputNorm + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00696                         classErrBag += 1.0;
00697             }
00698 
00699             // rmse calculation over all targets
00700             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00701             {
00702                 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00703                 REAL prediction = ensembleOutputNorm[j];
00704                 rmseBag += ( prediction - target ) * ( prediction - target );
00705             }
00706 
00707             delete[] ensembleOutputNorm;
00708         }
00709 
00710         if ( Framework::getDatasetType() )
00711             classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00712         rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass*m_data->m_nDomain*m_data->m_nTest ) );
00713         cout<<e<<": "<<"RMSE:"<<rmseBag<<" classErr:"<<classErrBag<<endl;
00714 
00715     }
00716 
00717     // take the mean
00718     for ( int i=0;i<testSize;i++ )
00719         testMean[i] /= ( REAL ) epochs;
00720 
00721     fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00722     if ( fTest.is_open() ==false )
00723         assert ( false );
00724     fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize );
00725     fTest.close();
00726 
00727 
00728     // calculate prediction RMSE and classification error
00729     double classErrBag = 0.0;
00730     double rmseBag = 0.0;
00731 
00732     // go through the test set
00733     for ( uint i=0;i<m_data->m_nTest;i++ )
00734     {
00735         REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00736 
00737         // if the dataset has classification type, count the #wrong labeled
00738         if ( Framework::getDatasetType() )
00739         {
00740             for ( int d=0;d<m_data->m_nDomain;d++ )
00741                 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00742                     classErrBag += 1.0;
00743         }
00744 
00745         // rmse calculation over all targets
00746         for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00747         {
00748             REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00749             REAL prediction = ensembleOutput[j];
00750             rmseBag += ( prediction - target ) * ( prediction - target );
00751         }
00752 
00753     }
00754 
00755     // calc errors
00756     if ( Framework::getDatasetType() )
00757         classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00758     rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00759 
00760     m_predictionRMSE = rmseBag;
00761     m_predictionClassificationError = classErrBag;
00762 
00763     cout<<endl;
00764     cout<<epochs<<" runs"<<endl;
00765     cout<<"Bagging runs (with boostrap sample):   rmseMean:"<<rmseMean/ ( double ) epochs<<"   classErrMean:"<<classErrMean/ ( double ) epochs<<endl;
00766     cout<<"Bagged (mean)                      :   rmse    :"<<rmseBag<<"   classErr    :"<<classErrBag<<endl<<endl;
00767 
00768     delete[] testMean;
00769 }
00770 
00776 void Scheduler::boosting()
00777 {
00778     int epochs = Framework::getAdditionalStartupParameter();
00779     cout<<endl<<endl;
00780     cout<<"================================= Boosting ================================="<<endl;
00781     cout<<"epochs:"<<epochs<<endl<<endl<<endl;
00782     m_boostingRun = true;
00783 
00784     vector<string> boostingFileNames;
00785     uint testSize = 0;
00786     double rmseMean = 0.0, classErrMean = 0.0;
00787     REAL* beta = new REAL[epochs];
00788     for ( m_boostingEpoch=0;m_boostingEpoch<epochs;m_boostingEpoch++ )
00789     {
00790         cout<<"e:"<<m_boostingEpoch<<endl;
00791 
00792         m_randSeedBagBoost = m_boostingEpoch;
00793 
00794         // train and predict testset (testset must be fixed)
00795         train();
00796         predict();
00797 
00798         fstream f ( "A.txt",ios::out );
00799         for ( int i=0;i<m_boostingNTrain;i++ )
00800             f<<m_probs[i]<<endl;
00801         f.close();
00802 
00803         rmseMean += getPredictionRMSE();
00804         classErrMean += getClassificationError();
00805 
00806         fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00807         if ( fTest.is_open() ==false )
00808             assert ( false );
00809         char buf[512];
00810         sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),m_boostingEpoch );
00811         boostingFileNames.push_back ( buf );
00812         fstream fTmp ( buf,ios::out );
00813 
00814         // get length of file
00815         fTest.seekg ( 0, ios::end );
00816         uint length = fTest.tellg();
00817         testSize = length/sizeof ( float );
00818         fTest.seekg ( 0, ios::beg );
00819 
00820         // allocate memory
00821         char* buffer = new char [length];
00822 
00823         // read data as a block
00824         fTest.read ( buffer,length );
00825         fTest.close();
00826 
00827         // write
00828         fTmp.write ( buffer,length );
00829         delete[] buffer;
00830 
00831         fTmp.close();
00832 
00833         // ==================== predict train set =====================
00834         double rmseBoost = 0.0, epsilon = 0.0, rmseTrain = 0.0;
00835         REAL min = m_data->m_negativeTarget, max = m_data->m_positiveTarget;
00836         Framework::setFrameworkMode ( 1 );
00837 
00838         preparePredictionMode();
00839         REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain];
00840         REAL* loss = new REAL[m_boostingNTrain];
00841         // go through the train set
00842         int nOut = m_data->m_nClass*m_data->m_nDomain;
00843         for ( int i=0;i<m_boostingNTrain;i++ )
00844         {
00845             // predict one example
00846             REAL* inputFeature = m_boostingTrain + i * m_data->m_nFeatures;
00847             getEnsemblePrediction ( inputFeature, ensembleOutput );
00848 
00849             // rmse calculation over all targets
00850             REAL err = 0.0, err2 = 0.0;
00851             for ( int j=0;j<m_data->m_nDomain;j++ )
00852             {
00853                 int indMax = -1;
00854                 REAL maxTarget = -1e10;
00855                 for ( int k=0;k<m_data->m_nClass;k++ )
00856                     if ( maxTarget < m_boostingTargets[i * nOut + m_data->m_nClass*j + k] )
00857                     {
00858                         maxTarget = m_boostingTargets[i * nOut + m_data->m_nClass*j + k];
00859                         indMax = k;
00860                     }
00861                 if ( indMax == -1 )
00862                     assert ( false );
00863                 for ( int k=0;k<m_data->m_nClass;k++ )
00864                 {
00865                     if ( indMax != k )
00866                     {
00867                         REAL predictionTarget = ensembleOutput[m_data->m_nClass*j + indMax];
00868                         REAL prediction = ensembleOutput[m_data->m_nClass*j + k];
00869 
00870                         err += 1.0 - ( predictionTarget-min ) / ( max-min ) + ( prediction-min ) / ( max-min );
00871                         err2 += 1.0 + ( predictionTarget-min ) / ( max-min ) - ( prediction-min ) / ( max-min );
00872                     }
00873                 }
00874 
00875                 for ( int j=0;j<m_data->m_nDomain;j++ )
00876                     for ( int k=0;k<m_data->m_nClass;k++ )
00877                     {
00878                         REAL out = ensembleOutput[m_data->m_nClass*j + k];
00879                         REAL target = m_boostingTargets[i * nOut + m_data->m_nClass*j + k];
00880                         rmseTrain += ( out-target ) * ( out-target );
00881                     }
00882 
00883             }
00884             epsilon += m_probs[i] * err / ( REAL ) ( m_data->m_nClass-1 );
00885             loss[i] = err2 / ( REAL ) ( m_data->m_nClass-1 );
00886         }
00887         rmseTrain = sqrt ( rmseTrain/ ( double ) ( m_boostingNTrain*m_data->m_nClass*m_data->m_nDomain ) );
00888         cout<<"rmseTrain(boosting):"<<rmseTrain<<endl;
00889         epsilon *= 0.5;
00890         beta[m_boostingEpoch] = epsilon / ( 1.0 - epsilon );
00891         // update example probabilities
00892         for ( int i=0;i<m_boostingNTrain;i++ )
00893             m_probs[i] *= pow ( beta[m_boostingEpoch], 0.5 * loss[i] );
00894         double sum = 0.0;
00895         for ( int i=0;i<m_boostingNTrain;i++ )
00896             sum += m_probs[i];
00897         // normalize
00898         for ( int i=0;i<m_boostingNTrain;i++ )
00899             m_probs[i] /= sum;
00900 
00901         delete[] loss;
00902         delete[] ensembleOutput;
00903 
00904         endPredictionMode();
00905     }
00906 
00907     // read test data
00908     srand ( m_data->m_randSeed );
00909     m_data->readDataset ( m_data->m_datasetName );
00910 
00911     // calc boosting mean
00912     cout<<endl<<endl<<"#test values:"<<testSize<<" (dataset size:"<<m_data->m_nTest<<")"<<endl;
00913     REAL* testMean = new REAL[testSize];
00914     for ( int i=0;i<testSize;i++ )
00915         testMean[i] = 0.0;
00916     for ( int e=0;e<epochs;e++ )
00917     {
00918         cout<<"Cascade layer "<<e<<": weight:"<<log10 ( 1.0/beta[e] ) <<"  "<<flush;
00919         fstream f ( boostingFileNames[e].c_str(),ios::in );
00920         if ( f.is_open() == false )
00921             assert ( false );
00922         float* buf = new float[testSize];
00923         f.read ( ( char* ) buf,sizeof ( float ) *testSize );
00924         f.close();
00925 
00926         // add this run to ensemble
00927         for ( int i=0;i<testSize;i++ )
00928         {
00929             REAL w = log10 ( 1.0/beta[e] );
00930             testMean[i] += w*buf[i];
00931         }
00932         delete[] buf;
00933 
00934 
00935         // Calculate per-epoch errors
00936         // go through the test set
00937         double classErrBoostingPerEpoch = 0.0;
00938         double rmseBoostingPerEpoch = 0.0;
00939         double rmseBoostingPerEpoch0 = 0.0;
00940         double rmseBoostingPerEpoch1 = 0.0;
00941         for ( int i=0;i<m_data->m_nTest;i++ )
00942         {
00943             REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00944             REAL* ensembleOutputNorm0 = new REAL[m_data->m_nClass*m_data->m_nDomain];
00945             REAL* ensembleOutputNorm1 = new REAL[m_data->m_nClass*m_data->m_nDomain];
00946 
00947             REAL norm0 = 0.0;
00948             for ( int j=0;j<=e;j++ )
00949                 norm0 += log10 ( 1.0/beta[e] );
00950             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00951             {
00952                 ensembleOutputNorm0[j] = ensembleOutput[j]/ ( REAL ) ( e+1 );
00953                 ensembleOutputNorm1[j] = ensembleOutput[j]/norm0;
00954             }
00955 
00956             // if the dataset has classification type, count the #wrong labeled
00957             if ( Framework::getDatasetType() )
00958             {
00959                 for ( int d=0;d<m_data->m_nDomain;d++ )
00960                     if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00961                         classErrBoostingPerEpoch += 1.0;
00962             }
00963 
00964             // rmse calculation over all targets
00965             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00966             {
00967                 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00968                 REAL prediction = ensembleOutput[j];
00969                 rmseBoostingPerEpoch += ( prediction - target ) * ( prediction - target );
00970 
00971                 prediction = ensembleOutputNorm0[j];
00972                 rmseBoostingPerEpoch0 += ( prediction - target ) * ( prediction - target );
00973 
00974                 prediction = ensembleOutputNorm1[j];
00975                 rmseBoostingPerEpoch1 += ( prediction - target ) * ( prediction - target );
00976             }
00977 
00978             delete[] ensembleOutputNorm0;
00979             delete[] ensembleOutputNorm1;
00980         }
00981         // calc errors
00982         if ( Framework::getDatasetType() )
00983             classErrBoostingPerEpoch = 100.0*classErrBoostingPerEpoch/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00984         rmseBoostingPerEpoch = sqrt ( rmseBoostingPerEpoch/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00985         rmseBoostingPerEpoch0 = sqrt ( rmseBoostingPerEpoch0/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00986         rmseBoostingPerEpoch1 = sqrt ( rmseBoostingPerEpoch1/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00987         cout<<"Boosting:  rmse:"<<rmseBoostingPerEpoch<<"  rmse0:"<<rmseBoostingPerEpoch0<<"  rmse1:"<<rmseBoostingPerEpoch1<<"  classErr:"<<classErrBoostingPerEpoch<<"%"<<endl;
00988     }
00989 
00990     // take the mean
00991     for ( int i=0;i<testSize;i++ )
00992         testMean[i] /= ( REAL ) epochs;
00993 
00994     fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00995     if ( fTest.is_open() ==false )
00996         assert ( false );
00997     fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize );
00998     fTest.close();
00999 
01000 
01001     // calculate prediction RMSE and classification error
01002     double classErrBoosting = 0.0;
01003     double rmseBoosting = 0.0;
01004 
01005     // go through the test set
01006     for ( int i=0;i<m_data->m_nTest;i++ )
01007     {
01008         REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
01009 
01010         // if the dataset has classification type, count the #wrong labeled
01011         if ( Framework::getDatasetType() )
01012         {
01013             for ( int d=0;d<m_data->m_nDomain;d++ )
01014                 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
01015                     classErrBoosting += 1.0;
01016         }
01017 
01018         // rmse calculation over all targets
01019         for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01020         {
01021             REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
01022             REAL prediction = ensembleOutput[j];
01023             rmseBoosting += ( prediction - target ) * ( prediction - target );
01024         }
01025 
01026     }
01027 
01028     // calc errors
01029     if ( Framework::getDatasetType() )
01030         classErrBoosting = 100.0*classErrBoosting/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
01031     rmseBoosting = sqrt ( rmseBoosting/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
01032 
01033     m_predictionRMSE = rmseBoosting;
01034     m_predictionClassificationError = classErrBoosting;
01035 
01036     cout<<endl;
01037     cout<<epochs<<" runs"<<endl;
01038     cout<<"Boosting runs (mean boostrap sample):   rmseMean:"<<rmseMean/ ( double ) epochs<<"   classErrMean:"<<classErrMean/ ( double ) epochs<<"%"<<endl;
01039     cout<<"Boosting (mean)                     :   rmse    :"<<rmseBoosting<<"   classErr    :"<<classErrBoosting<<"%"<<endl<<endl;
01040 
01041     delete[] testMean;
01042 }
01043 
01051 void Scheduler::checkAlgorithmTemplate ( string fname, string &algoName, string &id )
01052 {
01053     // check, if the algorithm line exists
01054     fstream f ( fname.c_str(), ios::in );
01055     if ( f.is_open() == false )
01056         assert ( false );
01057     string firstLine, secondLine, thirdLine;
01058     f>>firstLine;
01059     f>>secondLine;
01060     f>>thirdLine;
01061     f.close();
01062     int pos = firstLine.find ( "=" );
01063     string name = firstLine.substr ( 0, pos );
01064     algoName = firstLine.substr ( pos+1 );
01065     if ( name != "ALGORITHM" )
01066     {
01067         cout<<"Wrong dsc file, no ALGORITHM=.. found in first line"<<endl;
01068         exit ( 0 );
01069     }
01070     pos = secondLine.find ( "=" );
01071     name = secondLine.substr ( 0, pos );
01072     id = secondLine.substr ( pos+1 );
01073     if ( name != "ID" )
01074     {
01075         cout<<"Wrong dsc file, no ID=.. found in second line"<<endl;
01076         exit ( 0 );
01077     }
01078 }
01079 
01085 void Scheduler::trainAlgorithm ( string fnameTemplate, string fnameDsc ) //string fname)
01086 {
01087     cout<<"Train algorithm:"<<fnameTemplate<<endl;
01088 
01089     string algoName, id;
01090     checkAlgorithmTemplate ( fnameTemplate, algoName, id );
01091 
01092     // read dsc file
01093     m_data->readDscFile ( fnameTemplate );
01094     if ( m_data->m_disableTraining )
01095     {
01096         cout<<"Training disabled."<<endl;
01097         return;
01098     }
01099 
01100     // copy the content of the template to the dsc file
01101     fstream fAlgoTemplate ( fnameTemplate.c_str(), ios::in );
01102     fstream fAlgo ( fnameDsc.c_str(), ios::out );
01103     cout<<"AlgoTemplate:"<<fnameTemplate<<"  Algo:"<<fnameDsc<<endl;
01104     char buf[1024];
01105     while ( fAlgoTemplate.getline ( buf, 1024 ) ) // read all lines
01106     {
01107         string line = string ( buf );
01108         fAlgo<<line<<endl;
01109     }
01110     fAlgoTemplate.close();
01111     fAlgo.close();
01112 
01113     // redirect cout to filename
01114     cout.setOutputFile ( fnameDsc );
01115 
01116     cout<<"Floating point precision: "<< ( int ) sizeof ( REAL ) <<" Bytes"<<endl;
01117 
01118     m_data->partitionDatasetToCrossValidationSets();
01119 
01120     // start the algorithm
01121     Algorithm* algo = 0;
01122     algorithmDispatcher ( algo, algoName );
01123     algo->setDataPointers ( m_data );
01124 
01125     if ( m_data->m_enableFeatureSelection )
01126     {
01127         algo->doFeatureSelection();
01128         exit ( 0 );
01129     }
01130     else
01131         algo->train();
01132 
01133     if ( algo )
01134     {
01135         cout<<"delete algo"<<endl;
01136         delete algo;
01137     }
01138     algo = 0;
01139     cout<<"Finished train algorithm:"<<fnameTemplate<<endl;
01140 
01141 }
01142 
01149 int Scheduler::getIDFromFullPredictor ( string fullPredictor )
01150 {
01151     if ( fullPredictor=="" )
01152         return 0;
01153     for ( int i=0;i<m_algorithmObjectList.size();i++ )
01154         if ( m_algorithmObjectList[i]->m_stringMap["fullPrediction"] == fullPredictor )
01155             return m_algorithmObjectList[i]->m_algorithmID;
01156     cout<<"Error, this fullPredictor was not found:"<<fullPredictor<<endl;
01157     assert ( false );
01158 }
01159 
01167 void Scheduler::getEnsemblePrediction ( REAL* input, REAL* output )
01168 {
01169     int N = m_algorithmList.size();
01170     REAL* tmp = new REAL[m_data->m_nFeatures+N];
01171     
01172     // predict all targets per algorithm
01173     // if the algorithm needs a preprocessor, the effect file is loaded
01174     for ( int i=0;i<N;i++ )
01175     {
01176         // effect = pre-processor for this algorithm
01177         int ID = m_effectID[i];
01178         REAL* effect = m_noEffect;  // constant zero
01179         REAL* outputVector = m_outputs[i+1]; // +1: jump over constant 1
01180         if ( ID != 0 )
01181         {
01182             if ( ID < 0 || ID > i )
01183                 assert ( false );
01184             effect = m_outputs[ID];  // output of another prediction as effect
01185         }
01186 
01187         // cascade learning: add predictions of previous model as input to current
01188         if ( m_data->m_enableCascadeLearning )
01189         {
01190             int nF = m_data->m_nFeatures;
01191             int nFAlgo = m_algorithmObjectList[i]->m_nFeatures;
01192 
01193             // add input feature + normalize
01194             for ( int j=0;j<nF;j++ )
01195                 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j];
01196 
01197             // add predictions + normalize
01198             for ( int j=0;j<i;j++ ) // over all previous models
01199             {
01200                 REAL* previousOutputVector = m_outputs[j+1];
01201                 int nOut = m_data->m_nClass*m_data->m_nDomain;
01202                 for ( int k=0;k<nOut;k++ )
01203                     tmp[nF+j*nOut+k] = ( previousOutputVector[k] - m_data->m_mean[nF+j*nOut+k] ) / m_data->m_std[nF+j*nOut+k];
01204             }
01205         }
01206         else  // standard
01207         {
01208             for ( int j=0;j<m_data->m_nFeatures;j++ )
01209                 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j];
01210         }
01211 
01212         if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet")
01213             m_algorithmObjectList[i]->predictMultipleOutputs ( tmp, effect, outputVector, m_labelsPredict, 1, m_data->m_nCross );
01214         else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" )
01215         {
01216             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01217                 outputVector[j] = 0.0;
01218             for ( int j=0;j<m_data->m_nCross;j++ )
01219             {
01220                 m_algorithmObjectListList[i][j]->predictMultipleOutputs ( tmp, effect, m_outputVectorTmp, m_labelsTmp, 1, j );
01221                 for ( int k=0;k<m_data->m_nClass*m_data->m_nDomain;k++ )
01222                     outputVector[k] += m_outputVectorTmp[k];
01223             }
01224             for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01225                 outputVector[j] /= ( REAL ) m_data->m_nCross;
01226 
01227             // calc output labels (for classification dataset)
01228             if ( Framework::getDatasetType() )
01229             {
01230                 // in all domains
01231                 for ( int d=0;d<m_data->m_nDomain;d++ )
01232                 {
01233                     // find max. output value
01234                     int indMax = -1;
01235                     REAL max = -1e10;
01236                     for ( int j=0;j<m_data->m_nClass;j++ )
01237                     {
01238                         if ( max < outputVector[d*m_data->m_nClass+j] )
01239                         {
01240                             max = outputVector[d*m_data->m_nClass+j];
01241                             indMax = j;
01242                         }
01243                     }
01244                     m_labelsPredict[d] = indMax;
01245                 }
01246             }
01247 
01248         }
01249         else
01250             assert(false);
01251     }
01252 
01253     delete[] tmp;
01254     
01255     // calculate the ensemble output with the blender
01256     if ( m_data->m_enablePostNNBlending )
01257         m_blenderNN->predictEnsembleOutput ( m_outputs, output );
01258     else
01259         m_blender->predictEnsembleOutput ( m_outputs, output );
01260 }
01261 
01266 void Scheduler::preparePredictionMode()
01267 {
01268     cout<<"Start scheduled prediction"<<endl;
01269 
01270     // fix random seed
01271     srand ( m_data->m_randSeed );
01272 
01273     // load test set
01274     m_data->readDataset ( m_data->m_datasetName );
01275     srand ( m_data->m_randSeed );
01276 
01277     if(m_data->m_validationType=="ValidationSet")
01278         m_data->m_nCross = 0;
01279     
01280     // number of algorithms in the ensemble
01281     int N = m_algorithmList.size();
01282 
01283     // load normalization (mean, std)
01284     if ( m_data->m_enableCascadeLearning )
01285         m_data->loadNormalization ( N-1 );
01286     else
01287         m_data->loadNormalization();
01288 
01289     // go to prediction mode in all template files
01290     m_algorithmIDList.clear();
01291     for ( int i=0;i<N;i++ )
01292     {
01293         string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i];
01294         setPredictionModeInAlgorithm ( fAlgoTemplateName );
01295     }
01296 
01297     // new NN blender
01298     if ( m_data->m_enablePostNNBlending )
01299     {
01300         m_blenderNN = new BlendingNN();
01301         m_blenderNN->setDataPointers ( m_data );
01302         m_blenderNN->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
01303         m_blenderNN->readSpecificMaps();
01304         m_blenderNN->loadWeights();
01305     }
01306 
01307     // load blending weights
01308     m_blender = new BlendStopping ( ( Algorithm* ) m_data );
01309     m_blender->loadBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath, N+1 );
01310 
01311     for ( int i=0;i<N;i++ )
01312         cout<<"ALGO FROM MASTER DSC-FILE:"<<m_algorithmObjectList[i]->m_stringMap["fullPrediction"]<<endl;
01313 
01314     m_blender->printWeights();
01315 
01316     int nClass = m_data->m_nClass;
01317     int nDomain = m_data->m_nDomain;
01318 
01319     // precompute IDs from effect files per prediction
01320     m_effectID = new int[N];
01321     for ( int i=0;i<N;i++ )
01322         m_effectID[i] = getIDFromFullPredictor ( m_algorithmObjectList[i]->m_trainOnFullPredictorFile );
01323 
01324     // used in prediction mode
01325     // per sample: store prediction for every model
01326     m_noEffect = new REAL[nClass*nDomain];
01327     for ( int i=0;i<nClass*nDomain;i++ )
01328         m_noEffect[i] = 0.0;
01329     m_outputs = new REAL*[N+1];
01330     m_effects = new REAL*[N+1];
01331     for ( int i=0;i<N+1;i++ )
01332     {
01333         m_outputs[i] = new REAL[nClass*nDomain];
01334         m_effects[i] = new REAL[nClass*nDomain];
01335         for ( int j=0;j<nClass*nDomain;j++ )
01336         {
01337             m_outputs[i][j] = 1.0;  // init with constant 1.0 (needed in blend)
01338             m_effects[i][j] = 0.0;
01339         }
01340     }
01341 
01342     // tmp variable for predict labels
01343     m_labelsPredict = new int[m_data->m_nDomain];
01344 
01345 }
01346 
01351 void Scheduler::endPredictionMode()
01352 {
01353     cout<<"End scheduled prediction"<<endl;
01354     m_data->deleteMemory();
01355 
01356     for ( int i=0;i<m_algorithmObjectList.size();i++ )
01357         delete m_algorithmObjectList[i];
01358     m_algorithmObjectList.clear();
01359 
01360     if ( m_data->m_enablePostNNBlending )
01361         delete m_blenderNN;
01362     delete m_blender;
01363     delete[] m_effectID;
01364     int N = m_algorithmList.size();
01365     for ( int i=0;i<N+1;i++ )
01366     {
01367         delete[] m_outputs[i];
01368         delete[] m_effects[i];
01369     }
01370     delete[] m_noEffect;
01371     delete[] m_outputs;
01372     delete[] m_effects;
01373     delete[] m_labelsPredict;
01374 
01375 }
01376 
01384 int Scheduler::getIndexOfMax ( REAL* vector, int length )
01385 {
01386     int indMax = -1;
01387     REAL max = -1e10;
01388     for ( int i=0;i<length;i++ )
01389     {
01390         if ( max < vector[i] )
01391         {
01392             max = vector[i];
01393             indMax = i;
01394         }
01395     }
01396 
01397     return indMax;
01398 }
01399 
01407 void Scheduler::setPredictionModeInAlgorithm ( string fname )
01408 {
01409     cout<<"Prediction mode in algorithm:"<<fname<<endl;
01410 
01411     // check the dsc file
01412     string algoName, id;
01413     checkAlgorithmTemplate ( fname, algoName, id );
01414 
01415     // read dsc file
01416     m_data->readDscFile ( fname );
01417 
01418     // make an instance of the algorithm and give him the data
01419     if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet")
01420     {
01421         Algorithm* algo = 0;
01422         algorithmDispatcher ( algo, algoName );
01423         algo->setDataPointers ( m_data );
01424         algo->setPredictionMode ( m_data->m_nCross );
01425 
01426         // add the algorithm to internal object list of algorithms
01427         m_algorithmObjectList.push_back ( algo );
01428     }
01429     else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" )
01430     {
01431         cout<<"Make "<<m_data->m_nCross<<" models ready to predict"<<endl;
01432         Algorithm** algoList = new Algorithm*[m_data->m_nCross];
01433         for ( int i=0;i<m_data->m_nCross;i++ )
01434         {
01435             Algorithm* algo = 0;
01436             algorithmDispatcher ( algo, algoName );
01437             algo->setDataPointers ( m_data );
01438             algo->setPredictionMode ( i );
01439             algoList[i] = algo;
01440         }
01441         m_algorithmObjectListList.push_back ( algoList );
01442         m_algorithmObjectList.push_back ( algoList[0] );
01443     }
01444     else
01445         assert(false);
01446 
01447     // check, if id already exist
01448     for ( int i=0;i<m_algorithmIDList.size();i++ )
01449         if ( m_algorithmIDList[i] == atoi ( id.c_str() ) )
01450         {
01451             cout<<"ID:"<<id<<" in "<<algoName<<" already exists"<<endl;
01452             assert ( false );
01453         }
01454 
01455     m_algorithmIDList.push_back ( atoi ( id.c_str() ) );
01456 
01457     m_algorithmNameList.push_back ( algoName );
01458 
01459     cout<<endl;
01460 }
01461 
01468 void Scheduler::algorithmDispatcher ( Algorithm* &algo, string name )
01469 {
01470     if ( name == "LinearModel" )
01471         algo = new LinearModel();
01472     else if ( name == "KNearestNeighbor" )
01473         algo = new KNearestNeighbor();
01474     else if ( name == "NeuralNetwork" )
01475         algo = new NeuralNetwork();
01476     else if ( name == "PolynomialRegression" )
01477         algo = new PolynomialRegression();
01478     else if ( name == "LinearModelNonNeg" )
01479         algo = new LinearModelNonNeg();
01480     else if ( name == "KernelRidgeRegression" )
01481         algo = new KernelRidgeRegression();
01482     else if ( name == "NeuralNetworkRBMauto" )
01483         algo = new NeuralNetworkRBMauto();
01484     else if ( name == "Autoencoder" )
01485         algo = new Autoencoder();
01486     else if ( name == "GBDT" )
01487         algo = new GBDT();
01488     else if ( name == "LogisticRegression" )
01489         algo = new LogisticRegression();
01490     else
01491         assert ( false );
01492 }
01493 
01500 string Scheduler::masterDscTemplateGenerator ( string dataset, bool isClass, vector<string> algos, int rSeed, string blendAlgo, bool cascade )
01501 {
01502     stringstream s;
01503     s<<"dataset="<<dataset<<endl;
01504     s<<"isClassificationDataset="<<isClass<<endl;
01505     s<<"maxThreads=2"<<endl;
01506     s<<"maxThreadsInCross=2"<<endl;
01507     s<<"nCrossValidation=6"<<endl;
01508     s<<"validationType=Retraining"<<endl;
01509     s<<"positiveTarget=1.0"<<endl;
01510     s<<"negativeTarget=-1.0"<<endl;
01511     s<<"randomSeed="<<rSeed<<endl;
01512     s<<"nMixDataset=20"<<endl;
01513     s<<"nMixTrainList=100"<<endl;
01514     s<<"standardDeviationMin=0.01"<<endl;
01515     s<<"blendingRegularization=1e-4"<<endl;
01516     s<<"blendingEnableCrossValidation=0"<<endl;
01517     s<<"blendingAlgorithm="<<blendAlgo<<endl;
01518     s<<"enablePostNNBlending=0"<<endl;
01519     s<<"enableCascadeLearning="<<cascade<<endl;
01520     s<<"enableGlobalMeanStdEstimate=0"<<endl;
01521     s<<"enableSaveMemory=1"<<endl;
01522     s<<"addOutputNoise=0"<<endl;
01523     s<<"enablePostBlendClipping=0"<<endl;
01524     s<<"enableFeatureSelection=0"<<endl;
01525     s<<"featureSelectionWriteBinaryDataset=0"<<endl;
01526     s<<"enableGlobalBlendingWeights=1"<<endl;
01527     s<<"errorFunction=RMSE"<<endl;
01528     s<<"disableWriteDscFile=0"<<endl;
01529     s<<"enableStaticNormalization=0"<<endl;
01530     s<<"staticMeanNormalization=0.0"<<endl;
01531     s<<"staticStdNormalization=1.0"<<endl;
01532     s<<"enableProbablisticNormalization=0"<<endl;
01533     s<<"dimensionalityReduction=no"<<endl;
01534     s<<"subsampleTrainSet=1.0"<<endl;
01535     s<<"subsampleFeatures=1.0"<<endl;
01536     s<<"globalTrainingLoops=1"<<endl;
01537     s<<"addConstantInput=0"<<endl;
01538     s<<endl;
01539     s<<"[ALGORITHMS]"<<endl;
01540     for ( int i=0;i<algos.size();i++ )
01541         s<<algos[i]<<endl;
01542 
01543     return s.str();
01544 }
01545 
01550 REAL Scheduler::getPredictionRMSE()
01551 {
01552     return m_predictionRMSE;
01553 }
01554 
01559 REAL Scheduler::getClassificationError()
01560 {
01561     return m_predictionClassificationError;
01562 }

Generated on Tue Jan 26 09:20:59 2010 for ELF by  doxygen 1.5.8