00001 #include "Scheduler.h"
00002
00003 extern StreamOutput cout;
00004
00008 Scheduler::Scheduler()
00009 {
00010 cout<<"Scheduler"<<endl;
00011
00012 m_data = 0;
00013 m_blender = 0;
00014 m_blenderNN = 0;
00015 m_labelsPredict = 0;
00016 m_effectID = 0;
00017 m_noEffect = 0;
00018 m_outputs = 0;
00019 m_effects = 0;
00020 m_predictionRMSE = 0;
00021 m_predictionClassificationError = 0;
00022
00023 m_data = new Data();
00024 m_data->setPathes ( TMP_PATH, DSC_PATH, FULL_PREDICTOR_PATH, DATA_PATH );
00025
00026 m_baggingRun = 0;
00027 m_boostingRun = 0;
00028 m_randSeedBagBoost = 0;
00029 m_probs = 0;
00030 m_boostingTrain = 0;
00031 m_boostingTargets = 0;
00032 m_boostingNTrain = 0;
00033
00034 }
00035
00039 Scheduler::~Scheduler()
00040 {
00041 cout<<"descructor Scheduler"<<endl;
00042 if ( m_data )
00043 delete m_data;
00044 m_data = 0;
00045 }
00046
00053 void Scheduler::readMasterDscFile ( string path, string masterName )
00054 {
00055 m_data->m_datasetPath = path;
00056 cout<<"Open master .dsc file:"<<(path + "/" + masterName)<<endl;
00057 fstream fMaster ( (path + "/" + masterName).c_str(), ios::in );
00058
00059
00060 if ( fMaster.is_open() == 0 )
00061 {
00062 cout<<"Error: no Master.dsc file found in "<<path<<endl;
00063 exit ( 0 );
00064 }
00065
00066
00067 char buf[1024];
00068 bool readAlgorithmList = false;
00069 while ( fMaster.getline ( buf, 1024 ) )
00070 {
00071
00072 string line = string ( buf );
00073
00074
00075 if ( line=="" || line[0]=='#' )
00076 continue;
00077
00078
00079 if ( readAlgorithmList )
00080 {
00081 m_algorithmList.push_back ( line );
00082 continue;
00083 }
00084
00085
00086 if ( line=="[ALGORITHMS]" )
00087 {
00088 readAlgorithmList = true;
00089 continue;
00090 }
00091
00092
00093 int pos = line.find ( "=" );
00094 string name = line.substr ( 0, pos );
00095 string value = line.substr ( pos+1 );
00096
00097
00098 if ( name=="dataset" )
00099 m_data->m_datasetName = value;
00100 if ( name=="isClassificationDataset" )
00101 Framework::setDatasetType ( atoi ( value.c_str() ) );
00102 if ( name=="maxThreads" )
00103 {
00104 cout<<"Set max. threads in MKL and IPP: "<<atoi ( value.c_str() ) <<endl;
00105 mkl_set_num_threads ( atoi ( value.c_str() ) );
00106 ippSetNumThreads ( atoi ( value.c_str() ) );
00107 }
00108 if ( name=="maxThreadsInCross" )
00109 {
00110 Framework::setMaxThreads ( atoi ( value.c_str() ) );
00111 m_data->m_maxThreadsInCross = atoi ( value.c_str() );
00112 }
00113 if ( name=="nCrossValidation" )
00114 {
00115 m_data->m_nCross = atoi ( value.c_str() );
00116 cout<<"Train "<<m_data->m_nCross<<"-fold cross validation"<<endl;
00117 }
00118 if ( name=="validationType" )
00119 {
00120 assert ( value=="Retraining" || value=="CrossFoldMean" || value=="Bagging" || value=="ValidationSet");
00121 m_data->m_validationType = value;
00122 cout<<"ValidationType: "<<value<<endl;
00123 }
00124 if ( name=="positiveTarget" )
00125 m_data->m_positiveTarget = atof ( value.c_str() );
00126 if ( name=="negativeTarget" )
00127 m_data->m_negativeTarget = atof ( value.c_str() );
00128 if ( name=="standardDeviationMin" )
00129 m_data->m_standardDeviationMin = atof ( value.c_str() );
00130 if ( name=="randomSeed" )
00131 {
00132 if ( value=="time(0)" )
00133 m_data->m_randSeed = time ( 0 );
00134 else
00135 m_data->m_randSeed = atoi ( value.c_str() );
00136 cout<<"Set random seed to: "<<m_data->m_randSeed<<endl;
00137 setRandomSeed ( m_data->m_randSeed );
00138 }
00139 if ( name=="nMixDataset" )
00140 m_data->m_nMixDataset = atoi ( value.c_str() );
00141 if ( name=="nMixTrainList" )
00142 m_data->m_nMixTrainList = atoi ( value.c_str() );
00143 if ( name=="blendingRegularization" )
00144 m_data->m_blendingRegularization = atof ( value.c_str() );
00145 if ( name=="blendingAlgorithm" )
00146 m_data->m_blendingAlgorithm = value;
00147 if ( name=="blendingEnableCrossValidation" )
00148 m_data->m_blendingEnableCrossValidation = atoi ( value.c_str() );
00149 if ( name=="enablePostNNBlending" )
00150 m_data->m_enablePostNNBlending = atoi ( value.c_str() );
00151 if ( name=="enableCascadeLearning" )
00152 m_data->m_enableCascadeLearning = atoi ( value.c_str() );
00153 if ( name=="enableGlobalMeanStdEstimate" )
00154 m_data->m_enableGlobalMeanStdEstimate = atoi ( value.c_str() );
00155 if ( name=="enableSaveMemory" )
00156 m_data->m_enableSaveMemory = atoi ( value.c_str() );
00157 if ( name=="errorFunction" )
00158 m_data->m_errorFunction = value;
00159 if ( name=="enablePostBlendClipping" )
00160 m_data->m_enablePostBlendClipping = atoi ( value.c_str() );
00161 if ( name=="addOutputNoise" )
00162 m_data->m_addOutputNoise = atof ( value.c_str() );
00163 if ( name=="enableFeatureSelection" )
00164 m_data->m_enableFeatureSelection = atoi ( value.c_str() );
00165 if ( name=="featureSelectionWriteBinaryDataset" )
00166 m_data->m_featureSelectionWriteBinaryDataset = atoi ( value.c_str() );
00167 if ( name=="enableGlobalBlendingWeights" )
00168 m_data->m_enableGlobalBlendingWeights = atoi ( value.c_str() );
00169 if ( name=="disableWriteDscFile" )
00170 {
00171 m_data->m_disableWriteDscFile = atoi ( value.c_str() );
00172 if ( m_data->m_disableWriteDscFile )
00173 cout.disableFileOutputs();
00174 }
00175 if ( name=="enableStaticNormalization" )
00176 m_data->m_enableStaticNormalization = atoi ( value.c_str() );
00177 if ( name=="staticMeanNormalization" )
00178 m_data->m_staticMeanNormalization = atof ( value.c_str() );
00179 if ( name=="staticStdNormalization" )
00180 m_data->m_staticStdNormalization = atof ( value.c_str() );
00181 if ( name=="enableProbablisticNormalization" )
00182 m_data->m_enableProbablisticNormalization = atoi ( value.c_str() );
00183 if ( name=="dimensionalityReduction" )
00184 m_data->m_dimensionalityReduction = value;
00185 if ( name=="subsampleTrainSet" )
00186 m_data->m_subsampleTrainSet = atof ( value.c_str() );
00187 if ( name=="subsampleFeatures" )
00188 m_data->m_subsampleFeatures = atof ( value.c_str() );
00189 if ( name=="globalTrainingLoops" )
00190 m_data->m_globalTrainingLoops = atoi ( value.c_str() );
00191 if ( name=="addConstantInput" )
00192 m_data->m_addConstantInput = atoi ( value.c_str() );
00193 }
00194
00195 fMaster.close();
00196 }
00197
00202 void Scheduler::train()
00203 {
00204 Framework::setFrameworkMode ( 0 );
00205
00206 cout<<"Start scheduled training"<<endl;
00207
00208
00209 cout<<"Fill data"<<endl;
00210
00211
00212 fstream fA0 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataMean.dat" ).c_str(), ios::in );
00213 fstream fA1 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataStd.dat" ).c_str(), ios::in );
00214 fstream fA2 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTest.dat" ).c_str(), ios::in );
00215 fstream fA3 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTestTarget.dat" ).c_str(), ios::in );
00216 fstream fA4 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrain.dat" ).c_str(), ios::in );
00217 fstream fA5 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrainTarget.dat" ).c_str(), ios::in );
00218 fstream fA6 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderWeights.dat" ).c_str(), ios::in );
00219
00220 bool autoencoderFilesOK = fA0.is_open() && fA1.is_open() && fA2.is_open() && fA3.is_open() && fA4.is_open() && fA5.is_open() && fA6.is_open();
00221
00222
00223 if ( m_data->m_dimensionalityReduction == "Autoencoder" && autoencoderFilesOK == false )
00224 {
00225 cout<<"Autoencoder: start training"<<endl;
00226
00227
00228 srand ( m_data->m_randSeed );
00229
00230
00231 m_data->readDataset ( m_data->m_datasetName );
00232 m_data->mergeTrainAndTest();
00233 m_data->mixDataset();
00234
00235
00236 m_data->allocMemForCrossValidationSets();
00237 m_data->normalizeZeroOne();
00238
00239
00240 trainAlgorithm ( m_data->m_datasetPath + "/Autoencoder.dsc", m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/Autoencoder.dsc" );
00241
00242
00243 m_data->deleteMemory();
00244 }
00245
00246
00247
00248
00249 time_t totalTime = time(0);
00250 cout<<"globalTrainingLoops:"<<m_data->m_globalTrainingLoops<<endl;
00251 for ( int globalLoop=0;globalLoop<m_data->m_globalTrainingLoops;globalLoop++ )
00252 {
00253
00254 for ( int i=0;i<m_algorithmList.size();i++ )
00255 {
00256
00257
00258
00259 srand ( m_data->m_randSeed );
00260
00261
00262 if ( m_data->m_dimensionalityReduction == "Autoencoder" )
00263 {
00264 Autoencoder a;
00265 a.setDataPointers ( m_data );
00266
00267
00268 srand ( m_data->m_randSeed );
00269
00270 a.readDataset ( m_data, m_data->m_datasetName );
00271 }
00272 else
00273 m_data->readDataset ( m_data->m_datasetName );
00274
00275
00276 m_data->enableBagging ( m_baggingRun );
00277 m_data->baggingRandomSeed ( m_randSeedBagBoost );
00278
00279
00280 if ( m_boostingRun )
00281 {
00282 if ( m_probs == 0 )
00283 {
00284 cout<<"Init bootstrap probabilities to 1/N"<<endl;
00285 m_probs = new REAL[m_data->m_nTrain];
00286 for ( int j=0;j<m_data->m_nTrain;j++ )
00287 m_probs[j] = 1.0 / ( ( REAL ) m_data->m_nTrain );
00288 }
00289 if ( m_boostingTrain==0 )
00290 {
00291 cout<<"Copy train set (features + targets) to boosting trainset"<<endl;
00292 m_boostingNTrain = m_data->m_nTrain;
00293 m_boostingTrain = new REAL[m_data->m_nTrain*m_data->m_nFeatures];
00294 m_boostingTargets = new REAL[m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain];
00295 memcpy ( m_boostingTrain, m_data->m_trainOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nFeatures );
00296 memcpy ( m_boostingTargets, m_data->m_trainTargetOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain );
00297 }
00298
00299 if ( m_boostingEpoch > 0 )
00300 m_data->doBootstrapSampling ( m_probs,m_data->m_trainOrig,m_data->m_trainTargetOrig,m_data->m_trainTargetOrigEffect,m_data->m_trainTargetOrigResidual,m_data->m_trainLabelOrig );
00301 }
00302
00303 srand ( m_data->m_randSeed );
00304
00305
00306 if ( globalLoop == 0 )
00307 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.begin() +i ) );
00308 else
00309 {
00310 vector<string> tmp;
00311 for ( int j=0;j<m_algorithmList.size();j++ )
00312 if ( j != i )
00313 tmp.push_back ( m_algorithmList[j] );
00314 m_data->setAlgorithmList ( tmp );
00315 }
00316
00317 time_t beginTime = time ( 0 );
00318
00319
00320 if ( m_data->m_enableCascadeLearning )
00321 {
00322 if(m_data->m_validationType=="ValidationSet")
00323 assert(false);
00324 m_data->fillCascadeLearningInputs();
00325 m_data->extendTrainDataWithCascadeInputs();
00326 }
00327
00328 m_data->allocMemForCrossValidationSets();
00329
00330
00331 string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i];
00332 string fAlgoName = m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/" + m_algorithmList[i];
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358 if ( globalLoop > 0 )
00359 m_data->m_loadWeightsBeforeTraining = true;
00360
00361 trainAlgorithm ( fAlgoTemplateName, fAlgoName );
00362 cout<<"Finished in "<<time ( 0 )-beginTime<<"[s]"<<endl;
00363
00364
00365 cout.setOutputFile ( "" );
00366
00367
00368 m_data->deleteMemory();
00369 }
00370 }
00371
00372 cout<<"Total training time:"<<time(0)-totalTime<<"[s]"<<endl;
00373
00374 if ( m_data->m_enablePostNNBlending )
00375 {
00376
00377 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) );
00378
00379
00380 srand ( m_data->m_randSeed );
00381
00382
00383 m_data->readDataset ( m_data->m_datasetName );
00384 srand ( m_data->m_randSeed );
00385 m_data->allocMemForCrossValidationSets();
00386 m_data->partitionDatasetToCrossValidationSets();
00387
00388 m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
00389
00390 BlendingNN nn;
00391 nn.setDataPointers ( m_data );
00392 nn.readSpecificMaps();
00393 nn.init();
00394 nn.train();
00395
00396
00397 m_data->deleteMemory();
00398 }
00399 }
00400
00405 void Scheduler::blend()
00406 {
00407 Framework::setFrameworkMode ( 0 );
00408
00409 cout<<"Start blending after training"<<endl;
00410
00411
00412 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) );
00413
00414
00415 srand ( m_data->m_randSeed );
00416
00417
00418 cout<<"Fill data"<<endl;
00419 m_data->readDataset ( m_data->m_datasetName );
00420 srand ( m_data->m_randSeed );
00421 m_data->allocMemForCrossValidationSets();
00422 m_data->partitionDatasetToCrossValidationSets();
00423
00424 if ( m_data->m_enablePostNNBlending )
00425 {
00426 m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
00427
00428 BlendingNN nn;
00429 nn.setDataPointers ( m_data );
00430 nn.readSpecificMaps();
00431 nn.init();
00432 nn.train();
00433 }
00434 else
00435 {
00436 BlendStopping bb ( ( Algorithm* ) m_data, "" );
00437 bb.setRegularization ( m_data->m_blendingRegularization );
00438 double rmse = bb.calcBlending();
00439 cout<<"BLEND RMSE OF ACTUAL FULLPREDICTION PATH:"<<rmse<<endl;
00440 bb.saveBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath );
00441 }
00442 }
00443
00444
00450 void Scheduler::predict()
00451 {
00452 Framework::setFrameworkMode ( 1 );
00453
00454 preparePredictionMode();
00455
00456 int progress = m_data->m_nTest / 100 + 1;
00457 double mean = 0.0, rmse = 0.0;
00458
00459
00460 string fname;
00461 if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() >= 0 )
00462 {
00463 cout<<"Dataset:NETFLIX, slot:"<<Framework::getAdditionalStartupParameter() <<" ";
00464 char buf[512];
00465 sprintf ( buf,"p%d",Framework::getAdditionalStartupParameter() );
00466 fname = string ( NETFLIX_SLOTDATA_ROOT_DIR ) + buf + "/testPrediction.data";
00467 cout<<"pName:"<<fname<<endl;
00468 }
00469 else if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() < -100 )
00470 {
00471 char buf[512];
00472 sprintf ( buf,"ELFprediction%d",Framework::getRandomSeed() );
00473 string algos;
00474 for ( int i=0;i<m_algorithmList.size();i++ )
00475 algos += "_" + m_algorithmList[i];
00476 fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/" + buf + algos + ".dat";
00477 cout<<"pName:"<<fname<<endl;
00478 }
00479 else
00480 {
00481 char nr[512];
00482 sprintf ( nr,"%d",rand() );
00483 fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/testPrediction" + string ( nr ) + ".data";
00484
00485 }
00486
00487 fstream fOutput ( fname.c_str(),ios::out );
00488
00489
00490 REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain];
00491
00492 int* wrongLabelCnt = new int[m_data->m_nDomain];
00493 for ( int i=0;i<m_data->m_nDomain;i++ )
00494 wrongLabelCnt[i] = 0;
00495
00496
00497 int nrFeat = m_data->m_nFeatures;
00498
00499 m_outputVectorTmp = new REAL[m_data->m_nClass*m_data->m_nDomain];
00500 m_labelsTmp = new int[m_data->m_nClass*m_data->m_nDomain];
00501
00502
00503 Autoencoder* autoEnc = 0;
00504 bool enableAutoencoder = false;
00505 REAL* autoencoderOutput = 0;
00506 if ( m_data->m_dimensionalityReduction == "Autoencoder" )
00507 {
00508 autoEnc = new Autoencoder();
00509 autoEnc->setDataPointers ( m_data );
00510 autoEnc->loadWeights();
00511 autoEnc->loadNormalizations();
00512 enableAutoencoder = true;
00513 autoencoderOutput = new REAL[autoEnc->m_nClass];
00514 m_data->m_nFeatures = autoEnc->m_nClass;
00515 }
00516
00517 cout<<endl<<"predict(100 dots): "<<flush;
00518 time_t t0 = time ( 0 );
00519
00520
00521 for ( uint i=0;i<m_data->m_nTest;i++ )
00522 {
00523 if ( i % progress == 0 )
00524 cout<<"."<<flush;
00525
00526
00527 REAL* inputFeature = m_data->m_testOrig + i * ( uint ) nrFeat;
00528
00529 if ( enableAutoencoder )
00530 {
00531 autoEnc->predictAllOutputs ( inputFeature, autoencoderOutput, 1, 0 );
00532 getEnsemblePrediction ( autoencoderOutput, ensembleOutput );
00533 }
00534 else
00535 getEnsemblePrediction ( inputFeature, ensembleOutput );
00536
00537
00538 if ( Framework::getDatasetType() )
00539 {
00540 for ( uint d=0;d<m_data->m_nDomain;d++ )
00541 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i* ( uint ) m_data->m_nDomain] )
00542 wrongLabelCnt[d]++;
00543 }
00544
00545
00546 for ( uint j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00547 {
00548 REAL target = m_data->m_testTargetOrig[i * ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain + j];
00549 REAL prediction = ensembleOutput[j];
00550 rmse += ( prediction - target ) * ( prediction - target );
00551 mean += prediction;
00552 float predictionSP = prediction;
00553 fOutput.write ( ( char* ) &predictionSP, sizeof ( float ) );
00554 }
00555
00556 }
00557
00558 delete[] m_outputVectorTmp;
00559 delete[] m_labelsTmp;
00560
00561
00562 if ( Framework::getDatasetType() )
00563 {
00564 int nWrong = 0;
00565 for ( int d=0;d<m_data->m_nDomain;d++ )
00566 {
00567 nWrong += wrongLabelCnt[d];
00568 if ( m_data->m_nDomain > 1 )
00569 cout<<"["<< ( double ) wrongLabelCnt[d]/ ( double ) m_data->m_nTest<<"] ";
00570 }
00571 m_predictionClassificationError = 100.0* ( double ) nWrong/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00572 cout<<endl<<"Classification test error: "<<m_predictionClassificationError<<"%"<<endl;
00573 }
00574
00575
00576 m_predictionRMSE = sqrt ( rmse/ ( double ) ( ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain * m_data->m_nTest ) );
00577 cout<<"RMSE test: "<<m_predictionRMSE<<endl;
00578
00579
00580 cout<<endl<<"Predictions are written to binary output file: "<<fname<<" ("<< ( uint ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain*sizeof ( float ) );
00581 cout<<" Bytes, REAL="<< ( int ) sizeof ( float ) <<"Bytes, #elements:"<< ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain<<") ";
00582 cout<<"[mean:"<<mean/ ( double ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass* ( uint ) m_data->m_nDomain ) <<"] )"<<endl;
00583 cout<<"Prediction time: "<<time ( 0 )-t0<<"[s]"<<endl<<endl;
00584
00585 fOutput.close();
00586
00587 if ( ensembleOutput )
00588 delete[] ensembleOutput;
00589 ensembleOutput = 0;
00590
00591 endPredictionMode();
00592 }
00593
00600 void Scheduler::bagging()
00601 {
00602 int epochs = Framework::getAdditionalStartupParameter();
00603 cout<<endl<<endl;
00604 cout<<"================================= Bagging ================================="<<endl;
00605 cout<<"epochs:"<<epochs<<endl<<endl<<endl;
00606 m_baggingRun = true;
00607
00608 vector<string> baggingFileNames;
00609 uint testSize = 0;
00610 double rmseMean = 0.0, classErrMean = 0.0;
00611
00612 for ( int e=0;e<epochs;e++ )
00613 {
00614 cout<<"e:"<<e<<endl;
00615
00616 m_randSeedBagBoost = e + 1;
00617
00618
00619 train();
00620 predict();
00621
00622 rmseMean += getPredictionRMSE();
00623 classErrMean += getClassificationError();
00624
00625 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00626 if ( fTest.is_open() ==false )
00627 assert ( false );
00628 char buf[512];
00629 sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e );
00630 baggingFileNames.push_back ( buf );
00631 fstream fTmp ( buf,ios::out );
00632
00633
00634 fTest.seekg ( 0, ios::end );
00635 uint length = fTest.tellg();
00636 testSize = length/sizeof ( REAL );
00637 fTest.seekg ( 0, ios::beg );
00638
00639
00640 char* buffer = new char[length];
00641
00642
00643 fTest.read ( buffer,length );
00644 fTest.close();
00645
00646
00647 fTmp.write ( buffer,length );
00648 delete[] buffer;
00649
00650 fTmp.close();
00651 }
00652
00653
00654 srand ( m_data->m_randSeed );
00655 m_data->readDataset ( m_data->m_datasetName );
00656
00657 testSize = m_data->m_nTest * m_data->m_nClass * m_data->m_nDomain;
00658
00659
00660 REAL* testMean = new REAL[testSize];
00661 for ( int i=0;i<testSize;i++ )
00662 testMean[i] = 0.0;
00663 for ( int e=0;e<epochs;e++ )
00664 {
00665 char nameBuf[512];
00666 sprintf ( nameBuf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e );
00667 fstream f ( nameBuf,ios::in );
00668 float* buf = new float[testSize];
00669 f.read ( ( char* ) buf,sizeof ( float ) *testSize );
00670 f.close();
00671
00672
00673 for ( int i=0;i<testSize;i++ )
00674 testMean[i] += buf[i];
00675
00676 delete[] buf;
00677
00678
00679
00680 double classErrBag = 0.0;
00681 double rmseBag = 0.0;
00682
00683
00684 for ( uint i=0;i<m_data->m_nTest;i++ )
00685 {
00686 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00687 REAL* ensembleOutputNorm = new REAL[m_data->m_nClass*m_data->m_nDomain];
00688 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00689 ensembleOutputNorm[j] = ensembleOutput[j] / ( ( double ) e+1.0 );
00690
00691
00692 if ( Framework::getDatasetType() )
00693 {
00694 for ( int d=0;d<m_data->m_nDomain;d++ )
00695 if ( getIndexOfMax ( ensembleOutputNorm + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00696 classErrBag += 1.0;
00697 }
00698
00699
00700 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00701 {
00702 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00703 REAL prediction = ensembleOutputNorm[j];
00704 rmseBag += ( prediction - target ) * ( prediction - target );
00705 }
00706
00707 delete[] ensembleOutputNorm;
00708 }
00709
00710 if ( Framework::getDatasetType() )
00711 classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00712 rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass*m_data->m_nDomain*m_data->m_nTest ) );
00713 cout<<e<<": "<<"RMSE:"<<rmseBag<<" classErr:"<<classErrBag<<endl;
00714
00715 }
00716
00717
00718 for ( int i=0;i<testSize;i++ )
00719 testMean[i] /= ( REAL ) epochs;
00720
00721 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00722 if ( fTest.is_open() ==false )
00723 assert ( false );
00724 fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize );
00725 fTest.close();
00726
00727
00728
00729 double classErrBag = 0.0;
00730 double rmseBag = 0.0;
00731
00732
00733 for ( uint i=0;i<m_data->m_nTest;i++ )
00734 {
00735 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00736
00737
00738 if ( Framework::getDatasetType() )
00739 {
00740 for ( int d=0;d<m_data->m_nDomain;d++ )
00741 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00742 classErrBag += 1.0;
00743 }
00744
00745
00746 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00747 {
00748 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00749 REAL prediction = ensembleOutput[j];
00750 rmseBag += ( prediction - target ) * ( prediction - target );
00751 }
00752
00753 }
00754
00755
00756 if ( Framework::getDatasetType() )
00757 classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00758 rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00759
00760 m_predictionRMSE = rmseBag;
00761 m_predictionClassificationError = classErrBag;
00762
00763 cout<<endl;
00764 cout<<epochs<<" runs"<<endl;
00765 cout<<"Bagging runs (with boostrap sample): rmseMean:"<<rmseMean/ ( double ) epochs<<" classErrMean:"<<classErrMean/ ( double ) epochs<<endl;
00766 cout<<"Bagged (mean) : rmse :"<<rmseBag<<" classErr :"<<classErrBag<<endl<<endl;
00767
00768 delete[] testMean;
00769 }
00770
00776 void Scheduler::boosting()
00777 {
00778 int epochs = Framework::getAdditionalStartupParameter();
00779 cout<<endl<<endl;
00780 cout<<"================================= Boosting ================================="<<endl;
00781 cout<<"epochs:"<<epochs<<endl<<endl<<endl;
00782 m_boostingRun = true;
00783
00784 vector<string> boostingFileNames;
00785 uint testSize = 0;
00786 double rmseMean = 0.0, classErrMean = 0.0;
00787 REAL* beta = new REAL[epochs];
00788 for ( m_boostingEpoch=0;m_boostingEpoch<epochs;m_boostingEpoch++ )
00789 {
00790 cout<<"e:"<<m_boostingEpoch<<endl;
00791
00792 m_randSeedBagBoost = m_boostingEpoch;
00793
00794
00795 train();
00796 predict();
00797
00798 fstream f ( "A.txt",ios::out );
00799 for ( int i=0;i<m_boostingNTrain;i++ )
00800 f<<m_probs[i]<<endl;
00801 f.close();
00802
00803 rmseMean += getPredictionRMSE();
00804 classErrMean += getClassificationError();
00805
00806 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00807 if ( fTest.is_open() ==false )
00808 assert ( false );
00809 char buf[512];
00810 sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),m_boostingEpoch );
00811 boostingFileNames.push_back ( buf );
00812 fstream fTmp ( buf,ios::out );
00813
00814
00815 fTest.seekg ( 0, ios::end );
00816 uint length = fTest.tellg();
00817 testSize = length/sizeof ( float );
00818 fTest.seekg ( 0, ios::beg );
00819
00820
00821 char* buffer = new char [length];
00822
00823
00824 fTest.read ( buffer,length );
00825 fTest.close();
00826
00827
00828 fTmp.write ( buffer,length );
00829 delete[] buffer;
00830
00831 fTmp.close();
00832
00833
00834 double rmseBoost = 0.0, epsilon = 0.0, rmseTrain = 0.0;
00835 REAL min = m_data->m_negativeTarget, max = m_data->m_positiveTarget;
00836 Framework::setFrameworkMode ( 1 );
00837
00838 preparePredictionMode();
00839 REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain];
00840 REAL* loss = new REAL[m_boostingNTrain];
00841
00842 int nOut = m_data->m_nClass*m_data->m_nDomain;
00843 for ( int i=0;i<m_boostingNTrain;i++ )
00844 {
00845
00846 REAL* inputFeature = m_boostingTrain + i * m_data->m_nFeatures;
00847 getEnsemblePrediction ( inputFeature, ensembleOutput );
00848
00849
00850 REAL err = 0.0, err2 = 0.0;
00851 for ( int j=0;j<m_data->m_nDomain;j++ )
00852 {
00853 int indMax = -1;
00854 REAL maxTarget = -1e10;
00855 for ( int k=0;k<m_data->m_nClass;k++ )
00856 if ( maxTarget < m_boostingTargets[i * nOut + m_data->m_nClass*j + k] )
00857 {
00858 maxTarget = m_boostingTargets[i * nOut + m_data->m_nClass*j + k];
00859 indMax = k;
00860 }
00861 if ( indMax == -1 )
00862 assert ( false );
00863 for ( int k=0;k<m_data->m_nClass;k++ )
00864 {
00865 if ( indMax != k )
00866 {
00867 REAL predictionTarget = ensembleOutput[m_data->m_nClass*j + indMax];
00868 REAL prediction = ensembleOutput[m_data->m_nClass*j + k];
00869
00870 err += 1.0 - ( predictionTarget-min ) / ( max-min ) + ( prediction-min ) / ( max-min );
00871 err2 += 1.0 + ( predictionTarget-min ) / ( max-min ) - ( prediction-min ) / ( max-min );
00872 }
00873 }
00874
00875 for ( int j=0;j<m_data->m_nDomain;j++ )
00876 for ( int k=0;k<m_data->m_nClass;k++ )
00877 {
00878 REAL out = ensembleOutput[m_data->m_nClass*j + k];
00879 REAL target = m_boostingTargets[i * nOut + m_data->m_nClass*j + k];
00880 rmseTrain += ( out-target ) * ( out-target );
00881 }
00882
00883 }
00884 epsilon += m_probs[i] * err / ( REAL ) ( m_data->m_nClass-1 );
00885 loss[i] = err2 / ( REAL ) ( m_data->m_nClass-1 );
00886 }
00887 rmseTrain = sqrt ( rmseTrain/ ( double ) ( m_boostingNTrain*m_data->m_nClass*m_data->m_nDomain ) );
00888 cout<<"rmseTrain(boosting):"<<rmseTrain<<endl;
00889 epsilon *= 0.5;
00890 beta[m_boostingEpoch] = epsilon / ( 1.0 - epsilon );
00891
00892 for ( int i=0;i<m_boostingNTrain;i++ )
00893 m_probs[i] *= pow ( beta[m_boostingEpoch], 0.5 * loss[i] );
00894 double sum = 0.0;
00895 for ( int i=0;i<m_boostingNTrain;i++ )
00896 sum += m_probs[i];
00897
00898 for ( int i=0;i<m_boostingNTrain;i++ )
00899 m_probs[i] /= sum;
00900
00901 delete[] loss;
00902 delete[] ensembleOutput;
00903
00904 endPredictionMode();
00905 }
00906
00907
00908 srand ( m_data->m_randSeed );
00909 m_data->readDataset ( m_data->m_datasetName );
00910
00911
00912 cout<<endl<<endl<<"#test values:"<<testSize<<" (dataset size:"<<m_data->m_nTest<<")"<<endl;
00913 REAL* testMean = new REAL[testSize];
00914 for ( int i=0;i<testSize;i++ )
00915 testMean[i] = 0.0;
00916 for ( int e=0;e<epochs;e++ )
00917 {
00918 cout<<"Cascade layer "<<e<<": weight:"<<log10 ( 1.0/beta[e] ) <<" "<<flush;
00919 fstream f ( boostingFileNames[e].c_str(),ios::in );
00920 if ( f.is_open() == false )
00921 assert ( false );
00922 float* buf = new float[testSize];
00923 f.read ( ( char* ) buf,sizeof ( float ) *testSize );
00924 f.close();
00925
00926
00927 for ( int i=0;i<testSize;i++ )
00928 {
00929 REAL w = log10 ( 1.0/beta[e] );
00930 testMean[i] += w*buf[i];
00931 }
00932 delete[] buf;
00933
00934
00935
00936
00937 double classErrBoostingPerEpoch = 0.0;
00938 double rmseBoostingPerEpoch = 0.0;
00939 double rmseBoostingPerEpoch0 = 0.0;
00940 double rmseBoostingPerEpoch1 = 0.0;
00941 for ( int i=0;i<m_data->m_nTest;i++ )
00942 {
00943 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
00944 REAL* ensembleOutputNorm0 = new REAL[m_data->m_nClass*m_data->m_nDomain];
00945 REAL* ensembleOutputNorm1 = new REAL[m_data->m_nClass*m_data->m_nDomain];
00946
00947 REAL norm0 = 0.0;
00948 for ( int j=0;j<=e;j++ )
00949 norm0 += log10 ( 1.0/beta[e] );
00950 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00951 {
00952 ensembleOutputNorm0[j] = ensembleOutput[j]/ ( REAL ) ( e+1 );
00953 ensembleOutputNorm1[j] = ensembleOutput[j]/norm0;
00954 }
00955
00956
00957 if ( Framework::getDatasetType() )
00958 {
00959 for ( int d=0;d<m_data->m_nDomain;d++ )
00960 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
00961 classErrBoostingPerEpoch += 1.0;
00962 }
00963
00964
00965 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
00966 {
00967 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
00968 REAL prediction = ensembleOutput[j];
00969 rmseBoostingPerEpoch += ( prediction - target ) * ( prediction - target );
00970
00971 prediction = ensembleOutputNorm0[j];
00972 rmseBoostingPerEpoch0 += ( prediction - target ) * ( prediction - target );
00973
00974 prediction = ensembleOutputNorm1[j];
00975 rmseBoostingPerEpoch1 += ( prediction - target ) * ( prediction - target );
00976 }
00977
00978 delete[] ensembleOutputNorm0;
00979 delete[] ensembleOutputNorm1;
00980 }
00981
00982 if ( Framework::getDatasetType() )
00983 classErrBoostingPerEpoch = 100.0*classErrBoostingPerEpoch/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
00984 rmseBoostingPerEpoch = sqrt ( rmseBoostingPerEpoch/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00985 rmseBoostingPerEpoch0 = sqrt ( rmseBoostingPerEpoch0/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00986 rmseBoostingPerEpoch1 = sqrt ( rmseBoostingPerEpoch1/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
00987 cout<<"Boosting: rmse:"<<rmseBoostingPerEpoch<<" rmse0:"<<rmseBoostingPerEpoch0<<" rmse1:"<<rmseBoostingPerEpoch1<<" classErr:"<<classErrBoostingPerEpoch<<"%"<<endl;
00988 }
00989
00990
00991 for ( int i=0;i<testSize;i++ )
00992 testMean[i] /= ( REAL ) epochs;
00993
00994 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in );
00995 if ( fTest.is_open() ==false )
00996 assert ( false );
00997 fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize );
00998 fTest.close();
00999
01000
01001
01002 double classErrBoosting = 0.0;
01003 double rmseBoosting = 0.0;
01004
01005
01006 for ( int i=0;i<m_data->m_nTest;i++ )
01007 {
01008 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain;
01009
01010
01011 if ( Framework::getDatasetType() )
01012 {
01013 for ( int d=0;d<m_data->m_nDomain;d++ )
01014 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] )
01015 classErrBoosting += 1.0;
01016 }
01017
01018
01019 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01020 {
01021 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j];
01022 REAL prediction = ensembleOutput[j];
01023 rmseBoosting += ( prediction - target ) * ( prediction - target );
01024 }
01025
01026 }
01027
01028
01029 if ( Framework::getDatasetType() )
01030 classErrBoosting = 100.0*classErrBoosting/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain );
01031 rmseBoosting = sqrt ( rmseBoosting/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) );
01032
01033 m_predictionRMSE = rmseBoosting;
01034 m_predictionClassificationError = classErrBoosting;
01035
01036 cout<<endl;
01037 cout<<epochs<<" runs"<<endl;
01038 cout<<"Boosting runs (mean boostrap sample): rmseMean:"<<rmseMean/ ( double ) epochs<<" classErrMean:"<<classErrMean/ ( double ) epochs<<"%"<<endl;
01039 cout<<"Boosting (mean) : rmse :"<<rmseBoosting<<" classErr :"<<classErrBoosting<<"%"<<endl<<endl;
01040
01041 delete[] testMean;
01042 }
01043
01051 void Scheduler::checkAlgorithmTemplate ( string fname, string &algoName, string &id )
01052 {
01053
01054 fstream f ( fname.c_str(), ios::in );
01055 if ( f.is_open() == false )
01056 assert ( false );
01057 string firstLine, secondLine, thirdLine;
01058 f>>firstLine;
01059 f>>secondLine;
01060 f>>thirdLine;
01061 f.close();
01062 int pos = firstLine.find ( "=" );
01063 string name = firstLine.substr ( 0, pos );
01064 algoName = firstLine.substr ( pos+1 );
01065 if ( name != "ALGORITHM" )
01066 {
01067 cout<<"Wrong dsc file, no ALGORITHM=.. found in first line"<<endl;
01068 exit ( 0 );
01069 }
01070 pos = secondLine.find ( "=" );
01071 name = secondLine.substr ( 0, pos );
01072 id = secondLine.substr ( pos+1 );
01073 if ( name != "ID" )
01074 {
01075 cout<<"Wrong dsc file, no ID=.. found in second line"<<endl;
01076 exit ( 0 );
01077 }
01078 }
01079
01085 void Scheduler::trainAlgorithm ( string fnameTemplate, string fnameDsc )
01086 {
01087 cout<<"Train algorithm:"<<fnameTemplate<<endl;
01088
01089 string algoName, id;
01090 checkAlgorithmTemplate ( fnameTemplate, algoName, id );
01091
01092
01093 m_data->readDscFile ( fnameTemplate );
01094 if ( m_data->m_disableTraining )
01095 {
01096 cout<<"Training disabled."<<endl;
01097 return;
01098 }
01099
01100
01101 fstream fAlgoTemplate ( fnameTemplate.c_str(), ios::in );
01102 fstream fAlgo ( fnameDsc.c_str(), ios::out );
01103 cout<<"AlgoTemplate:"<<fnameTemplate<<" Algo:"<<fnameDsc<<endl;
01104 char buf[1024];
01105 while ( fAlgoTemplate.getline ( buf, 1024 ) )
01106 {
01107 string line = string ( buf );
01108 fAlgo<<line<<endl;
01109 }
01110 fAlgoTemplate.close();
01111 fAlgo.close();
01112
01113
01114 cout.setOutputFile ( fnameDsc );
01115
01116 cout<<"Floating point precision: "<< ( int ) sizeof ( REAL ) <<" Bytes"<<endl;
01117
01118 m_data->partitionDatasetToCrossValidationSets();
01119
01120
01121 Algorithm* algo = 0;
01122 algorithmDispatcher ( algo, algoName );
01123 algo->setDataPointers ( m_data );
01124
01125 if ( m_data->m_enableFeatureSelection )
01126 {
01127 algo->doFeatureSelection();
01128 exit ( 0 );
01129 }
01130 else
01131 algo->train();
01132
01133 if ( algo )
01134 {
01135 cout<<"delete algo"<<endl;
01136 delete algo;
01137 }
01138 algo = 0;
01139 cout<<"Finished train algorithm:"<<fnameTemplate<<endl;
01140
01141 }
01142
01149 int Scheduler::getIDFromFullPredictor ( string fullPredictor )
01150 {
01151 if ( fullPredictor=="" )
01152 return 0;
01153 for ( int i=0;i<m_algorithmObjectList.size();i++ )
01154 if ( m_algorithmObjectList[i]->m_stringMap["fullPrediction"] == fullPredictor )
01155 return m_algorithmObjectList[i]->m_algorithmID;
01156 cout<<"Error, this fullPredictor was not found:"<<fullPredictor<<endl;
01157 assert ( false );
01158 }
01159
01167 void Scheduler::getEnsemblePrediction ( REAL* input, REAL* output )
01168 {
01169 int N = m_algorithmList.size();
01170 REAL* tmp = new REAL[m_data->m_nFeatures+N];
01171
01172
01173
01174 for ( int i=0;i<N;i++ )
01175 {
01176
01177 int ID = m_effectID[i];
01178 REAL* effect = m_noEffect;
01179 REAL* outputVector = m_outputs[i+1];
01180 if ( ID != 0 )
01181 {
01182 if ( ID < 0 || ID > i )
01183 assert ( false );
01184 effect = m_outputs[ID];
01185 }
01186
01187
01188 if ( m_data->m_enableCascadeLearning )
01189 {
01190 int nF = m_data->m_nFeatures;
01191 int nFAlgo = m_algorithmObjectList[i]->m_nFeatures;
01192
01193
01194 for ( int j=0;j<nF;j++ )
01195 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j];
01196
01197
01198 for ( int j=0;j<i;j++ )
01199 {
01200 REAL* previousOutputVector = m_outputs[j+1];
01201 int nOut = m_data->m_nClass*m_data->m_nDomain;
01202 for ( int k=0;k<nOut;k++ )
01203 tmp[nF+j*nOut+k] = ( previousOutputVector[k] - m_data->m_mean[nF+j*nOut+k] ) / m_data->m_std[nF+j*nOut+k];
01204 }
01205 }
01206 else
01207 {
01208 for ( int j=0;j<m_data->m_nFeatures;j++ )
01209 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j];
01210 }
01211
01212 if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet")
01213 m_algorithmObjectList[i]->predictMultipleOutputs ( tmp, effect, outputVector, m_labelsPredict, 1, m_data->m_nCross );
01214 else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" )
01215 {
01216 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01217 outputVector[j] = 0.0;
01218 for ( int j=0;j<m_data->m_nCross;j++ )
01219 {
01220 m_algorithmObjectListList[i][j]->predictMultipleOutputs ( tmp, effect, m_outputVectorTmp, m_labelsTmp, 1, j );
01221 for ( int k=0;k<m_data->m_nClass*m_data->m_nDomain;k++ )
01222 outputVector[k] += m_outputVectorTmp[k];
01223 }
01224 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ )
01225 outputVector[j] /= ( REAL ) m_data->m_nCross;
01226
01227
01228 if ( Framework::getDatasetType() )
01229 {
01230
01231 for ( int d=0;d<m_data->m_nDomain;d++ )
01232 {
01233
01234 int indMax = -1;
01235 REAL max = -1e10;
01236 for ( int j=0;j<m_data->m_nClass;j++ )
01237 {
01238 if ( max < outputVector[d*m_data->m_nClass+j] )
01239 {
01240 max = outputVector[d*m_data->m_nClass+j];
01241 indMax = j;
01242 }
01243 }
01244 m_labelsPredict[d] = indMax;
01245 }
01246 }
01247
01248 }
01249 else
01250 assert(false);
01251 }
01252
01253 delete[] tmp;
01254
01255
01256 if ( m_data->m_enablePostNNBlending )
01257 m_blenderNN->predictEnsembleOutput ( m_outputs, output );
01258 else
01259 m_blender->predictEnsembleOutput ( m_outputs, output );
01260 }
01261
01266 void Scheduler::preparePredictionMode()
01267 {
01268 cout<<"Start scheduled prediction"<<endl;
01269
01270
01271 srand ( m_data->m_randSeed );
01272
01273
01274 m_data->readDataset ( m_data->m_datasetName );
01275 srand ( m_data->m_randSeed );
01276
01277 if(m_data->m_validationType=="ValidationSet")
01278 m_data->m_nCross = 0;
01279
01280
01281 int N = m_algorithmList.size();
01282
01283
01284 if ( m_data->m_enableCascadeLearning )
01285 m_data->loadNormalization ( N-1 );
01286 else
01287 m_data->loadNormalization();
01288
01289
01290 m_algorithmIDList.clear();
01291 for ( int i=0;i<N;i++ )
01292 {
01293 string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i];
01294 setPredictionModeInAlgorithm ( fAlgoTemplateName );
01295 }
01296
01297
01298 if ( m_data->m_enablePostNNBlending )
01299 {
01300 m_blenderNN = new BlendingNN();
01301 m_blenderNN->setDataPointers ( m_data );
01302 m_blenderNN->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" );
01303 m_blenderNN->readSpecificMaps();
01304 m_blenderNN->loadWeights();
01305 }
01306
01307
01308 m_blender = new BlendStopping ( ( Algorithm* ) m_data );
01309 m_blender->loadBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath, N+1 );
01310
01311 for ( int i=0;i<N;i++ )
01312 cout<<"ALGO FROM MASTER DSC-FILE:"<<m_algorithmObjectList[i]->m_stringMap["fullPrediction"]<<endl;
01313
01314 m_blender->printWeights();
01315
01316 int nClass = m_data->m_nClass;
01317 int nDomain = m_data->m_nDomain;
01318
01319
01320 m_effectID = new int[N];
01321 for ( int i=0;i<N;i++ )
01322 m_effectID[i] = getIDFromFullPredictor ( m_algorithmObjectList[i]->m_trainOnFullPredictorFile );
01323
01324
01325
01326 m_noEffect = new REAL[nClass*nDomain];
01327 for ( int i=0;i<nClass*nDomain;i++ )
01328 m_noEffect[i] = 0.0;
01329 m_outputs = new REAL*[N+1];
01330 m_effects = new REAL*[N+1];
01331 for ( int i=0;i<N+1;i++ )
01332 {
01333 m_outputs[i] = new REAL[nClass*nDomain];
01334 m_effects[i] = new REAL[nClass*nDomain];
01335 for ( int j=0;j<nClass*nDomain;j++ )
01336 {
01337 m_outputs[i][j] = 1.0;
01338 m_effects[i][j] = 0.0;
01339 }
01340 }
01341
01342
01343 m_labelsPredict = new int[m_data->m_nDomain];
01344
01345 }
01346
01351 void Scheduler::endPredictionMode()
01352 {
01353 cout<<"End scheduled prediction"<<endl;
01354 m_data->deleteMemory();
01355
01356 for ( int i=0;i<m_algorithmObjectList.size();i++ )
01357 delete m_algorithmObjectList[i];
01358 m_algorithmObjectList.clear();
01359
01360 if ( m_data->m_enablePostNNBlending )
01361 delete m_blenderNN;
01362 delete m_blender;
01363 delete[] m_effectID;
01364 int N = m_algorithmList.size();
01365 for ( int i=0;i<N+1;i++ )
01366 {
01367 delete[] m_outputs[i];
01368 delete[] m_effects[i];
01369 }
01370 delete[] m_noEffect;
01371 delete[] m_outputs;
01372 delete[] m_effects;
01373 delete[] m_labelsPredict;
01374
01375 }
01376
01384 int Scheduler::getIndexOfMax ( REAL* vector, int length )
01385 {
01386 int indMax = -1;
01387 REAL max = -1e10;
01388 for ( int i=0;i<length;i++ )
01389 {
01390 if ( max < vector[i] )
01391 {
01392 max = vector[i];
01393 indMax = i;
01394 }
01395 }
01396
01397 return indMax;
01398 }
01399
01407 void Scheduler::setPredictionModeInAlgorithm ( string fname )
01408 {
01409 cout<<"Prediction mode in algorithm:"<<fname<<endl;
01410
01411
01412 string algoName, id;
01413 checkAlgorithmTemplate ( fname, algoName, id );
01414
01415
01416 m_data->readDscFile ( fname );
01417
01418
01419 if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet")
01420 {
01421 Algorithm* algo = 0;
01422 algorithmDispatcher ( algo, algoName );
01423 algo->setDataPointers ( m_data );
01424 algo->setPredictionMode ( m_data->m_nCross );
01425
01426
01427 m_algorithmObjectList.push_back ( algo );
01428 }
01429 else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" )
01430 {
01431 cout<<"Make "<<m_data->m_nCross<<" models ready to predict"<<endl;
01432 Algorithm** algoList = new Algorithm*[m_data->m_nCross];
01433 for ( int i=0;i<m_data->m_nCross;i++ )
01434 {
01435 Algorithm* algo = 0;
01436 algorithmDispatcher ( algo, algoName );
01437 algo->setDataPointers ( m_data );
01438 algo->setPredictionMode ( i );
01439 algoList[i] = algo;
01440 }
01441 m_algorithmObjectListList.push_back ( algoList );
01442 m_algorithmObjectList.push_back ( algoList[0] );
01443 }
01444 else
01445 assert(false);
01446
01447
01448 for ( int i=0;i<m_algorithmIDList.size();i++ )
01449 if ( m_algorithmIDList[i] == atoi ( id.c_str() ) )
01450 {
01451 cout<<"ID:"<<id<<" in "<<algoName<<" already exists"<<endl;
01452 assert ( false );
01453 }
01454
01455 m_algorithmIDList.push_back ( atoi ( id.c_str() ) );
01456
01457 m_algorithmNameList.push_back ( algoName );
01458
01459 cout<<endl;
01460 }
01461
01468 void Scheduler::algorithmDispatcher ( Algorithm* &algo, string name )
01469 {
01470 if ( name == "LinearModel" )
01471 algo = new LinearModel();
01472 else if ( name == "KNearestNeighbor" )
01473 algo = new KNearestNeighbor();
01474 else if ( name == "NeuralNetwork" )
01475 algo = new NeuralNetwork();
01476 else if ( name == "PolynomialRegression" )
01477 algo = new PolynomialRegression();
01478 else if ( name == "LinearModelNonNeg" )
01479 algo = new LinearModelNonNeg();
01480 else if ( name == "KernelRidgeRegression" )
01481 algo = new KernelRidgeRegression();
01482 else if ( name == "NeuralNetworkRBMauto" )
01483 algo = new NeuralNetworkRBMauto();
01484 else if ( name == "Autoencoder" )
01485 algo = new Autoencoder();
01486 else if ( name == "GBDT" )
01487 algo = new GBDT();
01488 else if ( name == "LogisticRegression" )
01489 algo = new LogisticRegression();
01490 else
01491 assert ( false );
01492 }
01493
01500 string Scheduler::masterDscTemplateGenerator ( string dataset, bool isClass, vector<string> algos, int rSeed, string blendAlgo, bool cascade )
01501 {
01502 stringstream s;
01503 s<<"dataset="<<dataset<<endl;
01504 s<<"isClassificationDataset="<<isClass<<endl;
01505 s<<"maxThreads=2"<<endl;
01506 s<<"maxThreadsInCross=2"<<endl;
01507 s<<"nCrossValidation=6"<<endl;
01508 s<<"validationType=Retraining"<<endl;
01509 s<<"positiveTarget=1.0"<<endl;
01510 s<<"negativeTarget=-1.0"<<endl;
01511 s<<"randomSeed="<<rSeed<<endl;
01512 s<<"nMixDataset=20"<<endl;
01513 s<<"nMixTrainList=100"<<endl;
01514 s<<"standardDeviationMin=0.01"<<endl;
01515 s<<"blendingRegularization=1e-4"<<endl;
01516 s<<"blendingEnableCrossValidation=0"<<endl;
01517 s<<"blendingAlgorithm="<<blendAlgo<<endl;
01518 s<<"enablePostNNBlending=0"<<endl;
01519 s<<"enableCascadeLearning="<<cascade<<endl;
01520 s<<"enableGlobalMeanStdEstimate=0"<<endl;
01521 s<<"enableSaveMemory=1"<<endl;
01522 s<<"addOutputNoise=0"<<endl;
01523 s<<"enablePostBlendClipping=0"<<endl;
01524 s<<"enableFeatureSelection=0"<<endl;
01525 s<<"featureSelectionWriteBinaryDataset=0"<<endl;
01526 s<<"enableGlobalBlendingWeights=1"<<endl;
01527 s<<"errorFunction=RMSE"<<endl;
01528 s<<"disableWriteDscFile=0"<<endl;
01529 s<<"enableStaticNormalization=0"<<endl;
01530 s<<"staticMeanNormalization=0.0"<<endl;
01531 s<<"staticStdNormalization=1.0"<<endl;
01532 s<<"enableProbablisticNormalization=0"<<endl;
01533 s<<"dimensionalityReduction=no"<<endl;
01534 s<<"subsampleTrainSet=1.0"<<endl;
01535 s<<"subsampleFeatures=1.0"<<endl;
01536 s<<"globalTrainingLoops=1"<<endl;
01537 s<<"addConstantInput=0"<<endl;
01538 s<<endl;
01539 s<<"[ALGORITHMS]"<<endl;
01540 for ( int i=0;i<algos.size();i++ )
01541 s<<algos[i]<<endl;
01542
01543 return s.str();
01544 }
01545
01550 REAL Scheduler::getPredictionRMSE()
01551 {
01552 return m_predictionRMSE;
01553 }
01554
01559 REAL Scheduler::getClassificationError()
01560 {
01561 return m_predictionClassificationError;
01562 }