#include <Scheduler.h>
Public Member Functions | |
Scheduler () | |
~Scheduler () | |
void | readMasterDscFile (string path, string masterName) |
void | train () |
void | predict () |
void | blend () |
void | bagging () |
void | boosting () |
REAL | getPredictionRMSE () |
REAL | getClassificationError () |
Static Public Member Functions | |
static string | masterDscTemplateGenerator (string dataset, bool isClass, vector< string > algos, int rSeed, string blendAlgo, bool cascade) |
Private Member Functions | |
void | trainAlgorithm (string fnameTemplate, string fnameDsc) |
void | checkAlgorithmTemplate (string fname, string &algoName, string &id) |
void | setPredictionModeInAlgorithm (string fname) |
int | getIDFromFullPredictor (string fullPredictor) |
void | algorithmDispatcher (Algorithm *&algo, string name) |
void | getEnsemblePrediction (REAL *input, REAL *output) |
void | preparePredictionMode () |
int | getIndexOfMax (REAL *vector, int length) |
void | endPredictionMode () |
Private Attributes | |
vector< string > | m_algorithmList |
vector< int > | m_algorithmIDList |
vector< string > | m_algorithmNameList |
Data * | m_data |
vector< Algorithm * > | m_algorithmObjectList |
vector< Algorithm ** > | m_algorithmObjectListList |
BlendStopping * | m_blender |
BlendingNN * | m_blenderNN |
int * | m_labelsPredict |
int * | m_effectID |
REAL * | m_noEffect |
REAL ** | m_outputs |
REAL ** | m_effects |
REAL | m_predictionRMSE |
REAL | m_predictionClassificationError |
REAL * | m_outputVectorTmp |
int * | m_labelsTmp |
bool | m_baggingRun |
bool | m_boostingRun |
uint | m_randSeedBagBoost |
REAL * | m_probs |
REAL * | m_boostingTrain |
REAL * | m_boostingTargets |
int | m_boostingNTrain |
int | m_boostingEpoch |
Possible operation modes
This class can force the ready-to-prediction mode in the ensemble Very useful for predicting any test feature
Definition at line 41 of file Scheduler.h.
Scheduler::Scheduler | ( | ) |
Constructor
Definition at line 8 of file Scheduler.cpp.
00009 { 00010 cout<<"Scheduler"<<endl; 00011 // init member vars 00012 m_data = 0; 00013 m_blender = 0; 00014 m_blenderNN = 0; 00015 m_labelsPredict = 0; 00016 m_effectID = 0; 00017 m_noEffect = 0; 00018 m_outputs = 0; 00019 m_effects = 0; 00020 m_predictionRMSE = 0; 00021 m_predictionClassificationError = 0; 00022 00023 m_data = new Data(); 00024 m_data->setPathes ( TMP_PATH, DSC_PATH, FULL_PREDICTOR_PATH, DATA_PATH ); 00025 00026 m_baggingRun = 0; 00027 m_boostingRun = 0; 00028 m_randSeedBagBoost = 0; 00029 m_probs = 0; 00030 m_boostingTrain = 0; 00031 m_boostingTargets = 0; 00032 m_boostingNTrain = 0; 00033 00034 }
Scheduler::~Scheduler | ( | ) |
Destructor
Definition at line 39 of file Scheduler.cpp.
00040 { 00041 cout<<"descructor Scheduler"<<endl; 00042 if ( m_data ) 00043 delete m_data; 00044 m_data = 0; 00045 }
void Scheduler::algorithmDispatcher | ( | Algorithm *& | algo, | |
string | name | |||
) | [private] |
Make a new instance of an Algorithm based on the model name
algo | Reference to the Algorithm object pointer | |
name | Name of the model |
Definition at line 1468 of file Scheduler.cpp.
01469 { 01470 if ( name == "LinearModel" ) 01471 algo = new LinearModel(); 01472 else if ( name == "KNearestNeighbor" ) 01473 algo = new KNearestNeighbor(); 01474 else if ( name == "NeuralNetwork" ) 01475 algo = new NeuralNetwork(); 01476 else if ( name == "PolynomialRegression" ) 01477 algo = new PolynomialRegression(); 01478 else if ( name == "LinearModelNonNeg" ) 01479 algo = new LinearModelNonNeg(); 01480 else if ( name == "KernelRidgeRegression" ) 01481 algo = new KernelRidgeRegression(); 01482 else if ( name == "NeuralNetworkRBMauto" ) 01483 algo = new NeuralNetworkRBMauto(); 01484 else if ( name == "Autoencoder" ) 01485 algo = new Autoencoder(); 01486 else if ( name == "GBDT" ) 01487 algo = new GBDT(); 01488 else if ( name == "LogisticRegression" ) 01489 algo = new LogisticRegression(); 01490 else 01491 assert ( false ); 01492 }
void Scheduler::bagging | ( | ) |
Generate a bagging ensemble produce a set of predictions with modified train set measuring accuracy on the test set
Definition at line 600 of file Scheduler.cpp.
00601 { 00602 int epochs = Framework::getAdditionalStartupParameter(); 00603 cout<<endl<<endl; 00604 cout<<"================================= Bagging ================================="<<endl; 00605 cout<<"epochs:"<<epochs<<endl<<endl<<endl; 00606 m_baggingRun = true; 00607 00608 vector<string> baggingFileNames; 00609 uint testSize = 0; 00610 double rmseMean = 0.0, classErrMean = 0.0; 00611 00612 for ( int e=0;e<epochs;e++ ) 00613 { 00614 cout<<"e:"<<e<<endl; 00615 00616 m_randSeedBagBoost = e + 1; 00617 00618 // train and predict testset 00619 train(); 00620 predict(); 00621 00622 rmseMean += getPredictionRMSE(); 00623 classErrMean += getClassificationError(); 00624 00625 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in ); 00626 if ( fTest.is_open() ==false ) 00627 assert ( false ); 00628 char buf[512]; 00629 sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e ); 00630 baggingFileNames.push_back ( buf ); 00631 fstream fTmp ( buf,ios::out ); 00632 00633 // get length of file 00634 fTest.seekg ( 0, ios::end ); 00635 uint length = fTest.tellg(); 00636 testSize = length/sizeof ( REAL ); 00637 fTest.seekg ( 0, ios::beg ); 00638 00639 // allocate memory 00640 char* buffer = new char[length]; 00641 00642 // read data as a block 00643 fTest.read ( buffer,length ); 00644 fTest.close(); 00645 00646 // write 00647 fTmp.write ( buffer,length ); 00648 delete[] buffer; 00649 00650 fTmp.close(); 00651 } 00652 00653 00654 srand ( m_data->m_randSeed ); 00655 m_data->readDataset ( m_data->m_datasetName ); 00656 00657 testSize = m_data->m_nTest * m_data->m_nClass * m_data->m_nDomain; 00658 00659 // calc bag mean 00660 REAL* testMean = new REAL[testSize]; 00661 for ( int i=0;i<testSize;i++ ) 00662 testMean[i] = 0.0; 00663 for ( int e=0;e<epochs;e++ ) 00664 { 00665 char nameBuf[512]; 00666 sprintf ( nameBuf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),e ); 00667 fstream f ( nameBuf,ios::in ); 00668 float* buf = new float[testSize]; 00669 f.read ( ( char* ) buf,sizeof ( float ) *testSize ); 00670 f.close(); 00671 00672 // add this run to ensemble 00673 for ( int i=0;i<testSize;i++ ) 00674 testMean[i] += buf[i]; 00675 00676 delete[] buf; 00677 00678 00679 // per epoch: calculate prediction RMSE and classification error 00680 double classErrBag = 0.0; 00681 double rmseBag = 0.0; 00682 00683 // go through the test set 00684 for ( uint i=0;i<m_data->m_nTest;i++ ) 00685 { 00686 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain; 00687 REAL* ensembleOutputNorm = new REAL[m_data->m_nClass*m_data->m_nDomain]; 00688 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 00689 ensembleOutputNorm[j] = ensembleOutput[j] / ( ( double ) e+1.0 ); 00690 00691 // if the dataset has classification type, count the #wrong labeled 00692 if ( Framework::getDatasetType() ) 00693 { 00694 for ( int d=0;d<m_data->m_nDomain;d++ ) 00695 if ( getIndexOfMax ( ensembleOutputNorm + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] ) 00696 classErrBag += 1.0; 00697 } 00698 00699 // rmse calculation over all targets 00700 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 00701 { 00702 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j]; 00703 REAL prediction = ensembleOutputNorm[j]; 00704 rmseBag += ( prediction - target ) * ( prediction - target ); 00705 } 00706 00707 delete[] ensembleOutputNorm; 00708 } 00709 00710 if ( Framework::getDatasetType() ) 00711 classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain ); 00712 rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass*m_data->m_nDomain*m_data->m_nTest ) ); 00713 cout<<e<<": "<<"RMSE:"<<rmseBag<<" classErr:"<<classErrBag<<endl; 00714 00715 } 00716 00717 // take the mean 00718 for ( int i=0;i<testSize;i++ ) 00719 testMean[i] /= ( REAL ) epochs; 00720 00721 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in ); 00722 if ( fTest.is_open() ==false ) 00723 assert ( false ); 00724 fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize ); 00725 fTest.close(); 00726 00727 00728 // calculate prediction RMSE and classification error 00729 double classErrBag = 0.0; 00730 double rmseBag = 0.0; 00731 00732 // go through the test set 00733 for ( uint i=0;i<m_data->m_nTest;i++ ) 00734 { 00735 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain; 00736 00737 // if the dataset has classification type, count the #wrong labeled 00738 if ( Framework::getDatasetType() ) 00739 { 00740 for ( int d=0;d<m_data->m_nDomain;d++ ) 00741 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] ) 00742 classErrBag += 1.0; 00743 } 00744 00745 // rmse calculation over all targets 00746 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 00747 { 00748 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j]; 00749 REAL prediction = ensembleOutput[j]; 00750 rmseBag += ( prediction - target ) * ( prediction - target ); 00751 } 00752 00753 } 00754 00755 // calc errors 00756 if ( Framework::getDatasetType() ) 00757 classErrBag = 100.0*classErrBag/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain ); 00758 rmseBag = sqrt ( rmseBag/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) ); 00759 00760 m_predictionRMSE = rmseBag; 00761 m_predictionClassificationError = classErrBag; 00762 00763 cout<<endl; 00764 cout<<epochs<<" runs"<<endl; 00765 cout<<"Bagging runs (with boostrap sample): rmseMean:"<<rmseMean/ ( double ) epochs<<" classErrMean:"<<classErrMean/ ( double ) epochs<<endl; 00766 cout<<"Bagged (mean) : rmse :"<<rmseBag<<" classErr :"<<classErrBag<<endl<<endl; 00767 00768 delete[] testMean; 00769 }
void Scheduler::blend | ( | ) |
Blend the predictions with a neural network
Definition at line 405 of file Scheduler.cpp.
00406 { 00407 Framework::setFrameworkMode ( 0 ); 00408 00409 cout<<"Start blending after training"<<endl; 00410 00411 // set the list of already trained predictors 00412 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) ); 00413 00414 // fix random seed 00415 srand ( m_data->m_randSeed ); 00416 00417 // fill the data object with the dataset 00418 cout<<"Fill data"<<endl; 00419 m_data->readDataset ( m_data->m_datasetName ); 00420 srand ( m_data->m_randSeed ); 00421 m_data->allocMemForCrossValidationSets(); 00422 m_data->partitionDatasetToCrossValidationSets(); 00423 00424 if ( m_data->m_enablePostNNBlending ) 00425 { 00426 m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" ); 00427 00428 BlendingNN nn; 00429 nn.setDataPointers ( m_data ); 00430 nn.readSpecificMaps(); 00431 nn.init(); 00432 nn.train(); 00433 } 00434 else 00435 { 00436 BlendStopping bb ( ( Algorithm* ) m_data, "" ); 00437 bb.setRegularization ( m_data->m_blendingRegularization ); 00438 double rmse = bb.calcBlending(); 00439 cout<<"BLEND RMSE OF ACTUAL FULLPREDICTION PATH:"<<rmse<<endl; 00440 bb.saveBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath ); 00441 } 00442 }
void Scheduler::boosting | ( | ) |
Generate a boosting ensemble reweight sample probabilites based on train error
Definition at line 776 of file Scheduler.cpp.
00777 { 00778 int epochs = Framework::getAdditionalStartupParameter(); 00779 cout<<endl<<endl; 00780 cout<<"================================= Boosting ================================="<<endl; 00781 cout<<"epochs:"<<epochs<<endl<<endl<<endl; 00782 m_boostingRun = true; 00783 00784 vector<string> boostingFileNames; 00785 uint testSize = 0; 00786 double rmseMean = 0.0, classErrMean = 0.0; 00787 REAL* beta = new REAL[epochs]; 00788 for ( m_boostingEpoch=0;m_boostingEpoch<epochs;m_boostingEpoch++ ) 00789 { 00790 cout<<"e:"<<m_boostingEpoch<<endl; 00791 00792 m_randSeedBagBoost = m_boostingEpoch; 00793 00794 // train and predict testset (testset must be fixed) 00795 train(); 00796 predict(); 00797 00798 fstream f ( "A.txt",ios::out ); 00799 for ( int i=0;i<m_boostingNTrain;i++ ) 00800 f<<m_probs[i]<<endl; 00801 f.close(); 00802 00803 rmseMean += getPredictionRMSE(); 00804 classErrMean += getClassificationError(); 00805 00806 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in ); 00807 if ( fTest.is_open() ==false ) 00808 assert ( false ); 00809 char buf[512]; 00810 sprintf ( buf,"%s.%d", ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),m_boostingEpoch ); 00811 boostingFileNames.push_back ( buf ); 00812 fstream fTmp ( buf,ios::out ); 00813 00814 // get length of file 00815 fTest.seekg ( 0, ios::end ); 00816 uint length = fTest.tellg(); 00817 testSize = length/sizeof ( float ); 00818 fTest.seekg ( 0, ios::beg ); 00819 00820 // allocate memory 00821 char* buffer = new char [length]; 00822 00823 // read data as a block 00824 fTest.read ( buffer,length ); 00825 fTest.close(); 00826 00827 // write 00828 fTmp.write ( buffer,length ); 00829 delete[] buffer; 00830 00831 fTmp.close(); 00832 00833 // ==================== predict train set ===================== 00834 double rmseBoost = 0.0, epsilon = 0.0, rmseTrain = 0.0; 00835 REAL min = m_data->m_negativeTarget, max = m_data->m_positiveTarget; 00836 Framework::setFrameworkMode ( 1 ); 00837 00838 preparePredictionMode(); 00839 REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain]; 00840 REAL* loss = new REAL[m_boostingNTrain]; 00841 // go through the train set 00842 int nOut = m_data->m_nClass*m_data->m_nDomain; 00843 for ( int i=0;i<m_boostingNTrain;i++ ) 00844 { 00845 // predict one example 00846 REAL* inputFeature = m_boostingTrain + i * m_data->m_nFeatures; 00847 getEnsemblePrediction ( inputFeature, ensembleOutput ); 00848 00849 // rmse calculation over all targets 00850 REAL err = 0.0, err2 = 0.0; 00851 for ( int j=0;j<m_data->m_nDomain;j++ ) 00852 { 00853 int indMax = -1; 00854 REAL maxTarget = -1e10; 00855 for ( int k=0;k<m_data->m_nClass;k++ ) 00856 if ( maxTarget < m_boostingTargets[i * nOut + m_data->m_nClass*j + k] ) 00857 { 00858 maxTarget = m_boostingTargets[i * nOut + m_data->m_nClass*j + k]; 00859 indMax = k; 00860 } 00861 if ( indMax == -1 ) 00862 assert ( false ); 00863 for ( int k=0;k<m_data->m_nClass;k++ ) 00864 { 00865 if ( indMax != k ) 00866 { 00867 REAL predictionTarget = ensembleOutput[m_data->m_nClass*j + indMax]; 00868 REAL prediction = ensembleOutput[m_data->m_nClass*j + k]; 00869 00870 err += 1.0 - ( predictionTarget-min ) / ( max-min ) + ( prediction-min ) / ( max-min ); 00871 err2 += 1.0 + ( predictionTarget-min ) / ( max-min ) - ( prediction-min ) / ( max-min ); 00872 } 00873 } 00874 00875 for ( int j=0;j<m_data->m_nDomain;j++ ) 00876 for ( int k=0;k<m_data->m_nClass;k++ ) 00877 { 00878 REAL out = ensembleOutput[m_data->m_nClass*j + k]; 00879 REAL target = m_boostingTargets[i * nOut + m_data->m_nClass*j + k]; 00880 rmseTrain += ( out-target ) * ( out-target ); 00881 } 00882 00883 } 00884 epsilon += m_probs[i] * err / ( REAL ) ( m_data->m_nClass-1 ); 00885 loss[i] = err2 / ( REAL ) ( m_data->m_nClass-1 ); 00886 } 00887 rmseTrain = sqrt ( rmseTrain/ ( double ) ( m_boostingNTrain*m_data->m_nClass*m_data->m_nDomain ) ); 00888 cout<<"rmseTrain(boosting):"<<rmseTrain<<endl; 00889 epsilon *= 0.5; 00890 beta[m_boostingEpoch] = epsilon / ( 1.0 - epsilon ); 00891 // update example probabilities 00892 for ( int i=0;i<m_boostingNTrain;i++ ) 00893 m_probs[i] *= pow ( beta[m_boostingEpoch], 0.5 * loss[i] ); 00894 double sum = 0.0; 00895 for ( int i=0;i<m_boostingNTrain;i++ ) 00896 sum += m_probs[i]; 00897 // normalize 00898 for ( int i=0;i<m_boostingNTrain;i++ ) 00899 m_probs[i] /= sum; 00900 00901 delete[] loss; 00902 delete[] ensembleOutput; 00903 00904 endPredictionMode(); 00905 } 00906 00907 // read test data 00908 srand ( m_data->m_randSeed ); 00909 m_data->readDataset ( m_data->m_datasetName ); 00910 00911 // calc boosting mean 00912 cout<<endl<<endl<<"#test values:"<<testSize<<" (dataset size:"<<m_data->m_nTest<<")"<<endl; 00913 REAL* testMean = new REAL[testSize]; 00914 for ( int i=0;i<testSize;i++ ) 00915 testMean[i] = 0.0; 00916 for ( int e=0;e<epochs;e++ ) 00917 { 00918 cout<<"Cascade layer "<<e<<": weight:"<<log10 ( 1.0/beta[e] ) <<" "<<flush; 00919 fstream f ( boostingFileNames[e].c_str(),ios::in ); 00920 if ( f.is_open() == false ) 00921 assert ( false ); 00922 float* buf = new float[testSize]; 00923 f.read ( ( char* ) buf,sizeof ( float ) *testSize ); 00924 f.close(); 00925 00926 // add this run to ensemble 00927 for ( int i=0;i<testSize;i++ ) 00928 { 00929 REAL w = log10 ( 1.0/beta[e] ); 00930 testMean[i] += w*buf[i]; 00931 } 00932 delete[] buf; 00933 00934 00935 // Calculate per-epoch errors 00936 // go through the test set 00937 double classErrBoostingPerEpoch = 0.0; 00938 double rmseBoostingPerEpoch = 0.0; 00939 double rmseBoostingPerEpoch0 = 0.0; 00940 double rmseBoostingPerEpoch1 = 0.0; 00941 for ( int i=0;i<m_data->m_nTest;i++ ) 00942 { 00943 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain; 00944 REAL* ensembleOutputNorm0 = new REAL[m_data->m_nClass*m_data->m_nDomain]; 00945 REAL* ensembleOutputNorm1 = new REAL[m_data->m_nClass*m_data->m_nDomain]; 00946 00947 REAL norm0 = 0.0; 00948 for ( int j=0;j<=e;j++ ) 00949 norm0 += log10 ( 1.0/beta[e] ); 00950 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 00951 { 00952 ensembleOutputNorm0[j] = ensembleOutput[j]/ ( REAL ) ( e+1 ); 00953 ensembleOutputNorm1[j] = ensembleOutput[j]/norm0; 00954 } 00955 00956 // if the dataset has classification type, count the #wrong labeled 00957 if ( Framework::getDatasetType() ) 00958 { 00959 for ( int d=0;d<m_data->m_nDomain;d++ ) 00960 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] ) 00961 classErrBoostingPerEpoch += 1.0; 00962 } 00963 00964 // rmse calculation over all targets 00965 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 00966 { 00967 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j]; 00968 REAL prediction = ensembleOutput[j]; 00969 rmseBoostingPerEpoch += ( prediction - target ) * ( prediction - target ); 00970 00971 prediction = ensembleOutputNorm0[j]; 00972 rmseBoostingPerEpoch0 += ( prediction - target ) * ( prediction - target ); 00973 00974 prediction = ensembleOutputNorm1[j]; 00975 rmseBoostingPerEpoch1 += ( prediction - target ) * ( prediction - target ); 00976 } 00977 00978 delete[] ensembleOutputNorm0; 00979 delete[] ensembleOutputNorm1; 00980 } 00981 // calc errors 00982 if ( Framework::getDatasetType() ) 00983 classErrBoostingPerEpoch = 100.0*classErrBoostingPerEpoch/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain ); 00984 rmseBoostingPerEpoch = sqrt ( rmseBoostingPerEpoch/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) ); 00985 rmseBoostingPerEpoch0 = sqrt ( rmseBoostingPerEpoch0/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) ); 00986 rmseBoostingPerEpoch1 = sqrt ( rmseBoostingPerEpoch1/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) ); 00987 cout<<"Boosting: rmse:"<<rmseBoostingPerEpoch<<" rmse0:"<<rmseBoostingPerEpoch0<<" rmse1:"<<rmseBoostingPerEpoch1<<" classErr:"<<classErrBoostingPerEpoch<<"%"<<endl; 00988 } 00989 00990 // take the mean 00991 for ( int i=0;i<testSize;i++ ) 00992 testMean[i] /= ( REAL ) epochs; 00993 00994 fstream fTest ( ( m_data->m_datasetPath+"/"+TMP_PATH+"/testPrediction.data" ).c_str(),ios::in ); 00995 if ( fTest.is_open() ==false ) 00996 assert ( false ); 00997 fTest.write ( ( char* ) testMean,sizeof ( float ) *testSize ); 00998 fTest.close(); 00999 01000 01001 // calculate prediction RMSE and classification error 01002 double classErrBoosting = 0.0; 01003 double rmseBoosting = 0.0; 01004 01005 // go through the test set 01006 for ( int i=0;i<m_data->m_nTest;i++ ) 01007 { 01008 REAL* ensembleOutput = testMean + i * m_data->m_nClass*m_data->m_nDomain; 01009 01010 // if the dataset has classification type, count the #wrong labeled 01011 if ( Framework::getDatasetType() ) 01012 { 01013 for ( int d=0;d<m_data->m_nDomain;d++ ) 01014 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i*m_data->m_nDomain] ) 01015 classErrBoosting += 1.0; 01016 } 01017 01018 // rmse calculation over all targets 01019 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 01020 { 01021 REAL target = m_data->m_testTargetOrig[i * m_data->m_nClass * m_data->m_nDomain + j]; 01022 REAL prediction = ensembleOutput[j]; 01023 rmseBoosting += ( prediction - target ) * ( prediction - target ); 01024 } 01025 01026 } 01027 01028 // calc errors 01029 if ( Framework::getDatasetType() ) 01030 classErrBoosting = 100.0*classErrBoosting/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain ); 01031 rmseBoosting = sqrt ( rmseBoosting/ ( double ) ( m_data->m_nClass * m_data->m_nDomain * m_data->m_nTest ) ); 01032 01033 m_predictionRMSE = rmseBoosting; 01034 m_predictionClassificationError = classErrBoosting; 01035 01036 cout<<endl; 01037 cout<<epochs<<" runs"<<endl; 01038 cout<<"Boosting runs (mean boostrap sample): rmseMean:"<<rmseMean/ ( double ) epochs<<" classErrMean:"<<classErrMean/ ( double ) epochs<<"%"<<endl; 01039 cout<<"Boosting (mean) : rmse :"<<rmseBoosting<<" classErr :"<<classErrBoosting<<"%"<<endl<<endl; 01040 01041 delete[] testMean; 01042 }
void Scheduler::checkAlgorithmTemplate | ( | string | fname, | |
string & | algoName, | |||
string & | id | |||
) | [private] |
Check if the Algorithm dsc file has no errors
fname | Dsc filename of Algorithm | |
algoName | Reference the the algorithm name (KNN, NN, LinearModel, ..) | |
id | Reference to the ID (ascending number from 0,1..) |
Definition at line 1051 of file Scheduler.cpp.
01052 { 01053 // check, if the algorithm line exists 01054 fstream f ( fname.c_str(), ios::in ); 01055 if ( f.is_open() == false ) 01056 assert ( false ); 01057 string firstLine, secondLine, thirdLine; 01058 f>>firstLine; 01059 f>>secondLine; 01060 f>>thirdLine; 01061 f.close(); 01062 int pos = firstLine.find ( "=" ); 01063 string name = firstLine.substr ( 0, pos ); 01064 algoName = firstLine.substr ( pos+1 ); 01065 if ( name != "ALGORITHM" ) 01066 { 01067 cout<<"Wrong dsc file, no ALGORITHM=.. found in first line"<<endl; 01068 exit ( 0 ); 01069 } 01070 pos = secondLine.find ( "=" ); 01071 name = secondLine.substr ( 0, pos ); 01072 id = secondLine.substr ( pos+1 ); 01073 if ( name != "ID" ) 01074 { 01075 cout<<"Wrong dsc file, no ID=.. found in second line"<<endl; 01076 exit ( 0 ); 01077 } 01078 }
void Scheduler::endPredictionMode | ( | ) | [private] |
End of the prediction mode Deallocation of memory
Definition at line 1351 of file Scheduler.cpp.
01352 { 01353 cout<<"End scheduled prediction"<<endl; 01354 m_data->deleteMemory(); 01355 01356 for ( int i=0;i<m_algorithmObjectList.size();i++ ) 01357 delete m_algorithmObjectList[i]; 01358 m_algorithmObjectList.clear(); 01359 01360 if ( m_data->m_enablePostNNBlending ) 01361 delete m_blenderNN; 01362 delete m_blender; 01363 delete[] m_effectID; 01364 int N = m_algorithmList.size(); 01365 for ( int i=0;i<N+1;i++ ) 01366 { 01367 delete[] m_outputs[i]; 01368 delete[] m_effects[i]; 01369 } 01370 delete[] m_noEffect; 01371 delete[] m_outputs; 01372 delete[] m_effects; 01373 delete[] m_labelsPredict; 01374 01375 }
REAL Scheduler::getClassificationError | ( | ) |
Return classification error of last testset prediction
Definition at line 1559 of file Scheduler.cpp.
void Scheduler::getEnsemblePrediction | ( | REAL * | input, | |
REAL * | output | |||
) | [private] |
Predict a target vector with given input feature Based on the trained ensemble
input | REAL* vector to original input feature (read) | |
output | REAL* vector to target (write) |
Definition at line 1167 of file Scheduler.cpp.
01168 { 01169 int N = m_algorithmList.size(); 01170 REAL* tmp = new REAL[m_data->m_nFeatures+N]; 01171 01172 // predict all targets per algorithm 01173 // if the algorithm needs a preprocessor, the effect file is loaded 01174 for ( int i=0;i<N;i++ ) 01175 { 01176 // effect = pre-processor for this algorithm 01177 int ID = m_effectID[i]; 01178 REAL* effect = m_noEffect; // constant zero 01179 REAL* outputVector = m_outputs[i+1]; // +1: jump over constant 1 01180 if ( ID != 0 ) 01181 { 01182 if ( ID < 0 || ID > i ) 01183 assert ( false ); 01184 effect = m_outputs[ID]; // output of another prediction as effect 01185 } 01186 01187 // cascade learning: add predictions of previous model as input to current 01188 if ( m_data->m_enableCascadeLearning ) 01189 { 01190 int nF = m_data->m_nFeatures; 01191 int nFAlgo = m_algorithmObjectList[i]->m_nFeatures; 01192 01193 // add input feature + normalize 01194 for ( int j=0;j<nF;j++ ) 01195 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j]; 01196 01197 // add predictions + normalize 01198 for ( int j=0;j<i;j++ ) // over all previous models 01199 { 01200 REAL* previousOutputVector = m_outputs[j+1]; 01201 int nOut = m_data->m_nClass*m_data->m_nDomain; 01202 for ( int k=0;k<nOut;k++ ) 01203 tmp[nF+j*nOut+k] = ( previousOutputVector[k] - m_data->m_mean[nF+j*nOut+k] ) / m_data->m_std[nF+j*nOut+k]; 01204 } 01205 } 01206 else // standard 01207 { 01208 for ( int j=0;j<m_data->m_nFeatures;j++ ) 01209 tmp[j] = ( input[j] - m_data->m_mean[j] ) / m_data->m_std[j]; 01210 } 01211 01212 if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet") 01213 m_algorithmObjectList[i]->predictMultipleOutputs ( tmp, effect, outputVector, m_labelsPredict, 1, m_data->m_nCross ); 01214 else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" ) 01215 { 01216 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 01217 outputVector[j] = 0.0; 01218 for ( int j=0;j<m_data->m_nCross;j++ ) 01219 { 01220 m_algorithmObjectListList[i][j]->predictMultipleOutputs ( tmp, effect, m_outputVectorTmp, m_labelsTmp, 1, j ); 01221 for ( int k=0;k<m_data->m_nClass*m_data->m_nDomain;k++ ) 01222 outputVector[k] += m_outputVectorTmp[k]; 01223 } 01224 for ( int j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 01225 outputVector[j] /= ( REAL ) m_data->m_nCross; 01226 01227 // calc output labels (for classification dataset) 01228 if ( Framework::getDatasetType() ) 01229 { 01230 // in all domains 01231 for ( int d=0;d<m_data->m_nDomain;d++ ) 01232 { 01233 // find max. output value 01234 int indMax = -1; 01235 REAL max = -1e10; 01236 for ( int j=0;j<m_data->m_nClass;j++ ) 01237 { 01238 if ( max < outputVector[d*m_data->m_nClass+j] ) 01239 { 01240 max = outputVector[d*m_data->m_nClass+j]; 01241 indMax = j; 01242 } 01243 } 01244 m_labelsPredict[d] = indMax; 01245 } 01246 } 01247 01248 } 01249 else 01250 assert(false); 01251 } 01252 01253 delete[] tmp; 01254 01255 // calculate the ensemble output with the blender 01256 if ( m_data->m_enablePostNNBlending ) 01257 m_blenderNN->predictEnsembleOutput ( m_outputs, output ); 01258 else 01259 m_blender->predictEnsembleOutput ( m_outputs, output ); 01260 }
int Scheduler::getIDFromFullPredictor | ( | string | fullPredictor | ) | [private] |
Returns the ID from the corresponding dsc-file of the given full-prediction file
fullPredictor | Full-predictor file name |
Definition at line 1149 of file Scheduler.cpp.
01150 { 01151 if ( fullPredictor=="" ) 01152 return 0; 01153 for ( int i=0;i<m_algorithmObjectList.size();i++ ) 01154 if ( m_algorithmObjectList[i]->m_stringMap["fullPrediction"] == fullPredictor ) 01155 return m_algorithmObjectList[i]->m_algorithmID; 01156 cout<<"Error, this fullPredictor was not found:"<<fullPredictor<<endl; 01157 assert ( false ); 01158 }
int Scheduler::getIndexOfMax | ( | REAL * | vector, | |
int | length | |||
) | [private] |
Find the largest element in a vector and return the index
vector | Input REAL vector | |
length | The number of elements of vector |
Definition at line 1384 of file Scheduler.cpp.
01385 { 01386 int indMax = -1; 01387 REAL max = -1e10; 01388 for ( int i=0;i<length;i++ ) 01389 { 01390 if ( max < vector[i] ) 01391 { 01392 max = vector[i]; 01393 indMax = i; 01394 } 01395 } 01396 01397 return indMax; 01398 }
REAL Scheduler::getPredictionRMSE | ( | ) |
string Scheduler::masterDscTemplateGenerator | ( | string | dataset, | |
bool | isClass, | |||
vector< string > | algos, | |||
int | rSeed, | |||
string | blendAlgo, | |||
bool | cascade | |||
) | [static] |
Generates a template of the master description file This is an example of a Master.dsc file
Definition at line 1500 of file Scheduler.cpp.
01501 { 01502 stringstream s; 01503 s<<"dataset="<<dataset<<endl; 01504 s<<"isClassificationDataset="<<isClass<<endl; 01505 s<<"maxThreads=2"<<endl; 01506 s<<"maxThreadsInCross=2"<<endl; 01507 s<<"nCrossValidation=6"<<endl; 01508 s<<"validationType=Retraining"<<endl; 01509 s<<"positiveTarget=1.0"<<endl; 01510 s<<"negativeTarget=-1.0"<<endl; 01511 s<<"randomSeed="<<rSeed<<endl; 01512 s<<"nMixDataset=20"<<endl; 01513 s<<"nMixTrainList=100"<<endl; 01514 s<<"standardDeviationMin=0.01"<<endl; 01515 s<<"blendingRegularization=1e-4"<<endl; 01516 s<<"blendingEnableCrossValidation=0"<<endl; 01517 s<<"blendingAlgorithm="<<blendAlgo<<endl; 01518 s<<"enablePostNNBlending=0"<<endl; 01519 s<<"enableCascadeLearning="<<cascade<<endl; 01520 s<<"enableGlobalMeanStdEstimate=0"<<endl; 01521 s<<"enableSaveMemory=1"<<endl; 01522 s<<"addOutputNoise=0"<<endl; 01523 s<<"enablePostBlendClipping=0"<<endl; 01524 s<<"enableFeatureSelection=0"<<endl; 01525 s<<"featureSelectionWriteBinaryDataset=0"<<endl; 01526 s<<"enableGlobalBlendingWeights=1"<<endl; 01527 s<<"errorFunction=RMSE"<<endl; 01528 s<<"disableWriteDscFile=0"<<endl; 01529 s<<"enableStaticNormalization=0"<<endl; 01530 s<<"staticMeanNormalization=0.0"<<endl; 01531 s<<"staticStdNormalization=1.0"<<endl; 01532 s<<"enableProbablisticNormalization=0"<<endl; 01533 s<<"dimensionalityReduction=no"<<endl; 01534 s<<"subsampleTrainSet=1.0"<<endl; 01535 s<<"subsampleFeatures=1.0"<<endl; 01536 s<<"globalTrainingLoops=1"<<endl; 01537 s<<"addConstantInput=0"<<endl; 01538 s<<endl; 01539 s<<"[ALGORITHMS]"<<endl; 01540 for ( int i=0;i<algos.size();i++ ) 01541 s<<algos[i]<<endl; 01542 01543 return s.str(); 01544 }
void Scheduler::predict | ( | ) |
Predict the testset save the predictions to a binary file: out.dat
Definition at line 450 of file Scheduler.cpp.
00451 { 00452 Framework::setFrameworkMode ( 1 ); 00453 00454 preparePredictionMode(); 00455 00456 int progress = m_data->m_nTest / 100 + 1; 00457 double mean = 0.0, rmse = 0.0; 00458 00459 // output file (binary) 00460 string fname; 00461 if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() >= 0 ) 00462 { 00463 cout<<"Dataset:NETFLIX, slot:"<<Framework::getAdditionalStartupParameter() <<" "; 00464 char buf[512]; 00465 sprintf ( buf,"p%d",Framework::getAdditionalStartupParameter() ); 00466 fname = string ( NETFLIX_SLOTDATA_ROOT_DIR ) + buf + "/testPrediction.data"; 00467 cout<<"pName:"<<fname<<endl; 00468 } 00469 else if ( m_data->m_datasetName=="NETFLIX" && Framework::getAdditionalStartupParameter() < -100 ) 00470 { 00471 char buf[512]; 00472 sprintf ( buf,"ELFprediction%d",Framework::getRandomSeed() ); 00473 string algos; 00474 for ( int i=0;i<m_algorithmList.size();i++ ) 00475 algos += "_" + m_algorithmList[i]; 00476 fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/" + buf + algos + ".dat"; 00477 cout<<"pName:"<<fname<<endl; 00478 } 00479 else 00480 { 00481 char nr[512]; 00482 sprintf ( nr,"%d",rand() ); 00483 fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/testPrediction" + string ( nr ) + ".data"; 00484 //fname = m_data->m_datasetPath + "/" + m_data->m_tempPath + "/testPrediction.data"; 00485 } 00486 00487 fstream fOutput ( fname.c_str(),ios::out ); 00488 00489 // the output vector of the ensemble 00490 REAL* ensembleOutput = new REAL[m_data->m_nClass*m_data->m_nDomain]; 00491 00492 int* wrongLabelCnt = new int[m_data->m_nDomain]; 00493 for ( int i=0;i<m_data->m_nDomain;i++ ) 00494 wrongLabelCnt[i] = 0; 00495 00496 // store the real input dimension of data 00497 int nrFeat = m_data->m_nFeatures; 00498 00499 m_outputVectorTmp = new REAL[m_data->m_nClass*m_data->m_nDomain]; 00500 m_labelsTmp = new int[m_data->m_nClass*m_data->m_nDomain]; 00501 00502 // load the autoencoder net 00503 Autoencoder* autoEnc = 0; 00504 bool enableAutoencoder = false; 00505 REAL* autoencoderOutput = 0; 00506 if ( m_data->m_dimensionalityReduction == "Autoencoder" ) 00507 { 00508 autoEnc = new Autoencoder(); 00509 autoEnc->setDataPointers ( m_data ); 00510 autoEnc->loadWeights(); 00511 autoEnc->loadNormalizations(); 00512 enableAutoencoder = true; 00513 autoencoderOutput = new REAL[autoEnc->m_nClass]; 00514 m_data->m_nFeatures = autoEnc->m_nClass; // modify input dimension 00515 } 00516 00517 cout<<endl<<"predict(100 dots): "<<flush; 00518 time_t t0 = time ( 0 ); 00519 00520 // go through the test set 00521 for ( uint i=0;i<m_data->m_nTest;i++ ) 00522 { 00523 if ( i % progress == 0 ) 00524 cout<<"."<<flush; 00525 00526 // predict one example 00527 REAL* inputFeature = m_data->m_testOrig + i * ( uint ) nrFeat; 00528 00529 if ( enableAutoencoder ) 00530 { 00531 autoEnc->predictAllOutputs ( inputFeature, autoencoderOutput, 1, 0 ); 00532 getEnsemblePrediction ( autoencoderOutput, ensembleOutput ); 00533 } 00534 else 00535 getEnsemblePrediction ( inputFeature, ensembleOutput ); 00536 00537 // if the dataset has classification type, count the #wrong labeled 00538 if ( Framework::getDatasetType() ) 00539 { 00540 for ( uint d=0;d<m_data->m_nDomain;d++ ) 00541 if ( getIndexOfMax ( ensembleOutput + d * m_data->m_nClass, m_data->m_nClass ) != m_data->m_testLabelOrig[d+i* ( uint ) m_data->m_nDomain] ) 00542 wrongLabelCnt[d]++; 00543 } 00544 00545 // rmse calculation over all targets 00546 for ( uint j=0;j<m_data->m_nClass*m_data->m_nDomain;j++ ) 00547 { 00548 REAL target = m_data->m_testTargetOrig[i * ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain + j]; 00549 REAL prediction = ensembleOutput[j]; 00550 rmse += ( prediction - target ) * ( prediction - target ); 00551 mean += prediction; 00552 float predictionSP = prediction; 00553 fOutput.write ( ( char* ) &predictionSP, sizeof ( float ) ); 00554 } 00555 00556 } 00557 00558 delete[] m_outputVectorTmp; 00559 delete[] m_labelsTmp; 00560 00561 // print classification error 00562 if ( Framework::getDatasetType() ) 00563 { 00564 int nWrong = 0; 00565 for ( int d=0;d<m_data->m_nDomain;d++ ) 00566 { 00567 nWrong += wrongLabelCnt[d]; 00568 if ( m_data->m_nDomain > 1 ) 00569 cout<<"["<< ( double ) wrongLabelCnt[d]/ ( double ) m_data->m_nTest<<"] "; 00570 } 00571 m_predictionClassificationError = 100.0* ( double ) nWrong/ ( ( double ) m_data->m_nTest* ( double ) m_data->m_nDomain ); 00572 cout<<endl<<"Classification test error: "<<m_predictionClassificationError<<"%"<<endl; 00573 } 00574 00575 // print RMSE 00576 m_predictionRMSE = sqrt ( rmse/ ( double ) ( ( uint ) m_data->m_nClass * ( uint ) m_data->m_nDomain * m_data->m_nTest ) ); 00577 cout<<"RMSE test: "<<m_predictionRMSE<<endl; 00578 00579 // print info 00580 cout<<endl<<"Predictions are written to binary output file: "<<fname<<" ("<< ( uint ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain*sizeof ( float ) ); 00581 cout<<" Bytes, REAL="<< ( int ) sizeof ( float ) <<"Bytes, #elements:"<< ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass*m_data->m_nDomain<<") "; 00582 cout<<"[mean:"<<mean/ ( double ) ( ( uint ) m_data->m_nTest* ( uint ) m_data->m_nClass* ( uint ) m_data->m_nDomain ) <<"] )"<<endl; 00583 cout<<"Prediction time: "<<time ( 0 )-t0<<"[s]"<<endl<<endl; 00584 00585 fOutput.close(); 00586 00587 if ( ensembleOutput ) 00588 delete[] ensembleOutput; 00589 ensembleOutput = 0; 00590 00591 endPredictionMode(); 00592 }
void Scheduler::preparePredictionMode | ( | ) | [private] |
Prepare the trained ensemble to predict unknown input features
Definition at line 1266 of file Scheduler.cpp.
01267 { 01268 cout<<"Start scheduled prediction"<<endl; 01269 01270 // fix random seed 01271 srand ( m_data->m_randSeed ); 01272 01273 // load test set 01274 m_data->readDataset ( m_data->m_datasetName ); 01275 srand ( m_data->m_randSeed ); 01276 01277 if(m_data->m_validationType=="ValidationSet") 01278 m_data->m_nCross = 0; 01279 01280 // number of algorithms in the ensemble 01281 int N = m_algorithmList.size(); 01282 01283 // load normalization (mean, std) 01284 if ( m_data->m_enableCascadeLearning ) 01285 m_data->loadNormalization ( N-1 ); 01286 else 01287 m_data->loadNormalization(); 01288 01289 // go to prediction mode in all template files 01290 m_algorithmIDList.clear(); 01291 for ( int i=0;i<N;i++ ) 01292 { 01293 string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i]; 01294 setPredictionModeInAlgorithm ( fAlgoTemplateName ); 01295 } 01296 01297 // new NN blender 01298 if ( m_data->m_enablePostNNBlending ) 01299 { 01300 m_blenderNN = new BlendingNN(); 01301 m_blenderNN->setDataPointers ( m_data ); 01302 m_blenderNN->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" ); 01303 m_blenderNN->readSpecificMaps(); 01304 m_blenderNN->loadWeights(); 01305 } 01306 01307 // load blending weights 01308 m_blender = new BlendStopping ( ( Algorithm* ) m_data ); 01309 m_blender->loadBlendingWeights ( m_data->m_datasetPath + "/" + m_data->m_tempPath, N+1 ); 01310 01311 for ( int i=0;i<N;i++ ) 01312 cout<<"ALGO FROM MASTER DSC-FILE:"<<m_algorithmObjectList[i]->m_stringMap["fullPrediction"]<<endl; 01313 01314 m_blender->printWeights(); 01315 01316 int nClass = m_data->m_nClass; 01317 int nDomain = m_data->m_nDomain; 01318 01319 // precompute IDs from effect files per prediction 01320 m_effectID = new int[N]; 01321 for ( int i=0;i<N;i++ ) 01322 m_effectID[i] = getIDFromFullPredictor ( m_algorithmObjectList[i]->m_trainOnFullPredictorFile ); 01323 01324 // used in prediction mode 01325 // per sample: store prediction for every model 01326 m_noEffect = new REAL[nClass*nDomain]; 01327 for ( int i=0;i<nClass*nDomain;i++ ) 01328 m_noEffect[i] = 0.0; 01329 m_outputs = new REAL*[N+1]; 01330 m_effects = new REAL*[N+1]; 01331 for ( int i=0;i<N+1;i++ ) 01332 { 01333 m_outputs[i] = new REAL[nClass*nDomain]; 01334 m_effects[i] = new REAL[nClass*nDomain]; 01335 for ( int j=0;j<nClass*nDomain;j++ ) 01336 { 01337 m_outputs[i][j] = 1.0; // init with constant 1.0 (needed in blend) 01338 m_effects[i][j] = 0.0; 01339 } 01340 } 01341 01342 // tmp variable for predict labels 01343 m_labelsPredict = new int[m_data->m_nDomain]; 01344 01345 }
void Scheduler::readMasterDscFile | ( | string | path, | |
string | masterName | |||
) |
Read the master description file The master file set up some initial train settings, dataset name and train ordering
fname | Name of master-dsc file (string) |
Definition at line 53 of file Scheduler.cpp.
00054 { 00055 m_data->m_datasetPath = path; 00056 cout<<"Open master .dsc file:"<<(path + "/" + masterName)<<endl; 00057 fstream fMaster ( (path + "/" + masterName).c_str(), ios::in ); 00058 00059 // check is file exists 00060 if ( fMaster.is_open() == 0 ) 00061 { 00062 cout<<"Error: no Master.dsc file found in "<<path<<endl; 00063 exit ( 0 ); 00064 } 00065 00066 // read all lines 00067 char buf[1024]; 00068 bool readAlgorithmList = false; 00069 while ( fMaster.getline ( buf, 1024 ) ) // read all lines 00070 { 00071 // the line 00072 string line = string ( buf ); 00073 00074 // an empty line or comments 00075 if ( line=="" || line[0]=='#' ) 00076 continue; 00077 00078 // read the algorithm dsc files 00079 if ( readAlgorithmList ) 00080 { 00081 m_algorithmList.push_back ( line ); 00082 continue; 00083 } 00084 00085 // list of algorithm dsc files begins 00086 if ( line=="[ALGORITHMS]" ) 00087 { 00088 readAlgorithmList = true; 00089 continue; 00090 } 00091 00092 // split into 2 strings at the '=' char 00093 int pos = line.find ( "=" ); 00094 string name = line.substr ( 0, pos ); 00095 string value = line.substr ( pos+1 ); 00096 00097 // read the meta training values 00098 if ( name=="dataset" ) 00099 m_data->m_datasetName = value; 00100 if ( name=="isClassificationDataset" ) 00101 Framework::setDatasetType ( atoi ( value.c_str() ) ); 00102 if ( name=="maxThreads" ) 00103 { 00104 cout<<"Set max. threads in MKL and IPP: "<<atoi ( value.c_str() ) <<endl; 00105 mkl_set_num_threads ( atoi ( value.c_str() ) ); 00106 ippSetNumThreads ( atoi ( value.c_str() ) ); 00107 } 00108 if ( name=="maxThreadsInCross" ) 00109 { 00110 Framework::setMaxThreads ( atoi ( value.c_str() ) ); // store max. number of threads 00111 m_data->m_maxThreadsInCross = atoi ( value.c_str() ); // #threads in cross-fold-validation 00112 } 00113 if ( name=="nCrossValidation" ) 00114 { 00115 m_data->m_nCross = atoi ( value.c_str() ); 00116 cout<<"Train "<<m_data->m_nCross<<"-fold cross validation"<<endl; 00117 } 00118 if ( name=="validationType" ) 00119 { 00120 assert ( value=="Retraining" || value=="CrossFoldMean" || value=="Bagging" || value=="ValidationSet"); 00121 m_data->m_validationType = value; 00122 cout<<"ValidationType: "<<value<<endl; 00123 } 00124 if ( name=="positiveTarget" ) 00125 m_data->m_positiveTarget = atof ( value.c_str() ); 00126 if ( name=="negativeTarget" ) 00127 m_data->m_negativeTarget = atof ( value.c_str() ); 00128 if ( name=="standardDeviationMin" ) 00129 m_data->m_standardDeviationMin = atof ( value.c_str() ); 00130 if ( name=="randomSeed" ) 00131 { 00132 if ( value=="time(0)" ) 00133 m_data->m_randSeed = time ( 0 ); 00134 else 00135 m_data->m_randSeed = atoi ( value.c_str() ); 00136 cout<<"Set random seed to: "<<m_data->m_randSeed<<endl; 00137 setRandomSeed ( m_data->m_randSeed ); 00138 } 00139 if ( name=="nMixDataset" ) 00140 m_data->m_nMixDataset = atoi ( value.c_str() ); 00141 if ( name=="nMixTrainList" ) 00142 m_data->m_nMixTrainList = atoi ( value.c_str() ); 00143 if ( name=="blendingRegularization" ) 00144 m_data->m_blendingRegularization = atof ( value.c_str() ); 00145 if ( name=="blendingAlgorithm" ) 00146 m_data->m_blendingAlgorithm = value; 00147 if ( name=="blendingEnableCrossValidation" ) 00148 m_data->m_blendingEnableCrossValidation = atoi ( value.c_str() ); 00149 if ( name=="enablePostNNBlending" ) 00150 m_data->m_enablePostNNBlending = atoi ( value.c_str() ); 00151 if ( name=="enableCascadeLearning" ) 00152 m_data->m_enableCascadeLearning = atoi ( value.c_str() ); 00153 if ( name=="enableGlobalMeanStdEstimate" ) 00154 m_data->m_enableGlobalMeanStdEstimate = atoi ( value.c_str() ); 00155 if ( name=="enableSaveMemory" ) 00156 m_data->m_enableSaveMemory = atoi ( value.c_str() ); 00157 if ( name=="errorFunction" ) 00158 m_data->m_errorFunction = value; 00159 if ( name=="enablePostBlendClipping" ) 00160 m_data->m_enablePostBlendClipping = atoi ( value.c_str() ); 00161 if ( name=="addOutputNoise" ) 00162 m_data->m_addOutputNoise = atof ( value.c_str() ); 00163 if ( name=="enableFeatureSelection" ) 00164 m_data->m_enableFeatureSelection = atoi ( value.c_str() ); 00165 if ( name=="featureSelectionWriteBinaryDataset" ) 00166 m_data->m_featureSelectionWriteBinaryDataset = atoi ( value.c_str() ); 00167 if ( name=="enableGlobalBlendingWeights" ) 00168 m_data->m_enableGlobalBlendingWeights = atoi ( value.c_str() ); 00169 if ( name=="disableWriteDscFile" ) 00170 { 00171 m_data->m_disableWriteDscFile = atoi ( value.c_str() ); 00172 if ( m_data->m_disableWriteDscFile ) 00173 cout.disableFileOutputs(); 00174 } 00175 if ( name=="enableStaticNormalization" ) 00176 m_data->m_enableStaticNormalization = atoi ( value.c_str() ); 00177 if ( name=="staticMeanNormalization" ) 00178 m_data->m_staticMeanNormalization = atof ( value.c_str() ); 00179 if ( name=="staticStdNormalization" ) 00180 m_data->m_staticStdNormalization = atof ( value.c_str() ); 00181 if ( name=="enableProbablisticNormalization" ) 00182 m_data->m_enableProbablisticNormalization = atoi ( value.c_str() ); 00183 if ( name=="dimensionalityReduction" ) 00184 m_data->m_dimensionalityReduction = value; 00185 if ( name=="subsampleTrainSet" ) 00186 m_data->m_subsampleTrainSet = atof ( value.c_str() ); 00187 if ( name=="subsampleFeatures" ) 00188 m_data->m_subsampleFeatures = atof ( value.c_str() ); 00189 if ( name=="globalTrainingLoops" ) 00190 m_data->m_globalTrainingLoops = atoi ( value.c_str() ); 00191 if ( name=="addConstantInput" ) 00192 m_data->m_addConstantInput = atoi ( value.c_str() ); 00193 } 00194 00195 fMaster.close(); 00196 }
void Scheduler::setPredictionModeInAlgorithm | ( | string | fname | ) | [private] |
Used in Prediction mode
Set the particular algorithm in the prediction mode
fname | The name of the dsc-file of the Algorithm |
Definition at line 1407 of file Scheduler.cpp.
01408 { 01409 cout<<"Prediction mode in algorithm:"<<fname<<endl; 01410 01411 // check the dsc file 01412 string algoName, id; 01413 checkAlgorithmTemplate ( fname, algoName, id ); 01414 01415 // read dsc file 01416 m_data->readDscFile ( fname ); 01417 01418 // make an instance of the algorithm and give him the data 01419 if ( m_data->m_validationType=="Retraining" || m_data->m_validationType=="ValidationSet") 01420 { 01421 Algorithm* algo = 0; 01422 algorithmDispatcher ( algo, algoName ); 01423 algo->setDataPointers ( m_data ); 01424 algo->setPredictionMode ( m_data->m_nCross ); 01425 01426 // add the algorithm to internal object list of algorithms 01427 m_algorithmObjectList.push_back ( algo ); 01428 } 01429 else if ( m_data->m_validationType=="CrossFoldMean" || m_data->m_validationType=="Bagging" ) 01430 { 01431 cout<<"Make "<<m_data->m_nCross<<" models ready to predict"<<endl; 01432 Algorithm** algoList = new Algorithm*[m_data->m_nCross]; 01433 for ( int i=0;i<m_data->m_nCross;i++ ) 01434 { 01435 Algorithm* algo = 0; 01436 algorithmDispatcher ( algo, algoName ); 01437 algo->setDataPointers ( m_data ); 01438 algo->setPredictionMode ( i ); 01439 algoList[i] = algo; 01440 } 01441 m_algorithmObjectListList.push_back ( algoList ); 01442 m_algorithmObjectList.push_back ( algoList[0] ); 01443 } 01444 else 01445 assert(false); 01446 01447 // check, if id already exist 01448 for ( int i=0;i<m_algorithmIDList.size();i++ ) 01449 if ( m_algorithmIDList[i] == atoi ( id.c_str() ) ) 01450 { 01451 cout<<"ID:"<<id<<" in "<<algoName<<" already exists"<<endl; 01452 assert ( false ); 01453 } 01454 01455 m_algorithmIDList.push_back ( atoi ( id.c_str() ) ); 01456 01457 m_algorithmNameList.push_back ( algoName ); 01458 01459 cout<<endl; 01460 }
void Scheduler::train | ( | ) |
Train the stack of Algorithms (described is dsc files itself)
Definition at line 202 of file Scheduler.cpp.
00203 { 00204 Framework::setFrameworkMode ( 0 ); 00205 00206 cout<<"Start scheduled training"<<endl; 00207 00208 // fill the data object with the dataset 00209 cout<<"Fill data"<<endl; 00210 00211 // autoencoder file objects 00212 fstream fA0 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataMean.dat" ).c_str(), ios::in ); 00213 fstream fA1 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataStd.dat" ).c_str(), ios::in ); 00214 fstream fA2 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTest.dat" ).c_str(), ios::in ); 00215 fstream fA3 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTestTarget.dat" ).c_str(), ios::in ); 00216 fstream fA4 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrain.dat" ).c_str(), ios::in ); 00217 fstream fA5 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderDataTrainTarget.dat" ).c_str(), ios::in ); 00218 fstream fA6 ( ( m_data->m_datasetPath + "/" + m_data->m_tempPath + "/AutoencoderWeights.dat" ).c_str(), ios::in ); 00219 00220 bool autoencoderFilesOK = fA0.is_open() && fA1.is_open() && fA2.is_open() && fA3.is_open() && fA4.is_open() && fA5.is_open() && fA6.is_open(); 00221 00222 // perform: reduce the dimensionalty of data 00223 if ( m_data->m_dimensionalityReduction == "Autoencoder" && autoencoderFilesOK == false ) 00224 { 00225 cout<<"Autoencoder: start training"<<endl; 00226 00227 // fix random seed 00228 srand ( m_data->m_randSeed ); 00229 00230 // read dataset 00231 m_data->readDataset ( m_data->m_datasetName ); 00232 m_data->mergeTrainAndTest(); 00233 m_data->mixDataset(); 00234 00235 // prepare cross-fold validation 00236 m_data->allocMemForCrossValidationSets(); 00237 m_data->normalizeZeroOne(); 00238 00239 // train algorithm 00240 trainAlgorithm ( m_data->m_datasetPath + "/Autoencoder.dsc", m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/Autoencoder.dsc" ); 00241 00242 // clear mem 00243 m_data->deleteMemory(); 00244 } 00245 00246 // optimize sequentially the whole ensemble 00247 // 1st run: build initial ensemble 00248 // 2nd...endRun: optimize each algorithm's metaparameters 00249 time_t totalTime = time(0); 00250 cout<<"globalTrainingLoops:"<<m_data->m_globalTrainingLoops<<endl; 00251 for ( int globalLoop=0;globalLoop<m_data->m_globalTrainingLoops;globalLoop++ ) 00252 { 00253 // train all template files 00254 for ( int i=0;i<m_algorithmList.size();i++ ) 00255 { 00256 //m_data->m_randSeed+=i; 00257 00258 // fix random seed 00259 srand ( m_data->m_randSeed ); 00260 00261 // read dataset 00262 if ( m_data->m_dimensionalityReduction == "Autoencoder" ) 00263 { 00264 Autoencoder a; 00265 a.setDataPointers ( m_data ); 00266 00267 // fix random seed 00268 srand ( m_data->m_randSeed ); 00269 00270 a.readDataset ( m_data, m_data->m_datasetName ); 00271 } 00272 else 00273 m_data->readDataset ( m_data->m_datasetName ); 00274 00275 // bagging: modify the trainset in retraining 00276 m_data->enableBagging ( m_baggingRun ); 00277 m_data->baggingRandomSeed ( m_randSeedBagBoost ); 00278 00279 // copy train data for later evaluation 00280 if ( m_boostingRun ) 00281 { 00282 if ( m_probs == 0 ) 00283 { 00284 cout<<"Init bootstrap probabilities to 1/N"<<endl; 00285 m_probs = new REAL[m_data->m_nTrain]; 00286 for ( int j=0;j<m_data->m_nTrain;j++ ) 00287 m_probs[j] = 1.0 / ( ( REAL ) m_data->m_nTrain ); 00288 } 00289 if ( m_boostingTrain==0 ) 00290 { 00291 cout<<"Copy train set (features + targets) to boosting trainset"<<endl; 00292 m_boostingNTrain = m_data->m_nTrain; 00293 m_boostingTrain = new REAL[m_data->m_nTrain*m_data->m_nFeatures]; 00294 m_boostingTargets = new REAL[m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain]; 00295 memcpy ( m_boostingTrain, m_data->m_trainOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nFeatures ); 00296 memcpy ( m_boostingTargets, m_data->m_trainTargetOrig, sizeof ( REAL ) *m_data->m_nTrain*m_data->m_nClass*m_data->m_nDomain ); 00297 } 00298 00299 if ( m_boostingEpoch > 0 ) 00300 m_data->doBootstrapSampling ( m_probs,m_data->m_trainOrig,m_data->m_trainTargetOrig,m_data->m_trainTargetOrigEffect,m_data->m_trainTargetOrigResidual,m_data->m_trainLabelOrig ); 00301 } 00302 00303 srand ( m_data->m_randSeed ); 00304 00305 // set the list of already trained predictors 00306 if ( globalLoop == 0 ) 00307 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.begin() +i ) ); 00308 else 00309 { 00310 vector<string> tmp; 00311 for ( int j=0;j<m_algorithmList.size();j++ ) 00312 if ( j != i ) 00313 tmp.push_back ( m_algorithmList[j] ); 00314 m_data->setAlgorithmList ( tmp ); 00315 } 00316 00317 time_t beginTime = time ( 0 ); 00318 00319 // extend input features with previous predictions 00320 if ( m_data->m_enableCascadeLearning ) 00321 { 00322 if(m_data->m_validationType=="ValidationSet") 00323 assert(false); 00324 m_data->fillCascadeLearningInputs(); 00325 m_data->extendTrainDataWithCascadeInputs(); 00326 } 00327 00328 m_data->allocMemForCrossValidationSets(); 00329 00330 // algorithm dsc file (template) 00331 string fAlgoTemplateName = m_data->m_datasetPath + "/" + m_algorithmList[i]; 00332 string fAlgoName = m_data->m_datasetPath + "/" + string ( DSC_PATH ) + "/" + m_algorithmList[i]; 00333 /*fstream fAlgoTemplate(fAlgoTemplateName.c_str(), ios::in); 00334 00335 // open the dsc file 00336 fstream fAlgo(fAlgoName.c_str(), ios::out); 00337 00338 cout<<"AlgoTemplate:"<<fAlgoTemplateName<<" Algo:"<<fAlgoName<<endl; 00339 00340 // copy the content from the template to the dsc file 00341 char buf[1024]; 00342 while(fAlgoTemplate.getline(buf, 1024)) // read all lines 00343 { 00344 string line = string(buf); 00345 fAlgo<<line<<endl; 00346 } 00347 00348 fAlgoTemplate.close(); 00349 fAlgo.close(); 00350 00351 // redirect cout to filename 00352 cout.setOutputFile(fAlgoName); 00353 00354 cout<<"Floating point precision: "<<(int)sizeof(REAL)<<" Bytes"<<endl; 00355 */ 00356 // =========================== train the algorithm =========================== 00357 00358 if ( globalLoop > 0 ) 00359 m_data->m_loadWeightsBeforeTraining = true; 00360 00361 trainAlgorithm ( fAlgoTemplateName, fAlgoName ); 00362 cout<<"Finished in "<<time ( 0 )-beginTime<<"[s]"<<endl; 00363 00364 // clear file redirection of cout<< 00365 cout.setOutputFile ( "" ); 00366 00367 // clear mem 00368 m_data->deleteMemory(); 00369 } 00370 } 00371 00372 cout<<"Total training time:"<<time(0)-totalTime<<"[s]"<<endl; 00373 00374 if ( m_data->m_enablePostNNBlending ) 00375 { 00376 // set the list of already trained predictors 00377 m_data->setAlgorithmList ( vector<string> ( m_algorithmList.begin(), m_algorithmList.end() ) ); 00378 00379 // fix random seed 00380 srand ( m_data->m_randSeed ); 00381 00382 // read dataset 00383 m_data->readDataset ( m_data->m_datasetName ); 00384 srand ( m_data->m_randSeed ); 00385 m_data->allocMemForCrossValidationSets(); 00386 m_data->partitionDatasetToCrossValidationSets(); 00387 00388 m_data->readDscFile ( m_data->m_datasetPath + "/NeuralNetwork.dscblend" ); 00389 00390 BlendingNN nn; 00391 nn.setDataPointers ( m_data ); 00392 nn.readSpecificMaps(); 00393 nn.init(); 00394 nn.train(); 00395 00396 // clear mem 00397 m_data->deleteMemory(); 00398 } 00399 }
void Scheduler::trainAlgorithm | ( | string | fnameTemplate, | |
string | fnameDsc | |||
) | [private] |
Start the training of the particular Algorithm
fname | Dsc file name of algorithm |
Definition at line 1085 of file Scheduler.cpp.
01086 { 01087 cout<<"Train algorithm:"<<fnameTemplate<<endl; 01088 01089 string algoName, id; 01090 checkAlgorithmTemplate ( fnameTemplate, algoName, id ); 01091 01092 // read dsc file 01093 m_data->readDscFile ( fnameTemplate ); 01094 if ( m_data->m_disableTraining ) 01095 { 01096 cout<<"Training disabled."<<endl; 01097 return; 01098 } 01099 01100 // copy the content of the template to the dsc file 01101 fstream fAlgoTemplate ( fnameTemplate.c_str(), ios::in ); 01102 fstream fAlgo ( fnameDsc.c_str(), ios::out ); 01103 cout<<"AlgoTemplate:"<<fnameTemplate<<" Algo:"<<fnameDsc<<endl; 01104 char buf[1024]; 01105 while ( fAlgoTemplate.getline ( buf, 1024 ) ) // read all lines 01106 { 01107 string line = string ( buf ); 01108 fAlgo<<line<<endl; 01109 } 01110 fAlgoTemplate.close(); 01111 fAlgo.close(); 01112 01113 // redirect cout to filename 01114 cout.setOutputFile ( fnameDsc ); 01115 01116 cout<<"Floating point precision: "<< ( int ) sizeof ( REAL ) <<" Bytes"<<endl; 01117 01118 m_data->partitionDatasetToCrossValidationSets(); 01119 01120 // start the algorithm 01121 Algorithm* algo = 0; 01122 algorithmDispatcher ( algo, algoName ); 01123 algo->setDataPointers ( m_data ); 01124 01125 if ( m_data->m_enableFeatureSelection ) 01126 { 01127 algo->doFeatureSelection(); 01128 exit ( 0 ); 01129 } 01130 else 01131 algo->train(); 01132 01133 if ( algo ) 01134 { 01135 cout<<"delete algo"<<endl; 01136 delete algo; 01137 } 01138 algo = 0; 01139 cout<<"Finished train algorithm:"<<fnameTemplate<<endl; 01140 01141 }