Data.cpp

00001 #include "Data.h"
00002 
00003 extern StreamOutput cout;
00004 
00008 Data::Data()
00009 {
00010     cout<<"Constructor Data"<<endl;
00011 
00012     // init member vars
00013     m_algorithmID = 0;
00014     m_randSeed = 0;
00015     m_nMixDataset = 0;
00016     m_nMixTrainList = 0;
00017     m_nCross = 0;
00018     m_validationType = "Retraining";
00019     m_maxThreadsInCross = 0;
00020     m_enableGlobalMeanStdEstimate = 0;
00021     m_positiveTarget = 0;
00022     m_negativeTarget = 0;
00023     m_blendingRegularization = 0;
00024     m_enableGlobalBlendingWeights = 0;
00025     m_blendingEnableCrossValidation = 0;
00026     m_enablePostNNBlending = 0;
00027     m_enableCascadeLearning = 0;
00028     m_nCascadeInputs = 0;
00029     m_cascadeInputs = 0;
00030     m_nFeatures = 0;
00031     m_nClass = 0;
00032     m_nDomain = 0;
00033     m_mixDatasetIndices = 0;
00034     m_mixList = 0;
00035     m_crossIndex = 0;
00036     m_nTrain = 0;
00037     m_trainOrig = 0;
00038     m_trainTargetOrig = 0;
00039     m_trainTargetOrigEffect = 0;
00040     m_trainTargetOrigResidual = 0;
00041     m_trainLabelOrig = 0;
00042     m_trainBaggingIndex = 0;
00043     m_nTest = 0;
00044     m_testOrig = 0;
00045     m_testTargetOrig = 0;
00046     m_testLabelOrig = 0;
00047     m_slotBoundaries = 0;
00048     m_trainSize = 0;
00049     m_train = 0;
00050     m_trainTarget = 0;
00051     m_trainTargetEffect = 0;
00052     m_trainTargetResidual = 0;
00053     m_trainLabel = 0;
00054     m_probeSize = 0;
00055     m_probe = 0;
00056     m_probeTarget = 0;
00057     m_probeTargetEffect = 0;
00058     m_probeTargetResidual = 0;
00059     m_probeLabel = 0;
00060     m_probeIndex = 0;
00061     m_validSize = 0;
00062     m_valid = 0;
00063     m_validTarget = 0;
00064     m_validLabel = 0;
00065     m_mean = 0;
00066     m_std = 0;
00067     m_standardDeviationMin = 0;
00068     m_targetMean = 0;
00069     m_enableSaveMemory = 0;
00070     m_support = 0;
00071     m_enablePostBlendClipping = 0;
00072     m_addOutputNoise = 0;
00073     m_enableFeatureSelection = 0;
00074     m_featureSelectionWriteBinaryDataset = 0;
00075     m_enableBagging = 0;
00076     m_randomSeedBagging = 0;
00077     m_enableStaticNormalization = 0;
00078     m_staticMeanNormalization = 0.0;
00079     m_staticStdNormalization = 1.0;
00080     m_enableProbablisticNormalization = 0;
00081     m_dimensionalityReduction = "";
00082     m_subsampleTrainSet = 1.0;
00083     m_subsampleFeatures = 1.0;
00084     m_disableTraining = false;
00085     m_globalTrainingLoops = 1;
00086     m_addConstantInput = 0;
00087     m_loadWeightsBeforeTraining = false;
00088 }
00089 
00093 Data::~Data()
00094 {
00095     cout<<"destructor Data"<<endl;
00096 
00097 }
00098 
00104 void Data::deleteMemory()
00105 {
00106     cout<<"Delete internal memory"<<endl;
00107 
00108     // memory from dataset
00109     if ( m_trainOrig )
00110         delete[] m_trainOrig;
00111     m_trainOrig = 0;
00112     if ( m_trainTargetOrig )
00113         delete[] m_trainTargetOrig;
00114     m_trainTargetOrig = 0;
00115     if ( m_trainLabelOrig )
00116         delete[] m_trainLabelOrig;
00117     m_trainLabelOrig = 0;
00118     if ( m_testOrig )
00119         delete[] m_testOrig;
00120     m_testOrig = 0;
00121     if ( m_testTargetOrig )
00122         delete[] m_testTargetOrig;
00123     m_testTargetOrig = 0;
00124     if ( m_testLabelOrig )
00125         delete[] m_testLabelOrig;
00126     m_testLabelOrig = 0;
00127 
00128     // memory from cross validation
00129     if ( m_mean )
00130         delete[] m_mean;
00131     m_mean = 0;
00132     if ( m_std )
00133         delete[] m_std;
00134     m_std = 0;
00135     if ( m_trainTargetOrigEffect )
00136         delete[] m_trainTargetOrigEffect;
00137     m_trainTargetOrigEffect = 0;
00138     if ( m_trainTargetOrigResidual )
00139         delete[] m_trainTargetOrigResidual;
00140     m_trainTargetOrigResidual = 0;
00141 
00142     for ( int i=0;i<m_nCross+1;i++ )
00143     {
00144         if ( m_train )
00145         {
00146             if ( m_train[i] )
00147                 delete[] m_train[i];
00148             m_train[i] = 0;
00149         }
00150         if ( m_trainTarget )
00151         {
00152             if ( m_trainTarget[i] )
00153                 delete[] m_trainTarget[i];
00154             m_trainTarget[i] = 0;
00155         }
00156         if ( m_trainTargetEffect )
00157         {
00158             if ( m_trainTargetEffect[i] )
00159                 delete[] m_trainTargetEffect[i];
00160             m_trainTargetEffect[i] = 0;
00161         }
00162         if ( m_trainTargetResidual )
00163         {
00164             if ( m_trainTargetResidual[i] )
00165                 delete[] m_trainTargetResidual[i];
00166             m_trainTargetResidual[i] = 0;
00167         }
00168         if ( m_trainLabel )
00169         {
00170             if ( m_trainLabel[i] )
00171                 delete[] m_trainLabel[i];
00172             m_trainLabel[i] = 0;
00173         }
00174         if ( m_validationType == "Bagging" )
00175         {
00176             if( m_trainBaggingIndex )
00177             {
00178                 if ( m_trainBaggingIndex[i] )
00179                     delete[] m_trainBaggingIndex[i];
00180                 m_trainBaggingIndex[i] = 0;
00181             }
00182         }
00183         if ( m_probe )
00184         {
00185             if ( m_probe[i] )
00186                 delete[] m_probe[i];
00187             m_probe[i] = 0;
00188         }
00189         if ( m_probeTarget )
00190         {
00191             if ( m_probeTarget[i] )
00192                 delete[] m_probeTarget[i];
00193             m_probeTarget[i] = 0;
00194         }
00195         if ( m_probeTargetEffect )
00196         {
00197             if ( m_probeTargetEffect[i] )
00198                 delete[] m_probeTargetEffect[i];
00199             m_probeTargetEffect[i] = 0;
00200         }
00201         if ( m_probeTargetResidual )
00202         {
00203             if ( m_probeTargetResidual[i] )
00204                 delete[] m_probeTargetResidual[i];
00205             m_probeTargetResidual[i] = 0;
00206         }
00207         if ( m_probeLabel )
00208         {
00209             if ( m_probeLabel[i] )
00210                 delete[] m_probeLabel[i];
00211             m_probeLabel[i] = 0;
00212         }
00213         if ( m_probeIndex )
00214         {
00215             if ( m_probeIndex[i] )
00216                 delete[] m_probeIndex[i];
00217             m_probeIndex[i] = 0;
00218         }
00219     }
00220     if ( m_train )
00221         delete[] m_train;
00222     m_train = 0;
00223     if ( m_trainTarget )
00224         delete[] m_trainTarget;
00225     m_trainTarget = 0;
00226     if ( m_trainTargetEffect )
00227         delete[] m_trainTargetEffect;
00228     m_trainTargetEffect = 0;
00229     if ( m_trainTargetResidual )
00230         delete[] m_trainTargetResidual;
00231     m_trainTargetResidual = 0;
00232     if ( m_trainLabel )
00233         delete[] m_trainLabel;
00234     m_trainLabel = 0;
00235     if(m_validationType == "Bagging")
00236     {
00237         if(m_trainBaggingIndex)
00238             delete[] m_trainBaggingIndex;
00239         m_trainBaggingIndex = 0;
00240     }
00241     if ( m_probe )
00242         delete[] m_probe;
00243     m_probe = 0;
00244     if ( m_probeTarget )
00245         delete[] m_probeTarget;
00246     m_probeTarget = 0;
00247     if ( m_probeTargetEffect )
00248         delete[] m_probeTargetEffect;
00249     m_probeTargetEffect = 0;
00250     if ( m_probeTargetResidual )
00251         delete[] m_probeTargetResidual;
00252     m_probeTargetResidual = 0;
00253     if ( m_probeLabel )
00254         delete[] m_probeLabel;
00255     m_probeLabel = 0;
00256     if ( m_probeIndex )
00257         delete[] m_probeIndex;
00258     m_probeIndex = 0;
00259 
00260     if ( m_trainSize )
00261         delete[] m_trainSize;
00262     m_trainSize = 0;
00263     if ( m_probeSize )
00264         delete[] m_probeSize;
00265     m_probeSize = 0;
00266 
00267     if ( m_mixDatasetIndices )
00268         delete[] m_mixDatasetIndices;
00269     m_mixDatasetIndices = 0;
00270     if ( m_mixList )
00271         delete[] m_mixList;
00272     m_mixList = 0;
00273     if ( m_slotBoundaries )
00274         delete[] m_slotBoundaries;
00275     m_slotBoundaries = 0;
00276     if ( m_crossIndex )
00277         delete[] m_crossIndex;
00278     m_crossIndex = 0;
00279 
00280     if ( m_cascadeInputs )
00281         delete[] m_cascadeInputs;
00282     m_cascadeInputs = 0;
00283     
00284     if ( m_targetMean )
00285         delete[] m_targetMean;
00286     m_targetMean = 0;
00287 
00288 }
00289 
00296 void Data::readDataset ( string name )
00297 {
00298     // read MNIST
00299     if ( name == "MNIST" )
00300     {
00301         DatasetReader r;
00302         // call by reference, memory is allcated in the DatasetReader
00303         r.readMNIST ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00304     }
00305     else if ( name == "NETFLIX" ) // read Netflix
00306     {
00307         DatasetReader r;
00308         // call by reference, memory is allcated in the DatasetReader
00309         r.readNETFLIX ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00310     }
00311     else if ( name == "AusDM2009" ) // read AusDM2009
00312     {
00313         DatasetReader r;
00314         // call by reference, memory is allcated in the DatasetReader
00315         r.readAusDM2009 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00316     }
00317     else if ( name == "KDDCup09Large" ) // read large KDDCup09large dataset
00318     {
00319         DatasetReader r;
00320         // call by reference, memory is allcated in the DatasetReader
00321         r.readKDDCup09Large ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00322     }
00323     else if ( name == "KDDCup09Small" ) // read large KDDCup09small dataset
00324     {
00325         DatasetReader r;
00326         // call by reference, memory is allcated in the DatasetReader
00327         r.readKDDCup09Small ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00328     }
00329     else if ( name == "BINARY" ) // read binary format dataset
00330     {
00331         DatasetReader r;
00332         // call by reference, memory is allcated in the DatasetReader
00333         r.readBINARY ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00334     }
00335     else if ( name == "CSV" ) // read csv format dataset
00336     {
00337         DatasetReader r;
00338         // call by reference, memory is allcated in the DatasetReader
00339         r.readCSV ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00340     }
00341     else if ( name == "ARFF" ) // read arff format dataset
00342     {
00343         DatasetReader r;
00344         // call by reference, memory is allcated in the DatasetReader
00345         r.readARFF ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00346     }
00347     else if ( name == "PRUDSYS_DMC2009" ) // read PRUDSYS_DMC2009 dataset
00348     {
00349         DatasetReader r;
00350         // call by reference, memory is allcated in the DatasetReader
00351         r.readPRUDSYS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00352     }
00353     else if ( name == "ADULT" ) // read adult dataset
00354     {
00355         DatasetReader r;
00356         // call by reference, memory is allcated in the DatasetReader
00357         r.readADULT ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00358     }
00359     else if ( name == "AUSTRALIAN" ) // read australian dataset
00360     {
00361         DatasetReader r;
00362         // call by reference, memory is allcated in the DatasetReader
00363         r.readAUSTRALIAN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00364     }
00365     else if ( name == "BALANCE" ) // read balance dataset
00366     {
00367         DatasetReader r;
00368         // call by reference, memory is allcated in the DatasetReader
00369         r.readBALANCE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00370     }
00371     else if ( name == "CYLINDER-BANDS" ) // read cylinder-bands dataset
00372     {
00373         DatasetReader r;
00374         // call by reference, memory is allcated in the DatasetReader
00375         r.readCYLINDERBANDS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00376     }
00377     else if ( name == "BREAST" ) // read breast-cancer dataset
00378     {
00379         DatasetReader r;
00380         // call by reference, memory is allcated in the DatasetReader
00381         r.readBREASTCANCERWISCONSIN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00382     }
00383     else if ( name == "CREDIT" ) // read australian-credit dataset
00384     {
00385         DatasetReader r;
00386         // call by reference, memory is allcated in the DatasetReader
00387         r.readAUSTRALIANCREDIT ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00388     }
00389     else if ( name == "DIABETES" ) // read diabetes dataset
00390     {
00391         DatasetReader r;
00392         // call by reference, memory is allcated in the DatasetReader
00393         r.readDIABETES ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00394     }
00395     else if ( name == "GERMAN" ) // read german dataset
00396     {
00397         DatasetReader r;
00398         // call by reference, memory is allcated in the DatasetReader
00399         r.readGERMAN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00400     }
00401     else if ( name == "GLASS" ) // read glass dataset
00402     {
00403         DatasetReader r;
00404         // call by reference, memory is allcated in the DatasetReader
00405         r.readGLASS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00406     }
00407     else if ( name == "HEART-SPECTF" ) // read heart dataset
00408     {
00409         DatasetReader r;
00410         // call by reference, memory is allcated in the DatasetReader
00411         r.readHEART ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00412     }
00413     else if ( name == "HEPATITIS" ) // read hepatitis dataset
00414     {
00415         DatasetReader r;
00416         // call by reference, memory is allcated in the DatasetReader
00417         r.readHEPATITIS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00418     }
00419     else if ( name == "IONOSPHERE" ) // read ionophsere dataset
00420     {
00421         DatasetReader r;
00422         // call by reference, memory is allcated in the DatasetReader
00423         r.readIONOSPHERE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00424     }
00425     else if ( name == "IRIS" ) // read iris dataset
00426     {
00427         DatasetReader r;
00428         // call by reference, memory is allcated in the DatasetReader
00429         r.readIRIS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00430     }
00431     else if ( name == "LETTER" ) // read letter dataset
00432     {
00433         DatasetReader r;
00434         // call by reference, memory is allcated in the DatasetReader
00435         r.readLETTER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00436     }
00437     else if ( name == "MONKS-1" ) // read monks1 dataset
00438     {
00439         DatasetReader r;
00440         // call by reference, memory is allcated in the DatasetReader
00441         r.readMONKS1 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00442     }
00443     else if ( name == "MONKS-2" ) // read monks2 dataset
00444     {
00445         DatasetReader r;
00446         // call by reference, memory is allcated in the DatasetReader
00447         r.readMONKS2 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00448     }
00449     else if ( name == "MONKS-3" ) // read monks3 dataset
00450     {
00451         DatasetReader r;
00452         // call by reference, memory is allcated in the DatasetReader
00453         r.readMONKS3 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00454     }
00455     else if ( name == "MUSHROOM" ) // read mushroom dataset
00456     {
00457         DatasetReader r;
00458         // call by reference, memory is allcated in the DatasetReader
00459         r.readMUSHROOM ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00460     }
00461     else if ( name == "SATIMAGE" ) // read satimage dataset
00462     {
00463         DatasetReader r;
00464         // call by reference, memory is allcated in the DatasetReader
00465         r.readSATIMAGE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00466     }
00467     else if ( name == "SEGMENTATION" ) // read segmentation dataset
00468     {
00469         DatasetReader r;
00470         // call by reference, memory is allcated in the DatasetReader
00471         r.readSEGMENTATION ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00472     }
00473     else if ( name == "SONAR" ) // read sonar dataset
00474     {
00475         DatasetReader r;
00476         // call by reference, memory is allcated in the DatasetReader
00477         r.readSONAR ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00478     }
00479     else if ( name == "VEHICLE" ) // read vehicle dataset
00480     {
00481         DatasetReader r;
00482         // call by reference, memory is allcated in the DatasetReader
00483         r.readVEHICLE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00484     }
00485     else if ( name == "VOTES" ) // read votes dataset
00486     {
00487         DatasetReader r;
00488         // call by reference, memory is allcated in the DatasetReader
00489         r.readVOTES ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00490     }
00491     else if ( name == "WINE" ) // read wine dataset
00492     {
00493         DatasetReader r;
00494         // call by reference, memory is allcated in the DatasetReader
00495         r.readWINE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00496     }
00497     else if ( name == "POKER" ) // read poker dataset
00498     {
00499         DatasetReader r;
00500         // call by reference, memory is allcated in the DatasetReader
00501         r.readPOKER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00502     }
00503     else if ( name == "YEAST" ) // read yeast dataset
00504     {
00505         DatasetReader r;
00506         // call by reference, memory is allcated in the DatasetReader
00507         r.readYEAST ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00508     }
00509     else if ( name == "SURVIVAL" ) // read survival dataset
00510     {
00511         DatasetReader r;
00512         // call by reference, memory is allcated in the DatasetReader
00513         r.readSURVIVAL ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00514     }
00515     else if ( name == "SPIDER" ) // read (generated by)spider dataset
00516     {
00517         DatasetReader r;
00518         // call by reference, memory is allcated in the DatasetReader
00519         r.readSPIDER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00520     }
00521     else
00522     {
00523         cout<<"Dataset not found:"<<name<<endl;
00524         exit ( 0 );
00525     }
00526 
00527     if(m_addConstantInput)
00528         addConstantInput();
00529     
00530     // reduce the size of the training set
00531     reduceTrainingSetSize ( m_subsampleTrainSet );
00532 
00533     // reduce the size of the features in the training set
00534     int nFeatOrig = m_nFeatures;
00535     reduceFeatureSize ( m_trainOrig, m_nTrain, m_nFeatures, m_subsampleFeatures, Framework::getFrameworkMode() );
00536     reduceFeatureSize ( m_testOrig, m_nTest, nFeatOrig, m_subsampleFeatures, true );
00537 
00538     // feature selection, based on a linear model
00539     if ( m_featureSelectionWriteBinaryDataset )
00540     {
00541         makeBinaryDataset();
00542         exit ( 0 );
00543     }
00544 
00545     // mix train features and labels
00546     mixDataset();
00547 }
00548 
00557 void Data::doBootstrapSampling ( REAL* probs, REAL* &train, REAL* &target, REAL* &targetEff, REAL* &targetRes, int* &label, int nTrainNew )
00558 {
00559     cout<<endl<<"Do boostrap sampling of the dataset (size:"<<m_nTrain<<")"<<endl;
00560     cout<<"Random seed:"<<m_randomSeedBagging<<endl;
00561     srand ( m_randomSeedBagging );
00562 
00563     if ( nTrainNew > 0 && nTrainNew < m_nTrain )
00564         cout<<"Draw not a boostrap sample, make a simple random subset ("<<100.0* ( double ) nTrainNew/ ( double ) m_nTrain<<"%)"<<endl;
00565 
00566     REAL* trainNew = 0, *ptr0, *ptr1;
00567     if ( train )
00568         trainNew = new REAL[m_nFeatures*m_nTrain];
00569     REAL* targetNew = 0;
00570     if ( target )
00571         targetNew = new REAL[m_nClass*m_nDomain*m_nTrain];
00572     REAL* targetEffNew = 0;
00573     if ( targetEff )
00574         targetEffNew = new REAL[m_nClass*m_nDomain*m_nTrain];
00575     REAL* targetResNew = 0;
00576     if ( targetRes )
00577         targetResNew = new REAL[m_nClass*m_nDomain*m_nTrain];
00578     int* labelNew = 0;
00579     if ( Framework::getDatasetType() ==true )
00580         labelNew = new int[m_nDomain*m_nTrain];
00581     int* replicateCnt = new int[m_nTrain];
00582     for ( int i=0;i<m_nTrain;i++ )
00583         replicateCnt[i] = 0;
00584 
00585     int sampleCnt = 0;
00586     while ( ( sampleCnt < m_nTrain && nTrainNew == 0 ) || ( sampleCnt < nTrainNew && nTrainNew > 0 && nTrainNew < m_nTrain ) )
00587         //for(int i=0;i<m_nTrain;i++)
00588     {
00589         // random index
00590         int ind;
00591         if ( nTrainNew == 0 || nTrainNew >= m_nTrain ) // boostrap sample
00592         {
00593             if ( probs == 0 )
00594                 ind = rand() %m_nTrain;
00595             else
00596                 ind = vectorSampling ( probs, m_nTrain );
00597         }
00598         else  // random subset
00599         {
00600             ind = rand() %m_nTrain;
00601             while ( replicateCnt[ind] )
00602                 ind = rand() %m_nTrain;
00603         }
00604         replicateCnt[ind]++;
00605 
00606         // train features
00607         if ( train )
00608         {
00609             ptr0 = train + ind * m_nFeatures;
00610             ptr1 = trainNew + sampleCnt * m_nFeatures;
00611             for ( int j=0;j<m_nFeatures;j++ )
00612                 ptr1[j] = ptr0[j];
00613         }
00614 
00615         // targets
00616         if ( target )
00617         {
00618             ptr0 = target + ind * m_nClass*m_nDomain;
00619             ptr1 = targetNew + sampleCnt * m_nClass*m_nDomain;
00620             for ( int j=0;j<m_nClass*m_nDomain;j++ )
00621                 ptr1[j] = ptr0[j];
00622         }
00623 
00624         // effects
00625         if ( targetEff )
00626         {
00627             ptr0 = targetEff + ind * m_nClass*m_nDomain;
00628             ptr1 = targetEffNew + sampleCnt * m_nClass*m_nDomain;
00629             for ( int j=0;j<m_nClass*m_nDomain;j++ )
00630                 ptr1[j] = ptr0[j];
00631         }
00632 
00633         // residual
00634         if ( targetRes )
00635         {
00636             ptr0 = targetRes + ind * m_nClass*m_nDomain;
00637             ptr1 = targetResNew + sampleCnt * m_nClass*m_nDomain;
00638             for ( int j=0;j<m_nClass*m_nDomain;j++ )
00639                 ptr1[j] = ptr0[j];
00640         }
00641 
00642         // train label
00643         if ( Framework::getDatasetType() ==true )
00644             for ( int d=0;d<m_nDomain;d++ )
00645                 labelNew[d+sampleCnt*m_nDomain] = label[d+ind*m_nDomain];
00646 
00647         sampleCnt++;
00648     }
00649 
00650     int nonReplicates = 0, notUsed = 0, replicates = 0;
00651     for ( int i=0;i<m_nTrain;i++ )
00652     {
00653         if ( replicateCnt[i] == 0 )
00654             notUsed++;
00655         if ( replicateCnt[i] == 1 )
00656             nonReplicates++;
00657         if ( replicateCnt[i] > 1 )
00658             replicates++;
00659     }
00660     cout<<"notUsed:"<<notUsed<<" nonReplicates:"<<nonReplicates<<" replicates:"<<replicates;
00661     cout<<" ("<<100.0* ( REAL ) ( nonReplicates+replicates ) / ( REAL ) m_nTrain<<"%)"<<endl<<endl;
00662 
00663     delete[] replicateCnt;
00664 
00665     // set new data
00666     train = trainNew;
00667     target = targetNew;
00668     targetEff = targetEffNew;
00669     targetRes = targetResNew;
00670     label = labelNew;
00671 }
00672 
00680 int Data::vectorSampling ( REAL* probs, int length )
00681 {
00682     double sum = 0.0;
00683     for ( int i=0;i<length;i++ )
00684         sum += probs[i];
00685 
00686     double value = sum * ( ( double ) rand() / ( double ) RAND_MAX );
00687 
00688     sum = 0.0;
00689     for ( int i=0;i<length;i++ )
00690     {
00691         sum += probs[i];
00692         if ( sum >= value )
00693             return i;
00694     }
00695     cout<<"value:"<<value<<endl<<"length:"<<length<<endl<<"sum:"<<sum<<endl;
00696     for ( int i=0;i<length;i++ )
00697         cout<<probs[i]<<" "<<flush;
00698     assert ( false );
00699     return -1;
00700 }
00701 
00707 void Data::makeBinaryDataset()
00708 {
00709     cout<<endl;
00710     cout<<"Make binary dataset from selected features"<<endl;
00711     cout<<"Open features:"<<FEATURE_TXT_FILE<<endl;
00712 
00713     // read features from txt file
00714     fstream f;
00715     vector<int> features;
00716     f.open ( FEATURE_TXT_FILE,ios::in );
00717     if ( f.is_open() ==false )
00718         assert ( false );
00719     int value, nValidFeatures = 0;
00720     while ( f>>value )
00721         features.push_back ( value );
00722     f.close();
00723 
00724     // check featureIDs
00725     for ( int j=0;j<features.size();j++ )
00726         if ( features[j] >= m_nFeatures || features[j] == -1 )
00727             assert ( false );
00728         else
00729             nValidFeatures++;
00730 
00731     cout<<"nValidFeatures:"<<nValidFeatures<<endl;
00732     REAL* feat;
00733     int* label, N;
00734 
00735     if ( Framework::getFrameworkMode() == 1 )
00736     {
00737         cout<<"Write: binary.test"<<endl;
00738         f.open ( "binary.test", ios::out );
00739         feat = m_testOrig;
00740         label = m_testLabelOrig;
00741         N = m_nTest;
00742     }
00743     else
00744     {
00745         cout<<"Write: binary.train"<<endl;
00746         f.open ( "binary.train", ios::out );
00747         feat = m_trainOrig;
00748         label = m_trainLabelOrig;
00749         N = m_nTrain;
00750     }
00751 
00752     cout<<"#lines:"<<N<<endl;
00753 
00754     // dataset bounds
00755     f.write ( ( char* ) &N, sizeof ( int ) );
00756     f.write ( ( char* ) &m_nClass, sizeof ( int ) );
00757     f.write ( ( char* ) &m_nDomain, sizeof ( int ) );
00758     f.write ( ( char* ) &nValidFeatures, sizeof ( int ) );
00759 
00760     // write features
00761     for ( int i=0;i<N;i++ )
00762         for ( int j=0;j<features.size();j++ )
00763             f.write ( ( char* ) & ( feat[i*m_nFeatures + features[j]] ), sizeof ( REAL ) );
00764 
00765     // write labels
00766     f.write ( ( char* ) label, sizeof ( int ) *N*m_nDomain );
00767     f.close();
00768 
00769 }
00770 
00775 void Data::mixDataset()
00776 {
00777     if ( m_nTrain )
00778     {
00779         m_mixDatasetIndices = new int[m_nTrain];
00780         for ( int i=0;i<m_nTrain;i++ )
00781             m_mixDatasetIndices[i] = i;
00782     }
00783     else
00784     {
00785         cout<<"Do no mix the dataset."<<endl;
00786         m_mixDatasetIndices = 0;
00787         return;
00788     }
00789     cout<<"Randomize the dataset: "<<m_nMixDataset*m_nTrain<<" line swaps [";
00790 
00791     int progress = m_nTrain*m_nMixDataset/10 + 1;
00792     REAL* tmp0 = new REAL[m_nFeatures];
00793     REAL* tmp1 = new REAL[m_nClass*m_nDomain];
00794     for ( int i=0;i<m_nTrain*m_nMixDataset;i++ )
00795     {
00796         if ( i%progress==0 )
00797             cout<<"."<<flush;
00798 
00799         // random index swaps
00800         int ind0 = rand() %m_nTrain;
00801         int ind1 = rand() %m_nTrain;
00802 
00803         // train features (REAL*)
00804         REAL* ptr0 = m_trainOrig + ind0 * m_nFeatures;
00805         REAL* ptr1 = m_trainOrig + ind1 * m_nFeatures;
00806         for ( int j=0;j<m_nFeatures;j++ )
00807         {
00808             tmp0[j] = ptr0[j];
00809             ptr0[j] = ptr1[j];
00810             ptr1[j] = tmp0[j];
00811         }
00812 
00813         // train targets (REAL*)
00814         ptr0 = m_trainTargetOrig + ind0 * m_nClass * m_nDomain;
00815         ptr1 = m_trainTargetOrig + ind1 * m_nClass * m_nDomain;
00816         for ( int j=0;j<m_nClass*m_nDomain;j++ )
00817         {
00818             tmp1[j] = ptr0[j];
00819             ptr0[j] = ptr1[j];
00820             ptr1[j] = tmp1[j];
00821         }
00822 
00823         // train label
00824         if ( Framework::getDatasetType() ==true )
00825         {
00826             for ( int d=0;d<m_nDomain;d++ )
00827             {
00828                 int tmp = m_trainLabelOrig[d+ind0*m_nDomain];
00829                 m_trainLabelOrig[d+ind0*m_nDomain] = m_trainLabelOrig[d+ind1*m_nDomain];
00830                 m_trainLabelOrig[d+ind1*m_nDomain] = tmp;
00831             }
00832         }
00833 
00834         // index
00835         int tmp = m_mixDatasetIndices[ind0];
00836         m_mixDatasetIndices[ind0] = m_mixDatasetIndices[ind1];
00837         m_mixDatasetIndices[ind1] = tmp;
00838     }
00839     if ( tmp0 )
00840         delete[] tmp0;
00841     tmp0 = 0;
00842     if ( tmp1 )
00843         delete[] tmp1;
00844     tmp1 = 0;
00845 
00846     cout<<"] "<<"mixInd[0]:"<<m_mixDatasetIndices[0]<<"  mixInd["<<m_nTrain-1<<"]:"<<m_mixDatasetIndices[m_nTrain-1]<<endl;
00847 }
00848 
00853 void Data::loadNormalization ( int nCascade )
00854 {
00855     // load normalization
00856     char buf[1024];
00857     sprintf ( buf,"%s/%s/normalization.dat.add%d",m_datasetPath.c_str(), m_tempPath.c_str(), nCascade );
00858     cout<<"Load mean and std: "<<buf<<endl;
00859     fstream f ( buf, ios::in );
00860     if ( f.is_open() == false )
00861         assert ( false );
00862     int n;
00863     f.read ( ( char* ) &n, sizeof ( int ) );
00864     if ( m_mean == 0 )
00865         m_mean = new REAL[n];
00866     if ( m_std == 0 )
00867         m_std = new REAL[n];
00868     f.read ( ( char* ) m_mean, sizeof ( REAL ) *n );
00869     f.read ( ( char* ) m_std, sizeof ( REAL ) *n );
00870     REAL min = 1e10, max = -1e10;
00871     for ( int i=0;i<n;i++ )
00872     {
00873         if ( min > m_mean[i] )
00874             min = m_mean[i];
00875         if ( max < m_mean[i] )
00876             max = m_mean[i];
00877     }
00878     cout<<"Mean:  min|max:"<<min<<"|"<<max<<endl;
00879     min = 1e10;
00880     max = -1e10;
00881     for ( int i=0;i<n;i++ )
00882     {
00883         if ( min > m_std[i] )
00884             min = m_std[i];
00885         if ( max < m_std[i] )
00886             max = m_std[i];
00887     }
00888     cout<<"Std:  min|max:"<<min<<"|"<<max<<endl;
00889     f.close();
00890 }
00891 
00901 void Data::allocMemForCrossValidationSets()
00902 {
00903     cout<<"Alloc mem for cross validation data sets"<<endl;
00904     m_mean = new REAL[m_nFeatures];
00905     m_std = new REAL[m_nFeatures];
00906 
00907     if(m_validationType == "ValidationSet")
00908         m_nCross = 0;
00909     else
00910     {
00911         // partitioning to nCross-validation sets
00912         if ( m_nCross > m_nTrain )
00913         {
00914             cout<<"Limit: nCross=nTrain"<<endl;
00915             m_nCross = m_nTrain;
00916         }
00917         cout<<"Cross-validation settings: "<<m_nCross<<" sets"<<endl;
00918     }
00919     
00920     // calc global mean and standard deviation over whole dataset
00921     cout<<"Calculating mean and std per input"<<endl;
00922     double minStd = 1e10, maxStd = -1e10, minMean = 1e10, maxMean = -1e10, minValue = 1e10, maxValue = -1e10;
00923     for ( int i=0;i<m_nFeatures;i++ )
00924     {
00925         // calc mean
00926         double mean = 0.0;
00927         for ( int j=0;j<m_nTrain;j++ )
00928         {
00929             REAL v = m_trainOrig[j*m_nFeatures + i];
00930             mean += v;
00931             if ( minValue > v )
00932                 minValue = v;
00933             if ( maxValue < v )
00934                 maxValue = v;
00935         }
00936         mean /= ( double ) m_nTrain;
00937 
00938         // calc standard deviation
00939         double std = 0.0;
00940         for ( int j=0;j<m_nTrain;j++ )
00941             std += ( mean - m_trainOrig[j*m_nFeatures + i] ) * ( mean - m_trainOrig[j*m_nFeatures + i] );
00942         std = sqrt ( std/ ( double ) ( m_nTrain-1 ) );
00943 
00944         if ( m_datasetName=="KDDCup09Large" || m_datasetName=="KDDCup09Small" ) // || m_datasetName=="BINARY")
00945         {
00946             double max = -1e10;
00947             for ( int j=0;j<m_nTrain;j++ )
00948                 if ( max < fabs ( m_trainOrig[j*m_nFeatures + i]-mean ) )
00949                     max = fabs ( m_trainOrig[j*m_nFeatures + i]-mean );
00950             std = max;
00951         }
00952 
00953         if ( fabs ( std ) < 1e-9 && mean == 0.0 ) // constant zero input
00954         {
00955             //cout<<"Feature nr:"<<i<<" is constant zero (mean:"<<mean<<"), set std=1e10"<<endl;
00956             cout<<"f:"<<i<<"=0 "<<flush;
00957             std = 1e10;
00958         }
00959         if ( fabs ( std ) < 1e-9 && mean != 0.0 ) // constant input
00960         {
00961             //cout<<"Feature nr:"<<i<<" is constant (mean:"<<mean<<"), set std="<<mean<<" and mean=0"<<endl;
00962             cout<<"f:"<<i<<"=c "<<flush;
00963             std = mean;
00964             mean = 0.0;
00965         }
00966         if ( mean==1.0 ) // constant one input
00967         {
00968             //cout<<"Feature nr:"<<i<<" mean=1, set std=1 and mean=0"<<endl;
00969             cout<<"f:"<<i<<"=1 "<<flush;
00970             std = 1.0;
00971             mean = 0.0;
00972         }
00973         if ( std < m_standardDeviationMin ) // limit to a small positive value
00974         {
00975             //cout<<"Feature nr:"<<i<<" "<<"("<<std<<") is limited in std="<<m_standardDeviationMin<<endl;
00976             cout<<"f:"<<i<<"lim "<<flush;
00977             std = m_standardDeviationMin;
00978         }
00979 
00980         minStd = minStd > std? std : minStd;
00981         maxStd = maxStd < std? std : maxStd;
00982         minMean = minMean > mean? mean : minMean;
00983         maxMean = maxMean < mean? mean : maxMean;
00984 
00985         // save them
00986         m_mean[i] = mean;
00987         m_std[i] = std;
00988     }
00989     if ( m_enableStaticNormalization )
00990     {
00991         cout<<"Static mean:"<<m_staticMeanNormalization<<" and std:"<<m_staticStdNormalization<<endl;
00992         for ( int i=0;i<m_nFeatures;i++ )
00993         {
00994             m_mean[i] = m_staticMeanNormalization;
00995             m_std[i] = m_staticStdNormalization;
00996         }
00997         minMean = m_staticMeanNormalization;
00998         maxMean = m_staticMeanNormalization;
00999         minStd = m_staticStdNormalization;
01000         maxStd = m_staticStdNormalization;
01001     }
01002     if ( m_enableGlobalMeanStdEstimate )
01003     {
01004         cout<<"Calc average of mean and std"<<endl;
01005         double mean = 0.0;
01006         for ( int i=0;i<m_nFeatures;i++ )
01007             mean += m_mean[i];
01008         mean /= ( double ) m_nFeatures;
01009         for ( int i=0;i<m_nFeatures;i++ )
01010             m_mean[i] = mean;
01011         minMean = maxMean = mean;
01012 
01013         double std = 0.0;
01014         int stdCnt = 0;
01015         for ( int i=0;i<m_nFeatures;i++ )
01016         {
01017             if ( m_std[i] != 1e10 )
01018             {
01019                 std += m_std[i];
01020                 stdCnt++;
01021             }
01022         }
01023         if ( stdCnt == 0 )
01024             assert ( false );
01025         std /= ( double ) stdCnt;
01026         for ( int i=0;i<m_nFeatures;i++ )
01027             m_std[i] = std;
01028         minStd = maxStd = std;
01029     }
01030     if ( m_enableProbablisticNormalization )
01031     {
01032         cout<<"Calc probablistic normalization"<<endl;
01033         minStd = 1e10;
01034         maxStd = -1e10;
01035         minMean = 1e10;
01036         maxMean = -1e10;
01037         for ( int i=0;i<m_nFeatures;i++ )
01038         {
01039             REAL min = 1e10, max = -1e10;
01040             for ( int j=0;j<m_nTrain;j++ )
01041             {
01042                 REAL v = m_trainOrig[i + j*m_nFeatures];
01043                 if ( min > v )
01044                     min = v;
01045                 if ( max < v )
01046                     max = v;
01047             }
01048             REAL diff = max - min;
01049             m_mean[i] = min;
01050             m_std[i] = diff;
01051             if ( m_std[i] < 1e-6 )
01052                 m_std[i] = 1.0;
01053 
01054             minStd = minStd > m_std[i]? m_std[i] : minStd;
01055             maxStd = maxStd < m_std[i]? m_std[i] : maxStd;
01056             minMean = minMean > m_mean[i]? m_mean[i] : minMean;
01057             maxMean = maxMean < m_mean[i]? m_mean[i] : maxMean;
01058         }
01059         cout<<"mean|std:"<<endl;
01060         for ( int i=0;i<m_nFeatures;i++ )
01061             cout<<m_mean[i]<<"|"<<m_std[i]<<" ";
01062         cout<<endl;
01063     }
01064     cout<<"Min|Max mean: "<<minMean<<"|"<<maxMean<<"   Min|Max std: "<<minStd<<"|"<<maxStd<<"   Min|Max value: "<<minValue<<"|"<<maxValue<<endl;
01065 
01066     // target means
01067     cout<<"Target means: "<<flush;
01068     for ( int i=0;i<m_nClass*m_nDomain;i++ )
01069     {
01070         double mean = 0.0;
01071         REAL* ptr = m_trainTargetOrig + i * m_nClass * m_nDomain;
01072         for ( int j=0;j<m_nTrain;j++ )
01073             mean += ptr[j];
01074         cout<<i<<":"<<mean/ ( double ) ( m_nTrain ) <<" ";
01075     }
01076     cout<<endl;
01077 
01078     // save normalization
01079     char buf[1024];
01080     sprintf ( buf,"%s/%s/normalization.dat.add%d",m_datasetPath.c_str(), m_tempPath.c_str(), m_nCascadeInputs );
01081     cout<<"Save mean and std: "<<buf<<endl;
01082     fstream f ( buf, ios::out );
01083     f.write ( ( char* ) &m_nFeatures, sizeof ( int ) );
01084     f.write ( ( char* ) m_mean, sizeof ( REAL ) *m_nFeatures );
01085     f.write ( ( char* ) m_std, sizeof ( REAL ) *m_nFeatures );
01086     f.close();
01087 
01088     m_mixList = new int[m_nTrain];
01089 
01090     // mixing list
01091     for ( int i=0;i<m_nTrain;i++ )
01092         m_mixList[i] = i;
01093 
01094     // fix the randomness
01095     cout<<"Random seed:"<<m_randSeed<<endl;
01096     srand ( m_randSeed );
01097 
01098     cout<<"nFeatures:"<<m_nFeatures<<endl;
01099     cout<<"nClass:"<<m_nClass<<endl;
01100     cout<<"nDomain:"<<m_nDomain<<endl;
01101 
01102     if ( m_validationType == "ValidationSet" )
01103     {
01104         // no cross validation set
01105         m_trainSize = new int[1];
01106         m_trainSize[0] = m_nTrain;
01107         return;
01108     }
01109     
01110     
01111     m_trainTargetOrigEffect = new REAL[m_nClass*m_nDomain*m_nTrain];
01112     m_trainTargetOrigResidual = new REAL[m_nClass*m_nDomain*m_nTrain];
01113 
01114     // allocate mem for cross validation sets
01115     m_trainSize = new int[m_nCross+1];
01116     m_train = new REAL*[m_nCross+1];
01117     m_trainTarget = new REAL*[m_nCross+1];
01118     m_trainTargetEffect = new REAL*[m_nCross+1];
01119     m_trainTargetResidual = new REAL*[m_nCross+1];
01120     m_trainLabel = new int*[m_nCross+1];
01121     if(m_validationType == "Bagging")
01122         m_trainBaggingIndex = new int*[m_nCross+1];
01123 
01124     m_probeSize = new int[m_nCross+1];
01125     m_probe = new REAL*[m_nCross+1];
01126     m_probeTarget = new REAL*[m_nCross+1];
01127     m_probeTargetEffect = new REAL*[m_nCross+1];
01128     m_probeTargetResidual = new REAL*[m_nCross+1];
01129     m_probeLabel = new int*[m_nCross+1];
01130     m_probeIndex = new int*[m_nCross+1];
01131 
01132     
01133     // make a randomized index list (by random index swaps)
01134     int index0, index1, tmp;
01135     cout<<"Make "<<m_nTrain*m_nMixTrainList<<" index swaps (randomize sample index list)"<<endl;
01136     for ( int i=0;i<m_nTrain*m_nMixTrainList;i++ )
01137     {
01138         index0 = rand() % m_nTrain;
01139         index1 = rand() % m_nTrain;
01140 
01141         // swap
01142         tmp = m_mixList[index0];
01143         m_mixList[index0] = m_mixList[index1];
01144         m_mixList[index1] = tmp;
01145     }
01146 
01147     if( m_validationType == "Retraining" || m_validationType == "CrossFoldMean" )
01148     {
01149         m_slotBoundaries = new int[m_nCross+2];
01150     
01151         double partitionSize = ( double ) m_nTrain / ( double ) m_nCross;
01152         double accumulatedSize = partitionSize;
01153         int cnt = 0, currentSize = -1;
01154         m_slotBoundaries[0] = 0;
01155         m_slotBoundaries[m_nCross+1] = m_nTrain;
01156         cout<<"partition size: "<<partitionSize<<endl;
01157     
01158         // calculate train + probe size
01159         for ( int i=0;i<=m_nTrain;i++ )
01160         {
01161             currentSize++;
01162             if ( cnt < m_nCross )
01163             {
01164                 if ( i == ( int ) round ( accumulatedSize ) || i==m_nTrain )
01165                 {
01166                     m_slotBoundaries[cnt+1] = i;
01167                     m_probeSize[cnt] = currentSize;
01168                     m_trainSize[cnt] = m_nTrain - currentSize;
01169                     currentSize = 0;
01170                     accumulatedSize += partitionSize;
01171                     cnt++;
01172                 }
01173             }
01174         }
01175         m_trainSize[m_nCross] = m_nTrain;  // retraining set
01176         m_probeSize[m_nCross] = 0;
01177         
01178         // print splits
01179         int sum = 0;
01180         cout<<"slot: TRAIN | PROBE"<<endl<<"==================="<<endl;
01181         for ( int i=0;i<m_nCross+1;i++ )
01182         {
01183             cout<<i<<": "<<m_trainSize[i]<<" | "<<m_probeSize[i]<<endl;
01184             sum += m_probeSize[i];
01185         }
01186         cout<<"probe sum:"<<sum<<endl;
01187     }
01188     else if ( m_validationType == "Bagging" )
01189     {
01190         bool* bagSamples = new bool[m_nTrain];
01191         cout<<"Bagging sizes: TRAIN | PROBE"<<endl<<"============================"<<endl;
01192         for(int i=0;i<m_nCross;i++)
01193         {
01194             m_trainBaggingIndex[i] = new int[m_nTrain];
01195             
01196             // simulate boostrap sampling: sampling with replacenent
01197             srand(Framework::getRandomSeed() + i);
01198             int cnt = 0;
01199             for(int j=0;j<m_nTrain;j++)
01200                 bagSamples[j] = 0;
01201             for(int j=0;j<m_nTrain;j++)
01202             {
01203                 int ind = rand() % m_nTrain;
01204                 bagSamples[ind] = 1;
01205                 m_trainBaggingIndex[i][j] = ind;
01206             }
01207             for(int j=0;j<m_nTrain;j++)
01208                 cnt += bagSamples[j];
01209             m_trainSize[i] = m_nTrain;
01210             m_probeSize[i] = m_nTrain - cnt;
01211             
01212             m_probeIndex[i] = new int[m_probeSize[i]];
01213             cnt = 0;
01214             for(int j=0;j<m_nTrain;j++)
01215             {
01216                 if(bagSamples[j] == false)
01217                 {
01218                     m_probeIndex[i][cnt] = j;
01219                     cnt++;
01220                 }
01221             }
01222             cout<<i<<": "<<m_trainSize[i]<<" | "<<m_probeSize[i]<<"  ("<<100.0*(double)m_probeSize[i]/(double)m_nTrain<<"% in probe)"<<endl;
01223         }
01224         m_trainSize[m_nCross] = 0;
01225         m_probeSize[m_nCross] = 0;
01226         m_probeIndex[m_nCross] = 0;
01227         m_trainBaggingIndex[m_nCross] = 0;
01228         delete[] bagSamples;
01229         
01230         // make a summary (#zeros, mean coverage)
01231         int* bagCnt = new int[m_nTrain];
01232         for(int i=0;i<m_nTrain;i++)
01233             bagCnt[i] = 0;
01234         for(int i=0;i<m_nCross;i++)
01235             for(int j=0;j<m_nTrain;j++)
01236                 bagCnt[m_trainBaggingIndex[i][j]]++;
01237         cout<<"Bagging summary: #averaged: and  #cnt"<<endl;
01238         for(int nr=0;nr<2*m_nCross;nr++)
01239         {
01240             int cnt = 0;
01241             for(int i=0;i<m_nTrain;i++)
01242                 if(bagCnt[i] == nr)
01243                     cnt++;
01244             cout<<"n:"<<nr<<"|#"<<cnt<<" ";
01245         }
01246         cout<<endl;
01247         delete[] bagCnt;
01248     }
01249     else
01250         assert(false);
01251     
01252     // allocate mem + copy data to cross-validation slots
01253     for ( int i=0;i<m_nCross+1;i++ )
01254     {
01255         // allocate train mem
01256         int nTrain = m_trainSize[i];
01257         if ( m_enableSaveMemory == false )
01258             m_train[i] = new REAL[nTrain * m_nFeatures];
01259         else
01260             m_train[i] = 0;
01261         m_trainTarget[i] = new REAL[nTrain * m_nClass * m_nDomain];
01262         m_trainTargetEffect[i] = new REAL[nTrain * m_nClass * m_nDomain];
01263         m_trainTargetResidual[i] = new REAL[nTrain * m_nClass * m_nDomain];
01264         m_trainLabel[i] = new int[nTrain*m_nDomain];
01265 
01266         // allocate probe mem
01267         int nProbe = m_probeSize[i];
01268         if ( nProbe )
01269         {
01270             if ( m_enableSaveMemory == false )
01271                 m_probe[i] = new REAL[nProbe * m_nFeatures];
01272             else
01273                 m_probe[i] = 0;
01274             m_probeTarget[i] = new REAL[nProbe * m_nClass * m_nDomain];
01275             m_probeTargetEffect[i] = new REAL[nProbe * m_nClass * m_nDomain];
01276             m_probeTargetResidual[i] = new REAL[nProbe * m_nClass * m_nDomain];
01277             m_probeLabel[i] = new int[nProbe*m_nDomain];
01278             if ( m_validationType != "Bagging" )
01279                 m_probeIndex[i] = new int[nProbe];
01280         }
01281         else
01282         {
01283             m_probe[i] = 0;
01284             m_probeTarget[i] = 0;
01285             m_probeTargetEffect[i] = 0;
01286             m_probeTargetResidual[i] = 0;
01287             m_probeLabel[i] = 0;
01288             m_probeIndex[i] = 0;
01289         }
01290     }
01291 
01292     // alloc index list
01293     m_crossIndex = new int[m_nTrain];
01294     for ( int i=0;i<m_nTrain;i++ )
01295         m_crossIndex[i] = -1;
01296     
01297 }
01298 
01306 void Data::readEffectFile()
01307 {
01308     if(m_validationType == "ValidationSet")
01309         return;
01310     
01311     for ( int i=0;i<m_nClass*m_nDomain*m_nTrain;i++ )
01312         m_trainTargetOrigEffect[i] = 0.0;
01313 
01314     string name = m_datasetPath + "/" + m_fullPredPath + "/" + m_trainOnFullPredictorFile;
01315     fstream f ( name.c_str(), ios::in );
01316     if ( f.is_open() && m_trainOnFullPredictorFile!="" )
01317     {
01318         cout<<"Read fullPredictor:"<<name<<"  ";
01319         f.read ( ( char* ) m_trainTargetOrigEffect, sizeof ( REAL ) *m_nClass*m_nDomain*m_nTrain );
01320 
01321         double rmse0 = 0.0, rmse1 = 0.0, err;
01322         for ( int i=0;i<m_nClass*m_nDomain;i++ )
01323         {
01324             for ( int j=0;j<m_nTrain;j++ )
01325             {
01326                 err = m_trainTargetOrigEffect[j*m_nClass*m_nDomain + i] - m_trainTargetOrig[j*m_nClass*m_nDomain + i];
01327                 rmse0 += err * err;
01328             }
01329         }
01330         cout<<"RMSE:"<<sqrt ( rmse0/ ( double ) ( m_nClass*m_nDomain*m_nTrain ) ) <<"(retrain:"<<sqrt ( rmse1/ ( double ) ( m_nClass*m_nDomain*m_nTrain ) ) <<")"<<endl;
01331 
01332         f.close();
01333     }
01334     else
01335         cout<<"Can not open effect file:"<<name<<endl;
01336 
01337     // residual training: res = target - effect
01338     cout<<"Init residuals"<<endl;
01339     for ( int i=0;i<m_nClass*m_nDomain*m_nTrain;i++ )
01340         m_trainTargetOrigResidual[i] = m_trainTargetOrig[i] - m_trainTargetOrigEffect[i];
01341 }
01342 
01348 void Data::fillNCrossValidationSet ( int n )
01349 {
01350     // alloc new memory
01351     if ( m_train[n] )
01352         delete[] m_train[n];
01353     m_train[n] = 0;
01354     m_train[n] = new REAL[m_trainSize[n]*m_nFeatures];
01355     for ( int i=0;i<m_trainSize[n]*m_nFeatures;i++ )
01356         m_train[n][i] = 0.0;
01357     if ( m_probe[n] )
01358         delete[] m_probe[n];
01359     m_probe[n] = 0;
01360     if ( m_probeSize[n] )
01361         m_probe[n] = new REAL[m_probeSize[n]*m_nFeatures];
01362     for ( int i=0;i<m_probeSize[n]*m_nFeatures;i++ )
01363         m_probe[n][i] = 0.0;
01364 
01365     if(m_validationType == "Bagging")
01366     {
01367         bool* bagSamples = new bool[m_nTrain];
01368         for(int i=0;i<m_nTrain;i++)
01369             bagSamples[i] = 0;
01370         for(int i=0;i<m_nTrain;i++)
01371         {
01372             int ind = m_trainBaggingIndex[n][i];
01373             bagSamples[ind] = 1;
01374             for(int j=0;j<m_nFeatures;j++)
01375                 m_train[n][i*m_nFeatures+j] = m_trainOrig[ind*m_nFeatures + j];
01376         }
01377         int cnt = 0;
01378         for(int i=0;i<m_nTrain;i++)
01379         {
01380             if(bagSamples[i] == false)
01381             {
01382                 for(int j=0;j<m_nFeatures;j++)
01383                     m_probe[n][cnt*m_nFeatures+j] = m_trainOrig[i*m_nFeatures + j];
01384                 cnt++;
01385             }
01386         }
01387         if(cnt != m_probeSize[n])
01388         {
01389             cout<<"cnt:"<<cnt<<" probeSize"<<m_probeSize[n]<<endl;
01390             assert(false);
01391         }
01392         delete[] bagSamples;
01393     }
01394     else
01395     {
01396         // slot of probeset
01397         int begin = m_slotBoundaries[n];
01398         int end = m_slotBoundaries[n+1];
01399     
01400         int probeCnt = 0, trainCnt = 0;
01401     
01402         // go through whole trainOrig set
01403         for ( int j=0;j<m_nTrain;j++ )
01404         {
01405             int index = m_mixList[j];
01406     
01407             // probe set
01408             if ( j>=begin && j <end )
01409             {
01410                 for ( int k=0;k<m_nFeatures;k++ )
01411                     m_probe[n][probeCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01412                 probeCnt++;
01413             }
01414             else  // train set
01415             {
01416                 for ( int k=0;k<m_nFeatures;k++ )
01417                     m_train[n][trainCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01418                 trainCnt++;
01419             }
01420         }
01421     
01422         if ( probeCnt != m_probeSize[n] || trainCnt != m_trainSize[n] ) // safety check
01423             assert ( false );
01424     }
01425 }
01426 
01432 void Data::freeNCrossValidationSet ( int n )
01433 {
01434     if ( m_train[n] )
01435         delete[] m_train[n];
01436     m_train[n] = 0;
01437     if ( m_probe[n] )
01438         delete[] m_probe[n];
01439     m_probe[n] = 0;
01440 }
01441 
01445 void Data::doFeatureSelection()
01446 {
01447     bool* selectedFeatures = new bool[m_nFeatures];
01448     InputFeatureSelector::selectFeatures ( selectedFeatures, m_trainOrig, m_nFeatures, m_nTrain, m_trainLabelOrig, m_trainTargetOrigResidual, m_nClass, m_nDomain );
01449 
01450     delete[] selectedFeatures;
01451 }
01452 
01458 void Data::partitionDatasetToCrossValidationSets()
01459 {
01460     cout<<"Partition dataset to cross validation sets"<<endl;
01461 
01462     // read the effect file
01463     readEffectFile();
01464 
01465     // write the first lines to a file
01466     if(m_trainOrig)
01467     { fstream f("Atrain.txt",ios::out); for ( int i=0;i<m_nTrain && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_trainOrig[i*m_nFeatures + j]<<" ";f<<endl;}f.close();}
01468     if(m_testOrig)
01469     { fstream f("Atest.txt",ios::out); for ( int i=0;i<m_nTest && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_testOrig[i*m_nFeatures + j]<<" ";f<<endl;}f.close();}
01470     if(m_valid)
01471     { fstream f("Avalid.txt",ios::out); for ( int i=0;i<m_validSize && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_valid[i*m_nFeatures + j]<<" ";f<<endl;}f.close();}
01472     
01473     // apply mean and std to input features
01474     cout<<"Apply mean and std correction to train input features"<<endl;
01475     for ( int i=0;i<m_nTrain;i++ )
01476         for ( int j=0;j<m_nFeatures;j++ )
01477             m_trainOrig[i*m_nFeatures + j] = ( m_trainOrig[i*m_nFeatures + j] - m_mean[j] ) / m_std[j];
01478 
01479     // print min and max values in features
01480     REAL min = 1e10, max = -1e10;
01481     for ( int i=0;i<m_nTrain;i++ )
01482         for ( int j=0;j<m_nFeatures;j++ )
01483         {
01484             if ( min > m_trainOrig[i*m_nFeatures + j] )
01485                 min = m_trainOrig[i*m_nFeatures + j];
01486             if ( max < m_trainOrig[i*m_nFeatures + j] )
01487                 max = m_trainOrig[i*m_nFeatures + j];
01488         }
01489     cout<<"Min/Max after apply mean/std: "<<min<<"/"<<max<<endl;
01490 
01491     // print min and max values in targets
01492     min = 1e10;
01493     max = -1e10;
01494     m_targetMean = new REAL[m_nClass*m_nDomain];
01495     double* targetMean = new double[m_nClass*m_nDomain];
01496     for(int i=0;i<m_nClass*m_nDomain;i++)
01497         targetMean[i] = 0.0;
01498     for ( int i=0;i<m_nTrain;i++ )
01499         for ( int j=0;j<m_nClass*m_nDomain;j++ )
01500         {
01501             targetMean[j] += m_trainTargetOrig[i*m_nClass*m_nDomain + j];
01502             if ( min > m_trainTargetOrig[i*m_nClass*m_nDomain + j] )
01503                 min = m_trainTargetOrig[i*m_nClass*m_nDomain + j];
01504             if ( max < m_trainTargetOrig[i*m_nClass*m_nDomain + j] )
01505                 max = m_trainTargetOrig[i*m_nClass*m_nDomain + j];
01506         }
01507     for(int i=0;i<m_nClass*m_nDomain;i++)
01508         m_targetMean[i] = targetMean[i]/(double)m_nTrain;
01509     delete[] targetMean;
01510     
01511     cout<<"Min/Max target: "<<min<<"/"<<max<<endl<<"Mean target: ";
01512     for(int i=0;i<m_nClass*m_nDomain;i++)
01513         cout<<m_targetMean[i]<<" ";
01514     cout<<endl<<endl;
01515 
01516     if(m_validationType == "Retraining" || m_validationType == "CrossFoldMean")
01517     {
01518         int* labels = new int[m_nDomain];
01519     
01520         // copy data to cross-validation slots
01521         for ( int i=0;i<m_nCross+1;i++ )
01522         {
01523             // slot of probeset
01524             int begin = m_slotBoundaries[i];
01525             int end = m_slotBoundaries[i+1];
01526     
01527             int probeCnt = 0, trainCnt = 0;
01528     
01529             // go through whole trainOrig set
01530             for ( int j=0;j<m_nTrain;j++ )
01531             {
01532                 int index = m_mixList[j];
01533                 if ( Framework::getDatasetType() )
01534                 {
01535                     for ( int d=0;d<m_nDomain;d++ )
01536                         labels[d] = m_trainLabelOrig[d+index*m_nDomain];
01537                 }
01538     
01539                 // probe set
01540                 if ( j>=begin && j <end )
01541                 {
01542                     m_probeIndex[i][probeCnt] = index;
01543                     for ( int d=0;d<m_nDomain;d++ )
01544                         m_probeLabel[i][d+probeCnt*m_nDomain] = labels[d];
01545                     for ( int k=0;k<m_nFeatures;k++ )
01546                         if ( m_enableSaveMemory == false )
01547                             m_probe[i][probeCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01548                     for ( int k=0;k<m_nClass*m_nDomain;k++ )
01549                     {
01550                         m_probeTarget[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[index*m_nClass*m_nDomain + k];
01551                         m_probeTargetEffect[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[index*m_nClass*m_nDomain + k];
01552                         m_probeTargetResidual[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[index*m_nClass*m_nDomain + k];
01553                     }
01554                     probeCnt++;
01555                     m_crossIndex[j] = i;
01556                 }
01557                 else  // train set
01558                 {
01559                     for ( int d=0;d<m_nDomain;d++ )
01560                         m_trainLabel[i][d+trainCnt*m_nDomain] = labels[d];
01561                     for ( int k=0;k<m_nFeatures;k++ )
01562                         if ( m_enableSaveMemory == false )
01563                             m_train[i][trainCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01564                     for ( int k=0;k<m_nClass*m_nDomain;k++ )
01565                     {
01566                         m_trainTarget[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[index*m_nClass*m_nDomain + k];
01567                         m_trainTargetEffect[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[index*m_nClass*m_nDomain + k];
01568                         m_trainTargetResidual[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[index*m_nClass*m_nDomain + k];
01569                     }
01570                     trainCnt++;
01571                 }
01572             }
01573             if ( probeCnt != m_probeSize[i] || trainCnt != m_trainSize[i] ) // safety check
01574                 assert ( false );
01575         }
01576     
01577         if ( labels )
01578             delete[] labels;
01579     
01580         for ( int i=0;i<m_nTrain;i++ )
01581             if ( m_crossIndex[i] == -1 )
01582                 assert ( false );
01583     }
01584     else if(m_validationType == "Bagging")
01585     {
01586         bool* bagSamples = new bool[m_nTrain];
01587         for ( int i=0;i<m_nCross;i++ )
01588         {
01589             // train sets
01590             for(int j=0;j<m_nTrain;j++)
01591                 bagSamples[j] = 0;
01592             for(int j=0;j<m_nTrain;j++)
01593             {
01594                 uint ind = m_trainBaggingIndex[i][j];
01595                 bagSamples[ind] = 1;  // mark
01596                 
01597                 if ( Framework::getDatasetType() )
01598                     for ( int d=0;d<m_nDomain;d++ )
01599                         m_trainLabel[i][d+j*m_nDomain] = m_trainLabelOrig[d+ind*m_nDomain];
01600                 for ( int k=0;k<m_nFeatures;k++ )
01601                     if ( m_enableSaveMemory == false )
01602                         m_train[i][j*m_nFeatures + k] = m_trainOrig[ind*m_nFeatures + k];
01603                 for ( int k=0;k<m_nClass*m_nDomain;k++ )
01604                 {
01605                     m_trainTarget[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrig[ind*m_nClass*m_nDomain + k];
01606                     m_trainTargetEffect[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[ind*m_nClass*m_nDomain + k];
01607                     m_trainTargetResidual[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[ind*m_nClass*m_nDomain + k];
01608                 }
01609             }
01610             
01611             // probe sets
01612             int cnt = 0;
01613             for(int j=0;j<m_nTrain;j++)
01614                 cnt += bagSamples[j];
01615             if(m_nTrain - cnt != m_probeSize[i])
01616                 assert(false);
01617             cnt = 0;
01618             for(int j=0;j<m_nTrain;j++)
01619             {
01620                 if(bagSamples[j] == false)
01621                 {
01622                     if ( Framework::getDatasetType() )
01623                         for ( int d=0;d<m_nDomain;d++ )
01624                             m_probeLabel[i][d+cnt*m_nDomain] = m_trainLabelOrig[d+j*m_nDomain];
01625                     for ( int k=0;k<m_nFeatures;k++ )
01626                         if ( m_enableSaveMemory == false )
01627                             m_probe[i][cnt*m_nFeatures + k] = m_trainOrig[j*m_nFeatures + k];
01628                     for ( int k=0;k<m_nClass*m_nDomain;k++ )
01629                     {
01630                         m_probeTarget[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[j*m_nClass*m_nDomain + k];
01631                         m_probeTargetEffect[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[j*m_nClass*m_nDomain + k];
01632                         m_probeTargetResidual[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[j*m_nClass*m_nDomain + k];
01633                     }
01634                     cnt++;
01635                 }
01636             }
01637             if(cnt != m_probeSize[i])
01638                 assert(false);
01639         }
01640         delete[] bagSamples;
01641     }
01642     else if(m_validationType == "ValidationSet")
01643     {
01644         ;
01645     }
01646     else
01647         assert(false);
01648 }
01649 
01656 void Data::fillCascadeLearningInputs()
01657 {
01658     cout<<endl<<"Add effects (predictions of previous algorithms) as inputs to dataset"<<endl;
01659 
01660     // load the fullPredictors
01661     vector<string> files = m_algorithmNameList; //Data::getDirectoryFileList(m_datasetPath + "/" + m_fullPredPath + "/");
01662     vector<string> m_usedFiles;
01663 
01664     for ( int i=0;i<files.size();i++ )
01665         if ( files[i].at ( files[i].size()-1 ) != '.' && files[i].find ( ".dat" ) == files[i].length()-4 )
01666             m_usedFiles.push_back ( files[i] );
01667     int size = m_usedFiles.size();
01668 
01669     // alloc mem
01670     m_cascadeInputs = new REAL[size*m_nClass*m_nDomain*m_nTrain];
01671     for ( int i=0;i<size*m_nClass*m_nDomain*m_nTrain;i++ )
01672         m_cascadeInputs[i] = 1e10;
01673 
01674     // fill cascadeInputs
01675     for ( int i=0;i<size;i++ )
01676     {
01677         fstream f ( m_usedFiles[i].c_str(), ios::in );
01678         if ( f.is_open() == false )
01679             assert ( false );
01680         REAL* cache = new REAL[m_nTrain*m_nClass*m_nDomain];
01681         f.read ( ( char* ) cache, sizeof ( REAL ) *m_nTrain*m_nClass*m_nDomain );
01682         f.close();
01683 
01684         for ( int j=0;j<m_nTrain;j++ )
01685             for ( int k=0;k<m_nClass*m_nDomain;k++ )
01686                 m_cascadeInputs[j*m_nClass*m_nDomain*size + i*m_nClass*m_nDomain + k] = cache[j*m_nClass*m_nDomain + k];
01687 
01688         if ( cache )
01689             delete[] cache;
01690         cache = 0;
01691     }
01692     for ( int i=0;i<size;i++ )
01693     {
01694         double rmse = 0.0, err;
01695         for ( int j=0;j<m_nTrain;j++ )
01696             for ( int k=0;k<m_nClass*m_nDomain;k++ )
01697             {
01698                 err = m_cascadeInputs[j*m_nClass*m_nDomain*size + i*m_nClass*m_nDomain + k] - m_trainTargetOrig[k + j*m_nClass*m_nDomain];
01699                 rmse += err*err;
01700             }
01701         cout<<"File:"<<m_usedFiles[i]<<"  RMSE:"<<sqrt ( rmse/ ( double ) ( m_nClass*m_nTrain*m_nDomain ) ) <<endl;
01702     }
01703     if ( size == 0 )
01704         cout<<"Nothing to do here"<<endl;
01705     cout<<endl;
01706 
01707     m_nCascadeInputs = size;
01708     cout<<"nCascadeInputs:"<<m_nCascadeInputs<<endl;
01709 }
01710 
01716 void Data::extendTrainDataWithCascadeInputs()
01717 {
01718     if ( m_nCascadeInputs == 0 )
01719         return;
01720 
01721     cout<<"Extend the train data with cascade inputs"<<endl;
01722 
01723     if ( m_trainOrig )
01724     {
01725         REAL* m_trainOrigNew = new REAL[m_nTrain* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) ];
01726         for ( int i=0;i<m_nTrain;i++ )
01727         {
01728             REAL* ptr0 = m_trainOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain );
01729             REAL* ptr1 = m_trainOrig + i*m_nFeatures;
01730             for ( int j=0;j<m_nFeatures;j++ )
01731                 ptr0[j] = ptr1[j];
01732             ptr0 = m_trainOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) + m_nFeatures;
01733             ptr1 = m_cascadeInputs + i*m_nCascadeInputs*m_nClass*m_nDomain;
01734             for ( int j=0;j<m_nCascadeInputs*m_nClass*m_nDomain;j++ )
01735                 ptr0[j] = ptr1[j];
01736         }
01737         if ( m_trainOrig )
01738             delete[] m_trainOrig;
01739         m_trainOrig = m_trainOrigNew;
01740     }
01741 
01742     if ( m_testOrig )
01743     {
01744         REAL* m_testOrigNew = new REAL[m_nTest* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) ];
01745         for ( int i=0;i<m_nTest;i++ )
01746         {
01747             REAL* ptr0 = m_testOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain );
01748             REAL* ptr1 = m_testOrig + i*m_nFeatures;
01749             for ( int j=0;j<m_nFeatures;j++ )
01750                 ptr0[j] = ptr1[j];
01751             ptr0 = m_testOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) + m_nFeatures;
01752             ptr1 = m_cascadeInputs + i*m_nCascadeInputs*m_nClass*m_nDomain;
01753             for ( int j=0;j<m_nCascadeInputs*m_nClass*m_nDomain;j++ )
01754                 ptr0[j] = ptr1[j];
01755         }
01756         if ( m_testOrig )
01757             delete[] m_testOrig;
01758         m_testOrig = m_testOrigNew;
01759     }
01760 
01761     int nFeaturesBefore = m_nFeatures;
01762     m_nFeatures += m_nCascadeInputs*m_nClass*m_nDomain;
01763     cout<<"nFeatures: "<<m_nFeatures<<" (before: "<<nFeaturesBefore<<")"<<endl;
01764 }
01765 
01766 
01775 void Data::setPathes ( string temp, string dsc, string fullPred, string data )
01776 {
01777     m_tempPath = temp;
01778     m_dscPath = dsc;
01779     m_fullPredPath = fullPred;
01780     m_dataPath = data;
01781 }
01782 
01789 void Data::readParameter ( string line, int mode )
01790 {
01791     // split into 2 strings at the '=' char
01792     int pos = line.find ( "=" );
01793     string name = line.substr ( 0, pos );
01794     string value = line.substr ( pos+1 );
01795 
01796     if ( mode==-1 ) // meta info block (algorithm independent)
01797     {
01798         if ( name=="ALGORITHM" )
01799             m_algorithmName = value;
01800         if ( name=="ID" )
01801             m_algorithmID = atoi ( value.c_str() );
01802         if ( name=="TRAIN_ON_FULLPREDICTOR" )
01803         {
01804             if(m_validationType == "ValidationSet")
01805                 assert(false);
01806             m_trainOnFullPredictorFile = value;
01807         }
01808         if ( name=="DISABLE" )
01809             m_disableTraining = atoi ( value.c_str() );
01810         cout<<"[META] ";
01811     }
01812 
01813     if ( mode==0 ) // [int]
01814         m_intMap[name] = atoi ( value.c_str() );
01815 
01816     if ( mode==1 ) // [double]
01817         m_doubleMap[name] = atof ( value.c_str() );
01818 
01819     if ( mode==2 ) // [string]
01820         m_stringMap[name] = value;
01821 
01822     if ( mode==3 ) // [bool]
01823         m_boolMap[name] = atoi ( value.c_str() );
01824 
01825     cout<<name<<": "<<value<<endl;
01826 }
01827 
01833 void Data::readDscFile ( string name )
01834 {
01835     cout<<"Load descriptor file: "<<name<<endl;
01836     fstream f ( name.c_str(), ios::in );
01837 
01838     if ( f.is_open() ==false )
01839     {
01840         cout<<"Can not open file:"<<name<<endl;
01841         assert ( false );
01842     }
01843 
01844     int mode = -1;  // -1:meta info  0:int  1:double  2:string  3:bool
01845 
01846     char buf[256];
01847     while ( f.getline ( buf, 256 ) ) // read all lines
01848     {
01849         string line ( buf );
01850         if ( line[0]=='#' ) // a comment
01851             continue;
01852         if ( line.find ( "[int]" ) != string::npos )
01853             mode = 0;
01854         if ( line.find ( "[double]" ) != string::npos )
01855             mode = 1;
01856         if ( line.find ( "[string]" ) != string::npos )
01857             mode = 2;
01858         if ( line.find ( "[bool]" ) != string::npos )
01859             mode = 3;
01860 
01861         // only lines which consists of a '='
01862         if ( line.find ( "=" ) != string::npos )
01863             readParameter ( line, mode );
01864     }
01865 
01866     f.close();
01867 }
01868 
01873 vector<string> Data::getDirectoryFileList ( string path )
01874 {
01875     vector<string> v;
01876     DIR *dp;
01877     struct dirent *dirp;
01878     if ( ( dp = opendir ( path.c_str() ) ) == NULL )
01879     {
01880         cout << "Error opening " << path << endl;
01881         return v;
01882     }
01883     while ( ( dirp = readdir ( dp ) ) != NULL )
01884         v.push_back ( path + string ( dirp->d_name ) );
01885     closedir ( dp );
01886     return v;
01887 }
01888 
01897 int* Data::splitStringToIntegerList ( string str, char delimiter )
01898 {
01899     vector<int> v;
01900     int number;
01901     char *begin = ( char* ) str.c_str(), *end = ( char* ) str.c_str(), tmp;
01902     for ( int i=0;i<str.length();i++ )
01903     {
01904         end++;
01905         if ( *end==delimiter || *end==0 )
01906         {
01907             tmp = *end;
01908             *end = 0;
01909             sscanf ( begin, "%d", &number );
01910             begin = end + 1;
01911             *end = tmp;
01912             v.push_back ( number );
01913         }
01914     }
01915     int* returnList = new int[v.size() ];
01916     for ( int i=0;i<v.size();i++ )
01917         returnList[i] = v[i];
01918     return returnList;
01919 }
01920 
01929 vector<string> Data::splitStringToStringList ( string str, char delimiter )
01930 {
01931     vector<string> v;
01932     int number;
01933     char *begin = ( char* ) str.c_str(), *end = ( char* ) str.c_str(), tmp;
01934     for ( int i=0;i<str.length();i++ )
01935     {
01936         end++;
01937         if ( *end==delimiter || *end==0 )
01938         {
01939             tmp = *end;
01940             *end = 0;
01941             v.push_back ( begin );
01942             begin = end + 1;
01943             *end = tmp;
01944         }
01945     }
01946     return v;
01947 }
01948 
01954 void Data::setDataPointers ( Data* data )
01955 {
01956     cout<<"Set data pointers"<<endl;
01957 
01958     // copy maps
01959     m_intMap = data->m_intMap;
01960     m_doubleMap = data->m_doubleMap;
01961     m_boolMap = data->m_boolMap;
01962     m_stringMap = data->m_stringMap;
01963 
01964     m_algorithmName = data->m_algorithmName;
01965     m_algorithmID = data->m_algorithmID;
01966     m_trainOnFullPredictorFile = data->m_trainOnFullPredictorFile;
01967     m_disableTraining = data->m_disableTraining;
01968 
01969     m_randSeed = data->m_randSeed;
01970     m_positiveTarget = data->m_positiveTarget;
01971     m_negativeTarget = data->m_negativeTarget;
01972 
01973     m_mixList = data->m_mixList;
01974 
01975     // dataset pathes
01976     m_datasetPath = data->m_datasetPath;
01977     m_datasetName = data->m_datasetName;
01978     m_tempPath = data->m_tempPath;
01979     m_dscPath = data->m_dscPath;
01980     m_fullPredPath = data->m_fullPredPath;
01981     m_dataPath = data->m_dataPath;
01982 
01983     // dataset organization (input/output dimensionality)
01984     m_nFeatures = data->m_nFeatures;
01985     m_nClass = data->m_nClass;
01986     m_nDomain = data->m_nDomain;
01987     m_nMixTrainList = data->m_nMixTrainList;
01988 
01989     // cross-validation settings
01990     m_nCross = data->m_nCross;
01991     m_validationType = data->m_validationType;
01992 
01993     // global mean and standard deviation over whole dataset
01994     m_mean = data->m_mean;
01995     m_std = data->m_std;
01996     m_standardDeviationMin = data->m_standardDeviationMin;
01997     m_targetMean = data->m_targetMean;
01998 
01999     // full training set
02000     m_nTrain = data->m_nTrain;
02001     m_trainOrig = data->m_trainOrig;
02002     m_trainTargetOrig = data->m_trainTargetOrig;
02003     m_trainTargetOrigEffect = data->m_trainTargetOrigEffect;
02004     m_trainTargetOrigResidual = data->m_trainTargetOrigResidual;
02005     m_trainLabelOrig = data->m_trainLabelOrig;
02006     m_trainBaggingIndex = data->m_trainBaggingIndex;
02007 
02008     // the validation set
02009     m_validSize = data->m_validSize;
02010     m_valid = data->m_valid;
02011     m_validTarget = data->m_validTarget;
02012     m_validLabel = data->m_validLabel;
02013     
02014     // the testset
02015     m_nTest = data->m_nTest;
02016     m_testOrig = data->m_testOrig;
02017     m_testTargetOrig = data->m_testTargetOrig;
02018     m_testLabelOrig = data->m_testLabelOrig;
02019 
02020     // probe split inices
02021     m_slotBoundaries = data->m_slotBoundaries;
02022 
02023     // trainsets per cross-validation division
02024     m_trainSize = data->m_trainSize;
02025     m_train = data->m_train;
02026     m_trainTarget = data->m_trainTarget;
02027     m_trainTargetEffect = data->m_trainTargetEffect;
02028     m_trainTargetResidual = data->m_trainTargetResidual;
02029     m_trainLabel = data->m_trainLabel;
02030 
02031     // probesets per cross-validation division
02032     m_probeSize = data->m_probeSize;
02033     m_probe = data->m_probe;
02034     m_probeTarget = data->m_probeTarget;
02035     m_probeTargetEffect = data->m_probeTargetEffect;
02036     m_probeTargetResidual = data->m_probeTargetResidual;
02037     m_probeLabel = data->m_probeLabel;
02038     m_probeIndex = data->m_probeIndex;
02039 
02040     m_crossIndex = data->m_crossIndex;
02041 
02042     // blend stopping
02043     m_blendingRegularization = data->m_blendingRegularization;
02044     m_enableGlobalBlendingWeights = data->m_enableGlobalBlendingWeights;
02045     m_blendingEnableCrossValidation = data->m_blendingEnableCrossValidation;
02046     m_enablePostNNBlending = data->m_enablePostNNBlending;
02047     m_blendingAlgorithm = data->m_blendingAlgorithm;
02048 
02049     // cascade learning
02050     m_enableCascadeLearning = data->m_enableCascadeLearning;
02051     m_nCascadeInputs = data->m_nCascadeInputs;
02052     m_cascadeInputs = data->m_cascadeInputs;
02053 
02054     // average over mean and std as new mean and std
02055     m_enableGlobalMeanStdEstimate = data->m_enableGlobalMeanStdEstimate;
02056 
02057     // paralellization of k-fold cross validation
02058     m_maxThreadsInCross = data->m_maxThreadsInCross;
02059 
02060     // memory save option
02061     m_enableSaveMemory = data->m_enableSaveMemory;
02062 
02063     // error function "AUC" or "RMSE"
02064     m_errorFunction = data->m_errorFunction;
02065 
02066     // reverse mix table
02067     m_mixDatasetIndices = data->m_mixDatasetIndices;
02068 
02069     // already trained algo list
02070     m_algorithmNameList = data->m_algorithmNameList;
02071 
02072     // clip after blend
02073     m_enablePostBlendClipping = data->m_enablePostBlendClipping;
02074 
02075     // add output noise
02076     m_addOutputNoise = data->m_addOutputNoise;
02077 
02078     // feature selection
02079     m_enableFeatureSelection = data->m_enableFeatureSelection;
02080     m_featureSelectionWriteBinaryDataset = data->m_featureSelectionWriteBinaryDataset;
02081 
02082     // bagging
02083     m_enableBagging = data->m_enableBagging;
02084     m_randomSeedBagging = data->m_randomSeedBagging;
02085 
02086     // write dsc files in training
02087     m_disableWriteDscFile = data->m_disableWriteDscFile;
02088 
02089     // static mean and std normalization
02090     m_enableStaticNormalization = data->m_enableStaticNormalization;
02091     m_staticMeanNormalization = data->m_staticMeanNormalization;
02092     m_staticStdNormalization = data->m_staticStdNormalization;
02093     m_enableProbablisticNormalization = data->m_enableProbablisticNormalization;
02094 
02095     // dimensionality reduction
02096     m_dimensionalityReduction = data->m_dimensionalityReduction;
02097 
02098     // if this is set, the algorithm should load saved weights before start to training
02099     m_loadWeightsBeforeTraining = data->m_loadWeightsBeforeTraining;
02100 
02101     m_subsampleTrainSet = data->m_subsampleTrainSet;
02102     m_subsampleFeatures = data->m_subsampleFeatures;
02103     m_globalTrainingLoops = data->m_globalTrainingLoops;
02104     m_addConstantInput = data->m_addConstantInput;
02105 }
02106 
02113 void Data::setAlgorithmList ( vector<string> algorithmNameList )
02114 {
02115     cout<<"Set algorithm list (nTrained:"<< ( int ) algorithmNameList.size() <<")"<<endl;
02116     m_algorithmNameList = algorithmNameList;
02117     for ( int i=0;i<m_algorithmNameList.size();i++ )
02118     {
02119         int pos = m_algorithmNameList[i].find_first_of ( ".",0 );
02120         if ( pos == 0 )
02121             assert ( false );
02122         m_algorithmNameList[i] = m_datasetPath + "/" + m_fullPredPath + "/" + m_algorithmNameList[i].substr ( 0,pos ) + ".dat";
02123         cout<<"m_algorithmNameList["<<i<<"]:"<<m_algorithmNameList[i]<<endl;
02124     }
02125 }
02126 
02132 void Data::enableBagging ( bool en )
02133 {
02134     cout<<"Enable bagging:"<<en<<endl;
02135     m_enableBagging = en;
02136 }
02137 
02143 void Data::baggingRandomSeed ( uint seed )
02144 {
02145     m_randomSeedBagging = seed;
02146 }
02147 
02155 void Data::mergeTrainAndTest()
02156 {
02157     cout<<"trainSet = {trainSet(#"<<m_nTrain<<") + testSet(#"<<m_nTest<<")}"<<endl;
02158     if ( m_nTest == 0 )
02159         return;
02160 
02161     REAL* train = new REAL[ ( m_nTrain + m_nTest ) *m_nFeatures];
02162     REAL* trainTarget = new REAL[ ( m_nTrain + m_nTest ) *m_nClass*m_nDomain];
02163     int* trainLabel = new int[ ( m_nTrain + m_nTest ) *m_nDomain];
02164 
02165     memcpy ( train, m_trainOrig, sizeof ( REAL ) *m_nTrain*m_nFeatures );
02166     memcpy ( train + m_nTrain*m_nFeatures, m_testOrig, sizeof ( REAL ) *m_nTest*m_nFeatures );
02167 
02168     memcpy ( trainTarget, m_trainTargetOrig, sizeof ( REAL ) *m_nTrain*m_nClass*m_nDomain );
02169     memcpy ( trainTarget + m_nTrain*m_nClass*m_nDomain, m_testTargetOrig, sizeof ( REAL ) *m_nTest*m_nClass*m_nDomain );
02170 
02171     memcpy ( trainLabel, m_trainLabelOrig, sizeof ( REAL ) *m_nTrain*m_nDomain );
02172     memcpy ( trainLabel + m_nTrain*m_nDomain, m_testLabelOrig, sizeof ( REAL ) *m_nTest*m_nDomain );
02173 
02174     delete[] m_trainOrig;
02175     delete[] m_trainTargetOrig;
02176     delete[] m_trainLabelOrig;
02177 
02178     m_trainOrig = train;
02179     m_trainTargetOrig = trainTarget;
02180     m_trainLabelOrig = trainLabel;
02181 
02182     m_nTrain = m_nTrain + m_nTest;
02183 }
02184 
02188 void Data::normalizeZeroOne()
02189 {
02190     cout<<"Autoencoder: Normalize train between 0 and 1"<<endl;
02191     // (m_trainOrig[i*m_nFeatures + j] - m_mean[j]) / m_std[j]
02192     REAL* mean = new REAL[m_nFeatures];
02193     REAL* std = new REAL[m_nFeatures];
02194 
02195     for ( int i=0;i<m_nFeatures;i++ )
02196     {
02197         double mu = 0.0, min = 1e10, max = -1e10;
02198         for ( int j=0;j<m_nTrain;j++ )
02199         {
02200             REAL v = m_trainOrig[i+j*m_nFeatures];
02201             mu += v;
02202             if ( min > v )
02203                 min = v;
02204             if ( max < v )
02205                 max = v;
02206         }
02207         mean[i] = min;
02208         std[i] = max - min;
02209         if ( std[i] <= 1e-2 )
02210             std[i] = 1.0;
02211         m_mean[i] = 0.0;
02212         m_std[i] = 1.0;
02213 
02214         if ( m_enableStaticNormalization ) // something special, allow to modify the auto normalizations
02215         {
02216             mean[i] += m_staticMeanNormalization;
02217             std[i] *= m_staticStdNormalization;
02218         }
02219     }
02220     for ( int i=0;i<m_nTrain;i++ )
02221         for ( int j=0;j<m_nFeatures;j++ )
02222         {
02223             m_trainOrig[j+i*m_nFeatures] = ( m_trainOrig[j+i*m_nFeatures] - mean[j] ) / std[j];
02224             REAL v = m_trainOrig[j+i*m_nFeatures];
02225             if ( v > 1.0 || v < 0.0 )
02226             {
02227                 cout<<"v:"<<v<<endl;
02228                 assert ( false );
02229             }
02230         }
02231 
02232     // print mean/std
02233     for ( int j=0;j<m_nFeatures;j++ )
02234         cout<<mean[j]<<"|"<<std[j]<<" ";
02235     cout<<endl;
02236 
02237     // save the normalizations
02238     cout<<"save the 0..1 normalizations"<<endl;
02239     string meanName = m_datasetPath + "/" + m_tempPath + "/AutoencoderDataMean.dat";
02240     string stdName = m_datasetPath + "/" + m_tempPath + "/AutoencoderDataStd.dat";
02241     cout<<"meanName:"<<meanName<<endl<<"stdName:"<<stdName<<endl;
02242     fstream fMean ( meanName.c_str(),ios::out );
02243     fstream fStd ( stdName.c_str(),ios::out );
02244     fMean.write ( ( char* ) mean, sizeof ( REAL ) *m_nFeatures );
02245     fStd.write ( ( char* ) std, sizeof ( REAL ) *m_nFeatures );
02246     fMean.close();
02247     fStd.close();
02248 
02249     delete[] mean;
02250     delete[] std;
02251 }
02252 
02259 void Data::reduceTrainingSetSize ( REAL percent )
02260 {
02261     cout<<"reduce training set (current size:"<<m_nTrain<<") to "<<percent*100.0<<"% of its original size"<<flush;
02262     if ( percent <= 0.0 || percent >= 1.0 )
02263     {
02264         cout<<"  [nothing to do]"<<endl;
02265         return;
02266     }
02267     cout<<endl;
02268     
02269     srand ( Framework::getRandomSeed() );
02270     int cnt = 0;
02271     for ( int i=0;i<m_nTrain;i++ )
02272         if ( ( double ) rand() / ( double ) RAND_MAX < percent )
02273             cnt++;
02274 
02275     cout<<"allocate new training set, size:"<<cnt<<endl;
02276 
02277     REAL* train = new REAL[cnt*m_nFeatures];
02278     REAL* trainTarget = new REAL[cnt*m_nClass*m_nDomain];
02279 
02280     int* trainLabel = 0;
02281     if ( m_trainLabelOrig )
02282         trainLabel = new int[cnt*m_nDomain];
02283 
02284     srand ( Framework::getRandomSeed() );
02285     cnt = 0;
02286     for ( int i=0;i<m_nTrain;i++ )
02287     {
02288         if ( ( double ) rand() / ( double ) RAND_MAX < percent )
02289         {
02290             for ( int j=0;j<m_nFeatures;j++ )
02291                 train[j+cnt*m_nFeatures] = m_trainOrig[j+i*m_nFeatures];
02292             for ( int j=0;j<m_nClass*m_nDomain;j++ )
02293                 trainTarget[j+cnt*m_nClass*m_nDomain] = m_trainTargetOrig[j+i*m_nClass*m_nDomain];
02294             if ( m_trainLabelOrig )
02295             {
02296                 for ( int j=0;j<m_nDomain;j++ )
02297                     trainLabel[j+cnt*m_nDomain] = m_trainLabelOrig[j+i*m_nDomain];
02298             }
02299             cnt++;
02300         }
02301     }
02302 
02303     delete[] m_trainOrig;
02304     delete[] m_trainTargetOrig;
02305     if ( m_trainLabelOrig )
02306         delete[] m_trainLabelOrig;
02307 
02308     m_trainOrig = train;
02309     m_trainTargetOrig = trainTarget;
02310     if ( m_trainLabelOrig )
02311         m_trainLabelOrig = trainLabel;
02312 
02313     m_nTrain = cnt;
02314 }
02315 
02322 void Data::reduceFeatureSize ( REAL* &table, int tableRows, int &tableCols, REAL percent, bool loadColumnSet )
02323 {
02324     cout<<"subsample the columns (current:"<<tableCols<<") to "<<percent*100.0<<"% of columns (skip constant 1 features)"<<flush;
02325     if ( percent <= 0.0 || percent >= 1.0 )
02326     {
02327         cout<<"  [nothing to do]"<<endl;
02328         return;
02329     }
02330     cout<<endl;
02331     
02332     // determine constant 1 features
02333     bool* isConstantOne = new bool[tableCols];
02334     bool* selectedCols = new bool[tableCols];
02335     for ( int i=0;i<tableCols;i++ )
02336     {
02337         isConstantOne[i] = true;
02338         selectedCols[i] = false;
02339     }
02340     for ( int i=0;i<tableRows;i++ )
02341         for ( int j=0;j<tableCols;j++ )
02342             isConstantOne[j] &= table[j+i*tableCols]==1.0;
02343 
02344     srand ( Framework::getRandomSeed() );
02345     int cnt = 0;
02346     for ( int i=0;i<tableCols;i++ )
02347         if ( ( double ) rand() / ( double ) RAND_MAX < percent || isConstantOne[i] )
02348         {
02349             selectedCols[i] = true;
02350             cnt++;
02351         }
02352     delete[] isConstantOne;
02353 
02354     if ( loadColumnSet )
02355     {
02356         string fname = m_datasetPath + "/" + string ( DATA_PATH ) + "/subspace.txt";
02357         cout<<"load subspace file:"<<fname<<endl;
02358         fstream f ( fname.c_str(),ios::in );
02359         cnt = 0;
02360         for ( int i=0;i<tableCols;i++ )
02361         {
02362             f>>selectedCols[i];
02363             cnt += selectedCols[i];
02364         }
02365         f.close();
02366     }
02367     else
02368     {
02369         string fname = m_datasetPath + "/" + string ( DATA_PATH ) + "/subspace.txt";
02370         cout<<"write subspace file:"<<fname<<endl;
02371         fstream f ( fname.c_str(),ios::out );
02372         for ( int i=0;i<tableCols;i++ )
02373             f<<selectedCols[i]<<endl;
02374         f.close();
02375     }
02376 
02377     cout<<"allocate new table set, column size:"<<cnt<<endl;
02378     REAL* newTable = new REAL[cnt*tableRows];
02379 
02380     srand ( Framework::getRandomSeed() );
02381     for ( int i=0;i<tableRows;i++ )
02382     {
02383         int c = 0;
02384         for ( int j=0;j<tableCols;j++ )
02385         {
02386             if ( selectedCols[j] )
02387             {
02388                 newTable[c+i*cnt] = table[j+i*tableCols];
02389                 c++;
02390             }
02391         }
02392     }
02393 
02394     delete[] table;
02395     delete[] selectedCols;
02396     table = newTable;
02397     tableCols = cnt;
02398 }
02399 
02403 void Data::addConstantInput()
02404 {
02405     if(m_trainOrig)
02406     {
02407         cout<<"Add a constant 1 column to the train feature matrix"<<endl;
02408         REAL* trainTmp = new REAL[m_nTrain*(m_nFeatures+1)];
02409         for(int i=0;i<m_nTrain;i++)
02410         {
02411             for(int j=0;j<m_nFeatures;j++)
02412                 trainTmp[i*(m_nFeatures+1)+j] = m_trainOrig[i*m_nFeatures+j];
02413             trainTmp[i*(m_nFeatures+1)+m_nFeatures] = 1.0;
02414         }
02415         delete[] m_trainOrig;
02416         m_trainOrig = trainTmp;
02417     }
02418     if(m_testOrig)
02419     {
02420         cout<<"Add a constant 1 column to the test feature matrix"<<endl;
02421         REAL* testTmp = new REAL[m_nTest*(m_nFeatures+1)];
02422         for(int i=0;i<m_nTest;i++)
02423         {
02424             for(int j=0;j<m_nFeatures;j++)
02425                 testTmp[i*(m_nFeatures+1)+j] = m_testOrig[i*m_nFeatures+j];
02426             testTmp[i*(m_nFeatures+1)+m_nFeatures] = 1.0;
02427         }
02428         delete[] m_testOrig;
02429         m_testOrig = testTmp;
02430     }
02431     m_nFeatures++;
02432 }

Generated on Tue Jan 26 09:20:58 2010 for ELF by  doxygen 1.5.8