Data Class Reference

#include <Data.h>

Inheritance diagram for Data:

Framework Algorithm BlendingNN Autoencoder StandardAlgorithm GBDT KernelRidgeRegression KNearestNeighbor LinearModel LinearModelNonNeg LogisticRegression NeuralNetwork NeuralNetworkRBMauto PolynomialRegression

List of all members.

Public Member Functions

 Data ()
virtual ~Data ()
void readParameter (string line, int mode)
void readDscFile (string name)
void setPathes (string temp, string dsc, string fullPred, string data)
void readDataset (string name)
void allocMemForCrossValidationSets ()
void partitionDatasetToCrossValidationSets ()
void fillCascadeLearningInputs ()
void extendTrainDataWithCascadeInputs ()
void fillNCrossValidationSet (int n)
void freeNCrossValidationSet (int n)
void readEffectFile ()
void setDataPointers (Data *data)
void mixDataset ()
void deleteMemory ()
void loadNormalization (int nCascade=0)
void setAlgorithmList (vector< string > m_algorithmNameList)
void loadFeatureSelectionFile ()
void saveFeatureSelectionFile ()
void doFeatureSelection ()
void makeBinaryDataset ()
void enableBagging (bool en)
void doBootstrapSampling (REAL *probs, REAL *&train, REAL *&target, REAL *&targetEff, REAL *&targetRes, int *&label, int nTrainNew=0)
void baggingRandomSeed (uint seed)
int vectorSampling (REAL *probs, int length)
void mergeTrainAndTest ()
void normalizeZeroOne ()
void reduceTrainingSetSize (REAL percent)
void reduceFeatureSize (REAL *&table, int tableRows, int &tableCols, REAL percent, bool loadColumnSet)
void addConstantInput ()

Static Public Member Functions

static vector< string > getDirectoryFileList (string path)
static int * splitStringToIntegerList (string str, char delimiter)
static vector< string > splitStringToStringList (string str, char delimiter)

Protected Attributes

string m_datasetPath
string m_datasetName
string m_algorithmName
int m_algorithmID
string m_trainOnFullPredictorFile
bool m_disableTraining
int m_randSeed
int m_nMixDataset
int m_nMixTrainList
int m_nCross
string m_validationType
int m_maxThreadsInCross
bool m_enableGlobalMeanStdEstimate
REAL m_positiveTarget
REAL m_negativeTarget
double m_blendingRegularization
bool m_enableGlobalBlendingWeights
bool m_blendingEnableCrossValidation
bool m_enablePostNNBlending
string m_blendingAlgorithm
bool m_enableCascadeLearning
int m_nCascadeInputs
REAL * m_cascadeInputs
map< string, int > m_intMap
map< string, double > m_doubleMap
map< string, bool > m_boolMap
map< string, string > m_stringMap
string m_tempPath
string m_dscPath
string m_fullPredPath
string m_dataPath
int m_nFeatures
int m_nClass
int m_nDomain
int * m_mixDatasetIndices
int * m_mixList
int * m_crossIndex
uint m_nTrain
REAL * m_trainOrig
REAL * m_trainTargetOrig
REAL * m_trainTargetOrigEffect
REAL * m_trainTargetOrigResidual
int * m_trainLabelOrig
uint m_nTest
REAL * m_testOrig
REAL * m_testTargetOrig
int * m_testLabelOrig
int * m_slotBoundaries
int * m_trainSize
REAL ** m_train
REAL ** m_trainTarget
REAL ** m_trainTargetEffect
REAL ** m_trainTargetResidual
int ** m_trainLabel
int ** m_trainBaggingIndex
int * m_probeSize
REAL ** m_probe
REAL ** m_probeTarget
REAL ** m_probeTargetEffect
REAL ** m_probeTargetResidual
int ** m_probeLabel
int ** m_probeIndex
int m_validSize
REAL * m_valid
REAL * m_validTarget
int * m_validLabel
REAL * m_mean
REAL * m_std
REAL m_standardDeviationMin
REAL * m_targetMean
bool m_enableSaveMemory
string m_errorFunction
REAL * m_support
vector< string > m_algorithmNameList
bool m_enablePostBlendClipping
REAL m_addOutputNoise
bool m_enableFeatureSelection
bool m_featureSelectionWriteBinaryDataset
bool m_enableBagging
uint m_randomSeedBagging
bool m_disableWriteDscFile
bool m_enableStaticNormalization
REAL m_staticMeanNormalization
REAL m_staticStdNormalization
bool m_enableProbablisticNormalization
string m_dimensionalityReduction
REAL m_subsampleTrainSet
REAL m_subsampleFeatures
int m_globalTrainingLoops
bool m_addConstantInput
bool m_loadWeightsBeforeTraining

Friends

class Scheduler
class Algorithm
class Autoencoder


Detailed Description

This is the basic data class. This class holds the dataset and also the n-fold cross-validation dataset

An Algorithm is derived in general from the Algorithm (child from Data) class. The information of the master.dsc files is stored here.

Mean and Standard deviation for inputs are stored here.

Definition at line 34 of file Data.h.


Constructor & Destructor Documentation

Data::Data (  ) 

Constructor

Definition at line 8 of file Data.cpp.

00009 {
00010     cout<<"Constructor Data"<<endl;
00011 
00012     // init member vars
00013     m_algorithmID = 0;
00014     m_randSeed = 0;
00015     m_nMixDataset = 0;
00016     m_nMixTrainList = 0;
00017     m_nCross = 0;
00018     m_validationType = "Retraining";
00019     m_maxThreadsInCross = 0;
00020     m_enableGlobalMeanStdEstimate = 0;
00021     m_positiveTarget = 0;
00022     m_negativeTarget = 0;
00023     m_blendingRegularization = 0;
00024     m_enableGlobalBlendingWeights = 0;
00025     m_blendingEnableCrossValidation = 0;
00026     m_enablePostNNBlending = 0;
00027     m_enableCascadeLearning = 0;
00028     m_nCascadeInputs = 0;
00029     m_cascadeInputs = 0;
00030     m_nFeatures = 0;
00031     m_nClass = 0;
00032     m_nDomain = 0;
00033     m_mixDatasetIndices = 0;
00034     m_mixList = 0;
00035     m_crossIndex = 0;
00036     m_nTrain = 0;
00037     m_trainOrig = 0;
00038     m_trainTargetOrig = 0;
00039     m_trainTargetOrigEffect = 0;
00040     m_trainTargetOrigResidual = 0;
00041     m_trainLabelOrig = 0;
00042     m_trainBaggingIndex = 0;
00043     m_nTest = 0;
00044     m_testOrig = 0;
00045     m_testTargetOrig = 0;
00046     m_testLabelOrig = 0;
00047     m_slotBoundaries = 0;
00048     m_trainSize = 0;
00049     m_train = 0;
00050     m_trainTarget = 0;
00051     m_trainTargetEffect = 0;
00052     m_trainTargetResidual = 0;
00053     m_trainLabel = 0;
00054     m_probeSize = 0;
00055     m_probe = 0;
00056     m_probeTarget = 0;
00057     m_probeTargetEffect = 0;
00058     m_probeTargetResidual = 0;
00059     m_probeLabel = 0;
00060     m_probeIndex = 0;
00061     m_validSize = 0;
00062     m_valid = 0;
00063     m_validTarget = 0;
00064     m_validLabel = 0;
00065     m_mean = 0;
00066     m_std = 0;
00067     m_standardDeviationMin = 0;
00068     m_targetMean = 0;
00069     m_enableSaveMemory = 0;
00070     m_support = 0;
00071     m_enablePostBlendClipping = 0;
00072     m_addOutputNoise = 0;
00073     m_enableFeatureSelection = 0;
00074     m_featureSelectionWriteBinaryDataset = 0;
00075     m_enableBagging = 0;
00076     m_randomSeedBagging = 0;
00077     m_enableStaticNormalization = 0;
00078     m_staticMeanNormalization = 0.0;
00079     m_staticStdNormalization = 1.0;
00080     m_enableProbablisticNormalization = 0;
00081     m_dimensionalityReduction = "";
00082     m_subsampleTrainSet = 1.0;
00083     m_subsampleFeatures = 1.0;
00084     m_disableTraining = false;
00085     m_globalTrainingLoops = 1;
00086     m_addConstantInput = 0;
00087     m_loadWeightsBeforeTraining = false;
00088 }

Data::~Data (  )  [virtual]

Destructor

Definition at line 93 of file Data.cpp.

00094 {
00095     cout<<"destructor Data"<<endl;
00096 
00097 }


Member Function Documentation

void Data::addConstantInput (  ) 

Add a constant 1 column to the feature matrices

Definition at line 2403 of file Data.cpp.

02404 {
02405     if(m_trainOrig)
02406     {
02407         cout<<"Add a constant 1 column to the train feature matrix"<<endl;
02408         REAL* trainTmp = new REAL[m_nTrain*(m_nFeatures+1)];
02409         for(int i=0;i<m_nTrain;i++)
02410         {
02411             for(int j=0;j<m_nFeatures;j++)
02412                 trainTmp[i*(m_nFeatures+1)+j] = m_trainOrig[i*m_nFeatures+j];
02413             trainTmp[i*(m_nFeatures+1)+m_nFeatures] = 1.0;
02414         }
02415         delete[] m_trainOrig;
02416         m_trainOrig = trainTmp;
02417     }
02418     if(m_testOrig)
02419     {
02420         cout<<"Add a constant 1 column to the test feature matrix"<<endl;
02421         REAL* testTmp = new REAL[m_nTest*(m_nFeatures+1)];
02422         for(int i=0;i<m_nTest;i++)
02423         {
02424             for(int j=0;j<m_nFeatures;j++)
02425                 testTmp[i*(m_nFeatures+1)+j] = m_testOrig[i*m_nFeatures+j];
02426             testTmp[i*(m_nFeatures+1)+m_nFeatures] = 1.0;
02427         }
02428         delete[] m_testOrig;
02429         m_testOrig = testTmp;
02430     }
02431     m_nFeatures++;
02432 }

void Data::allocMemForCrossValidationSets (  ) 

Allocate memory for the cross validation dataset splits residual = original - effect

residual = model error original = original target value from datafile effect = prediction from an Algorithm (e.g. preprocessing)

Definition at line 901 of file Data.cpp.

00902 {
00903     cout<<"Alloc mem for cross validation data sets"<<endl;
00904     m_mean = new REAL[m_nFeatures];
00905     m_std = new REAL[m_nFeatures];
00906 
00907     if(m_validationType == "ValidationSet")
00908         m_nCross = 0;
00909     else
00910     {
00911         // partitioning to nCross-validation sets
00912         if ( m_nCross > m_nTrain )
00913         {
00914             cout<<"Limit: nCross=nTrain"<<endl;
00915             m_nCross = m_nTrain;
00916         }
00917         cout<<"Cross-validation settings: "<<m_nCross<<" sets"<<endl;
00918     }
00919     
00920     // calc global mean and standard deviation over whole dataset
00921     cout<<"Calculating mean and std per input"<<endl;
00922     double minStd = 1e10, maxStd = -1e10, minMean = 1e10, maxMean = -1e10, minValue = 1e10, maxValue = -1e10;
00923     for ( int i=0;i<m_nFeatures;i++ )
00924     {
00925         // calc mean
00926         double mean = 0.0;
00927         for ( int j=0;j<m_nTrain;j++ )
00928         {
00929             REAL v = m_trainOrig[j*m_nFeatures + i];
00930             mean += v;
00931             if ( minValue > v )
00932                 minValue = v;
00933             if ( maxValue < v )
00934                 maxValue = v;
00935         }
00936         mean /= ( double ) m_nTrain;
00937 
00938         // calc standard deviation
00939         double std = 0.0;
00940         for ( int j=0;j<m_nTrain;j++ )
00941             std += ( mean - m_trainOrig[j*m_nFeatures + i] ) * ( mean - m_trainOrig[j*m_nFeatures + i] );
00942         std = sqrt ( std/ ( double ) ( m_nTrain-1 ) );
00943 
00944         if ( m_datasetName=="KDDCup09Large" || m_datasetName=="KDDCup09Small" ) // || m_datasetName=="BINARY")
00945         {
00946             double max = -1e10;
00947             for ( int j=0;j<m_nTrain;j++ )
00948                 if ( max < fabs ( m_trainOrig[j*m_nFeatures + i]-mean ) )
00949                     max = fabs ( m_trainOrig[j*m_nFeatures + i]-mean );
00950             std = max;
00951         }
00952 
00953         if ( fabs ( std ) < 1e-9 && mean == 0.0 ) // constant zero input
00954         {
00955             //cout<<"Feature nr:"<<i<<" is constant zero (mean:"<<mean<<"), set std=1e10"<<endl;
00956             cout<<"f:"<<i<<"=0 "<<flush;
00957             std = 1e10;
00958         }
00959         if ( fabs ( std ) < 1e-9 && mean != 0.0 ) // constant input
00960         {
00961             //cout<<"Feature nr:"<<i<<" is constant (mean:"<<mean<<"), set std="<<mean<<" and mean=0"<<endl;
00962             cout<<"f:"<<i<<"=c "<<flush;
00963             std = mean;
00964             mean = 0.0;
00965         }
00966         if ( mean==1.0 ) // constant one input
00967         {
00968             //cout<<"Feature nr:"<<i<<" mean=1, set std=1 and mean=0"<<endl;
00969             cout<<"f:"<<i<<"=1 "<<flush;
00970             std = 1.0;
00971             mean = 0.0;
00972         }
00973         if ( std < m_standardDeviationMin ) // limit to a small positive value
00974         {
00975             //cout<<"Feature nr:"<<i<<" "<<"("<<std<<") is limited in std="<<m_standardDeviationMin<<endl;
00976             cout<<"f:"<<i<<"lim "<<flush;
00977             std = m_standardDeviationMin;
00978         }
00979 
00980         minStd = minStd > std? std : minStd;
00981         maxStd = maxStd < std? std : maxStd;
00982         minMean = minMean > mean? mean : minMean;
00983         maxMean = maxMean < mean? mean : maxMean;
00984 
00985         // save them
00986         m_mean[i] = mean;
00987         m_std[i] = std;
00988     }
00989     if ( m_enableStaticNormalization )
00990     {
00991         cout<<"Static mean:"<<m_staticMeanNormalization<<" and std:"<<m_staticStdNormalization<<endl;
00992         for ( int i=0;i<m_nFeatures;i++ )
00993         {
00994             m_mean[i] = m_staticMeanNormalization;
00995             m_std[i] = m_staticStdNormalization;
00996         }
00997         minMean = m_staticMeanNormalization;
00998         maxMean = m_staticMeanNormalization;
00999         minStd = m_staticStdNormalization;
01000         maxStd = m_staticStdNormalization;
01001     }
01002     if ( m_enableGlobalMeanStdEstimate )
01003     {
01004         cout<<"Calc average of mean and std"<<endl;
01005         double mean = 0.0;
01006         for ( int i=0;i<m_nFeatures;i++ )
01007             mean += m_mean[i];
01008         mean /= ( double ) m_nFeatures;
01009         for ( int i=0;i<m_nFeatures;i++ )
01010             m_mean[i] = mean;
01011         minMean = maxMean = mean;
01012 
01013         double std = 0.0;
01014         int stdCnt = 0;
01015         for ( int i=0;i<m_nFeatures;i++ )
01016         {
01017             if ( m_std[i] != 1e10 )
01018             {
01019                 std += m_std[i];
01020                 stdCnt++;
01021             }
01022         }
01023         if ( stdCnt == 0 )
01024             assert ( false );
01025         std /= ( double ) stdCnt;
01026         for ( int i=0;i<m_nFeatures;i++ )
01027             m_std[i] = std;
01028         minStd = maxStd = std;
01029     }
01030     if ( m_enableProbablisticNormalization )
01031     {
01032         cout<<"Calc probablistic normalization"<<endl;
01033         minStd = 1e10;
01034         maxStd = -1e10;
01035         minMean = 1e10;
01036         maxMean = -1e10;
01037         for ( int i=0;i<m_nFeatures;i++ )
01038         {
01039             REAL min = 1e10, max = -1e10;
01040             for ( int j=0;j<m_nTrain;j++ )
01041             {
01042                 REAL v = m_trainOrig[i + j*m_nFeatures];
01043                 if ( min > v )
01044                     min = v;
01045                 if ( max < v )
01046                     max = v;
01047             }
01048             REAL diff = max - min;
01049             m_mean[i] = min;
01050             m_std[i] = diff;
01051             if ( m_std[i] < 1e-6 )
01052                 m_std[i] = 1.0;
01053 
01054             minStd = minStd > m_std[i]? m_std[i] : minStd;
01055             maxStd = maxStd < m_std[i]? m_std[i] : maxStd;
01056             minMean = minMean > m_mean[i]? m_mean[i] : minMean;
01057             maxMean = maxMean < m_mean[i]? m_mean[i] : maxMean;
01058         }
01059         cout<<"mean|std:"<<endl;
01060         for ( int i=0;i<m_nFeatures;i++ )
01061             cout<<m_mean[i]<<"|"<<m_std[i]<<" ";
01062         cout<<endl;
01063     }
01064     cout<<"Min|Max mean: "<<minMean<<"|"<<maxMean<<"   Min|Max std: "<<minStd<<"|"<<maxStd<<"   Min|Max value: "<<minValue<<"|"<<maxValue<<endl;
01065 
01066     // target means
01067     cout<<"Target means: "<<flush;
01068     for ( int i=0;i<m_nClass*m_nDomain;i++ )
01069     {
01070         double mean = 0.0;
01071         REAL* ptr = m_trainTargetOrig + i * m_nClass * m_nDomain;
01072         for ( int j=0;j<m_nTrain;j++ )
01073             mean += ptr[j];
01074         cout<<i<<":"<<mean/ ( double ) ( m_nTrain ) <<" ";
01075     }
01076     cout<<endl;
01077 
01078     // save normalization
01079     char buf[1024];
01080     sprintf ( buf,"%s/%s/normalization.dat.add%d",m_datasetPath.c_str(), m_tempPath.c_str(), m_nCascadeInputs );
01081     cout<<"Save mean and std: "<<buf<<endl;
01082     fstream f ( buf, ios::out );
01083     f.write ( ( char* ) &m_nFeatures, sizeof ( int ) );
01084     f.write ( ( char* ) m_mean, sizeof ( REAL ) *m_nFeatures );
01085     f.write ( ( char* ) m_std, sizeof ( REAL ) *m_nFeatures );
01086     f.close();
01087 
01088     m_mixList = new int[m_nTrain];
01089 
01090     // mixing list
01091     for ( int i=0;i<m_nTrain;i++ )
01092         m_mixList[i] = i;
01093 
01094     // fix the randomness
01095     cout<<"Random seed:"<<m_randSeed<<endl;
01096     srand ( m_randSeed );
01097 
01098     cout<<"nFeatures:"<<m_nFeatures<<endl;
01099     cout<<"nClass:"<<m_nClass<<endl;
01100     cout<<"nDomain:"<<m_nDomain<<endl;
01101 
01102     if ( m_validationType == "ValidationSet" )
01103     {
01104         // no cross validation set
01105         m_trainSize = new int[1];
01106         m_trainSize[0] = m_nTrain;
01107         return;
01108     }
01109     
01110     
01111     m_trainTargetOrigEffect = new REAL[m_nClass*m_nDomain*m_nTrain];
01112     m_trainTargetOrigResidual = new REAL[m_nClass*m_nDomain*m_nTrain];
01113 
01114     // allocate mem for cross validation sets
01115     m_trainSize = new int[m_nCross+1];
01116     m_train = new REAL*[m_nCross+1];
01117     m_trainTarget = new REAL*[m_nCross+1];
01118     m_trainTargetEffect = new REAL*[m_nCross+1];
01119     m_trainTargetResidual = new REAL*[m_nCross+1];
01120     m_trainLabel = new int*[m_nCross+1];
01121     if(m_validationType == "Bagging")
01122         m_trainBaggingIndex = new int*[m_nCross+1];
01123 
01124     m_probeSize = new int[m_nCross+1];
01125     m_probe = new REAL*[m_nCross+1];
01126     m_probeTarget = new REAL*[m_nCross+1];
01127     m_probeTargetEffect = new REAL*[m_nCross+1];
01128     m_probeTargetResidual = new REAL*[m_nCross+1];
01129     m_probeLabel = new int*[m_nCross+1];
01130     m_probeIndex = new int*[m_nCross+1];
01131 
01132     
01133     // make a randomized index list (by random index swaps)
01134     int index0, index1, tmp;
01135     cout<<"Make "<<m_nTrain*m_nMixTrainList<<" index swaps (randomize sample index list)"<<endl;
01136     for ( int i=0;i<m_nTrain*m_nMixTrainList;i++ )
01137     {
01138         index0 = rand() % m_nTrain;
01139         index1 = rand() % m_nTrain;
01140 
01141         // swap
01142         tmp = m_mixList[index0];
01143         m_mixList[index0] = m_mixList[index1];
01144         m_mixList[index1] = tmp;
01145     }
01146 
01147     if( m_validationType == "Retraining" || m_validationType == "CrossFoldMean" )
01148     {
01149         m_slotBoundaries = new int[m_nCross+2];
01150     
01151         double partitionSize = ( double ) m_nTrain / ( double ) m_nCross;
01152         double accumulatedSize = partitionSize;
01153         int cnt = 0, currentSize = -1;
01154         m_slotBoundaries[0] = 0;
01155         m_slotBoundaries[m_nCross+1] = m_nTrain;
01156         cout<<"partition size: "<<partitionSize<<endl;
01157     
01158         // calculate train + probe size
01159         for ( int i=0;i<=m_nTrain;i++ )
01160         {
01161             currentSize++;
01162             if ( cnt < m_nCross )
01163             {
01164                 if ( i == ( int ) round ( accumulatedSize ) || i==m_nTrain )
01165                 {
01166                     m_slotBoundaries[cnt+1] = i;
01167                     m_probeSize[cnt] = currentSize;
01168                     m_trainSize[cnt] = m_nTrain - currentSize;
01169                     currentSize = 0;
01170                     accumulatedSize += partitionSize;
01171                     cnt++;
01172                 }
01173             }
01174         }
01175         m_trainSize[m_nCross] = m_nTrain;  // retraining set
01176         m_probeSize[m_nCross] = 0;
01177         
01178         // print splits
01179         int sum = 0;
01180         cout<<"slot: TRAIN | PROBE"<<endl<<"==================="<<endl;
01181         for ( int i=0;i<m_nCross+1;i++ )
01182         {
01183             cout<<i<<": "<<m_trainSize[i]<<" | "<<m_probeSize[i]<<endl;
01184             sum += m_probeSize[i];
01185         }
01186         cout<<"probe sum:"<<sum<<endl;
01187     }
01188     else if ( m_validationType == "Bagging" )
01189     {
01190         bool* bagSamples = new bool[m_nTrain];
01191         cout<<"Bagging sizes: TRAIN | PROBE"<<endl<<"============================"<<endl;
01192         for(int i=0;i<m_nCross;i++)
01193         {
01194             m_trainBaggingIndex[i] = new int[m_nTrain];
01195             
01196             // simulate boostrap sampling: sampling with replacenent
01197             srand(Framework::getRandomSeed() + i);
01198             int cnt = 0;
01199             for(int j=0;j<m_nTrain;j++)
01200                 bagSamples[j] = 0;
01201             for(int j=0;j<m_nTrain;j++)
01202             {
01203                 int ind = rand() % m_nTrain;
01204                 bagSamples[ind] = 1;
01205                 m_trainBaggingIndex[i][j] = ind;
01206             }
01207             for(int j=0;j<m_nTrain;j++)
01208                 cnt += bagSamples[j];
01209             m_trainSize[i] = m_nTrain;
01210             m_probeSize[i] = m_nTrain - cnt;
01211             
01212             m_probeIndex[i] = new int[m_probeSize[i]];
01213             cnt = 0;
01214             for(int j=0;j<m_nTrain;j++)
01215             {
01216                 if(bagSamples[j] == false)
01217                 {
01218                     m_probeIndex[i][cnt] = j;
01219                     cnt++;
01220                 }
01221             }
01222             cout<<i<<": "<<m_trainSize[i]<<" | "<<m_probeSize[i]<<"  ("<<100.0*(double)m_probeSize[i]/(double)m_nTrain<<"% in probe)"<<endl;
01223         }
01224         m_trainSize[m_nCross] = 0;
01225         m_probeSize[m_nCross] = 0;
01226         m_probeIndex[m_nCross] = 0;
01227         m_trainBaggingIndex[m_nCross] = 0;
01228         delete[] bagSamples;
01229         
01230         // make a summary (#zeros, mean coverage)
01231         int* bagCnt = new int[m_nTrain];
01232         for(int i=0;i<m_nTrain;i++)
01233             bagCnt[i] = 0;
01234         for(int i=0;i<m_nCross;i++)
01235             for(int j=0;j<m_nTrain;j++)
01236                 bagCnt[m_trainBaggingIndex[i][j]]++;
01237         cout<<"Bagging summary: #averaged: and  #cnt"<<endl;
01238         for(int nr=0;nr<2*m_nCross;nr++)
01239         {
01240             int cnt = 0;
01241             for(int i=0;i<m_nTrain;i++)
01242                 if(bagCnt[i] == nr)
01243                     cnt++;
01244             cout<<"n:"<<nr<<"|#"<<cnt<<" ";
01245         }
01246         cout<<endl;
01247         delete[] bagCnt;
01248     }
01249     else
01250         assert(false);
01251     
01252     // allocate mem + copy data to cross-validation slots
01253     for ( int i=0;i<m_nCross+1;i++ )
01254     {
01255         // allocate train mem
01256         int nTrain = m_trainSize[i];
01257         if ( m_enableSaveMemory == false )
01258             m_train[i] = new REAL[nTrain * m_nFeatures];
01259         else
01260             m_train[i] = 0;
01261         m_trainTarget[i] = new REAL[nTrain * m_nClass * m_nDomain];
01262         m_trainTargetEffect[i] = new REAL[nTrain * m_nClass * m_nDomain];
01263         m_trainTargetResidual[i] = new REAL[nTrain * m_nClass * m_nDomain];
01264         m_trainLabel[i] = new int[nTrain*m_nDomain];
01265 
01266         // allocate probe mem
01267         int nProbe = m_probeSize[i];
01268         if ( nProbe )
01269         {
01270             if ( m_enableSaveMemory == false )
01271                 m_probe[i] = new REAL[nProbe * m_nFeatures];
01272             else
01273                 m_probe[i] = 0;
01274             m_probeTarget[i] = new REAL[nProbe * m_nClass * m_nDomain];
01275             m_probeTargetEffect[i] = new REAL[nProbe * m_nClass * m_nDomain];
01276             m_probeTargetResidual[i] = new REAL[nProbe * m_nClass * m_nDomain];
01277             m_probeLabel[i] = new int[nProbe*m_nDomain];
01278             if ( m_validationType != "Bagging" )
01279                 m_probeIndex[i] = new int[nProbe];
01280         }
01281         else
01282         {
01283             m_probe[i] = 0;
01284             m_probeTarget[i] = 0;
01285             m_probeTargetEffect[i] = 0;
01286             m_probeTargetResidual[i] = 0;
01287             m_probeLabel[i] = 0;
01288             m_probeIndex[i] = 0;
01289         }
01290     }
01291 
01292     // alloc index list
01293     m_crossIndex = new int[m_nTrain];
01294     for ( int i=0;i<m_nTrain;i++ )
01295         m_crossIndex[i] = -1;
01296     
01297 }

void Data::baggingRandomSeed ( uint  seed  ) 

Set the random seed in bagging

Parameters:
seed The seed

Definition at line 2143 of file Data.cpp.

02144 {
02145     m_randomSeedBagging = seed;
02146 }

void Data::deleteMemory (  ) 

Deletes internal memory, in order to re-read a dataset and start the training again

Definition at line 104 of file Data.cpp.

00105 {
00106     cout<<"Delete internal memory"<<endl;
00107 
00108     // memory from dataset
00109     if ( m_trainOrig )
00110         delete[] m_trainOrig;
00111     m_trainOrig = 0;
00112     if ( m_trainTargetOrig )
00113         delete[] m_trainTargetOrig;
00114     m_trainTargetOrig = 0;
00115     if ( m_trainLabelOrig )
00116         delete[] m_trainLabelOrig;
00117     m_trainLabelOrig = 0;
00118     if ( m_testOrig )
00119         delete[] m_testOrig;
00120     m_testOrig = 0;
00121     if ( m_testTargetOrig )
00122         delete[] m_testTargetOrig;
00123     m_testTargetOrig = 0;
00124     if ( m_testLabelOrig )
00125         delete[] m_testLabelOrig;
00126     m_testLabelOrig = 0;
00127 
00128     // memory from cross validation
00129     if ( m_mean )
00130         delete[] m_mean;
00131     m_mean = 0;
00132     if ( m_std )
00133         delete[] m_std;
00134     m_std = 0;
00135     if ( m_trainTargetOrigEffect )
00136         delete[] m_trainTargetOrigEffect;
00137     m_trainTargetOrigEffect = 0;
00138     if ( m_trainTargetOrigResidual )
00139         delete[] m_trainTargetOrigResidual;
00140     m_trainTargetOrigResidual = 0;
00141 
00142     for ( int i=0;i<m_nCross+1;i++ )
00143     {
00144         if ( m_train )
00145         {
00146             if ( m_train[i] )
00147                 delete[] m_train[i];
00148             m_train[i] = 0;
00149         }
00150         if ( m_trainTarget )
00151         {
00152             if ( m_trainTarget[i] )
00153                 delete[] m_trainTarget[i];
00154             m_trainTarget[i] = 0;
00155         }
00156         if ( m_trainTargetEffect )
00157         {
00158             if ( m_trainTargetEffect[i] )
00159                 delete[] m_trainTargetEffect[i];
00160             m_trainTargetEffect[i] = 0;
00161         }
00162         if ( m_trainTargetResidual )
00163         {
00164             if ( m_trainTargetResidual[i] )
00165                 delete[] m_trainTargetResidual[i];
00166             m_trainTargetResidual[i] = 0;
00167         }
00168         if ( m_trainLabel )
00169         {
00170             if ( m_trainLabel[i] )
00171                 delete[] m_trainLabel[i];
00172             m_trainLabel[i] = 0;
00173         }
00174         if ( m_validationType == "Bagging" )
00175         {
00176             if( m_trainBaggingIndex )
00177             {
00178                 if ( m_trainBaggingIndex[i] )
00179                     delete[] m_trainBaggingIndex[i];
00180                 m_trainBaggingIndex[i] = 0;
00181             }
00182         }
00183         if ( m_probe )
00184         {
00185             if ( m_probe[i] )
00186                 delete[] m_probe[i];
00187             m_probe[i] = 0;
00188         }
00189         if ( m_probeTarget )
00190         {
00191             if ( m_probeTarget[i] )
00192                 delete[] m_probeTarget[i];
00193             m_probeTarget[i] = 0;
00194         }
00195         if ( m_probeTargetEffect )
00196         {
00197             if ( m_probeTargetEffect[i] )
00198                 delete[] m_probeTargetEffect[i];
00199             m_probeTargetEffect[i] = 0;
00200         }
00201         if ( m_probeTargetResidual )
00202         {
00203             if ( m_probeTargetResidual[i] )
00204                 delete[] m_probeTargetResidual[i];
00205             m_probeTargetResidual[i] = 0;
00206         }
00207         if ( m_probeLabel )
00208         {
00209             if ( m_probeLabel[i] )
00210                 delete[] m_probeLabel[i];
00211             m_probeLabel[i] = 0;
00212         }
00213         if ( m_probeIndex )
00214         {
00215             if ( m_probeIndex[i] )
00216                 delete[] m_probeIndex[i];
00217             m_probeIndex[i] = 0;
00218         }
00219     }
00220     if ( m_train )
00221         delete[] m_train;
00222     m_train = 0;
00223     if ( m_trainTarget )
00224         delete[] m_trainTarget;
00225     m_trainTarget = 0;
00226     if ( m_trainTargetEffect )
00227         delete[] m_trainTargetEffect;
00228     m_trainTargetEffect = 0;
00229     if ( m_trainTargetResidual )
00230         delete[] m_trainTargetResidual;
00231     m_trainTargetResidual = 0;
00232     if ( m_trainLabel )
00233         delete[] m_trainLabel;
00234     m_trainLabel = 0;
00235     if(m_validationType == "Bagging")
00236     {
00237         if(m_trainBaggingIndex)
00238             delete[] m_trainBaggingIndex;
00239         m_trainBaggingIndex = 0;
00240     }
00241     if ( m_probe )
00242         delete[] m_probe;
00243     m_probe = 0;
00244     if ( m_probeTarget )
00245         delete[] m_probeTarget;
00246     m_probeTarget = 0;
00247     if ( m_probeTargetEffect )
00248         delete[] m_probeTargetEffect;
00249     m_probeTargetEffect = 0;
00250     if ( m_probeTargetResidual )
00251         delete[] m_probeTargetResidual;
00252     m_probeTargetResidual = 0;
00253     if ( m_probeLabel )
00254         delete[] m_probeLabel;
00255     m_probeLabel = 0;
00256     if ( m_probeIndex )
00257         delete[] m_probeIndex;
00258     m_probeIndex = 0;
00259 
00260     if ( m_trainSize )
00261         delete[] m_trainSize;
00262     m_trainSize = 0;
00263     if ( m_probeSize )
00264         delete[] m_probeSize;
00265     m_probeSize = 0;
00266 
00267     if ( m_mixDatasetIndices )
00268         delete[] m_mixDatasetIndices;
00269     m_mixDatasetIndices = 0;
00270     if ( m_mixList )
00271         delete[] m_mixList;
00272     m_mixList = 0;
00273     if ( m_slotBoundaries )
00274         delete[] m_slotBoundaries;
00275     m_slotBoundaries = 0;
00276     if ( m_crossIndex )
00277         delete[] m_crossIndex;
00278     m_crossIndex = 0;
00279 
00280     if ( m_cascadeInputs )
00281         delete[] m_cascadeInputs;
00282     m_cascadeInputs = 0;
00283     
00284     if ( m_targetMean )
00285         delete[] m_targetMean;
00286     m_targetMean = 0;
00287 
00288 }

void Data::doBootstrapSampling ( REAL *  probs,
REAL *&  train,
REAL *&  target,
REAL *&  targetEff,
REAL *&  targetRes,
int *&  label,
int  nTrainNew = 0 
)

This is an obsolete method!! Please use directly the option: validationType=Bagging in the Master.dsc file instead

Make a modified train dataset using boostrap sampling -> Sampling with replacement On average 63% of original data are in the new trainset (with duplicates)

Definition at line 557 of file Data.cpp.

00558 {
00559     cout<<endl<<"Do boostrap sampling of the dataset (size:"<<m_nTrain<<")"<<endl;
00560     cout<<"Random seed:"<<m_randomSeedBagging<<endl;
00561     srand ( m_randomSeedBagging );
00562 
00563     if ( nTrainNew > 0 && nTrainNew < m_nTrain )
00564         cout<<"Draw not a boostrap sample, make a simple random subset ("<<100.0* ( double ) nTrainNew/ ( double ) m_nTrain<<"%)"<<endl;
00565 
00566     REAL* trainNew = 0, *ptr0, *ptr1;
00567     if ( train )
00568         trainNew = new REAL[m_nFeatures*m_nTrain];
00569     REAL* targetNew = 0;
00570     if ( target )
00571         targetNew = new REAL[m_nClass*m_nDomain*m_nTrain];
00572     REAL* targetEffNew = 0;
00573     if ( targetEff )
00574         targetEffNew = new REAL[m_nClass*m_nDomain*m_nTrain];
00575     REAL* targetResNew = 0;
00576     if ( targetRes )
00577         targetResNew = new REAL[m_nClass*m_nDomain*m_nTrain];
00578     int* labelNew = 0;
00579     if ( Framework::getDatasetType() ==true )
00580         labelNew = new int[m_nDomain*m_nTrain];
00581     int* replicateCnt = new int[m_nTrain];
00582     for ( int i=0;i<m_nTrain;i++ )
00583         replicateCnt[i] = 0;
00584 
00585     int sampleCnt = 0;
00586     while ( ( sampleCnt < m_nTrain && nTrainNew == 0 ) || ( sampleCnt < nTrainNew && nTrainNew > 0 && nTrainNew < m_nTrain ) )
00587         //for(int i=0;i<m_nTrain;i++)
00588     {
00589         // random index
00590         int ind;
00591         if ( nTrainNew == 0 || nTrainNew >= m_nTrain ) // boostrap sample
00592         {
00593             if ( probs == 0 )
00594                 ind = rand() %m_nTrain;
00595             else
00596                 ind = vectorSampling ( probs, m_nTrain );
00597         }
00598         else  // random subset
00599         {
00600             ind = rand() %m_nTrain;
00601             while ( replicateCnt[ind] )
00602                 ind = rand() %m_nTrain;
00603         }
00604         replicateCnt[ind]++;
00605 
00606         // train features
00607         if ( train )
00608         {
00609             ptr0 = train + ind * m_nFeatures;
00610             ptr1 = trainNew + sampleCnt * m_nFeatures;
00611             for ( int j=0;j<m_nFeatures;j++ )
00612                 ptr1[j] = ptr0[j];
00613         }
00614 
00615         // targets
00616         if ( target )
00617         {
00618             ptr0 = target + ind * m_nClass*m_nDomain;
00619             ptr1 = targetNew + sampleCnt * m_nClass*m_nDomain;
00620             for ( int j=0;j<m_nClass*m_nDomain;j++ )
00621                 ptr1[j] = ptr0[j];
00622         }
00623 
00624         // effects
00625         if ( targetEff )
00626         {
00627             ptr0 = targetEff + ind * m_nClass*m_nDomain;
00628             ptr1 = targetEffNew + sampleCnt * m_nClass*m_nDomain;
00629             for ( int j=0;j<m_nClass*m_nDomain;j++ )
00630                 ptr1[j] = ptr0[j];
00631         }
00632 
00633         // residual
00634         if ( targetRes )
00635         {
00636             ptr0 = targetRes + ind * m_nClass*m_nDomain;
00637             ptr1 = targetResNew + sampleCnt * m_nClass*m_nDomain;
00638             for ( int j=0;j<m_nClass*m_nDomain;j++ )
00639                 ptr1[j] = ptr0[j];
00640         }
00641 
00642         // train label
00643         if ( Framework::getDatasetType() ==true )
00644             for ( int d=0;d<m_nDomain;d++ )
00645                 labelNew[d+sampleCnt*m_nDomain] = label[d+ind*m_nDomain];
00646 
00647         sampleCnt++;
00648     }
00649 
00650     int nonReplicates = 0, notUsed = 0, replicates = 0;
00651     for ( int i=0;i<m_nTrain;i++ )
00652     {
00653         if ( replicateCnt[i] == 0 )
00654             notUsed++;
00655         if ( replicateCnt[i] == 1 )
00656             nonReplicates++;
00657         if ( replicateCnt[i] > 1 )
00658             replicates++;
00659     }
00660     cout<<"notUsed:"<<notUsed<<" nonReplicates:"<<nonReplicates<<" replicates:"<<replicates;
00661     cout<<" ("<<100.0* ( REAL ) ( nonReplicates+replicates ) / ( REAL ) m_nTrain<<"%)"<<endl<<endl;
00662 
00663     delete[] replicateCnt;
00664 
00665     // set new data
00666     train = trainNew;
00667     target = targetNew;
00668     targetEff = targetEffNew;
00669     targetRes = targetResNew;
00670     label = labelNew;
00671 }

void Data::doFeatureSelection (  ) 

Start the feature selection process

Definition at line 1445 of file Data.cpp.

01446 {
01447     bool* selectedFeatures = new bool[m_nFeatures];
01448     InputFeatureSelector::selectFeatures ( selectedFeatures, m_trainOrig, m_nFeatures, m_nTrain, m_trainLabelOrig, m_trainTargetOrigResidual, m_nClass, m_nDomain );
01449 
01450     delete[] selectedFeatures;
01451 }

void Data::enableBagging ( bool  en  ) 

enable bagging: done by resampling of the trainingset in retraining

Parameters:
en enable

Definition at line 2132 of file Data.cpp.

02133 {
02134     cout<<"Enable bagging:"<<en<<endl;
02135     m_enableBagging = en;
02136 }

void Data::extendTrainDataWithCascadeInputs (  ) 

Extend the input features with predictions of previous algorithms nInputsNew = nInputs + nCascadeInputs

Definition at line 1716 of file Data.cpp.

01717 {
01718     if ( m_nCascadeInputs == 0 )
01719         return;
01720 
01721     cout<<"Extend the train data with cascade inputs"<<endl;
01722 
01723     if ( m_trainOrig )
01724     {
01725         REAL* m_trainOrigNew = new REAL[m_nTrain* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) ];
01726         for ( int i=0;i<m_nTrain;i++ )
01727         {
01728             REAL* ptr0 = m_trainOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain );
01729             REAL* ptr1 = m_trainOrig + i*m_nFeatures;
01730             for ( int j=0;j<m_nFeatures;j++ )
01731                 ptr0[j] = ptr1[j];
01732             ptr0 = m_trainOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) + m_nFeatures;
01733             ptr1 = m_cascadeInputs + i*m_nCascadeInputs*m_nClass*m_nDomain;
01734             for ( int j=0;j<m_nCascadeInputs*m_nClass*m_nDomain;j++ )
01735                 ptr0[j] = ptr1[j];
01736         }
01737         if ( m_trainOrig )
01738             delete[] m_trainOrig;
01739         m_trainOrig = m_trainOrigNew;
01740     }
01741 
01742     if ( m_testOrig )
01743     {
01744         REAL* m_testOrigNew = new REAL[m_nTest* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) ];
01745         for ( int i=0;i<m_nTest;i++ )
01746         {
01747             REAL* ptr0 = m_testOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain );
01748             REAL* ptr1 = m_testOrig + i*m_nFeatures;
01749             for ( int j=0;j<m_nFeatures;j++ )
01750                 ptr0[j] = ptr1[j];
01751             ptr0 = m_testOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) + m_nFeatures;
01752             ptr1 = m_cascadeInputs + i*m_nCascadeInputs*m_nClass*m_nDomain;
01753             for ( int j=0;j<m_nCascadeInputs*m_nClass*m_nDomain;j++ )
01754                 ptr0[j] = ptr1[j];
01755         }
01756         if ( m_testOrig )
01757             delete[] m_testOrig;
01758         m_testOrig = m_testOrigNew;
01759     }
01760 
01761     int nFeaturesBefore = m_nFeatures;
01762     m_nFeatures += m_nCascadeInputs*m_nClass*m_nDomain;
01763     cout<<"nFeatures: "<<m_nFeatures<<" (before: "<<nFeaturesBefore<<")"<<endl;
01764 }

void Data::fillCascadeLearningInputs (  ) 

If this algorithm is based on an other algorithm Add the predictions of previous algorithms as input features This means add all predictions from the fullPredictionPath

Definition at line 1656 of file Data.cpp.

01657 {
01658     cout<<endl<<"Add effects (predictions of previous algorithms) as inputs to dataset"<<endl;
01659 
01660     // load the fullPredictors
01661     vector<string> files = m_algorithmNameList; //Data::getDirectoryFileList(m_datasetPath + "/" + m_fullPredPath + "/");
01662     vector<string> m_usedFiles;
01663 
01664     for ( int i=0;i<files.size();i++ )
01665         if ( files[i].at ( files[i].size()-1 ) != '.' && files[i].find ( ".dat" ) == files[i].length()-4 )
01666             m_usedFiles.push_back ( files[i] );
01667     int size = m_usedFiles.size();
01668 
01669     // alloc mem
01670     m_cascadeInputs = new REAL[size*m_nClass*m_nDomain*m_nTrain];
01671     for ( int i=0;i<size*m_nClass*m_nDomain*m_nTrain;i++ )
01672         m_cascadeInputs[i] = 1e10;
01673 
01674     // fill cascadeInputs
01675     for ( int i=0;i<size;i++ )
01676     {
01677         fstream f ( m_usedFiles[i].c_str(), ios::in );
01678         if ( f.is_open() == false )
01679             assert ( false );
01680         REAL* cache = new REAL[m_nTrain*m_nClass*m_nDomain];
01681         f.read ( ( char* ) cache, sizeof ( REAL ) *m_nTrain*m_nClass*m_nDomain );
01682         f.close();
01683 
01684         for ( int j=0;j<m_nTrain;j++ )
01685             for ( int k=0;k<m_nClass*m_nDomain;k++ )
01686                 m_cascadeInputs[j*m_nClass*m_nDomain*size + i*m_nClass*m_nDomain + k] = cache[j*m_nClass*m_nDomain + k];
01687 
01688         if ( cache )
01689             delete[] cache;
01690         cache = 0;
01691     }
01692     for ( int i=0;i<size;i++ )
01693     {
01694         double rmse = 0.0, err;
01695         for ( int j=0;j<m_nTrain;j++ )
01696             for ( int k=0;k<m_nClass*m_nDomain;k++ )
01697             {
01698                 err = m_cascadeInputs[j*m_nClass*m_nDomain*size + i*m_nClass*m_nDomain + k] - m_trainTargetOrig[k + j*m_nClass*m_nDomain];
01699                 rmse += err*err;
01700             }
01701         cout<<"File:"<<m_usedFiles[i]<<"  RMSE:"<<sqrt ( rmse/ ( double ) ( m_nClass*m_nTrain*m_nDomain ) ) <<endl;
01702     }
01703     if ( size == 0 )
01704         cout<<"Nothing to do here"<<endl;
01705     cout<<endl;
01706 
01707     m_nCascadeInputs = size;
01708     cout<<"nCascadeInputs:"<<m_nCascadeInputs<<endl;
01709 }

void Data::fillNCrossValidationSet ( int  n  ) 

Fill one split of the cross-fold validation set

Parameters:
n The n-th set (0..nCross-1)

Definition at line 1348 of file Data.cpp.

01349 {
01350     // alloc new memory
01351     if ( m_train[n] )
01352         delete[] m_train[n];
01353     m_train[n] = 0;
01354     m_train[n] = new REAL[m_trainSize[n]*m_nFeatures];
01355     for ( int i=0;i<m_trainSize[n]*m_nFeatures;i++ )
01356         m_train[n][i] = 0.0;
01357     if ( m_probe[n] )
01358         delete[] m_probe[n];
01359     m_probe[n] = 0;
01360     if ( m_probeSize[n] )
01361         m_probe[n] = new REAL[m_probeSize[n]*m_nFeatures];
01362     for ( int i=0;i<m_probeSize[n]*m_nFeatures;i++ )
01363         m_probe[n][i] = 0.0;
01364 
01365     if(m_validationType == "Bagging")
01366     {
01367         bool* bagSamples = new bool[m_nTrain];
01368         for(int i=0;i<m_nTrain;i++)
01369             bagSamples[i] = 0;
01370         for(int i=0;i<m_nTrain;i++)
01371         {
01372             int ind = m_trainBaggingIndex[n][i];
01373             bagSamples[ind] = 1;
01374             for(int j=0;j<m_nFeatures;j++)
01375                 m_train[n][i*m_nFeatures+j] = m_trainOrig[ind*m_nFeatures + j];
01376         }
01377         int cnt = 0;
01378         for(int i=0;i<m_nTrain;i++)
01379         {
01380             if(bagSamples[i] == false)
01381             {
01382                 for(int j=0;j<m_nFeatures;j++)
01383                     m_probe[n][cnt*m_nFeatures+j] = m_trainOrig[i*m_nFeatures + j];
01384                 cnt++;
01385             }
01386         }
01387         if(cnt != m_probeSize[n])
01388         {
01389             cout<<"cnt:"<<cnt<<" probeSize"<<m_probeSize[n]<<endl;
01390             assert(false);
01391         }
01392         delete[] bagSamples;
01393     }
01394     else
01395     {
01396         // slot of probeset
01397         int begin = m_slotBoundaries[n];
01398         int end = m_slotBoundaries[n+1];
01399     
01400         int probeCnt = 0, trainCnt = 0;
01401     
01402         // go through whole trainOrig set
01403         for ( int j=0;j<m_nTrain;j++ )
01404         {
01405             int index = m_mixList[j];
01406     
01407             // probe set
01408             if ( j>=begin && j <end )
01409             {
01410                 for ( int k=0;k<m_nFeatures;k++ )
01411                     m_probe[n][probeCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01412                 probeCnt++;
01413             }
01414             else  // train set
01415             {
01416                 for ( int k=0;k<m_nFeatures;k++ )
01417                     m_train[n][trainCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01418                 trainCnt++;
01419             }
01420         }
01421     
01422         if ( probeCnt != m_probeSize[n] || trainCnt != m_trainSize[n] ) // safety check
01423             assert ( false );
01424     }
01425 }

void Data::freeNCrossValidationSet ( int  n  ) 

Free memory of one split of the cross-fold validation set

Parameters:
n The n-th set (0..nCross-1)

Definition at line 1432 of file Data.cpp.

01433 {
01434     if ( m_train[n] )
01435         delete[] m_train[n];
01436     m_train[n] = 0;
01437     if ( m_probe[n] )
01438         delete[] m_probe[n];
01439     m_probe[n] = 0;
01440 }

vector< string > Data::getDirectoryFileList ( string  path  )  [static]

Parameters:
path The path to the directory, which should be listed
Returns:
A list of files with absolute path, stored in a vector which constist of strings

Definition at line 1873 of file Data.cpp.

01874 {
01875     vector<string> v;
01876     DIR *dp;
01877     struct dirent *dirp;
01878     if ( ( dp = opendir ( path.c_str() ) ) == NULL )
01879     {
01880         cout << "Error opening " << path << endl;
01881         return v;
01882     }
01883     while ( ( dirp = readdir ( dp ) ) != NULL )
01884         v.push_back ( path + string ( dirp->d_name ) );
01885     closedir ( dp );
01886     return v;
01887 }

void Data::loadNormalization ( int  nCascade = 0  ) 

Load the normalization.dat in the temp folder

Definition at line 853 of file Data.cpp.

00854 {
00855     // load normalization
00856     char buf[1024];
00857     sprintf ( buf,"%s/%s/normalization.dat.add%d",m_datasetPath.c_str(), m_tempPath.c_str(), nCascade );
00858     cout<<"Load mean and std: "<<buf<<endl;
00859     fstream f ( buf, ios::in );
00860     if ( f.is_open() == false )
00861         assert ( false );
00862     int n;
00863     f.read ( ( char* ) &n, sizeof ( int ) );
00864     if ( m_mean == 0 )
00865         m_mean = new REAL[n];
00866     if ( m_std == 0 )
00867         m_std = new REAL[n];
00868     f.read ( ( char* ) m_mean, sizeof ( REAL ) *n );
00869     f.read ( ( char* ) m_std, sizeof ( REAL ) *n );
00870     REAL min = 1e10, max = -1e10;
00871     for ( int i=0;i<n;i++ )
00872     {
00873         if ( min > m_mean[i] )
00874             min = m_mean[i];
00875         if ( max < m_mean[i] )
00876             max = m_mean[i];
00877     }
00878     cout<<"Mean:  min|max:"<<min<<"|"<<max<<endl;
00879     min = 1e10;
00880     max = -1e10;
00881     for ( int i=0;i<n;i++ )
00882     {
00883         if ( min > m_std[i] )
00884             min = m_std[i];
00885         if ( max < m_std[i] )
00886             max = m_std[i];
00887     }
00888     cout<<"Std:  min|max:"<<min<<"|"<<max<<endl;
00889     f.close();
00890 }

void Data::makeBinaryDataset (  ) 

Writes the dataset in binary form

  • binary.train
  • binary.test

Definition at line 707 of file Data.cpp.

00708 {
00709     cout<<endl;
00710     cout<<"Make binary dataset from selected features"<<endl;
00711     cout<<"Open features:"<<FEATURE_TXT_FILE<<endl;
00712 
00713     // read features from txt file
00714     fstream f;
00715     vector<int> features;
00716     f.open ( FEATURE_TXT_FILE,ios::in );
00717     if ( f.is_open() ==false )
00718         assert ( false );
00719     int value, nValidFeatures = 0;
00720     while ( f>>value )
00721         features.push_back ( value );
00722     f.close();
00723 
00724     // check featureIDs
00725     for ( int j=0;j<features.size();j++ )
00726         if ( features[j] >= m_nFeatures || features[j] == -1 )
00727             assert ( false );
00728         else
00729             nValidFeatures++;
00730 
00731     cout<<"nValidFeatures:"<<nValidFeatures<<endl;
00732     REAL* feat;
00733     int* label, N;
00734 
00735     if ( Framework::getFrameworkMode() == 1 )
00736     {
00737         cout<<"Write: binary.test"<<endl;
00738         f.open ( "binary.test", ios::out );
00739         feat = m_testOrig;
00740         label = m_testLabelOrig;
00741         N = m_nTest;
00742     }
00743     else
00744     {
00745         cout<<"Write: binary.train"<<endl;
00746         f.open ( "binary.train", ios::out );
00747         feat = m_trainOrig;
00748         label = m_trainLabelOrig;
00749         N = m_nTrain;
00750     }
00751 
00752     cout<<"#lines:"<<N<<endl;
00753 
00754     // dataset bounds
00755     f.write ( ( char* ) &N, sizeof ( int ) );
00756     f.write ( ( char* ) &m_nClass, sizeof ( int ) );
00757     f.write ( ( char* ) &m_nDomain, sizeof ( int ) );
00758     f.write ( ( char* ) &nValidFeatures, sizeof ( int ) );
00759 
00760     // write features
00761     for ( int i=0;i<N;i++ )
00762         for ( int j=0;j<features.size();j++ )
00763             f.write ( ( char* ) & ( feat[i*m_nFeatures + features[j]] ), sizeof ( REAL ) );
00764 
00765     // write labels
00766     f.write ( ( char* ) label, sizeof ( int ) *N*m_nDomain );
00767     f.close();
00768 
00769 }

void Data::mergeTrainAndTest (  ) 

Merge the train and test set into the train set

This is used in the dimensionality reduction, where the training is unsupervised, which means to train only on features and without targets

Definition at line 2155 of file Data.cpp.

02156 {
02157     cout<<"trainSet = {trainSet(#"<<m_nTrain<<") + testSet(#"<<m_nTest<<")}"<<endl;
02158     if ( m_nTest == 0 )
02159         return;
02160 
02161     REAL* train = new REAL[ ( m_nTrain + m_nTest ) *m_nFeatures];
02162     REAL* trainTarget = new REAL[ ( m_nTrain + m_nTest ) *m_nClass*m_nDomain];
02163     int* trainLabel = new int[ ( m_nTrain + m_nTest ) *m_nDomain];
02164 
02165     memcpy ( train, m_trainOrig, sizeof ( REAL ) *m_nTrain*m_nFeatures );
02166     memcpy ( train + m_nTrain*m_nFeatures, m_testOrig, sizeof ( REAL ) *m_nTest*m_nFeatures );
02167 
02168     memcpy ( trainTarget, m_trainTargetOrig, sizeof ( REAL ) *m_nTrain*m_nClass*m_nDomain );
02169     memcpy ( trainTarget + m_nTrain*m_nClass*m_nDomain, m_testTargetOrig, sizeof ( REAL ) *m_nTest*m_nClass*m_nDomain );
02170 
02171     memcpy ( trainLabel, m_trainLabelOrig, sizeof ( REAL ) *m_nTrain*m_nDomain );
02172     memcpy ( trainLabel + m_nTrain*m_nDomain, m_testLabelOrig, sizeof ( REAL ) *m_nTest*m_nDomain );
02173 
02174     delete[] m_trainOrig;
02175     delete[] m_trainTargetOrig;
02176     delete[] m_trainLabelOrig;
02177 
02178     m_trainOrig = train;
02179     m_trainTargetOrig = trainTarget;
02180     m_trainLabelOrig = trainLabel;
02181 
02182     m_nTrain = m_nTrain + m_nTest;
02183 }

void Data::mixDataset (  ) 

Mix the dataset Do m_nTrain*m_nMixDataset random sample swaps

Definition at line 775 of file Data.cpp.

00776 {
00777     if ( m_nTrain )
00778     {
00779         m_mixDatasetIndices = new int[m_nTrain];
00780         for ( int i=0;i<m_nTrain;i++ )
00781             m_mixDatasetIndices[i] = i;
00782     }
00783     else
00784     {
00785         cout<<"Do no mix the dataset."<<endl;
00786         m_mixDatasetIndices = 0;
00787         return;
00788     }
00789     cout<<"Randomize the dataset: "<<m_nMixDataset*m_nTrain<<" line swaps [";
00790 
00791     int progress = m_nTrain*m_nMixDataset/10 + 1;
00792     REAL* tmp0 = new REAL[m_nFeatures];
00793     REAL* tmp1 = new REAL[m_nClass*m_nDomain];
00794     for ( int i=0;i<m_nTrain*m_nMixDataset;i++ )
00795     {
00796         if ( i%progress==0 )
00797             cout<<"."<<flush;
00798 
00799         // random index swaps
00800         int ind0 = rand() %m_nTrain;
00801         int ind1 = rand() %m_nTrain;
00802 
00803         // train features (REAL*)
00804         REAL* ptr0 = m_trainOrig + ind0 * m_nFeatures;
00805         REAL* ptr1 = m_trainOrig + ind1 * m_nFeatures;
00806         for ( int j=0;j<m_nFeatures;j++ )
00807         {
00808             tmp0[j] = ptr0[j];
00809             ptr0[j] = ptr1[j];
00810             ptr1[j] = tmp0[j];
00811         }
00812 
00813         // train targets (REAL*)
00814         ptr0 = m_trainTargetOrig + ind0 * m_nClass * m_nDomain;
00815         ptr1 = m_trainTargetOrig + ind1 * m_nClass * m_nDomain;
00816         for ( int j=0;j<m_nClass*m_nDomain;j++ )
00817         {
00818             tmp1[j] = ptr0[j];
00819             ptr0[j] = ptr1[j];
00820             ptr1[j] = tmp1[j];
00821         }
00822 
00823         // train label
00824         if ( Framework::getDatasetType() ==true )
00825         {
00826             for ( int d=0;d<m_nDomain;d++ )
00827             {
00828                 int tmp = m_trainLabelOrig[d+ind0*m_nDomain];
00829                 m_trainLabelOrig[d+ind0*m_nDomain] = m_trainLabelOrig[d+ind1*m_nDomain];
00830                 m_trainLabelOrig[d+ind1*m_nDomain] = tmp;
00831             }
00832         }
00833 
00834         // index
00835         int tmp = m_mixDatasetIndices[ind0];
00836         m_mixDatasetIndices[ind0] = m_mixDatasetIndices[ind1];
00837         m_mixDatasetIndices[ind1] = tmp;
00838     }
00839     if ( tmp0 )
00840         delete[] tmp0;
00841     tmp0 = 0;
00842     if ( tmp1 )
00843         delete[] tmp1;
00844     tmp1 = 0;
00845 
00846     cout<<"] "<<"mixInd[0]:"<<m_mixDatasetIndices[0]<<"  mixInd["<<m_nTrain-1<<"]:"<<m_mixDatasetIndices[m_nTrain-1]<<endl;
00847 }

void Data::normalizeZeroOne (  ) 

Normalize train between 0 and 1

Definition at line 2188 of file Data.cpp.

02189 {
02190     cout<<"Autoencoder: Normalize train between 0 and 1"<<endl;
02191     // (m_trainOrig[i*m_nFeatures + j] - m_mean[j]) / m_std[j]
02192     REAL* mean = new REAL[m_nFeatures];
02193     REAL* std = new REAL[m_nFeatures];
02194 
02195     for ( int i=0;i<m_nFeatures;i++ )
02196     {
02197         double mu = 0.0, min = 1e10, max = -1e10;
02198         for ( int j=0;j<m_nTrain;j++ )
02199         {
02200             REAL v = m_trainOrig[i+j*m_nFeatures];
02201             mu += v;
02202             if ( min > v )
02203                 min = v;
02204             if ( max < v )
02205                 max = v;
02206         }
02207         mean[i] = min;
02208         std[i] = max - min;
02209         if ( std[i] <= 1e-2 )
02210             std[i] = 1.0;
02211         m_mean[i] = 0.0;
02212         m_std[i] = 1.0;
02213 
02214         if ( m_enableStaticNormalization ) // something special, allow to modify the auto normalizations
02215         {
02216             mean[i] += m_staticMeanNormalization;
02217             std[i] *= m_staticStdNormalization;
02218         }
02219     }
02220     for ( int i=0;i<m_nTrain;i++ )
02221         for ( int j=0;j<m_nFeatures;j++ )
02222         {
02223             m_trainOrig[j+i*m_nFeatures] = ( m_trainOrig[j+i*m_nFeatures] - mean[j] ) / std[j];
02224             REAL v = m_trainOrig[j+i*m_nFeatures];
02225             if ( v > 1.0 || v < 0.0 )
02226             {
02227                 cout<<"v:"<<v<<endl;
02228                 assert ( false );
02229             }
02230         }
02231 
02232     // print mean/std
02233     for ( int j=0;j<m_nFeatures;j++ )
02234         cout<<mean[j]<<"|"<<std[j]<<" ";
02235     cout<<endl;
02236 
02237     // save the normalizations
02238     cout<<"save the 0..1 normalizations"<<endl;
02239     string meanName = m_datasetPath + "/" + m_tempPath + "/AutoencoderDataMean.dat";
02240     string stdName = m_datasetPath + "/" + m_tempPath + "/AutoencoderDataStd.dat";
02241     cout<<"meanName:"<<meanName<<endl<<"stdName:"<<stdName<<endl;
02242     fstream fMean ( meanName.c_str(),ios::out );
02243     fstream fStd ( stdName.c_str(),ios::out );
02244     fMean.write ( ( char* ) mean, sizeof ( REAL ) *m_nFeatures );
02245     fStd.write ( ( char* ) std, sizeof ( REAL ) *m_nFeatures );
02246     fMean.close();
02247     fStd.close();
02248 
02249     delete[] mean;
02250     delete[] std;
02251 }

void Data::partitionDatasetToCrossValidationSets (  ) 

Split the data in n-cross validation sets And store it in member vars

Definition at line 1458 of file Data.cpp.

01459 {
01460     cout<<"Partition dataset to cross validation sets"<<endl;
01461 
01462     // read the effect file
01463     readEffectFile();
01464 
01465     // write the first lines to a file
01466     if(m_trainOrig)
01467     { fstream f("Atrain.txt",ios::out); for ( int i=0;i<m_nTrain && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_trainOrig[i*m_nFeatures + j]<<" ";f<<endl;}f.close();}
01468     if(m_testOrig)
01469     { fstream f("Atest.txt",ios::out); for ( int i=0;i<m_nTest && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_testOrig[i*m_nFeatures + j]<<" ";f<<endl;}f.close();}
01470     if(m_valid)
01471     { fstream f("Avalid.txt",ios::out); for ( int i=0;i<m_validSize && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_valid[i*m_nFeatures + j]<<" ";f<<endl;}f.close();}
01472     
01473     // apply mean and std to input features
01474     cout<<"Apply mean and std correction to train input features"<<endl;
01475     for ( int i=0;i<m_nTrain;i++ )
01476         for ( int j=0;j<m_nFeatures;j++ )
01477             m_trainOrig[i*m_nFeatures + j] = ( m_trainOrig[i*m_nFeatures + j] - m_mean[j] ) / m_std[j];
01478 
01479     // print min and max values in features
01480     REAL min = 1e10, max = -1e10;
01481     for ( int i=0;i<m_nTrain;i++ )
01482         for ( int j=0;j<m_nFeatures;j++ )
01483         {
01484             if ( min > m_trainOrig[i*m_nFeatures + j] )
01485                 min = m_trainOrig[i*m_nFeatures + j];
01486             if ( max < m_trainOrig[i*m_nFeatures + j] )
01487                 max = m_trainOrig[i*m_nFeatures + j];
01488         }
01489     cout<<"Min/Max after apply mean/std: "<<min<<"/"<<max<<endl;
01490 
01491     // print min and max values in targets
01492     min = 1e10;
01493     max = -1e10;
01494     m_targetMean = new REAL[m_nClass*m_nDomain];
01495     double* targetMean = new double[m_nClass*m_nDomain];
01496     for(int i=0;i<m_nClass*m_nDomain;i++)
01497         targetMean[i] = 0.0;
01498     for ( int i=0;i<m_nTrain;i++ )
01499         for ( int j=0;j<m_nClass*m_nDomain;j++ )
01500         {
01501             targetMean[j] += m_trainTargetOrig[i*m_nClass*m_nDomain + j];
01502             if ( min > m_trainTargetOrig[i*m_nClass*m_nDomain + j] )
01503                 min = m_trainTargetOrig[i*m_nClass*m_nDomain + j];
01504             if ( max < m_trainTargetOrig[i*m_nClass*m_nDomain + j] )
01505                 max = m_trainTargetOrig[i*m_nClass*m_nDomain + j];
01506         }
01507     for(int i=0;i<m_nClass*m_nDomain;i++)
01508         m_targetMean[i] = targetMean[i]/(double)m_nTrain;
01509     delete[] targetMean;
01510     
01511     cout<<"Min/Max target: "<<min<<"/"<<max<<endl<<"Mean target: ";
01512     for(int i=0;i<m_nClass*m_nDomain;i++)
01513         cout<<m_targetMean[i]<<" ";
01514     cout<<endl<<endl;
01515 
01516     if(m_validationType == "Retraining" || m_validationType == "CrossFoldMean")
01517     {
01518         int* labels = new int[m_nDomain];
01519     
01520         // copy data to cross-validation slots
01521         for ( int i=0;i<m_nCross+1;i++ )
01522         {
01523             // slot of probeset
01524             int begin = m_slotBoundaries[i];
01525             int end = m_slotBoundaries[i+1];
01526     
01527             int probeCnt = 0, trainCnt = 0;
01528     
01529             // go through whole trainOrig set
01530             for ( int j=0;j<m_nTrain;j++ )
01531             {
01532                 int index = m_mixList[j];
01533                 if ( Framework::getDatasetType() )
01534                 {
01535                     for ( int d=0;d<m_nDomain;d++ )
01536                         labels[d] = m_trainLabelOrig[d+index*m_nDomain];
01537                 }
01538     
01539                 // probe set
01540                 if ( j>=begin && j <end )
01541                 {
01542                     m_probeIndex[i][probeCnt] = index;
01543                     for ( int d=0;d<m_nDomain;d++ )
01544                         m_probeLabel[i][d+probeCnt*m_nDomain] = labels[d];
01545                     for ( int k=0;k<m_nFeatures;k++ )
01546                         if ( m_enableSaveMemory == false )
01547                             m_probe[i][probeCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01548                     for ( int k=0;k<m_nClass*m_nDomain;k++ )
01549                     {
01550                         m_probeTarget[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[index*m_nClass*m_nDomain + k];
01551                         m_probeTargetEffect[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[index*m_nClass*m_nDomain + k];
01552                         m_probeTargetResidual[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[index*m_nClass*m_nDomain + k];
01553                     }
01554                     probeCnt++;
01555                     m_crossIndex[j] = i;
01556                 }
01557                 else  // train set
01558                 {
01559                     for ( int d=0;d<m_nDomain;d++ )
01560                         m_trainLabel[i][d+trainCnt*m_nDomain] = labels[d];
01561                     for ( int k=0;k<m_nFeatures;k++ )
01562                         if ( m_enableSaveMemory == false )
01563                             m_train[i][trainCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k];
01564                     for ( int k=0;k<m_nClass*m_nDomain;k++ )
01565                     {
01566                         m_trainTarget[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[index*m_nClass*m_nDomain + k];
01567                         m_trainTargetEffect[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[index*m_nClass*m_nDomain + k];
01568                         m_trainTargetResidual[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[index*m_nClass*m_nDomain + k];
01569                     }
01570                     trainCnt++;
01571                 }
01572             }
01573             if ( probeCnt != m_probeSize[i] || trainCnt != m_trainSize[i] ) // safety check
01574                 assert ( false );
01575         }
01576     
01577         if ( labels )
01578             delete[] labels;
01579     
01580         for ( int i=0;i<m_nTrain;i++ )
01581             if ( m_crossIndex[i] == -1 )
01582                 assert ( false );
01583     }
01584     else if(m_validationType == "Bagging")
01585     {
01586         bool* bagSamples = new bool[m_nTrain];
01587         for ( int i=0;i<m_nCross;i++ )
01588         {
01589             // train sets
01590             for(int j=0;j<m_nTrain;j++)
01591                 bagSamples[j] = 0;
01592             for(int j=0;j<m_nTrain;j++)
01593             {
01594                 uint ind = m_trainBaggingIndex[i][j];
01595                 bagSamples[ind] = 1;  // mark
01596                 
01597                 if ( Framework::getDatasetType() )
01598                     for ( int d=0;d<m_nDomain;d++ )
01599                         m_trainLabel[i][d+j*m_nDomain] = m_trainLabelOrig[d+ind*m_nDomain];
01600                 for ( int k=0;k<m_nFeatures;k++ )
01601                     if ( m_enableSaveMemory == false )
01602                         m_train[i][j*m_nFeatures + k] = m_trainOrig[ind*m_nFeatures + k];
01603                 for ( int k=0;k<m_nClass*m_nDomain;k++ )
01604                 {
01605                     m_trainTarget[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrig[ind*m_nClass*m_nDomain + k];
01606                     m_trainTargetEffect[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[ind*m_nClass*m_nDomain + k];
01607                     m_trainTargetResidual[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[ind*m_nClass*m_nDomain + k];
01608                 }
01609             }
01610             
01611             // probe sets
01612             int cnt = 0;
01613             for(int j=0;j<m_nTrain;j++)
01614                 cnt += bagSamples[j];
01615             if(m_nTrain - cnt != m_probeSize[i])
01616                 assert(false);
01617             cnt = 0;
01618             for(int j=0;j<m_nTrain;j++)
01619             {
01620                 if(bagSamples[j] == false)
01621                 {
01622                     if ( Framework::getDatasetType() )
01623                         for ( int d=0;d<m_nDomain;d++ )
01624                             m_probeLabel[i][d+cnt*m_nDomain] = m_trainLabelOrig[d+j*m_nDomain];
01625                     for ( int k=0;k<m_nFeatures;k++ )
01626                         if ( m_enableSaveMemory == false )
01627                             m_probe[i][cnt*m_nFeatures + k] = m_trainOrig[j*m_nFeatures + k];
01628                     for ( int k=0;k<m_nClass*m_nDomain;k++ )
01629                     {
01630                         m_probeTarget[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[j*m_nClass*m_nDomain + k];
01631                         m_probeTargetEffect[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[j*m_nClass*m_nDomain + k];
01632                         m_probeTargetResidual[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[j*m_nClass*m_nDomain + k];
01633                     }
01634                     cnt++;
01635                 }
01636             }
01637             if(cnt != m_probeSize[i])
01638                 assert(false);
01639         }
01640         delete[] bagSamples;
01641     }
01642     else if(m_validationType == "ValidationSet")
01643     {
01644         ;
01645     }
01646     else
01647         assert(false);
01648 }

void Data::readDataset ( string  name  ) 

Read a dataset The name belongs directly to a read-in method

Parameters:
name The name of the dataset

Definition at line 296 of file Data.cpp.

00297 {
00298     // read MNIST
00299     if ( name == "MNIST" )
00300     {
00301         DatasetReader r;
00302         // call by reference, memory is allcated in the DatasetReader
00303         r.readMNIST ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00304     }
00305     else if ( name == "NETFLIX" ) // read Netflix
00306     {
00307         DatasetReader r;
00308         // call by reference, memory is allcated in the DatasetReader
00309         r.readNETFLIX ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00310     }
00311     else if ( name == "AusDM2009" ) // read AusDM2009
00312     {
00313         DatasetReader r;
00314         // call by reference, memory is allcated in the DatasetReader
00315         r.readAusDM2009 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00316     }
00317     else if ( name == "KDDCup09Large" ) // read large KDDCup09large dataset
00318     {
00319         DatasetReader r;
00320         // call by reference, memory is allcated in the DatasetReader
00321         r.readKDDCup09Large ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00322     }
00323     else if ( name == "KDDCup09Small" ) // read large KDDCup09small dataset
00324     {
00325         DatasetReader r;
00326         // call by reference, memory is allcated in the DatasetReader
00327         r.readKDDCup09Small ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00328     }
00329     else if ( name == "BINARY" ) // read binary format dataset
00330     {
00331         DatasetReader r;
00332         // call by reference, memory is allcated in the DatasetReader
00333         r.readBINARY ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00334     }
00335     else if ( name == "CSV" ) // read csv format dataset
00336     {
00337         DatasetReader r;
00338         // call by reference, memory is allcated in the DatasetReader
00339         r.readCSV ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00340     }
00341     else if ( name == "ARFF" ) // read arff format dataset
00342     {
00343         DatasetReader r;
00344         // call by reference, memory is allcated in the DatasetReader
00345         r.readARFF ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00346     }
00347     else if ( name == "PRUDSYS_DMC2009" ) // read PRUDSYS_DMC2009 dataset
00348     {
00349         DatasetReader r;
00350         // call by reference, memory is allcated in the DatasetReader
00351         r.readPRUDSYS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00352     }
00353     else if ( name == "ADULT" ) // read adult dataset
00354     {
00355         DatasetReader r;
00356         // call by reference, memory is allcated in the DatasetReader
00357         r.readADULT ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00358     }
00359     else if ( name == "AUSTRALIAN" ) // read australian dataset
00360     {
00361         DatasetReader r;
00362         // call by reference, memory is allcated in the DatasetReader
00363         r.readAUSTRALIAN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00364     }
00365     else if ( name == "BALANCE" ) // read balance dataset
00366     {
00367         DatasetReader r;
00368         // call by reference, memory is allcated in the DatasetReader
00369         r.readBALANCE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00370     }
00371     else if ( name == "CYLINDER-BANDS" ) // read cylinder-bands dataset
00372     {
00373         DatasetReader r;
00374         // call by reference, memory is allcated in the DatasetReader
00375         r.readCYLINDERBANDS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00376     }
00377     else if ( name == "BREAST" ) // read breast-cancer dataset
00378     {
00379         DatasetReader r;
00380         // call by reference, memory is allcated in the DatasetReader
00381         r.readBREASTCANCERWISCONSIN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00382     }
00383     else if ( name == "CREDIT" ) // read australian-credit dataset
00384     {
00385         DatasetReader r;
00386         // call by reference, memory is allcated in the DatasetReader
00387         r.readAUSTRALIANCREDIT ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00388     }
00389     else if ( name == "DIABETES" ) // read diabetes dataset
00390     {
00391         DatasetReader r;
00392         // call by reference, memory is allcated in the DatasetReader
00393         r.readDIABETES ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00394     }
00395     else if ( name == "GERMAN" ) // read german dataset
00396     {
00397         DatasetReader r;
00398         // call by reference, memory is allcated in the DatasetReader
00399         r.readGERMAN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00400     }
00401     else if ( name == "GLASS" ) // read glass dataset
00402     {
00403         DatasetReader r;
00404         // call by reference, memory is allcated in the DatasetReader
00405         r.readGLASS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00406     }
00407     else if ( name == "HEART-SPECTF" ) // read heart dataset
00408     {
00409         DatasetReader r;
00410         // call by reference, memory is allcated in the DatasetReader
00411         r.readHEART ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00412     }
00413     else if ( name == "HEPATITIS" ) // read hepatitis dataset
00414     {
00415         DatasetReader r;
00416         // call by reference, memory is allcated in the DatasetReader
00417         r.readHEPATITIS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00418     }
00419     else if ( name == "IONOSPHERE" ) // read ionophsere dataset
00420     {
00421         DatasetReader r;
00422         // call by reference, memory is allcated in the DatasetReader
00423         r.readIONOSPHERE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00424     }
00425     else if ( name == "IRIS" ) // read iris dataset
00426     {
00427         DatasetReader r;
00428         // call by reference, memory is allcated in the DatasetReader
00429         r.readIRIS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00430     }
00431     else if ( name == "LETTER" ) // read letter dataset
00432     {
00433         DatasetReader r;
00434         // call by reference, memory is allcated in the DatasetReader
00435         r.readLETTER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00436     }
00437     else if ( name == "MONKS-1" ) // read monks1 dataset
00438     {
00439         DatasetReader r;
00440         // call by reference, memory is allcated in the DatasetReader
00441         r.readMONKS1 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00442     }
00443     else if ( name == "MONKS-2" ) // read monks2 dataset
00444     {
00445         DatasetReader r;
00446         // call by reference, memory is allcated in the DatasetReader
00447         r.readMONKS2 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00448     }
00449     else if ( name == "MONKS-3" ) // read monks3 dataset
00450     {
00451         DatasetReader r;
00452         // call by reference, memory is allcated in the DatasetReader
00453         r.readMONKS3 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00454     }
00455     else if ( name == "MUSHROOM" ) // read mushroom dataset
00456     {
00457         DatasetReader r;
00458         // call by reference, memory is allcated in the DatasetReader
00459         r.readMUSHROOM ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00460     }
00461     else if ( name == "SATIMAGE" ) // read satimage dataset
00462     {
00463         DatasetReader r;
00464         // call by reference, memory is allcated in the DatasetReader
00465         r.readSATIMAGE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00466     }
00467     else if ( name == "SEGMENTATION" ) // read segmentation dataset
00468     {
00469         DatasetReader r;
00470         // call by reference, memory is allcated in the DatasetReader
00471         r.readSEGMENTATION ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00472     }
00473     else if ( name == "SONAR" ) // read sonar dataset
00474     {
00475         DatasetReader r;
00476         // call by reference, memory is allcated in the DatasetReader
00477         r.readSONAR ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00478     }
00479     else if ( name == "VEHICLE" ) // read vehicle dataset
00480     {
00481         DatasetReader r;
00482         // call by reference, memory is allcated in the DatasetReader
00483         r.readVEHICLE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00484     }
00485     else if ( name == "VOTES" ) // read votes dataset
00486     {
00487         DatasetReader r;
00488         // call by reference, memory is allcated in the DatasetReader
00489         r.readVOTES ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00490     }
00491     else if ( name == "WINE" ) // read wine dataset
00492     {
00493         DatasetReader r;
00494         // call by reference, memory is allcated in the DatasetReader
00495         r.readWINE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00496     }
00497     else if ( name == "POKER" ) // read poker dataset
00498     {
00499         DatasetReader r;
00500         // call by reference, memory is allcated in the DatasetReader
00501         r.readPOKER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00502     }
00503     else if ( name == "YEAST" ) // read yeast dataset
00504     {
00505         DatasetReader r;
00506         // call by reference, memory is allcated in the DatasetReader
00507         r.readYEAST ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00508     }
00509     else if ( name == "SURVIVAL" ) // read survival dataset
00510     {
00511         DatasetReader r;
00512         // call by reference, memory is allcated in the DatasetReader
00513         r.readSURVIVAL ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00514     }
00515     else if ( name == "SPIDER" ) // read (generated by)spider dataset
00516     {
00517         DatasetReader r;
00518         // call by reference, memory is allcated in the DatasetReader
00519         r.readSPIDER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget );
00520     }
00521     else
00522     {
00523         cout<<"Dataset not found:"<<name<<endl;
00524         exit ( 0 );
00525     }
00526 
00527     if(m_addConstantInput)
00528         addConstantInput();
00529     
00530     // reduce the size of the training set
00531     reduceTrainingSetSize ( m_subsampleTrainSet );
00532 
00533     // reduce the size of the features in the training set
00534     int nFeatOrig = m_nFeatures;
00535     reduceFeatureSize ( m_trainOrig, m_nTrain, m_nFeatures, m_subsampleFeatures, Framework::getFrameworkMode() );
00536     reduceFeatureSize ( m_testOrig, m_nTest, nFeatOrig, m_subsampleFeatures, true );
00537 
00538     // feature selection, based on a linear model
00539     if ( m_featureSelectionWriteBinaryDataset )
00540     {
00541         makeBinaryDataset();
00542         exit ( 0 );
00543     }
00544 
00545     // mix train features and labels
00546     mixDataset();
00547 }

void Data::readDscFile ( string  name  ) 

Read the description file

Parameters:
name The description file name (string)

Definition at line 1833 of file Data.cpp.

01834 {
01835     cout<<"Load descriptor file: "<<name<<endl;
01836     fstream f ( name.c_str(), ios::in );
01837 
01838     if ( f.is_open() ==false )
01839     {
01840         cout<<"Can not open file:"<<name<<endl;
01841         assert ( false );
01842     }
01843 
01844     int mode = -1;  // -1:meta info  0:int  1:double  2:string  3:bool
01845 
01846     char buf[256];
01847     while ( f.getline ( buf, 256 ) ) // read all lines
01848     {
01849         string line ( buf );
01850         if ( line[0]=='#' ) // a comment
01851             continue;
01852         if ( line.find ( "[int]" ) != string::npos )
01853             mode = 0;
01854         if ( line.find ( "[double]" ) != string::npos )
01855             mode = 1;
01856         if ( line.find ( "[string]" ) != string::npos )
01857             mode = 2;
01858         if ( line.find ( "[bool]" ) != string::npos )
01859             mode = 3;
01860 
01861         // only lines which consists of a '='
01862         if ( line.find ( "=" ) != string::npos )
01863             readParameter ( line, mode );
01864     }
01865 
01866     f.close();
01867 }

void Data::readEffectFile (  ) 

Read the effect file This is the prediction of the whole trainingset from an other Algorithm This can be used as preprocessing of an other Algorithm Effect file name is: m_trainOnFullPredictorFile

Definition at line 1306 of file Data.cpp.

01307 {
01308     if(m_validationType == "ValidationSet")
01309         return;
01310     
01311     for ( int i=0;i<m_nClass*m_nDomain*m_nTrain;i++ )
01312         m_trainTargetOrigEffect[i] = 0.0;
01313 
01314     string name = m_datasetPath + "/" + m_fullPredPath + "/" + m_trainOnFullPredictorFile;
01315     fstream f ( name.c_str(), ios::in );
01316     if ( f.is_open() && m_trainOnFullPredictorFile!="" )
01317     {
01318         cout<<"Read fullPredictor:"<<name<<"  ";
01319         f.read ( ( char* ) m_trainTargetOrigEffect, sizeof ( REAL ) *m_nClass*m_nDomain*m_nTrain );
01320 
01321         double rmse0 = 0.0, rmse1 = 0.0, err;
01322         for ( int i=0;i<m_nClass*m_nDomain;i++ )
01323         {
01324             for ( int j=0;j<m_nTrain;j++ )
01325             {
01326                 err = m_trainTargetOrigEffect[j*m_nClass*m_nDomain + i] - m_trainTargetOrig[j*m_nClass*m_nDomain + i];
01327                 rmse0 += err * err;
01328             }
01329         }
01330         cout<<"RMSE:"<<sqrt ( rmse0/ ( double ) ( m_nClass*m_nDomain*m_nTrain ) ) <<"(retrain:"<<sqrt ( rmse1/ ( double ) ( m_nClass*m_nDomain*m_nTrain ) ) <<")"<<endl;
01331 
01332         f.close();
01333     }
01334     else
01335         cout<<"Can not open effect file:"<<name<<endl;
01336 
01337     // residual training: res = target - effect
01338     cout<<"Init residuals"<<endl;
01339     for ( int i=0;i<m_nClass*m_nDomain*m_nTrain;i++ )
01340         m_trainTargetOrigResidual[i] = m_trainTargetOrig[i] - m_trainTargetOrigEffect[i];
01341 }

void Data::readParameter ( string  line,
int  mode 
)

Read a parameter in the description file

Parameters:
line One line in the file (string)
mode -1: metaparameters, 0: integer, 1: double, 2: string, 3: bool

Definition at line 1789 of file Data.cpp.

01790 {
01791     // split into 2 strings at the '=' char
01792     int pos = line.find ( "=" );
01793     string name = line.substr ( 0, pos );
01794     string value = line.substr ( pos+1 );
01795 
01796     if ( mode==-1 ) // meta info block (algorithm independent)
01797     {
01798         if ( name=="ALGORITHM" )
01799             m_algorithmName = value;
01800         if ( name=="ID" )
01801             m_algorithmID = atoi ( value.c_str() );
01802         if ( name=="TRAIN_ON_FULLPREDICTOR" )
01803         {
01804             if(m_validationType == "ValidationSet")
01805                 assert(false);
01806             m_trainOnFullPredictorFile = value;
01807         }
01808         if ( name=="DISABLE" )
01809             m_disableTraining = atoi ( value.c_str() );
01810         cout<<"[META] ";
01811     }
01812 
01813     if ( mode==0 ) // [int]
01814         m_intMap[name] = atoi ( value.c_str() );
01815 
01816     if ( mode==1 ) // [double]
01817         m_doubleMap[name] = atof ( value.c_str() );
01818 
01819     if ( mode==2 ) // [string]
01820         m_stringMap[name] = value;
01821 
01822     if ( mode==3 ) // [bool]
01823         m_boolMap[name] = atoi ( value.c_str() );
01824 
01825     cout<<name<<": "<<value<<endl;
01826 }

void Data::reduceFeatureSize ( REAL *&  table,
int  tableRows,
int &  tableCols,
REAL  percent,
bool  loadColumnSet 
)

This method is for reduce the feature size The idea stem from Random Forrests

Parameters:
percent The normalized size of features (0...1)

Definition at line 2322 of file Data.cpp.

02323 {
02324     cout<<"subsample the columns (current:"<<tableCols<<") to "<<percent*100.0<<"% of columns (skip constant 1 features)"<<flush;
02325     if ( percent <= 0.0 || percent >= 1.0 )
02326     {
02327         cout<<"  [nothing to do]"<<endl;
02328         return;
02329     }
02330     cout<<endl;
02331     
02332     // determine constant 1 features
02333     bool* isConstantOne = new bool[tableCols];
02334     bool* selectedCols = new bool[tableCols];
02335     for ( int i=0;i<tableCols;i++ )
02336     {
02337         isConstantOne[i] = true;
02338         selectedCols[i] = false;
02339     }
02340     for ( int i=0;i<tableRows;i++ )
02341         for ( int j=0;j<tableCols;j++ )
02342             isConstantOne[j] &= table[j+i*tableCols]==1.0;
02343 
02344     srand ( Framework::getRandomSeed() );
02345     int cnt = 0;
02346     for ( int i=0;i<tableCols;i++ )
02347         if ( ( double ) rand() / ( double ) RAND_MAX < percent || isConstantOne[i] )
02348         {
02349             selectedCols[i] = true;
02350             cnt++;
02351         }
02352     delete[] isConstantOne;
02353 
02354     if ( loadColumnSet )
02355     {
02356         string fname = m_datasetPath + "/" + string ( DATA_PATH ) + "/subspace.txt";
02357         cout<<"load subspace file:"<<fname<<endl;
02358         fstream f ( fname.c_str(),ios::in );
02359         cnt = 0;
02360         for ( int i=0;i<tableCols;i++ )
02361         {
02362             f>>selectedCols[i];
02363             cnt += selectedCols[i];
02364         }
02365         f.close();
02366     }
02367     else
02368     {
02369         string fname = m_datasetPath + "/" + string ( DATA_PATH ) + "/subspace.txt";
02370         cout<<"write subspace file:"<<fname<<endl;
02371         fstream f ( fname.c_str(),ios::out );
02372         for ( int i=0;i<tableCols;i++ )
02373             f<<selectedCols[i]<<endl;
02374         f.close();
02375     }
02376 
02377     cout<<"allocate new table set, column size:"<<cnt<<endl;
02378     REAL* newTable = new REAL[cnt*tableRows];
02379 
02380     srand ( Framework::getRandomSeed() );
02381     for ( int i=0;i<tableRows;i++ )
02382     {
02383         int c = 0;
02384         for ( int j=0;j<tableCols;j++ )
02385         {
02386             if ( selectedCols[j] )
02387             {
02388                 newTable[c+i*cnt] = table[j+i*tableCols];
02389                 c++;
02390             }
02391         }
02392     }
02393 
02394     delete[] table;
02395     delete[] selectedCols;
02396     table = newTable;
02397     tableCols = cnt;
02398 }

void Data::reduceTrainingSetSize ( REAL  percent  ) 

This method is for reduce the training sample size Useful for apply complex models on large datasets

Parameters:
percent The size of the new training set (0...1)

Definition at line 2259 of file Data.cpp.

02260 {
02261     cout<<"reduce training set (current size:"<<m_nTrain<<") to "<<percent*100.0<<"% of its original size"<<flush;
02262     if ( percent <= 0.0 || percent >= 1.0 )
02263     {
02264         cout<<"  [nothing to do]"<<endl;
02265         return;
02266     }
02267     cout<<endl;
02268     
02269     srand ( Framework::getRandomSeed() );
02270     int cnt = 0;
02271     for ( int i=0;i<m_nTrain;i++ )
02272         if ( ( double ) rand() / ( double ) RAND_MAX < percent )
02273             cnt++;
02274 
02275     cout<<"allocate new training set, size:"<<cnt<<endl;
02276 
02277     REAL* train = new REAL[cnt*m_nFeatures];
02278     REAL* trainTarget = new REAL[cnt*m_nClass*m_nDomain];
02279 
02280     int* trainLabel = 0;
02281     if ( m_trainLabelOrig )
02282         trainLabel = new int[cnt*m_nDomain];
02283 
02284     srand ( Framework::getRandomSeed() );
02285     cnt = 0;
02286     for ( int i=0;i<m_nTrain;i++ )
02287     {
02288         if ( ( double ) rand() / ( double ) RAND_MAX < percent )
02289         {
02290             for ( int j=0;j<m_nFeatures;j++ )
02291                 train[j+cnt*m_nFeatures] = m_trainOrig[j+i*m_nFeatures];
02292             for ( int j=0;j<m_nClass*m_nDomain;j++ )
02293                 trainTarget[j+cnt*m_nClass*m_nDomain] = m_trainTargetOrig[j+i*m_nClass*m_nDomain];
02294             if ( m_trainLabelOrig )
02295             {
02296                 for ( int j=0;j<m_nDomain;j++ )
02297                     trainLabel[j+cnt*m_nDomain] = m_trainLabelOrig[j+i*m_nDomain];
02298             }
02299             cnt++;
02300         }
02301     }
02302 
02303     delete[] m_trainOrig;
02304     delete[] m_trainTargetOrig;
02305     if ( m_trainLabelOrig )
02306         delete[] m_trainLabelOrig;
02307 
02308     m_trainOrig = train;
02309     m_trainTargetOrig = trainTarget;
02310     if ( m_trainLabelOrig )
02311         m_trainLabelOrig = trainLabel;
02312 
02313     m_nTrain = cnt;
02314 }

void Data::setAlgorithmList ( vector< string >  algorithmNameList  ) 

Copy an external vector of *dsc files to the member list

Parameters:
m_algorithmNameList List of filenames (*dsc)
nAlgorithmsTrained How many of them have finished training

Definition at line 2113 of file Data.cpp.

02114 {
02115     cout<<"Set algorithm list (nTrained:"<< ( int ) algorithmNameList.size() <<")"<<endl;
02116     m_algorithmNameList = algorithmNameList;
02117     for ( int i=0;i<m_algorithmNameList.size();i++ )
02118     {
02119         int pos = m_algorithmNameList[i].find_first_of ( ".",0 );
02120         if ( pos == 0 )
02121             assert ( false );
02122         m_algorithmNameList[i] = m_datasetPath + "/" + m_fullPredPath + "/" + m_algorithmNameList[i].substr ( 0,pos ) + ".dat";
02123         cout<<"m_algorithmNameList["<<i<<"]:"<<m_algorithmNameList[i]<<endl;
02124     }
02125 }

void Data::setDataPointers ( Data data  ) 

Fills the pointes from the base class Data

Parameters:
data The pointer to the data object, where a valid dataset is loaded

Definition at line 1954 of file Data.cpp.

01955 {
01956     cout<<"Set data pointers"<<endl;
01957 
01958     // copy maps
01959     m_intMap = data->m_intMap;
01960     m_doubleMap = data->m_doubleMap;
01961     m_boolMap = data->m_boolMap;
01962     m_stringMap = data->m_stringMap;
01963 
01964     m_algorithmName = data->m_algorithmName;
01965     m_algorithmID = data->m_algorithmID;
01966     m_trainOnFullPredictorFile = data->m_trainOnFullPredictorFile;
01967     m_disableTraining = data->m_disableTraining;
01968 
01969     m_randSeed = data->m_randSeed;
01970     m_positiveTarget = data->m_positiveTarget;
01971     m_negativeTarget = data->m_negativeTarget;
01972 
01973     m_mixList = data->m_mixList;
01974 
01975     // dataset pathes
01976     m_datasetPath = data->m_datasetPath;
01977     m_datasetName = data->m_datasetName;
01978     m_tempPath = data->m_tempPath;
01979     m_dscPath = data->m_dscPath;
01980     m_fullPredPath = data->m_fullPredPath;
01981     m_dataPath = data->m_dataPath;
01982 
01983     // dataset organization (input/output dimensionality)
01984     m_nFeatures = data->m_nFeatures;
01985     m_nClass = data->m_nClass;
01986     m_nDomain = data->m_nDomain;
01987     m_nMixTrainList = data->m_nMixTrainList;
01988 
01989     // cross-validation settings
01990     m_nCross = data->m_nCross;
01991     m_validationType = data->m_validationType;
01992 
01993     // global mean and standard deviation over whole dataset
01994     m_mean = data->m_mean;
01995     m_std = data->m_std;
01996     m_standardDeviationMin = data->m_standardDeviationMin;
01997     m_targetMean = data->m_targetMean;
01998 
01999     // full training set
02000     m_nTrain = data->m_nTrain;
02001     m_trainOrig = data->m_trainOrig;
02002     m_trainTargetOrig = data->m_trainTargetOrig;
02003     m_trainTargetOrigEffect = data->m_trainTargetOrigEffect;
02004     m_trainTargetOrigResidual = data->m_trainTargetOrigResidual;
02005     m_trainLabelOrig = data->m_trainLabelOrig;
02006     m_trainBaggingIndex = data->m_trainBaggingIndex;
02007 
02008     // the validation set
02009     m_validSize = data->m_validSize;
02010     m_valid = data->m_valid;
02011     m_validTarget = data->m_validTarget;
02012     m_validLabel = data->m_validLabel;
02013     
02014     // the testset
02015     m_nTest = data->m_nTest;
02016     m_testOrig = data->m_testOrig;
02017     m_testTargetOrig = data->m_testTargetOrig;
02018     m_testLabelOrig = data->m_testLabelOrig;
02019 
02020     // probe split inices
02021     m_slotBoundaries = data->m_slotBoundaries;
02022 
02023     // trainsets per cross-validation division
02024     m_trainSize = data->m_trainSize;
02025     m_train = data->m_train;
02026     m_trainTarget = data->m_trainTarget;
02027     m_trainTargetEffect = data->m_trainTargetEffect;
02028     m_trainTargetResidual = data->m_trainTargetResidual;
02029     m_trainLabel = data->m_trainLabel;
02030 
02031     // probesets per cross-validation division
02032     m_probeSize = data->m_probeSize;
02033     m_probe = data->m_probe;
02034     m_probeTarget = data->m_probeTarget;
02035     m_probeTargetEffect = data->m_probeTargetEffect;
02036     m_probeTargetResidual = data->m_probeTargetResidual;
02037     m_probeLabel = data->m_probeLabel;
02038     m_probeIndex = data->m_probeIndex;
02039 
02040     m_crossIndex = data->m_crossIndex;
02041 
02042     // blend stopping
02043     m_blendingRegularization = data->m_blendingRegularization;
02044     m_enableGlobalBlendingWeights = data->m_enableGlobalBlendingWeights;
02045     m_blendingEnableCrossValidation = data->m_blendingEnableCrossValidation;
02046     m_enablePostNNBlending = data->m_enablePostNNBlending;
02047     m_blendingAlgorithm = data->m_blendingAlgorithm;
02048 
02049     // cascade learning
02050     m_enableCascadeLearning = data->m_enableCascadeLearning;
02051     m_nCascadeInputs = data->m_nCascadeInputs;
02052     m_cascadeInputs = data->m_cascadeInputs;
02053 
02054     // average over mean and std as new mean and std
02055     m_enableGlobalMeanStdEstimate = data->m_enableGlobalMeanStdEstimate;
02056 
02057     // paralellization of k-fold cross validation
02058     m_maxThreadsInCross = data->m_maxThreadsInCross;
02059 
02060     // memory save option
02061     m_enableSaveMemory = data->m_enableSaveMemory;
02062 
02063     // error function "AUC" or "RMSE"
02064     m_errorFunction = data->m_errorFunction;
02065 
02066     // reverse mix table
02067     m_mixDatasetIndices = data->m_mixDatasetIndices;
02068 
02069     // already trained algo list
02070     m_algorithmNameList = data->m_algorithmNameList;
02071 
02072     // clip after blend
02073     m_enablePostBlendClipping = data->m_enablePostBlendClipping;
02074 
02075     // add output noise
02076     m_addOutputNoise = data->m_addOutputNoise;
02077 
02078     // feature selection
02079     m_enableFeatureSelection = data->m_enableFeatureSelection;
02080     m_featureSelectionWriteBinaryDataset = data->m_featureSelectionWriteBinaryDataset;
02081 
02082     // bagging
02083     m_enableBagging = data->m_enableBagging;
02084     m_randomSeedBagging = data->m_randomSeedBagging;
02085 
02086     // write dsc files in training
02087     m_disableWriteDscFile = data->m_disableWriteDscFile;
02088 
02089     // static mean and std normalization
02090     m_enableStaticNormalization = data->m_enableStaticNormalization;
02091     m_staticMeanNormalization = data->m_staticMeanNormalization;
02092     m_staticStdNormalization = data->m_staticStdNormalization;
02093     m_enableProbablisticNormalization = data->m_enableProbablisticNormalization;
02094 
02095     // dimensionality reduction
02096     m_dimensionalityReduction = data->m_dimensionalityReduction;
02097 
02098     // if this is set, the algorithm should load saved weights before start to training
02099     m_loadWeightsBeforeTraining = data->m_loadWeightsBeforeTraining;
02100 
02101     m_subsampleTrainSet = data->m_subsampleTrainSet;
02102     m_subsampleFeatures = data->m_subsampleFeatures;
02103     m_globalTrainingLoops = data->m_globalTrainingLoops;
02104     m_addConstantInput = data->m_addConstantInput;
02105 }

void Data::setPathes ( string  temp,
string  dsc,
string  fullPred,
string  data 
)

Set important pathes for running the Framework

Parameters:
temp The temp directors, used for weights files of Algorithms
dsc The description file dir, the cout<<.. per Algorithm are collected here
fullPred The full-prediction dir, files which predicts the trainset with cross validation
data The dataset directory, where the dataset files are

Definition at line 1775 of file Data.cpp.

01776 {
01777     m_tempPath = temp;
01778     m_dscPath = dsc;
01779     m_fullPredPath = fullPred;
01780     m_dataPath = data;
01781 }

int * Data::splitStringToIntegerList ( string  str,
char  delimiter 
) [static]

Split a list of integers in a string E.g.: str="10,10,100,50"

Parameters:
str The string (input)
delimiter The delimiter sign (char)
Returns:
The int* list (allocated memory)

Definition at line 1897 of file Data.cpp.

01898 {
01899     vector<int> v;
01900     int number;
01901     char *begin = ( char* ) str.c_str(), *end = ( char* ) str.c_str(), tmp;
01902     for ( int i=0;i<str.length();i++ )
01903     {
01904         end++;
01905         if ( *end==delimiter || *end==0 )
01906         {
01907             tmp = *end;
01908             *end = 0;
01909             sscanf ( begin, "%d", &number );
01910             begin = end + 1;
01911             *end = tmp;
01912             v.push_back ( number );
01913         }
01914     }
01915     int* returnList = new int[v.size() ];
01916     for ( int i=0;i<v.size();i++ )
01917         returnList[i] = v[i];
01918     return returnList;
01919 }

vector< string > Data::splitStringToStringList ( string  str,
char  delimiter 
) [static]

Split a string to substrings E.g.: str="10,10,100,50" and delimiter=','

Parameters:
str The string (input)
delimiter The delimiter sign (char)
Returns:
The vector of strings

Definition at line 1929 of file Data.cpp.

01930 {
01931     vector<string> v;
01932     int number;
01933     char *begin = ( char* ) str.c_str(), *end = ( char* ) str.c_str(), tmp;
01934     for ( int i=0;i<str.length();i++ )
01935     {
01936         end++;
01937         if ( *end==delimiter || *end==0 )
01938         {
01939             tmp = *end;
01940             *end = 0;
01941             v.push_back ( begin );
01942             begin = end + 1;
01943             *end = tmp;
01944         }
01945     }
01946     return v;
01947 }

int Data::vectorSampling ( REAL *  probs,
int  length 
)

Returns the number of samples when having probabilites for each vector sample

Parameters:
probs per-sample probability
length the number of samples
Returns:
new samples

Definition at line 680 of file Data.cpp.

00681 {
00682     double sum = 0.0;
00683     for ( int i=0;i<length;i++ )
00684         sum += probs[i];
00685 
00686     double value = sum * ( ( double ) rand() / ( double ) RAND_MAX );
00687 
00688     sum = 0.0;
00689     for ( int i=0;i<length;i++ )
00690     {
00691         sum += probs[i];
00692         if ( sum >= value )
00693             return i;
00694     }
00695     cout<<"value:"<<value<<endl<<"length:"<<length<<endl<<"sum:"<<sum<<endl;
00696     for ( int i=0;i<length;i++ )
00697         cout<<probs[i]<<" "<<flush;
00698     assert ( false );
00699     return -1;
00700 }


The documentation for this class was generated from the following files:

Generated on Tue Jan 26 09:21:06 2010 for ELF by  doxygen 1.5.8