#include <Data.h>
Public Member Functions | |
Data () | |
virtual | ~Data () |
void | readParameter (string line, int mode) |
void | readDscFile (string name) |
void | setPathes (string temp, string dsc, string fullPred, string data) |
void | readDataset (string name) |
void | allocMemForCrossValidationSets () |
void | partitionDatasetToCrossValidationSets () |
void | fillCascadeLearningInputs () |
void | extendTrainDataWithCascadeInputs () |
void | fillNCrossValidationSet (int n) |
void | freeNCrossValidationSet (int n) |
void | readEffectFile () |
void | setDataPointers (Data *data) |
void | mixDataset () |
void | deleteMemory () |
void | loadNormalization (int nCascade=0) |
void | setAlgorithmList (vector< string > m_algorithmNameList) |
void | loadFeatureSelectionFile () |
void | saveFeatureSelectionFile () |
void | doFeatureSelection () |
void | makeBinaryDataset () |
void | enableBagging (bool en) |
void | doBootstrapSampling (REAL *probs, REAL *&train, REAL *&target, REAL *&targetEff, REAL *&targetRes, int *&label, int nTrainNew=0) |
void | baggingRandomSeed (uint seed) |
int | vectorSampling (REAL *probs, int length) |
void | mergeTrainAndTest () |
void | normalizeZeroOne () |
void | reduceTrainingSetSize (REAL percent) |
void | reduceFeatureSize (REAL *&table, int tableRows, int &tableCols, REAL percent, bool loadColumnSet) |
void | addConstantInput () |
Static Public Member Functions | |
static vector< string > | getDirectoryFileList (string path) |
static int * | splitStringToIntegerList (string str, char delimiter) |
static vector< string > | splitStringToStringList (string str, char delimiter) |
Protected Attributes | |
string | m_datasetPath |
string | m_datasetName |
string | m_algorithmName |
int | m_algorithmID |
string | m_trainOnFullPredictorFile |
bool | m_disableTraining |
int | m_randSeed |
int | m_nMixDataset |
int | m_nMixTrainList |
int | m_nCross |
string | m_validationType |
int | m_maxThreadsInCross |
bool | m_enableGlobalMeanStdEstimate |
REAL | m_positiveTarget |
REAL | m_negativeTarget |
double | m_blendingRegularization |
bool | m_enableGlobalBlendingWeights |
bool | m_blendingEnableCrossValidation |
bool | m_enablePostNNBlending |
string | m_blendingAlgorithm |
bool | m_enableCascadeLearning |
int | m_nCascadeInputs |
REAL * | m_cascadeInputs |
map< string, int > | m_intMap |
map< string, double > | m_doubleMap |
map< string, bool > | m_boolMap |
map< string, string > | m_stringMap |
string | m_tempPath |
string | m_dscPath |
string | m_fullPredPath |
string | m_dataPath |
int | m_nFeatures |
int | m_nClass |
int | m_nDomain |
int * | m_mixDatasetIndices |
int * | m_mixList |
int * | m_crossIndex |
uint | m_nTrain |
REAL * | m_trainOrig |
REAL * | m_trainTargetOrig |
REAL * | m_trainTargetOrigEffect |
REAL * | m_trainTargetOrigResidual |
int * | m_trainLabelOrig |
uint | m_nTest |
REAL * | m_testOrig |
REAL * | m_testTargetOrig |
int * | m_testLabelOrig |
int * | m_slotBoundaries |
int * | m_trainSize |
REAL ** | m_train |
REAL ** | m_trainTarget |
REAL ** | m_trainTargetEffect |
REAL ** | m_trainTargetResidual |
int ** | m_trainLabel |
int ** | m_trainBaggingIndex |
int * | m_probeSize |
REAL ** | m_probe |
REAL ** | m_probeTarget |
REAL ** | m_probeTargetEffect |
REAL ** | m_probeTargetResidual |
int ** | m_probeLabel |
int ** | m_probeIndex |
int | m_validSize |
REAL * | m_valid |
REAL * | m_validTarget |
int * | m_validLabel |
REAL * | m_mean |
REAL * | m_std |
REAL | m_standardDeviationMin |
REAL * | m_targetMean |
bool | m_enableSaveMemory |
string | m_errorFunction |
REAL * | m_support |
vector< string > | m_algorithmNameList |
bool | m_enablePostBlendClipping |
REAL | m_addOutputNoise |
bool | m_enableFeatureSelection |
bool | m_featureSelectionWriteBinaryDataset |
bool | m_enableBagging |
uint | m_randomSeedBagging |
bool | m_disableWriteDscFile |
bool | m_enableStaticNormalization |
REAL | m_staticMeanNormalization |
REAL | m_staticStdNormalization |
bool | m_enableProbablisticNormalization |
string | m_dimensionalityReduction |
REAL | m_subsampleTrainSet |
REAL | m_subsampleFeatures |
int | m_globalTrainingLoops |
bool | m_addConstantInput |
bool | m_loadWeightsBeforeTraining |
Friends | |
class | Scheduler |
class | Algorithm |
class | Autoencoder |
An Algorithm is derived in general from the Algorithm (child from Data) class. The information of the master.dsc files is stored here.
Mean and Standard deviation for inputs are stored here.
Definition at line 34 of file Data.h.
Data::Data | ( | ) |
Constructor
Definition at line 8 of file Data.cpp.
00009 { 00010 cout<<"Constructor Data"<<endl; 00011 00012 // init member vars 00013 m_algorithmID = 0; 00014 m_randSeed = 0; 00015 m_nMixDataset = 0; 00016 m_nMixTrainList = 0; 00017 m_nCross = 0; 00018 m_validationType = "Retraining"; 00019 m_maxThreadsInCross = 0; 00020 m_enableGlobalMeanStdEstimate = 0; 00021 m_positiveTarget = 0; 00022 m_negativeTarget = 0; 00023 m_blendingRegularization = 0; 00024 m_enableGlobalBlendingWeights = 0; 00025 m_blendingEnableCrossValidation = 0; 00026 m_enablePostNNBlending = 0; 00027 m_enableCascadeLearning = 0; 00028 m_nCascadeInputs = 0; 00029 m_cascadeInputs = 0; 00030 m_nFeatures = 0; 00031 m_nClass = 0; 00032 m_nDomain = 0; 00033 m_mixDatasetIndices = 0; 00034 m_mixList = 0; 00035 m_crossIndex = 0; 00036 m_nTrain = 0; 00037 m_trainOrig = 0; 00038 m_trainTargetOrig = 0; 00039 m_trainTargetOrigEffect = 0; 00040 m_trainTargetOrigResidual = 0; 00041 m_trainLabelOrig = 0; 00042 m_trainBaggingIndex = 0; 00043 m_nTest = 0; 00044 m_testOrig = 0; 00045 m_testTargetOrig = 0; 00046 m_testLabelOrig = 0; 00047 m_slotBoundaries = 0; 00048 m_trainSize = 0; 00049 m_train = 0; 00050 m_trainTarget = 0; 00051 m_trainTargetEffect = 0; 00052 m_trainTargetResidual = 0; 00053 m_trainLabel = 0; 00054 m_probeSize = 0; 00055 m_probe = 0; 00056 m_probeTarget = 0; 00057 m_probeTargetEffect = 0; 00058 m_probeTargetResidual = 0; 00059 m_probeLabel = 0; 00060 m_probeIndex = 0; 00061 m_validSize = 0; 00062 m_valid = 0; 00063 m_validTarget = 0; 00064 m_validLabel = 0; 00065 m_mean = 0; 00066 m_std = 0; 00067 m_standardDeviationMin = 0; 00068 m_targetMean = 0; 00069 m_enableSaveMemory = 0; 00070 m_support = 0; 00071 m_enablePostBlendClipping = 0; 00072 m_addOutputNoise = 0; 00073 m_enableFeatureSelection = 0; 00074 m_featureSelectionWriteBinaryDataset = 0; 00075 m_enableBagging = 0; 00076 m_randomSeedBagging = 0; 00077 m_enableStaticNormalization = 0; 00078 m_staticMeanNormalization = 0.0; 00079 m_staticStdNormalization = 1.0; 00080 m_enableProbablisticNormalization = 0; 00081 m_dimensionalityReduction = ""; 00082 m_subsampleTrainSet = 1.0; 00083 m_subsampleFeatures = 1.0; 00084 m_disableTraining = false; 00085 m_globalTrainingLoops = 1; 00086 m_addConstantInput = 0; 00087 m_loadWeightsBeforeTraining = false; 00088 }
Data::~Data | ( | ) | [virtual] |
void Data::addConstantInput | ( | ) |
Add a constant 1 column to the feature matrices
Definition at line 2403 of file Data.cpp.
02404 { 02405 if(m_trainOrig) 02406 { 02407 cout<<"Add a constant 1 column to the train feature matrix"<<endl; 02408 REAL* trainTmp = new REAL[m_nTrain*(m_nFeatures+1)]; 02409 for(int i=0;i<m_nTrain;i++) 02410 { 02411 for(int j=0;j<m_nFeatures;j++) 02412 trainTmp[i*(m_nFeatures+1)+j] = m_trainOrig[i*m_nFeatures+j]; 02413 trainTmp[i*(m_nFeatures+1)+m_nFeatures] = 1.0; 02414 } 02415 delete[] m_trainOrig; 02416 m_trainOrig = trainTmp; 02417 } 02418 if(m_testOrig) 02419 { 02420 cout<<"Add a constant 1 column to the test feature matrix"<<endl; 02421 REAL* testTmp = new REAL[m_nTest*(m_nFeatures+1)]; 02422 for(int i=0;i<m_nTest;i++) 02423 { 02424 for(int j=0;j<m_nFeatures;j++) 02425 testTmp[i*(m_nFeatures+1)+j] = m_testOrig[i*m_nFeatures+j]; 02426 testTmp[i*(m_nFeatures+1)+m_nFeatures] = 1.0; 02427 } 02428 delete[] m_testOrig; 02429 m_testOrig = testTmp; 02430 } 02431 m_nFeatures++; 02432 }
void Data::allocMemForCrossValidationSets | ( | ) |
Allocate memory for the cross validation dataset splits residual = original - effect
residual = model error original = original target value from datafile effect = prediction from an Algorithm (e.g. preprocessing)
Definition at line 901 of file Data.cpp.
00902 { 00903 cout<<"Alloc mem for cross validation data sets"<<endl; 00904 m_mean = new REAL[m_nFeatures]; 00905 m_std = new REAL[m_nFeatures]; 00906 00907 if(m_validationType == "ValidationSet") 00908 m_nCross = 0; 00909 else 00910 { 00911 // partitioning to nCross-validation sets 00912 if ( m_nCross > m_nTrain ) 00913 { 00914 cout<<"Limit: nCross=nTrain"<<endl; 00915 m_nCross = m_nTrain; 00916 } 00917 cout<<"Cross-validation settings: "<<m_nCross<<" sets"<<endl; 00918 } 00919 00920 // calc global mean and standard deviation over whole dataset 00921 cout<<"Calculating mean and std per input"<<endl; 00922 double minStd = 1e10, maxStd = -1e10, minMean = 1e10, maxMean = -1e10, minValue = 1e10, maxValue = -1e10; 00923 for ( int i=0;i<m_nFeatures;i++ ) 00924 { 00925 // calc mean 00926 double mean = 0.0; 00927 for ( int j=0;j<m_nTrain;j++ ) 00928 { 00929 REAL v = m_trainOrig[j*m_nFeatures + i]; 00930 mean += v; 00931 if ( minValue > v ) 00932 minValue = v; 00933 if ( maxValue < v ) 00934 maxValue = v; 00935 } 00936 mean /= ( double ) m_nTrain; 00937 00938 // calc standard deviation 00939 double std = 0.0; 00940 for ( int j=0;j<m_nTrain;j++ ) 00941 std += ( mean - m_trainOrig[j*m_nFeatures + i] ) * ( mean - m_trainOrig[j*m_nFeatures + i] ); 00942 std = sqrt ( std/ ( double ) ( m_nTrain-1 ) ); 00943 00944 if ( m_datasetName=="KDDCup09Large" || m_datasetName=="KDDCup09Small" ) // || m_datasetName=="BINARY") 00945 { 00946 double max = -1e10; 00947 for ( int j=0;j<m_nTrain;j++ ) 00948 if ( max < fabs ( m_trainOrig[j*m_nFeatures + i]-mean ) ) 00949 max = fabs ( m_trainOrig[j*m_nFeatures + i]-mean ); 00950 std = max; 00951 } 00952 00953 if ( fabs ( std ) < 1e-9 && mean == 0.0 ) // constant zero input 00954 { 00955 //cout<<"Feature nr:"<<i<<" is constant zero (mean:"<<mean<<"), set std=1e10"<<endl; 00956 cout<<"f:"<<i<<"=0 "<<flush; 00957 std = 1e10; 00958 } 00959 if ( fabs ( std ) < 1e-9 && mean != 0.0 ) // constant input 00960 { 00961 //cout<<"Feature nr:"<<i<<" is constant (mean:"<<mean<<"), set std="<<mean<<" and mean=0"<<endl; 00962 cout<<"f:"<<i<<"=c "<<flush; 00963 std = mean; 00964 mean = 0.0; 00965 } 00966 if ( mean==1.0 ) // constant one input 00967 { 00968 //cout<<"Feature nr:"<<i<<" mean=1, set std=1 and mean=0"<<endl; 00969 cout<<"f:"<<i<<"=1 "<<flush; 00970 std = 1.0; 00971 mean = 0.0; 00972 } 00973 if ( std < m_standardDeviationMin ) // limit to a small positive value 00974 { 00975 //cout<<"Feature nr:"<<i<<" "<<"("<<std<<") is limited in std="<<m_standardDeviationMin<<endl; 00976 cout<<"f:"<<i<<"lim "<<flush; 00977 std = m_standardDeviationMin; 00978 } 00979 00980 minStd = minStd > std? std : minStd; 00981 maxStd = maxStd < std? std : maxStd; 00982 minMean = minMean > mean? mean : minMean; 00983 maxMean = maxMean < mean? mean : maxMean; 00984 00985 // save them 00986 m_mean[i] = mean; 00987 m_std[i] = std; 00988 } 00989 if ( m_enableStaticNormalization ) 00990 { 00991 cout<<"Static mean:"<<m_staticMeanNormalization<<" and std:"<<m_staticStdNormalization<<endl; 00992 for ( int i=0;i<m_nFeatures;i++ ) 00993 { 00994 m_mean[i] = m_staticMeanNormalization; 00995 m_std[i] = m_staticStdNormalization; 00996 } 00997 minMean = m_staticMeanNormalization; 00998 maxMean = m_staticMeanNormalization; 00999 minStd = m_staticStdNormalization; 01000 maxStd = m_staticStdNormalization; 01001 } 01002 if ( m_enableGlobalMeanStdEstimate ) 01003 { 01004 cout<<"Calc average of mean and std"<<endl; 01005 double mean = 0.0; 01006 for ( int i=0;i<m_nFeatures;i++ ) 01007 mean += m_mean[i]; 01008 mean /= ( double ) m_nFeatures; 01009 for ( int i=0;i<m_nFeatures;i++ ) 01010 m_mean[i] = mean; 01011 minMean = maxMean = mean; 01012 01013 double std = 0.0; 01014 int stdCnt = 0; 01015 for ( int i=0;i<m_nFeatures;i++ ) 01016 { 01017 if ( m_std[i] != 1e10 ) 01018 { 01019 std += m_std[i]; 01020 stdCnt++; 01021 } 01022 } 01023 if ( stdCnt == 0 ) 01024 assert ( false ); 01025 std /= ( double ) stdCnt; 01026 for ( int i=0;i<m_nFeatures;i++ ) 01027 m_std[i] = std; 01028 minStd = maxStd = std; 01029 } 01030 if ( m_enableProbablisticNormalization ) 01031 { 01032 cout<<"Calc probablistic normalization"<<endl; 01033 minStd = 1e10; 01034 maxStd = -1e10; 01035 minMean = 1e10; 01036 maxMean = -1e10; 01037 for ( int i=0;i<m_nFeatures;i++ ) 01038 { 01039 REAL min = 1e10, max = -1e10; 01040 for ( int j=0;j<m_nTrain;j++ ) 01041 { 01042 REAL v = m_trainOrig[i + j*m_nFeatures]; 01043 if ( min > v ) 01044 min = v; 01045 if ( max < v ) 01046 max = v; 01047 } 01048 REAL diff = max - min; 01049 m_mean[i] = min; 01050 m_std[i] = diff; 01051 if ( m_std[i] < 1e-6 ) 01052 m_std[i] = 1.0; 01053 01054 minStd = minStd > m_std[i]? m_std[i] : minStd; 01055 maxStd = maxStd < m_std[i]? m_std[i] : maxStd; 01056 minMean = minMean > m_mean[i]? m_mean[i] : minMean; 01057 maxMean = maxMean < m_mean[i]? m_mean[i] : maxMean; 01058 } 01059 cout<<"mean|std:"<<endl; 01060 for ( int i=0;i<m_nFeatures;i++ ) 01061 cout<<m_mean[i]<<"|"<<m_std[i]<<" "; 01062 cout<<endl; 01063 } 01064 cout<<"Min|Max mean: "<<minMean<<"|"<<maxMean<<" Min|Max std: "<<minStd<<"|"<<maxStd<<" Min|Max value: "<<minValue<<"|"<<maxValue<<endl; 01065 01066 // target means 01067 cout<<"Target means: "<<flush; 01068 for ( int i=0;i<m_nClass*m_nDomain;i++ ) 01069 { 01070 double mean = 0.0; 01071 REAL* ptr = m_trainTargetOrig + i * m_nClass * m_nDomain; 01072 for ( int j=0;j<m_nTrain;j++ ) 01073 mean += ptr[j]; 01074 cout<<i<<":"<<mean/ ( double ) ( m_nTrain ) <<" "; 01075 } 01076 cout<<endl; 01077 01078 // save normalization 01079 char buf[1024]; 01080 sprintf ( buf,"%s/%s/normalization.dat.add%d",m_datasetPath.c_str(), m_tempPath.c_str(), m_nCascadeInputs ); 01081 cout<<"Save mean and std: "<<buf<<endl; 01082 fstream f ( buf, ios::out ); 01083 f.write ( ( char* ) &m_nFeatures, sizeof ( int ) ); 01084 f.write ( ( char* ) m_mean, sizeof ( REAL ) *m_nFeatures ); 01085 f.write ( ( char* ) m_std, sizeof ( REAL ) *m_nFeatures ); 01086 f.close(); 01087 01088 m_mixList = new int[m_nTrain]; 01089 01090 // mixing list 01091 for ( int i=0;i<m_nTrain;i++ ) 01092 m_mixList[i] = i; 01093 01094 // fix the randomness 01095 cout<<"Random seed:"<<m_randSeed<<endl; 01096 srand ( m_randSeed ); 01097 01098 cout<<"nFeatures:"<<m_nFeatures<<endl; 01099 cout<<"nClass:"<<m_nClass<<endl; 01100 cout<<"nDomain:"<<m_nDomain<<endl; 01101 01102 if ( m_validationType == "ValidationSet" ) 01103 { 01104 // no cross validation set 01105 m_trainSize = new int[1]; 01106 m_trainSize[0] = m_nTrain; 01107 return; 01108 } 01109 01110 01111 m_trainTargetOrigEffect = new REAL[m_nClass*m_nDomain*m_nTrain]; 01112 m_trainTargetOrigResidual = new REAL[m_nClass*m_nDomain*m_nTrain]; 01113 01114 // allocate mem for cross validation sets 01115 m_trainSize = new int[m_nCross+1]; 01116 m_train = new REAL*[m_nCross+1]; 01117 m_trainTarget = new REAL*[m_nCross+1]; 01118 m_trainTargetEffect = new REAL*[m_nCross+1]; 01119 m_trainTargetResidual = new REAL*[m_nCross+1]; 01120 m_trainLabel = new int*[m_nCross+1]; 01121 if(m_validationType == "Bagging") 01122 m_trainBaggingIndex = new int*[m_nCross+1]; 01123 01124 m_probeSize = new int[m_nCross+1]; 01125 m_probe = new REAL*[m_nCross+1]; 01126 m_probeTarget = new REAL*[m_nCross+1]; 01127 m_probeTargetEffect = new REAL*[m_nCross+1]; 01128 m_probeTargetResidual = new REAL*[m_nCross+1]; 01129 m_probeLabel = new int*[m_nCross+1]; 01130 m_probeIndex = new int*[m_nCross+1]; 01131 01132 01133 // make a randomized index list (by random index swaps) 01134 int index0, index1, tmp; 01135 cout<<"Make "<<m_nTrain*m_nMixTrainList<<" index swaps (randomize sample index list)"<<endl; 01136 for ( int i=0;i<m_nTrain*m_nMixTrainList;i++ ) 01137 { 01138 index0 = rand() % m_nTrain; 01139 index1 = rand() % m_nTrain; 01140 01141 // swap 01142 tmp = m_mixList[index0]; 01143 m_mixList[index0] = m_mixList[index1]; 01144 m_mixList[index1] = tmp; 01145 } 01146 01147 if( m_validationType == "Retraining" || m_validationType == "CrossFoldMean" ) 01148 { 01149 m_slotBoundaries = new int[m_nCross+2]; 01150 01151 double partitionSize = ( double ) m_nTrain / ( double ) m_nCross; 01152 double accumulatedSize = partitionSize; 01153 int cnt = 0, currentSize = -1; 01154 m_slotBoundaries[0] = 0; 01155 m_slotBoundaries[m_nCross+1] = m_nTrain; 01156 cout<<"partition size: "<<partitionSize<<endl; 01157 01158 // calculate train + probe size 01159 for ( int i=0;i<=m_nTrain;i++ ) 01160 { 01161 currentSize++; 01162 if ( cnt < m_nCross ) 01163 { 01164 if ( i == ( int ) round ( accumulatedSize ) || i==m_nTrain ) 01165 { 01166 m_slotBoundaries[cnt+1] = i; 01167 m_probeSize[cnt] = currentSize; 01168 m_trainSize[cnt] = m_nTrain - currentSize; 01169 currentSize = 0; 01170 accumulatedSize += partitionSize; 01171 cnt++; 01172 } 01173 } 01174 } 01175 m_trainSize[m_nCross] = m_nTrain; // retraining set 01176 m_probeSize[m_nCross] = 0; 01177 01178 // print splits 01179 int sum = 0; 01180 cout<<"slot: TRAIN | PROBE"<<endl<<"==================="<<endl; 01181 for ( int i=0;i<m_nCross+1;i++ ) 01182 { 01183 cout<<i<<": "<<m_trainSize[i]<<" | "<<m_probeSize[i]<<endl; 01184 sum += m_probeSize[i]; 01185 } 01186 cout<<"probe sum:"<<sum<<endl; 01187 } 01188 else if ( m_validationType == "Bagging" ) 01189 { 01190 bool* bagSamples = new bool[m_nTrain]; 01191 cout<<"Bagging sizes: TRAIN | PROBE"<<endl<<"============================"<<endl; 01192 for(int i=0;i<m_nCross;i++) 01193 { 01194 m_trainBaggingIndex[i] = new int[m_nTrain]; 01195 01196 // simulate boostrap sampling: sampling with replacenent 01197 srand(Framework::getRandomSeed() + i); 01198 int cnt = 0; 01199 for(int j=0;j<m_nTrain;j++) 01200 bagSamples[j] = 0; 01201 for(int j=0;j<m_nTrain;j++) 01202 { 01203 int ind = rand() % m_nTrain; 01204 bagSamples[ind] = 1; 01205 m_trainBaggingIndex[i][j] = ind; 01206 } 01207 for(int j=0;j<m_nTrain;j++) 01208 cnt += bagSamples[j]; 01209 m_trainSize[i] = m_nTrain; 01210 m_probeSize[i] = m_nTrain - cnt; 01211 01212 m_probeIndex[i] = new int[m_probeSize[i]]; 01213 cnt = 0; 01214 for(int j=0;j<m_nTrain;j++) 01215 { 01216 if(bagSamples[j] == false) 01217 { 01218 m_probeIndex[i][cnt] = j; 01219 cnt++; 01220 } 01221 } 01222 cout<<i<<": "<<m_trainSize[i]<<" | "<<m_probeSize[i]<<" ("<<100.0*(double)m_probeSize[i]/(double)m_nTrain<<"% in probe)"<<endl; 01223 } 01224 m_trainSize[m_nCross] = 0; 01225 m_probeSize[m_nCross] = 0; 01226 m_probeIndex[m_nCross] = 0; 01227 m_trainBaggingIndex[m_nCross] = 0; 01228 delete[] bagSamples; 01229 01230 // make a summary (#zeros, mean coverage) 01231 int* bagCnt = new int[m_nTrain]; 01232 for(int i=0;i<m_nTrain;i++) 01233 bagCnt[i] = 0; 01234 for(int i=0;i<m_nCross;i++) 01235 for(int j=0;j<m_nTrain;j++) 01236 bagCnt[m_trainBaggingIndex[i][j]]++; 01237 cout<<"Bagging summary: #averaged: and #cnt"<<endl; 01238 for(int nr=0;nr<2*m_nCross;nr++) 01239 { 01240 int cnt = 0; 01241 for(int i=0;i<m_nTrain;i++) 01242 if(bagCnt[i] == nr) 01243 cnt++; 01244 cout<<"n:"<<nr<<"|#"<<cnt<<" "; 01245 } 01246 cout<<endl; 01247 delete[] bagCnt; 01248 } 01249 else 01250 assert(false); 01251 01252 // allocate mem + copy data to cross-validation slots 01253 for ( int i=0;i<m_nCross+1;i++ ) 01254 { 01255 // allocate train mem 01256 int nTrain = m_trainSize[i]; 01257 if ( m_enableSaveMemory == false ) 01258 m_train[i] = new REAL[nTrain * m_nFeatures]; 01259 else 01260 m_train[i] = 0; 01261 m_trainTarget[i] = new REAL[nTrain * m_nClass * m_nDomain]; 01262 m_trainTargetEffect[i] = new REAL[nTrain * m_nClass * m_nDomain]; 01263 m_trainTargetResidual[i] = new REAL[nTrain * m_nClass * m_nDomain]; 01264 m_trainLabel[i] = new int[nTrain*m_nDomain]; 01265 01266 // allocate probe mem 01267 int nProbe = m_probeSize[i]; 01268 if ( nProbe ) 01269 { 01270 if ( m_enableSaveMemory == false ) 01271 m_probe[i] = new REAL[nProbe * m_nFeatures]; 01272 else 01273 m_probe[i] = 0; 01274 m_probeTarget[i] = new REAL[nProbe * m_nClass * m_nDomain]; 01275 m_probeTargetEffect[i] = new REAL[nProbe * m_nClass * m_nDomain]; 01276 m_probeTargetResidual[i] = new REAL[nProbe * m_nClass * m_nDomain]; 01277 m_probeLabel[i] = new int[nProbe*m_nDomain]; 01278 if ( m_validationType != "Bagging" ) 01279 m_probeIndex[i] = new int[nProbe]; 01280 } 01281 else 01282 { 01283 m_probe[i] = 0; 01284 m_probeTarget[i] = 0; 01285 m_probeTargetEffect[i] = 0; 01286 m_probeTargetResidual[i] = 0; 01287 m_probeLabel[i] = 0; 01288 m_probeIndex[i] = 0; 01289 } 01290 } 01291 01292 // alloc index list 01293 m_crossIndex = new int[m_nTrain]; 01294 for ( int i=0;i<m_nTrain;i++ ) 01295 m_crossIndex[i] = -1; 01296 01297 }
void Data::baggingRandomSeed | ( | uint | seed | ) |
void Data::deleteMemory | ( | ) |
Deletes internal memory, in order to re-read a dataset and start the training again
Definition at line 104 of file Data.cpp.
00105 { 00106 cout<<"Delete internal memory"<<endl; 00107 00108 // memory from dataset 00109 if ( m_trainOrig ) 00110 delete[] m_trainOrig; 00111 m_trainOrig = 0; 00112 if ( m_trainTargetOrig ) 00113 delete[] m_trainTargetOrig; 00114 m_trainTargetOrig = 0; 00115 if ( m_trainLabelOrig ) 00116 delete[] m_trainLabelOrig; 00117 m_trainLabelOrig = 0; 00118 if ( m_testOrig ) 00119 delete[] m_testOrig; 00120 m_testOrig = 0; 00121 if ( m_testTargetOrig ) 00122 delete[] m_testTargetOrig; 00123 m_testTargetOrig = 0; 00124 if ( m_testLabelOrig ) 00125 delete[] m_testLabelOrig; 00126 m_testLabelOrig = 0; 00127 00128 // memory from cross validation 00129 if ( m_mean ) 00130 delete[] m_mean; 00131 m_mean = 0; 00132 if ( m_std ) 00133 delete[] m_std; 00134 m_std = 0; 00135 if ( m_trainTargetOrigEffect ) 00136 delete[] m_trainTargetOrigEffect; 00137 m_trainTargetOrigEffect = 0; 00138 if ( m_trainTargetOrigResidual ) 00139 delete[] m_trainTargetOrigResidual; 00140 m_trainTargetOrigResidual = 0; 00141 00142 for ( int i=0;i<m_nCross+1;i++ ) 00143 { 00144 if ( m_train ) 00145 { 00146 if ( m_train[i] ) 00147 delete[] m_train[i]; 00148 m_train[i] = 0; 00149 } 00150 if ( m_trainTarget ) 00151 { 00152 if ( m_trainTarget[i] ) 00153 delete[] m_trainTarget[i]; 00154 m_trainTarget[i] = 0; 00155 } 00156 if ( m_trainTargetEffect ) 00157 { 00158 if ( m_trainTargetEffect[i] ) 00159 delete[] m_trainTargetEffect[i]; 00160 m_trainTargetEffect[i] = 0; 00161 } 00162 if ( m_trainTargetResidual ) 00163 { 00164 if ( m_trainTargetResidual[i] ) 00165 delete[] m_trainTargetResidual[i]; 00166 m_trainTargetResidual[i] = 0; 00167 } 00168 if ( m_trainLabel ) 00169 { 00170 if ( m_trainLabel[i] ) 00171 delete[] m_trainLabel[i]; 00172 m_trainLabel[i] = 0; 00173 } 00174 if ( m_validationType == "Bagging" ) 00175 { 00176 if( m_trainBaggingIndex ) 00177 { 00178 if ( m_trainBaggingIndex[i] ) 00179 delete[] m_trainBaggingIndex[i]; 00180 m_trainBaggingIndex[i] = 0; 00181 } 00182 } 00183 if ( m_probe ) 00184 { 00185 if ( m_probe[i] ) 00186 delete[] m_probe[i]; 00187 m_probe[i] = 0; 00188 } 00189 if ( m_probeTarget ) 00190 { 00191 if ( m_probeTarget[i] ) 00192 delete[] m_probeTarget[i]; 00193 m_probeTarget[i] = 0; 00194 } 00195 if ( m_probeTargetEffect ) 00196 { 00197 if ( m_probeTargetEffect[i] ) 00198 delete[] m_probeTargetEffect[i]; 00199 m_probeTargetEffect[i] = 0; 00200 } 00201 if ( m_probeTargetResidual ) 00202 { 00203 if ( m_probeTargetResidual[i] ) 00204 delete[] m_probeTargetResidual[i]; 00205 m_probeTargetResidual[i] = 0; 00206 } 00207 if ( m_probeLabel ) 00208 { 00209 if ( m_probeLabel[i] ) 00210 delete[] m_probeLabel[i]; 00211 m_probeLabel[i] = 0; 00212 } 00213 if ( m_probeIndex ) 00214 { 00215 if ( m_probeIndex[i] ) 00216 delete[] m_probeIndex[i]; 00217 m_probeIndex[i] = 0; 00218 } 00219 } 00220 if ( m_train ) 00221 delete[] m_train; 00222 m_train = 0; 00223 if ( m_trainTarget ) 00224 delete[] m_trainTarget; 00225 m_trainTarget = 0; 00226 if ( m_trainTargetEffect ) 00227 delete[] m_trainTargetEffect; 00228 m_trainTargetEffect = 0; 00229 if ( m_trainTargetResidual ) 00230 delete[] m_trainTargetResidual; 00231 m_trainTargetResidual = 0; 00232 if ( m_trainLabel ) 00233 delete[] m_trainLabel; 00234 m_trainLabel = 0; 00235 if(m_validationType == "Bagging") 00236 { 00237 if(m_trainBaggingIndex) 00238 delete[] m_trainBaggingIndex; 00239 m_trainBaggingIndex = 0; 00240 } 00241 if ( m_probe ) 00242 delete[] m_probe; 00243 m_probe = 0; 00244 if ( m_probeTarget ) 00245 delete[] m_probeTarget; 00246 m_probeTarget = 0; 00247 if ( m_probeTargetEffect ) 00248 delete[] m_probeTargetEffect; 00249 m_probeTargetEffect = 0; 00250 if ( m_probeTargetResidual ) 00251 delete[] m_probeTargetResidual; 00252 m_probeTargetResidual = 0; 00253 if ( m_probeLabel ) 00254 delete[] m_probeLabel; 00255 m_probeLabel = 0; 00256 if ( m_probeIndex ) 00257 delete[] m_probeIndex; 00258 m_probeIndex = 0; 00259 00260 if ( m_trainSize ) 00261 delete[] m_trainSize; 00262 m_trainSize = 0; 00263 if ( m_probeSize ) 00264 delete[] m_probeSize; 00265 m_probeSize = 0; 00266 00267 if ( m_mixDatasetIndices ) 00268 delete[] m_mixDatasetIndices; 00269 m_mixDatasetIndices = 0; 00270 if ( m_mixList ) 00271 delete[] m_mixList; 00272 m_mixList = 0; 00273 if ( m_slotBoundaries ) 00274 delete[] m_slotBoundaries; 00275 m_slotBoundaries = 0; 00276 if ( m_crossIndex ) 00277 delete[] m_crossIndex; 00278 m_crossIndex = 0; 00279 00280 if ( m_cascadeInputs ) 00281 delete[] m_cascadeInputs; 00282 m_cascadeInputs = 0; 00283 00284 if ( m_targetMean ) 00285 delete[] m_targetMean; 00286 m_targetMean = 0; 00287 00288 }
void Data::doBootstrapSampling | ( | REAL * | probs, | |
REAL *& | train, | |||
REAL *& | target, | |||
REAL *& | targetEff, | |||
REAL *& | targetRes, | |||
int *& | label, | |||
int | nTrainNew = 0 | |||
) |
This is an obsolete method!! Please use directly the option: validationType=Bagging in the Master.dsc file instead
Make a modified train dataset using boostrap sampling -> Sampling with replacement On average 63% of original data are in the new trainset (with duplicates)
Definition at line 557 of file Data.cpp.
00558 { 00559 cout<<endl<<"Do boostrap sampling of the dataset (size:"<<m_nTrain<<")"<<endl; 00560 cout<<"Random seed:"<<m_randomSeedBagging<<endl; 00561 srand ( m_randomSeedBagging ); 00562 00563 if ( nTrainNew > 0 && nTrainNew < m_nTrain ) 00564 cout<<"Draw not a boostrap sample, make a simple random subset ("<<100.0* ( double ) nTrainNew/ ( double ) m_nTrain<<"%)"<<endl; 00565 00566 REAL* trainNew = 0, *ptr0, *ptr1; 00567 if ( train ) 00568 trainNew = new REAL[m_nFeatures*m_nTrain]; 00569 REAL* targetNew = 0; 00570 if ( target ) 00571 targetNew = new REAL[m_nClass*m_nDomain*m_nTrain]; 00572 REAL* targetEffNew = 0; 00573 if ( targetEff ) 00574 targetEffNew = new REAL[m_nClass*m_nDomain*m_nTrain]; 00575 REAL* targetResNew = 0; 00576 if ( targetRes ) 00577 targetResNew = new REAL[m_nClass*m_nDomain*m_nTrain]; 00578 int* labelNew = 0; 00579 if ( Framework::getDatasetType() ==true ) 00580 labelNew = new int[m_nDomain*m_nTrain]; 00581 int* replicateCnt = new int[m_nTrain]; 00582 for ( int i=0;i<m_nTrain;i++ ) 00583 replicateCnt[i] = 0; 00584 00585 int sampleCnt = 0; 00586 while ( ( sampleCnt < m_nTrain && nTrainNew == 0 ) || ( sampleCnt < nTrainNew && nTrainNew > 0 && nTrainNew < m_nTrain ) ) 00587 //for(int i=0;i<m_nTrain;i++) 00588 { 00589 // random index 00590 int ind; 00591 if ( nTrainNew == 0 || nTrainNew >= m_nTrain ) // boostrap sample 00592 { 00593 if ( probs == 0 ) 00594 ind = rand() %m_nTrain; 00595 else 00596 ind = vectorSampling ( probs, m_nTrain ); 00597 } 00598 else // random subset 00599 { 00600 ind = rand() %m_nTrain; 00601 while ( replicateCnt[ind] ) 00602 ind = rand() %m_nTrain; 00603 } 00604 replicateCnt[ind]++; 00605 00606 // train features 00607 if ( train ) 00608 { 00609 ptr0 = train + ind * m_nFeatures; 00610 ptr1 = trainNew + sampleCnt * m_nFeatures; 00611 for ( int j=0;j<m_nFeatures;j++ ) 00612 ptr1[j] = ptr0[j]; 00613 } 00614 00615 // targets 00616 if ( target ) 00617 { 00618 ptr0 = target + ind * m_nClass*m_nDomain; 00619 ptr1 = targetNew + sampleCnt * m_nClass*m_nDomain; 00620 for ( int j=0;j<m_nClass*m_nDomain;j++ ) 00621 ptr1[j] = ptr0[j]; 00622 } 00623 00624 // effects 00625 if ( targetEff ) 00626 { 00627 ptr0 = targetEff + ind * m_nClass*m_nDomain; 00628 ptr1 = targetEffNew + sampleCnt * m_nClass*m_nDomain; 00629 for ( int j=0;j<m_nClass*m_nDomain;j++ ) 00630 ptr1[j] = ptr0[j]; 00631 } 00632 00633 // residual 00634 if ( targetRes ) 00635 { 00636 ptr0 = targetRes + ind * m_nClass*m_nDomain; 00637 ptr1 = targetResNew + sampleCnt * m_nClass*m_nDomain; 00638 for ( int j=0;j<m_nClass*m_nDomain;j++ ) 00639 ptr1[j] = ptr0[j]; 00640 } 00641 00642 // train label 00643 if ( Framework::getDatasetType() ==true ) 00644 for ( int d=0;d<m_nDomain;d++ ) 00645 labelNew[d+sampleCnt*m_nDomain] = label[d+ind*m_nDomain]; 00646 00647 sampleCnt++; 00648 } 00649 00650 int nonReplicates = 0, notUsed = 0, replicates = 0; 00651 for ( int i=0;i<m_nTrain;i++ ) 00652 { 00653 if ( replicateCnt[i] == 0 ) 00654 notUsed++; 00655 if ( replicateCnt[i] == 1 ) 00656 nonReplicates++; 00657 if ( replicateCnt[i] > 1 ) 00658 replicates++; 00659 } 00660 cout<<"notUsed:"<<notUsed<<" nonReplicates:"<<nonReplicates<<" replicates:"<<replicates; 00661 cout<<" ("<<100.0* ( REAL ) ( nonReplicates+replicates ) / ( REAL ) m_nTrain<<"%)"<<endl<<endl; 00662 00663 delete[] replicateCnt; 00664 00665 // set new data 00666 train = trainNew; 00667 target = targetNew; 00668 targetEff = targetEffNew; 00669 targetRes = targetResNew; 00670 label = labelNew; 00671 }
void Data::doFeatureSelection | ( | ) |
Start the feature selection process
Definition at line 1445 of file Data.cpp.
01446 { 01447 bool* selectedFeatures = new bool[m_nFeatures]; 01448 InputFeatureSelector::selectFeatures ( selectedFeatures, m_trainOrig, m_nFeatures, m_nTrain, m_trainLabelOrig, m_trainTargetOrigResidual, m_nClass, m_nDomain ); 01449 01450 delete[] selectedFeatures; 01451 }
void Data::enableBagging | ( | bool | en | ) |
void Data::extendTrainDataWithCascadeInputs | ( | ) |
Extend the input features with predictions of previous algorithms nInputsNew = nInputs + nCascadeInputs
Definition at line 1716 of file Data.cpp.
01717 { 01718 if ( m_nCascadeInputs == 0 ) 01719 return; 01720 01721 cout<<"Extend the train data with cascade inputs"<<endl; 01722 01723 if ( m_trainOrig ) 01724 { 01725 REAL* m_trainOrigNew = new REAL[m_nTrain* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) ]; 01726 for ( int i=0;i<m_nTrain;i++ ) 01727 { 01728 REAL* ptr0 = m_trainOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ); 01729 REAL* ptr1 = m_trainOrig + i*m_nFeatures; 01730 for ( int j=0;j<m_nFeatures;j++ ) 01731 ptr0[j] = ptr1[j]; 01732 ptr0 = m_trainOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) + m_nFeatures; 01733 ptr1 = m_cascadeInputs + i*m_nCascadeInputs*m_nClass*m_nDomain; 01734 for ( int j=0;j<m_nCascadeInputs*m_nClass*m_nDomain;j++ ) 01735 ptr0[j] = ptr1[j]; 01736 } 01737 if ( m_trainOrig ) 01738 delete[] m_trainOrig; 01739 m_trainOrig = m_trainOrigNew; 01740 } 01741 01742 if ( m_testOrig ) 01743 { 01744 REAL* m_testOrigNew = new REAL[m_nTest* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) ]; 01745 for ( int i=0;i<m_nTest;i++ ) 01746 { 01747 REAL* ptr0 = m_testOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ); 01748 REAL* ptr1 = m_testOrig + i*m_nFeatures; 01749 for ( int j=0;j<m_nFeatures;j++ ) 01750 ptr0[j] = ptr1[j]; 01751 ptr0 = m_testOrigNew + i* ( m_nFeatures+m_nCascadeInputs*m_nClass*m_nDomain ) + m_nFeatures; 01752 ptr1 = m_cascadeInputs + i*m_nCascadeInputs*m_nClass*m_nDomain; 01753 for ( int j=0;j<m_nCascadeInputs*m_nClass*m_nDomain;j++ ) 01754 ptr0[j] = ptr1[j]; 01755 } 01756 if ( m_testOrig ) 01757 delete[] m_testOrig; 01758 m_testOrig = m_testOrigNew; 01759 } 01760 01761 int nFeaturesBefore = m_nFeatures; 01762 m_nFeatures += m_nCascadeInputs*m_nClass*m_nDomain; 01763 cout<<"nFeatures: "<<m_nFeatures<<" (before: "<<nFeaturesBefore<<")"<<endl; 01764 }
void Data::fillCascadeLearningInputs | ( | ) |
If this algorithm is based on an other algorithm Add the predictions of previous algorithms as input features This means add all predictions from the fullPredictionPath
Definition at line 1656 of file Data.cpp.
01657 { 01658 cout<<endl<<"Add effects (predictions of previous algorithms) as inputs to dataset"<<endl; 01659 01660 // load the fullPredictors 01661 vector<string> files = m_algorithmNameList; //Data::getDirectoryFileList(m_datasetPath + "/" + m_fullPredPath + "/"); 01662 vector<string> m_usedFiles; 01663 01664 for ( int i=0;i<files.size();i++ ) 01665 if ( files[i].at ( files[i].size()-1 ) != '.' && files[i].find ( ".dat" ) == files[i].length()-4 ) 01666 m_usedFiles.push_back ( files[i] ); 01667 int size = m_usedFiles.size(); 01668 01669 // alloc mem 01670 m_cascadeInputs = new REAL[size*m_nClass*m_nDomain*m_nTrain]; 01671 for ( int i=0;i<size*m_nClass*m_nDomain*m_nTrain;i++ ) 01672 m_cascadeInputs[i] = 1e10; 01673 01674 // fill cascadeInputs 01675 for ( int i=0;i<size;i++ ) 01676 { 01677 fstream f ( m_usedFiles[i].c_str(), ios::in ); 01678 if ( f.is_open() == false ) 01679 assert ( false ); 01680 REAL* cache = new REAL[m_nTrain*m_nClass*m_nDomain]; 01681 f.read ( ( char* ) cache, sizeof ( REAL ) *m_nTrain*m_nClass*m_nDomain ); 01682 f.close(); 01683 01684 for ( int j=0;j<m_nTrain;j++ ) 01685 for ( int k=0;k<m_nClass*m_nDomain;k++ ) 01686 m_cascadeInputs[j*m_nClass*m_nDomain*size + i*m_nClass*m_nDomain + k] = cache[j*m_nClass*m_nDomain + k]; 01687 01688 if ( cache ) 01689 delete[] cache; 01690 cache = 0; 01691 } 01692 for ( int i=0;i<size;i++ ) 01693 { 01694 double rmse = 0.0, err; 01695 for ( int j=0;j<m_nTrain;j++ ) 01696 for ( int k=0;k<m_nClass*m_nDomain;k++ ) 01697 { 01698 err = m_cascadeInputs[j*m_nClass*m_nDomain*size + i*m_nClass*m_nDomain + k] - m_trainTargetOrig[k + j*m_nClass*m_nDomain]; 01699 rmse += err*err; 01700 } 01701 cout<<"File:"<<m_usedFiles[i]<<" RMSE:"<<sqrt ( rmse/ ( double ) ( m_nClass*m_nTrain*m_nDomain ) ) <<endl; 01702 } 01703 if ( size == 0 ) 01704 cout<<"Nothing to do here"<<endl; 01705 cout<<endl; 01706 01707 m_nCascadeInputs = size; 01708 cout<<"nCascadeInputs:"<<m_nCascadeInputs<<endl; 01709 }
void Data::fillNCrossValidationSet | ( | int | n | ) |
Fill one split of the cross-fold validation set
n | The n-th set (0..nCross-1) |
Definition at line 1348 of file Data.cpp.
01349 { 01350 // alloc new memory 01351 if ( m_train[n] ) 01352 delete[] m_train[n]; 01353 m_train[n] = 0; 01354 m_train[n] = new REAL[m_trainSize[n]*m_nFeatures]; 01355 for ( int i=0;i<m_trainSize[n]*m_nFeatures;i++ ) 01356 m_train[n][i] = 0.0; 01357 if ( m_probe[n] ) 01358 delete[] m_probe[n]; 01359 m_probe[n] = 0; 01360 if ( m_probeSize[n] ) 01361 m_probe[n] = new REAL[m_probeSize[n]*m_nFeatures]; 01362 for ( int i=0;i<m_probeSize[n]*m_nFeatures;i++ ) 01363 m_probe[n][i] = 0.0; 01364 01365 if(m_validationType == "Bagging") 01366 { 01367 bool* bagSamples = new bool[m_nTrain]; 01368 for(int i=0;i<m_nTrain;i++) 01369 bagSamples[i] = 0; 01370 for(int i=0;i<m_nTrain;i++) 01371 { 01372 int ind = m_trainBaggingIndex[n][i]; 01373 bagSamples[ind] = 1; 01374 for(int j=0;j<m_nFeatures;j++) 01375 m_train[n][i*m_nFeatures+j] = m_trainOrig[ind*m_nFeatures + j]; 01376 } 01377 int cnt = 0; 01378 for(int i=0;i<m_nTrain;i++) 01379 { 01380 if(bagSamples[i] == false) 01381 { 01382 for(int j=0;j<m_nFeatures;j++) 01383 m_probe[n][cnt*m_nFeatures+j] = m_trainOrig[i*m_nFeatures + j]; 01384 cnt++; 01385 } 01386 } 01387 if(cnt != m_probeSize[n]) 01388 { 01389 cout<<"cnt:"<<cnt<<" probeSize"<<m_probeSize[n]<<endl; 01390 assert(false); 01391 } 01392 delete[] bagSamples; 01393 } 01394 else 01395 { 01396 // slot of probeset 01397 int begin = m_slotBoundaries[n]; 01398 int end = m_slotBoundaries[n+1]; 01399 01400 int probeCnt = 0, trainCnt = 0; 01401 01402 // go through whole trainOrig set 01403 for ( int j=0;j<m_nTrain;j++ ) 01404 { 01405 int index = m_mixList[j]; 01406 01407 // probe set 01408 if ( j>=begin && j <end ) 01409 { 01410 for ( int k=0;k<m_nFeatures;k++ ) 01411 m_probe[n][probeCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k]; 01412 probeCnt++; 01413 } 01414 else // train set 01415 { 01416 for ( int k=0;k<m_nFeatures;k++ ) 01417 m_train[n][trainCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k]; 01418 trainCnt++; 01419 } 01420 } 01421 01422 if ( probeCnt != m_probeSize[n] || trainCnt != m_trainSize[n] ) // safety check 01423 assert ( false ); 01424 } 01425 }
void Data::freeNCrossValidationSet | ( | int | n | ) |
Free memory of one split of the cross-fold validation set
n | The n-th set (0..nCross-1) |
Definition at line 1432 of file Data.cpp.
01433 { 01434 if ( m_train[n] ) 01435 delete[] m_train[n]; 01436 m_train[n] = 0; 01437 if ( m_probe[n] ) 01438 delete[] m_probe[n]; 01439 m_probe[n] = 0; 01440 }
vector< string > Data::getDirectoryFileList | ( | string | path | ) | [static] |
path | The path to the directory, which should be listed |
Definition at line 1873 of file Data.cpp.
01874 { 01875 vector<string> v; 01876 DIR *dp; 01877 struct dirent *dirp; 01878 if ( ( dp = opendir ( path.c_str() ) ) == NULL ) 01879 { 01880 cout << "Error opening " << path << endl; 01881 return v; 01882 } 01883 while ( ( dirp = readdir ( dp ) ) != NULL ) 01884 v.push_back ( path + string ( dirp->d_name ) ); 01885 closedir ( dp ); 01886 return v; 01887 }
void Data::loadNormalization | ( | int | nCascade = 0 |
) |
Load the normalization.dat in the temp folder
Definition at line 853 of file Data.cpp.
00854 { 00855 // load normalization 00856 char buf[1024]; 00857 sprintf ( buf,"%s/%s/normalization.dat.add%d",m_datasetPath.c_str(), m_tempPath.c_str(), nCascade ); 00858 cout<<"Load mean and std: "<<buf<<endl; 00859 fstream f ( buf, ios::in ); 00860 if ( f.is_open() == false ) 00861 assert ( false ); 00862 int n; 00863 f.read ( ( char* ) &n, sizeof ( int ) ); 00864 if ( m_mean == 0 ) 00865 m_mean = new REAL[n]; 00866 if ( m_std == 0 ) 00867 m_std = new REAL[n]; 00868 f.read ( ( char* ) m_mean, sizeof ( REAL ) *n ); 00869 f.read ( ( char* ) m_std, sizeof ( REAL ) *n ); 00870 REAL min = 1e10, max = -1e10; 00871 for ( int i=0;i<n;i++ ) 00872 { 00873 if ( min > m_mean[i] ) 00874 min = m_mean[i]; 00875 if ( max < m_mean[i] ) 00876 max = m_mean[i]; 00877 } 00878 cout<<"Mean: min|max:"<<min<<"|"<<max<<endl; 00879 min = 1e10; 00880 max = -1e10; 00881 for ( int i=0;i<n;i++ ) 00882 { 00883 if ( min > m_std[i] ) 00884 min = m_std[i]; 00885 if ( max < m_std[i] ) 00886 max = m_std[i]; 00887 } 00888 cout<<"Std: min|max:"<<min<<"|"<<max<<endl; 00889 f.close(); 00890 }
void Data::makeBinaryDataset | ( | ) |
Writes the dataset in binary form
Definition at line 707 of file Data.cpp.
00708 { 00709 cout<<endl; 00710 cout<<"Make binary dataset from selected features"<<endl; 00711 cout<<"Open features:"<<FEATURE_TXT_FILE<<endl; 00712 00713 // read features from txt file 00714 fstream f; 00715 vector<int> features; 00716 f.open ( FEATURE_TXT_FILE,ios::in ); 00717 if ( f.is_open() ==false ) 00718 assert ( false ); 00719 int value, nValidFeatures = 0; 00720 while ( f>>value ) 00721 features.push_back ( value ); 00722 f.close(); 00723 00724 // check featureIDs 00725 for ( int j=0;j<features.size();j++ ) 00726 if ( features[j] >= m_nFeatures || features[j] == -1 ) 00727 assert ( false ); 00728 else 00729 nValidFeatures++; 00730 00731 cout<<"nValidFeatures:"<<nValidFeatures<<endl; 00732 REAL* feat; 00733 int* label, N; 00734 00735 if ( Framework::getFrameworkMode() == 1 ) 00736 { 00737 cout<<"Write: binary.test"<<endl; 00738 f.open ( "binary.test", ios::out ); 00739 feat = m_testOrig; 00740 label = m_testLabelOrig; 00741 N = m_nTest; 00742 } 00743 else 00744 { 00745 cout<<"Write: binary.train"<<endl; 00746 f.open ( "binary.train", ios::out ); 00747 feat = m_trainOrig; 00748 label = m_trainLabelOrig; 00749 N = m_nTrain; 00750 } 00751 00752 cout<<"#lines:"<<N<<endl; 00753 00754 // dataset bounds 00755 f.write ( ( char* ) &N, sizeof ( int ) ); 00756 f.write ( ( char* ) &m_nClass, sizeof ( int ) ); 00757 f.write ( ( char* ) &m_nDomain, sizeof ( int ) ); 00758 f.write ( ( char* ) &nValidFeatures, sizeof ( int ) ); 00759 00760 // write features 00761 for ( int i=0;i<N;i++ ) 00762 for ( int j=0;j<features.size();j++ ) 00763 f.write ( ( char* ) & ( feat[i*m_nFeatures + features[j]] ), sizeof ( REAL ) ); 00764 00765 // write labels 00766 f.write ( ( char* ) label, sizeof ( int ) *N*m_nDomain ); 00767 f.close(); 00768 00769 }
void Data::mergeTrainAndTest | ( | ) |
Merge the train and test set into the train set
This is used in the dimensionality reduction, where the training is unsupervised, which means to train only on features and without targets
Definition at line 2155 of file Data.cpp.
02156 { 02157 cout<<"trainSet = {trainSet(#"<<m_nTrain<<") + testSet(#"<<m_nTest<<")}"<<endl; 02158 if ( m_nTest == 0 ) 02159 return; 02160 02161 REAL* train = new REAL[ ( m_nTrain + m_nTest ) *m_nFeatures]; 02162 REAL* trainTarget = new REAL[ ( m_nTrain + m_nTest ) *m_nClass*m_nDomain]; 02163 int* trainLabel = new int[ ( m_nTrain + m_nTest ) *m_nDomain]; 02164 02165 memcpy ( train, m_trainOrig, sizeof ( REAL ) *m_nTrain*m_nFeatures ); 02166 memcpy ( train + m_nTrain*m_nFeatures, m_testOrig, sizeof ( REAL ) *m_nTest*m_nFeatures ); 02167 02168 memcpy ( trainTarget, m_trainTargetOrig, sizeof ( REAL ) *m_nTrain*m_nClass*m_nDomain ); 02169 memcpy ( trainTarget + m_nTrain*m_nClass*m_nDomain, m_testTargetOrig, sizeof ( REAL ) *m_nTest*m_nClass*m_nDomain ); 02170 02171 memcpy ( trainLabel, m_trainLabelOrig, sizeof ( REAL ) *m_nTrain*m_nDomain ); 02172 memcpy ( trainLabel + m_nTrain*m_nDomain, m_testLabelOrig, sizeof ( REAL ) *m_nTest*m_nDomain ); 02173 02174 delete[] m_trainOrig; 02175 delete[] m_trainTargetOrig; 02176 delete[] m_trainLabelOrig; 02177 02178 m_trainOrig = train; 02179 m_trainTargetOrig = trainTarget; 02180 m_trainLabelOrig = trainLabel; 02181 02182 m_nTrain = m_nTrain + m_nTest; 02183 }
void Data::mixDataset | ( | ) |
Mix the dataset Do m_nTrain*m_nMixDataset random sample swaps
Definition at line 775 of file Data.cpp.
00776 { 00777 if ( m_nTrain ) 00778 { 00779 m_mixDatasetIndices = new int[m_nTrain]; 00780 for ( int i=0;i<m_nTrain;i++ ) 00781 m_mixDatasetIndices[i] = i; 00782 } 00783 else 00784 { 00785 cout<<"Do no mix the dataset."<<endl; 00786 m_mixDatasetIndices = 0; 00787 return; 00788 } 00789 cout<<"Randomize the dataset: "<<m_nMixDataset*m_nTrain<<" line swaps ["; 00790 00791 int progress = m_nTrain*m_nMixDataset/10 + 1; 00792 REAL* tmp0 = new REAL[m_nFeatures]; 00793 REAL* tmp1 = new REAL[m_nClass*m_nDomain]; 00794 for ( int i=0;i<m_nTrain*m_nMixDataset;i++ ) 00795 { 00796 if ( i%progress==0 ) 00797 cout<<"."<<flush; 00798 00799 // random index swaps 00800 int ind0 = rand() %m_nTrain; 00801 int ind1 = rand() %m_nTrain; 00802 00803 // train features (REAL*) 00804 REAL* ptr0 = m_trainOrig + ind0 * m_nFeatures; 00805 REAL* ptr1 = m_trainOrig + ind1 * m_nFeatures; 00806 for ( int j=0;j<m_nFeatures;j++ ) 00807 { 00808 tmp0[j] = ptr0[j]; 00809 ptr0[j] = ptr1[j]; 00810 ptr1[j] = tmp0[j]; 00811 } 00812 00813 // train targets (REAL*) 00814 ptr0 = m_trainTargetOrig + ind0 * m_nClass * m_nDomain; 00815 ptr1 = m_trainTargetOrig + ind1 * m_nClass * m_nDomain; 00816 for ( int j=0;j<m_nClass*m_nDomain;j++ ) 00817 { 00818 tmp1[j] = ptr0[j]; 00819 ptr0[j] = ptr1[j]; 00820 ptr1[j] = tmp1[j]; 00821 } 00822 00823 // train label 00824 if ( Framework::getDatasetType() ==true ) 00825 { 00826 for ( int d=0;d<m_nDomain;d++ ) 00827 { 00828 int tmp = m_trainLabelOrig[d+ind0*m_nDomain]; 00829 m_trainLabelOrig[d+ind0*m_nDomain] = m_trainLabelOrig[d+ind1*m_nDomain]; 00830 m_trainLabelOrig[d+ind1*m_nDomain] = tmp; 00831 } 00832 } 00833 00834 // index 00835 int tmp = m_mixDatasetIndices[ind0]; 00836 m_mixDatasetIndices[ind0] = m_mixDatasetIndices[ind1]; 00837 m_mixDatasetIndices[ind1] = tmp; 00838 } 00839 if ( tmp0 ) 00840 delete[] tmp0; 00841 tmp0 = 0; 00842 if ( tmp1 ) 00843 delete[] tmp1; 00844 tmp1 = 0; 00845 00846 cout<<"] "<<"mixInd[0]:"<<m_mixDatasetIndices[0]<<" mixInd["<<m_nTrain-1<<"]:"<<m_mixDatasetIndices[m_nTrain-1]<<endl; 00847 }
void Data::normalizeZeroOne | ( | ) |
Normalize train between 0 and 1
Definition at line 2188 of file Data.cpp.
02189 { 02190 cout<<"Autoencoder: Normalize train between 0 and 1"<<endl; 02191 // (m_trainOrig[i*m_nFeatures + j] - m_mean[j]) / m_std[j] 02192 REAL* mean = new REAL[m_nFeatures]; 02193 REAL* std = new REAL[m_nFeatures]; 02194 02195 for ( int i=0;i<m_nFeatures;i++ ) 02196 { 02197 double mu = 0.0, min = 1e10, max = -1e10; 02198 for ( int j=0;j<m_nTrain;j++ ) 02199 { 02200 REAL v = m_trainOrig[i+j*m_nFeatures]; 02201 mu += v; 02202 if ( min > v ) 02203 min = v; 02204 if ( max < v ) 02205 max = v; 02206 } 02207 mean[i] = min; 02208 std[i] = max - min; 02209 if ( std[i] <= 1e-2 ) 02210 std[i] = 1.0; 02211 m_mean[i] = 0.0; 02212 m_std[i] = 1.0; 02213 02214 if ( m_enableStaticNormalization ) // something special, allow to modify the auto normalizations 02215 { 02216 mean[i] += m_staticMeanNormalization; 02217 std[i] *= m_staticStdNormalization; 02218 } 02219 } 02220 for ( int i=0;i<m_nTrain;i++ ) 02221 for ( int j=0;j<m_nFeatures;j++ ) 02222 { 02223 m_trainOrig[j+i*m_nFeatures] = ( m_trainOrig[j+i*m_nFeatures] - mean[j] ) / std[j]; 02224 REAL v = m_trainOrig[j+i*m_nFeatures]; 02225 if ( v > 1.0 || v < 0.0 ) 02226 { 02227 cout<<"v:"<<v<<endl; 02228 assert ( false ); 02229 } 02230 } 02231 02232 // print mean/std 02233 for ( int j=0;j<m_nFeatures;j++ ) 02234 cout<<mean[j]<<"|"<<std[j]<<" "; 02235 cout<<endl; 02236 02237 // save the normalizations 02238 cout<<"save the 0..1 normalizations"<<endl; 02239 string meanName = m_datasetPath + "/" + m_tempPath + "/AutoencoderDataMean.dat"; 02240 string stdName = m_datasetPath + "/" + m_tempPath + "/AutoencoderDataStd.dat"; 02241 cout<<"meanName:"<<meanName<<endl<<"stdName:"<<stdName<<endl; 02242 fstream fMean ( meanName.c_str(),ios::out ); 02243 fstream fStd ( stdName.c_str(),ios::out ); 02244 fMean.write ( ( char* ) mean, sizeof ( REAL ) *m_nFeatures ); 02245 fStd.write ( ( char* ) std, sizeof ( REAL ) *m_nFeatures ); 02246 fMean.close(); 02247 fStd.close(); 02248 02249 delete[] mean; 02250 delete[] std; 02251 }
void Data::partitionDatasetToCrossValidationSets | ( | ) |
Split the data in n-cross validation sets And store it in member vars
Definition at line 1458 of file Data.cpp.
01459 { 01460 cout<<"Partition dataset to cross validation sets"<<endl; 01461 01462 // read the effect file 01463 readEffectFile(); 01464 01465 // write the first lines to a file 01466 if(m_trainOrig) 01467 { fstream f("Atrain.txt",ios::out); for ( int i=0;i<m_nTrain && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_trainOrig[i*m_nFeatures + j]<<" ";f<<endl;}f.close();} 01468 if(m_testOrig) 01469 { fstream f("Atest.txt",ios::out); for ( int i=0;i<m_nTest && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_testOrig[i*m_nFeatures + j]<<" ";f<<endl;}f.close();} 01470 if(m_valid) 01471 { fstream f("Avalid.txt",ios::out); for ( int i=0;i<m_validSize && i < 1000;i++ ){for ( int j=0;j<m_nFeatures;j++ ) f<<m_valid[i*m_nFeatures + j]<<" ";f<<endl;}f.close();} 01472 01473 // apply mean and std to input features 01474 cout<<"Apply mean and std correction to train input features"<<endl; 01475 for ( int i=0;i<m_nTrain;i++ ) 01476 for ( int j=0;j<m_nFeatures;j++ ) 01477 m_trainOrig[i*m_nFeatures + j] = ( m_trainOrig[i*m_nFeatures + j] - m_mean[j] ) / m_std[j]; 01478 01479 // print min and max values in features 01480 REAL min = 1e10, max = -1e10; 01481 for ( int i=0;i<m_nTrain;i++ ) 01482 for ( int j=0;j<m_nFeatures;j++ ) 01483 { 01484 if ( min > m_trainOrig[i*m_nFeatures + j] ) 01485 min = m_trainOrig[i*m_nFeatures + j]; 01486 if ( max < m_trainOrig[i*m_nFeatures + j] ) 01487 max = m_trainOrig[i*m_nFeatures + j]; 01488 } 01489 cout<<"Min/Max after apply mean/std: "<<min<<"/"<<max<<endl; 01490 01491 // print min and max values in targets 01492 min = 1e10; 01493 max = -1e10; 01494 m_targetMean = new REAL[m_nClass*m_nDomain]; 01495 double* targetMean = new double[m_nClass*m_nDomain]; 01496 for(int i=0;i<m_nClass*m_nDomain;i++) 01497 targetMean[i] = 0.0; 01498 for ( int i=0;i<m_nTrain;i++ ) 01499 for ( int j=0;j<m_nClass*m_nDomain;j++ ) 01500 { 01501 targetMean[j] += m_trainTargetOrig[i*m_nClass*m_nDomain + j]; 01502 if ( min > m_trainTargetOrig[i*m_nClass*m_nDomain + j] ) 01503 min = m_trainTargetOrig[i*m_nClass*m_nDomain + j]; 01504 if ( max < m_trainTargetOrig[i*m_nClass*m_nDomain + j] ) 01505 max = m_trainTargetOrig[i*m_nClass*m_nDomain + j]; 01506 } 01507 for(int i=0;i<m_nClass*m_nDomain;i++) 01508 m_targetMean[i] = targetMean[i]/(double)m_nTrain; 01509 delete[] targetMean; 01510 01511 cout<<"Min/Max target: "<<min<<"/"<<max<<endl<<"Mean target: "; 01512 for(int i=0;i<m_nClass*m_nDomain;i++) 01513 cout<<m_targetMean[i]<<" "; 01514 cout<<endl<<endl; 01515 01516 if(m_validationType == "Retraining" || m_validationType == "CrossFoldMean") 01517 { 01518 int* labels = new int[m_nDomain]; 01519 01520 // copy data to cross-validation slots 01521 for ( int i=0;i<m_nCross+1;i++ ) 01522 { 01523 // slot of probeset 01524 int begin = m_slotBoundaries[i]; 01525 int end = m_slotBoundaries[i+1]; 01526 01527 int probeCnt = 0, trainCnt = 0; 01528 01529 // go through whole trainOrig set 01530 for ( int j=0;j<m_nTrain;j++ ) 01531 { 01532 int index = m_mixList[j]; 01533 if ( Framework::getDatasetType() ) 01534 { 01535 for ( int d=0;d<m_nDomain;d++ ) 01536 labels[d] = m_trainLabelOrig[d+index*m_nDomain]; 01537 } 01538 01539 // probe set 01540 if ( j>=begin && j <end ) 01541 { 01542 m_probeIndex[i][probeCnt] = index; 01543 for ( int d=0;d<m_nDomain;d++ ) 01544 m_probeLabel[i][d+probeCnt*m_nDomain] = labels[d]; 01545 for ( int k=0;k<m_nFeatures;k++ ) 01546 if ( m_enableSaveMemory == false ) 01547 m_probe[i][probeCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k]; 01548 for ( int k=0;k<m_nClass*m_nDomain;k++ ) 01549 { 01550 m_probeTarget[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[index*m_nClass*m_nDomain + k]; 01551 m_probeTargetEffect[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[index*m_nClass*m_nDomain + k]; 01552 m_probeTargetResidual[i][probeCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[index*m_nClass*m_nDomain + k]; 01553 } 01554 probeCnt++; 01555 m_crossIndex[j] = i; 01556 } 01557 else // train set 01558 { 01559 for ( int d=0;d<m_nDomain;d++ ) 01560 m_trainLabel[i][d+trainCnt*m_nDomain] = labels[d]; 01561 for ( int k=0;k<m_nFeatures;k++ ) 01562 if ( m_enableSaveMemory == false ) 01563 m_train[i][trainCnt*m_nFeatures + k] = m_trainOrig[index*m_nFeatures + k]; 01564 for ( int k=0;k<m_nClass*m_nDomain;k++ ) 01565 { 01566 m_trainTarget[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[index*m_nClass*m_nDomain + k]; 01567 m_trainTargetEffect[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[index*m_nClass*m_nDomain + k]; 01568 m_trainTargetResidual[i][trainCnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[index*m_nClass*m_nDomain + k]; 01569 } 01570 trainCnt++; 01571 } 01572 } 01573 if ( probeCnt != m_probeSize[i] || trainCnt != m_trainSize[i] ) // safety check 01574 assert ( false ); 01575 } 01576 01577 if ( labels ) 01578 delete[] labels; 01579 01580 for ( int i=0;i<m_nTrain;i++ ) 01581 if ( m_crossIndex[i] == -1 ) 01582 assert ( false ); 01583 } 01584 else if(m_validationType == "Bagging") 01585 { 01586 bool* bagSamples = new bool[m_nTrain]; 01587 for ( int i=0;i<m_nCross;i++ ) 01588 { 01589 // train sets 01590 for(int j=0;j<m_nTrain;j++) 01591 bagSamples[j] = 0; 01592 for(int j=0;j<m_nTrain;j++) 01593 { 01594 uint ind = m_trainBaggingIndex[i][j]; 01595 bagSamples[ind] = 1; // mark 01596 01597 if ( Framework::getDatasetType() ) 01598 for ( int d=0;d<m_nDomain;d++ ) 01599 m_trainLabel[i][d+j*m_nDomain] = m_trainLabelOrig[d+ind*m_nDomain]; 01600 for ( int k=0;k<m_nFeatures;k++ ) 01601 if ( m_enableSaveMemory == false ) 01602 m_train[i][j*m_nFeatures + k] = m_trainOrig[ind*m_nFeatures + k]; 01603 for ( int k=0;k<m_nClass*m_nDomain;k++ ) 01604 { 01605 m_trainTarget[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrig[ind*m_nClass*m_nDomain + k]; 01606 m_trainTargetEffect[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[ind*m_nClass*m_nDomain + k]; 01607 m_trainTargetResidual[i][j*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[ind*m_nClass*m_nDomain + k]; 01608 } 01609 } 01610 01611 // probe sets 01612 int cnt = 0; 01613 for(int j=0;j<m_nTrain;j++) 01614 cnt += bagSamples[j]; 01615 if(m_nTrain - cnt != m_probeSize[i]) 01616 assert(false); 01617 cnt = 0; 01618 for(int j=0;j<m_nTrain;j++) 01619 { 01620 if(bagSamples[j] == false) 01621 { 01622 if ( Framework::getDatasetType() ) 01623 for ( int d=0;d<m_nDomain;d++ ) 01624 m_probeLabel[i][d+cnt*m_nDomain] = m_trainLabelOrig[d+j*m_nDomain]; 01625 for ( int k=0;k<m_nFeatures;k++ ) 01626 if ( m_enableSaveMemory == false ) 01627 m_probe[i][cnt*m_nFeatures + k] = m_trainOrig[j*m_nFeatures + k]; 01628 for ( int k=0;k<m_nClass*m_nDomain;k++ ) 01629 { 01630 m_probeTarget[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrig[j*m_nClass*m_nDomain + k]; 01631 m_probeTargetEffect[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrigEffect[j*m_nClass*m_nDomain + k]; 01632 m_probeTargetResidual[i][cnt*m_nClass*m_nDomain + k] = m_trainTargetOrigResidual[j*m_nClass*m_nDomain + k]; 01633 } 01634 cnt++; 01635 } 01636 } 01637 if(cnt != m_probeSize[i]) 01638 assert(false); 01639 } 01640 delete[] bagSamples; 01641 } 01642 else if(m_validationType == "ValidationSet") 01643 { 01644 ; 01645 } 01646 else 01647 assert(false); 01648 }
void Data::readDataset | ( | string | name | ) |
Read a dataset The name belongs directly to a read-in method
name | The name of the dataset |
Definition at line 296 of file Data.cpp.
00297 { 00298 // read MNIST 00299 if ( name == "MNIST" ) 00300 { 00301 DatasetReader r; 00302 // call by reference, memory is allcated in the DatasetReader 00303 r.readMNIST ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00304 } 00305 else if ( name == "NETFLIX" ) // read Netflix 00306 { 00307 DatasetReader r; 00308 // call by reference, memory is allcated in the DatasetReader 00309 r.readNETFLIX ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00310 } 00311 else if ( name == "AusDM2009" ) // read AusDM2009 00312 { 00313 DatasetReader r; 00314 // call by reference, memory is allcated in the DatasetReader 00315 r.readAusDM2009 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00316 } 00317 else if ( name == "KDDCup09Large" ) // read large KDDCup09large dataset 00318 { 00319 DatasetReader r; 00320 // call by reference, memory is allcated in the DatasetReader 00321 r.readKDDCup09Large ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00322 } 00323 else if ( name == "KDDCup09Small" ) // read large KDDCup09small dataset 00324 { 00325 DatasetReader r; 00326 // call by reference, memory is allcated in the DatasetReader 00327 r.readKDDCup09Small ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00328 } 00329 else if ( name == "BINARY" ) // read binary format dataset 00330 { 00331 DatasetReader r; 00332 // call by reference, memory is allcated in the DatasetReader 00333 r.readBINARY ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00334 } 00335 else if ( name == "CSV" ) // read csv format dataset 00336 { 00337 DatasetReader r; 00338 // call by reference, memory is allcated in the DatasetReader 00339 r.readCSV ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00340 } 00341 else if ( name == "ARFF" ) // read arff format dataset 00342 { 00343 DatasetReader r; 00344 // call by reference, memory is allcated in the DatasetReader 00345 r.readARFF ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00346 } 00347 else if ( name == "PRUDSYS_DMC2009" ) // read PRUDSYS_DMC2009 dataset 00348 { 00349 DatasetReader r; 00350 // call by reference, memory is allcated in the DatasetReader 00351 r.readPRUDSYS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00352 } 00353 else if ( name == "ADULT" ) // read adult dataset 00354 { 00355 DatasetReader r; 00356 // call by reference, memory is allcated in the DatasetReader 00357 r.readADULT ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00358 } 00359 else if ( name == "AUSTRALIAN" ) // read australian dataset 00360 { 00361 DatasetReader r; 00362 // call by reference, memory is allcated in the DatasetReader 00363 r.readAUSTRALIAN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00364 } 00365 else if ( name == "BALANCE" ) // read balance dataset 00366 { 00367 DatasetReader r; 00368 // call by reference, memory is allcated in the DatasetReader 00369 r.readBALANCE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00370 } 00371 else if ( name == "CYLINDER-BANDS" ) // read cylinder-bands dataset 00372 { 00373 DatasetReader r; 00374 // call by reference, memory is allcated in the DatasetReader 00375 r.readCYLINDERBANDS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00376 } 00377 else if ( name == "BREAST" ) // read breast-cancer dataset 00378 { 00379 DatasetReader r; 00380 // call by reference, memory is allcated in the DatasetReader 00381 r.readBREASTCANCERWISCONSIN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00382 } 00383 else if ( name == "CREDIT" ) // read australian-credit dataset 00384 { 00385 DatasetReader r; 00386 // call by reference, memory is allcated in the DatasetReader 00387 r.readAUSTRALIANCREDIT ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00388 } 00389 else if ( name == "DIABETES" ) // read diabetes dataset 00390 { 00391 DatasetReader r; 00392 // call by reference, memory is allcated in the DatasetReader 00393 r.readDIABETES ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00394 } 00395 else if ( name == "GERMAN" ) // read german dataset 00396 { 00397 DatasetReader r; 00398 // call by reference, memory is allcated in the DatasetReader 00399 r.readGERMAN ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00400 } 00401 else if ( name == "GLASS" ) // read glass dataset 00402 { 00403 DatasetReader r; 00404 // call by reference, memory is allcated in the DatasetReader 00405 r.readGLASS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00406 } 00407 else if ( name == "HEART-SPECTF" ) // read heart dataset 00408 { 00409 DatasetReader r; 00410 // call by reference, memory is allcated in the DatasetReader 00411 r.readHEART ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00412 } 00413 else if ( name == "HEPATITIS" ) // read hepatitis dataset 00414 { 00415 DatasetReader r; 00416 // call by reference, memory is allcated in the DatasetReader 00417 r.readHEPATITIS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00418 } 00419 else if ( name == "IONOSPHERE" ) // read ionophsere dataset 00420 { 00421 DatasetReader r; 00422 // call by reference, memory is allcated in the DatasetReader 00423 r.readIONOSPHERE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00424 } 00425 else if ( name == "IRIS" ) // read iris dataset 00426 { 00427 DatasetReader r; 00428 // call by reference, memory is allcated in the DatasetReader 00429 r.readIRIS ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00430 } 00431 else if ( name == "LETTER" ) // read letter dataset 00432 { 00433 DatasetReader r; 00434 // call by reference, memory is allcated in the DatasetReader 00435 r.readLETTER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00436 } 00437 else if ( name == "MONKS-1" ) // read monks1 dataset 00438 { 00439 DatasetReader r; 00440 // call by reference, memory is allcated in the DatasetReader 00441 r.readMONKS1 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00442 } 00443 else if ( name == "MONKS-2" ) // read monks2 dataset 00444 { 00445 DatasetReader r; 00446 // call by reference, memory is allcated in the DatasetReader 00447 r.readMONKS2 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00448 } 00449 else if ( name == "MONKS-3" ) // read monks3 dataset 00450 { 00451 DatasetReader r; 00452 // call by reference, memory is allcated in the DatasetReader 00453 r.readMONKS3 ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00454 } 00455 else if ( name == "MUSHROOM" ) // read mushroom dataset 00456 { 00457 DatasetReader r; 00458 // call by reference, memory is allcated in the DatasetReader 00459 r.readMUSHROOM ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00460 } 00461 else if ( name == "SATIMAGE" ) // read satimage dataset 00462 { 00463 DatasetReader r; 00464 // call by reference, memory is allcated in the DatasetReader 00465 r.readSATIMAGE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00466 } 00467 else if ( name == "SEGMENTATION" ) // read segmentation dataset 00468 { 00469 DatasetReader r; 00470 // call by reference, memory is allcated in the DatasetReader 00471 r.readSEGMENTATION ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00472 } 00473 else if ( name == "SONAR" ) // read sonar dataset 00474 { 00475 DatasetReader r; 00476 // call by reference, memory is allcated in the DatasetReader 00477 r.readSONAR ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00478 } 00479 else if ( name == "VEHICLE" ) // read vehicle dataset 00480 { 00481 DatasetReader r; 00482 // call by reference, memory is allcated in the DatasetReader 00483 r.readVEHICLE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00484 } 00485 else if ( name == "VOTES" ) // read votes dataset 00486 { 00487 DatasetReader r; 00488 // call by reference, memory is allcated in the DatasetReader 00489 r.readVOTES ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00490 } 00491 else if ( name == "WINE" ) // read wine dataset 00492 { 00493 DatasetReader r; 00494 // call by reference, memory is allcated in the DatasetReader 00495 r.readWINE ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00496 } 00497 else if ( name == "POKER" ) // read poker dataset 00498 { 00499 DatasetReader r; 00500 // call by reference, memory is allcated in the DatasetReader 00501 r.readPOKER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00502 } 00503 else if ( name == "YEAST" ) // read yeast dataset 00504 { 00505 DatasetReader r; 00506 // call by reference, memory is allcated in the DatasetReader 00507 r.readYEAST ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00508 } 00509 else if ( name == "SURVIVAL" ) // read survival dataset 00510 { 00511 DatasetReader r; 00512 // call by reference, memory is allcated in the DatasetReader 00513 r.readSURVIVAL ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00514 } 00515 else if ( name == "SPIDER" ) // read (generated by)spider dataset 00516 { 00517 DatasetReader r; 00518 // call by reference, memory is allcated in the DatasetReader 00519 r.readSPIDER ( m_datasetPath+"/"+m_dataPath, m_trainOrig, m_trainTargetOrig, m_trainLabelOrig, m_testOrig, m_testTargetOrig, m_testLabelOrig, m_nTrain, m_nTest, m_nClass, m_nDomain, m_nFeatures, m_positiveTarget, m_negativeTarget ); 00520 } 00521 else 00522 { 00523 cout<<"Dataset not found:"<<name<<endl; 00524 exit ( 0 ); 00525 } 00526 00527 if(m_addConstantInput) 00528 addConstantInput(); 00529 00530 // reduce the size of the training set 00531 reduceTrainingSetSize ( m_subsampleTrainSet ); 00532 00533 // reduce the size of the features in the training set 00534 int nFeatOrig = m_nFeatures; 00535 reduceFeatureSize ( m_trainOrig, m_nTrain, m_nFeatures, m_subsampleFeatures, Framework::getFrameworkMode() ); 00536 reduceFeatureSize ( m_testOrig, m_nTest, nFeatOrig, m_subsampleFeatures, true ); 00537 00538 // feature selection, based on a linear model 00539 if ( m_featureSelectionWriteBinaryDataset ) 00540 { 00541 makeBinaryDataset(); 00542 exit ( 0 ); 00543 } 00544 00545 // mix train features and labels 00546 mixDataset(); 00547 }
void Data::readDscFile | ( | string | name | ) |
Read the description file
name | The description file name (string) |
Definition at line 1833 of file Data.cpp.
01834 { 01835 cout<<"Load descriptor file: "<<name<<endl; 01836 fstream f ( name.c_str(), ios::in ); 01837 01838 if ( f.is_open() ==false ) 01839 { 01840 cout<<"Can not open file:"<<name<<endl; 01841 assert ( false ); 01842 } 01843 01844 int mode = -1; // -1:meta info 0:int 1:double 2:string 3:bool 01845 01846 char buf[256]; 01847 while ( f.getline ( buf, 256 ) ) // read all lines 01848 { 01849 string line ( buf ); 01850 if ( line[0]=='#' ) // a comment 01851 continue; 01852 if ( line.find ( "[int]" ) != string::npos ) 01853 mode = 0; 01854 if ( line.find ( "[double]" ) != string::npos ) 01855 mode = 1; 01856 if ( line.find ( "[string]" ) != string::npos ) 01857 mode = 2; 01858 if ( line.find ( "[bool]" ) != string::npos ) 01859 mode = 3; 01860 01861 // only lines which consists of a '=' 01862 if ( line.find ( "=" ) != string::npos ) 01863 readParameter ( line, mode ); 01864 } 01865 01866 f.close(); 01867 }
void Data::readEffectFile | ( | ) |
Read the effect file This is the prediction of the whole trainingset from an other Algorithm This can be used as preprocessing of an other Algorithm Effect file name is: m_trainOnFullPredictorFile
Definition at line 1306 of file Data.cpp.
01307 { 01308 if(m_validationType == "ValidationSet") 01309 return; 01310 01311 for ( int i=0;i<m_nClass*m_nDomain*m_nTrain;i++ ) 01312 m_trainTargetOrigEffect[i] = 0.0; 01313 01314 string name = m_datasetPath + "/" + m_fullPredPath + "/" + m_trainOnFullPredictorFile; 01315 fstream f ( name.c_str(), ios::in ); 01316 if ( f.is_open() && m_trainOnFullPredictorFile!="" ) 01317 { 01318 cout<<"Read fullPredictor:"<<name<<" "; 01319 f.read ( ( char* ) m_trainTargetOrigEffect, sizeof ( REAL ) *m_nClass*m_nDomain*m_nTrain ); 01320 01321 double rmse0 = 0.0, rmse1 = 0.0, err; 01322 for ( int i=0;i<m_nClass*m_nDomain;i++ ) 01323 { 01324 for ( int j=0;j<m_nTrain;j++ ) 01325 { 01326 err = m_trainTargetOrigEffect[j*m_nClass*m_nDomain + i] - m_trainTargetOrig[j*m_nClass*m_nDomain + i]; 01327 rmse0 += err * err; 01328 } 01329 } 01330 cout<<"RMSE:"<<sqrt ( rmse0/ ( double ) ( m_nClass*m_nDomain*m_nTrain ) ) <<"(retrain:"<<sqrt ( rmse1/ ( double ) ( m_nClass*m_nDomain*m_nTrain ) ) <<")"<<endl; 01331 01332 f.close(); 01333 } 01334 else 01335 cout<<"Can not open effect file:"<<name<<endl; 01336 01337 // residual training: res = target - effect 01338 cout<<"Init residuals"<<endl; 01339 for ( int i=0;i<m_nClass*m_nDomain*m_nTrain;i++ ) 01340 m_trainTargetOrigResidual[i] = m_trainTargetOrig[i] - m_trainTargetOrigEffect[i]; 01341 }
void Data::readParameter | ( | string | line, | |
int | mode | |||
) |
Read a parameter in the description file
line | One line in the file (string) | |
mode | -1: metaparameters, 0: integer, 1: double, 2: string, 3: bool |
Definition at line 1789 of file Data.cpp.
01790 { 01791 // split into 2 strings at the '=' char 01792 int pos = line.find ( "=" ); 01793 string name = line.substr ( 0, pos ); 01794 string value = line.substr ( pos+1 ); 01795 01796 if ( mode==-1 ) // meta info block (algorithm independent) 01797 { 01798 if ( name=="ALGORITHM" ) 01799 m_algorithmName = value; 01800 if ( name=="ID" ) 01801 m_algorithmID = atoi ( value.c_str() ); 01802 if ( name=="TRAIN_ON_FULLPREDICTOR" ) 01803 { 01804 if(m_validationType == "ValidationSet") 01805 assert(false); 01806 m_trainOnFullPredictorFile = value; 01807 } 01808 if ( name=="DISABLE" ) 01809 m_disableTraining = atoi ( value.c_str() ); 01810 cout<<"[META] "; 01811 } 01812 01813 if ( mode==0 ) // [int] 01814 m_intMap[name] = atoi ( value.c_str() ); 01815 01816 if ( mode==1 ) // [double] 01817 m_doubleMap[name] = atof ( value.c_str() ); 01818 01819 if ( mode==2 ) // [string] 01820 m_stringMap[name] = value; 01821 01822 if ( mode==3 ) // [bool] 01823 m_boolMap[name] = atoi ( value.c_str() ); 01824 01825 cout<<name<<": "<<value<<endl; 01826 }
void Data::reduceFeatureSize | ( | REAL *& | table, | |
int | tableRows, | |||
int & | tableCols, | |||
REAL | percent, | |||
bool | loadColumnSet | |||
) |
This method is for reduce the feature size The idea stem from Random Forrests
percent | The normalized size of features (0...1) |
Definition at line 2322 of file Data.cpp.
02323 { 02324 cout<<"subsample the columns (current:"<<tableCols<<") to "<<percent*100.0<<"% of columns (skip constant 1 features)"<<flush; 02325 if ( percent <= 0.0 || percent >= 1.0 ) 02326 { 02327 cout<<" [nothing to do]"<<endl; 02328 return; 02329 } 02330 cout<<endl; 02331 02332 // determine constant 1 features 02333 bool* isConstantOne = new bool[tableCols]; 02334 bool* selectedCols = new bool[tableCols]; 02335 for ( int i=0;i<tableCols;i++ ) 02336 { 02337 isConstantOne[i] = true; 02338 selectedCols[i] = false; 02339 } 02340 for ( int i=0;i<tableRows;i++ ) 02341 for ( int j=0;j<tableCols;j++ ) 02342 isConstantOne[j] &= table[j+i*tableCols]==1.0; 02343 02344 srand ( Framework::getRandomSeed() ); 02345 int cnt = 0; 02346 for ( int i=0;i<tableCols;i++ ) 02347 if ( ( double ) rand() / ( double ) RAND_MAX < percent || isConstantOne[i] ) 02348 { 02349 selectedCols[i] = true; 02350 cnt++; 02351 } 02352 delete[] isConstantOne; 02353 02354 if ( loadColumnSet ) 02355 { 02356 string fname = m_datasetPath + "/" + string ( DATA_PATH ) + "/subspace.txt"; 02357 cout<<"load subspace file:"<<fname<<endl; 02358 fstream f ( fname.c_str(),ios::in ); 02359 cnt = 0; 02360 for ( int i=0;i<tableCols;i++ ) 02361 { 02362 f>>selectedCols[i]; 02363 cnt += selectedCols[i]; 02364 } 02365 f.close(); 02366 } 02367 else 02368 { 02369 string fname = m_datasetPath + "/" + string ( DATA_PATH ) + "/subspace.txt"; 02370 cout<<"write subspace file:"<<fname<<endl; 02371 fstream f ( fname.c_str(),ios::out ); 02372 for ( int i=0;i<tableCols;i++ ) 02373 f<<selectedCols[i]<<endl; 02374 f.close(); 02375 } 02376 02377 cout<<"allocate new table set, column size:"<<cnt<<endl; 02378 REAL* newTable = new REAL[cnt*tableRows]; 02379 02380 srand ( Framework::getRandomSeed() ); 02381 for ( int i=0;i<tableRows;i++ ) 02382 { 02383 int c = 0; 02384 for ( int j=0;j<tableCols;j++ ) 02385 { 02386 if ( selectedCols[j] ) 02387 { 02388 newTable[c+i*cnt] = table[j+i*tableCols]; 02389 c++; 02390 } 02391 } 02392 } 02393 02394 delete[] table; 02395 delete[] selectedCols; 02396 table = newTable; 02397 tableCols = cnt; 02398 }
void Data::reduceTrainingSetSize | ( | REAL | percent | ) |
This method is for reduce the training sample size Useful for apply complex models on large datasets
percent | The size of the new training set (0...1) |
Definition at line 2259 of file Data.cpp.
02260 { 02261 cout<<"reduce training set (current size:"<<m_nTrain<<") to "<<percent*100.0<<"% of its original size"<<flush; 02262 if ( percent <= 0.0 || percent >= 1.0 ) 02263 { 02264 cout<<" [nothing to do]"<<endl; 02265 return; 02266 } 02267 cout<<endl; 02268 02269 srand ( Framework::getRandomSeed() ); 02270 int cnt = 0; 02271 for ( int i=0;i<m_nTrain;i++ ) 02272 if ( ( double ) rand() / ( double ) RAND_MAX < percent ) 02273 cnt++; 02274 02275 cout<<"allocate new training set, size:"<<cnt<<endl; 02276 02277 REAL* train = new REAL[cnt*m_nFeatures]; 02278 REAL* trainTarget = new REAL[cnt*m_nClass*m_nDomain]; 02279 02280 int* trainLabel = 0; 02281 if ( m_trainLabelOrig ) 02282 trainLabel = new int[cnt*m_nDomain]; 02283 02284 srand ( Framework::getRandomSeed() ); 02285 cnt = 0; 02286 for ( int i=0;i<m_nTrain;i++ ) 02287 { 02288 if ( ( double ) rand() / ( double ) RAND_MAX < percent ) 02289 { 02290 for ( int j=0;j<m_nFeatures;j++ ) 02291 train[j+cnt*m_nFeatures] = m_trainOrig[j+i*m_nFeatures]; 02292 for ( int j=0;j<m_nClass*m_nDomain;j++ ) 02293 trainTarget[j+cnt*m_nClass*m_nDomain] = m_trainTargetOrig[j+i*m_nClass*m_nDomain]; 02294 if ( m_trainLabelOrig ) 02295 { 02296 for ( int j=0;j<m_nDomain;j++ ) 02297 trainLabel[j+cnt*m_nDomain] = m_trainLabelOrig[j+i*m_nDomain]; 02298 } 02299 cnt++; 02300 } 02301 } 02302 02303 delete[] m_trainOrig; 02304 delete[] m_trainTargetOrig; 02305 if ( m_trainLabelOrig ) 02306 delete[] m_trainLabelOrig; 02307 02308 m_trainOrig = train; 02309 m_trainTargetOrig = trainTarget; 02310 if ( m_trainLabelOrig ) 02311 m_trainLabelOrig = trainLabel; 02312 02313 m_nTrain = cnt; 02314 }
void Data::setAlgorithmList | ( | vector< string > | algorithmNameList | ) |
Copy an external vector of *dsc files to the member list
m_algorithmNameList | List of filenames (*dsc) | |
nAlgorithmsTrained | How many of them have finished training |
Definition at line 2113 of file Data.cpp.
02114 { 02115 cout<<"Set algorithm list (nTrained:"<< ( int ) algorithmNameList.size() <<")"<<endl; 02116 m_algorithmNameList = algorithmNameList; 02117 for ( int i=0;i<m_algorithmNameList.size();i++ ) 02118 { 02119 int pos = m_algorithmNameList[i].find_first_of ( ".",0 ); 02120 if ( pos == 0 ) 02121 assert ( false ); 02122 m_algorithmNameList[i] = m_datasetPath + "/" + m_fullPredPath + "/" + m_algorithmNameList[i].substr ( 0,pos ) + ".dat"; 02123 cout<<"m_algorithmNameList["<<i<<"]:"<<m_algorithmNameList[i]<<endl; 02124 } 02125 }
void Data::setDataPointers | ( | Data * | data | ) |
Fills the pointes from the base class Data
data | The pointer to the data object, where a valid dataset is loaded |
Definition at line 1954 of file Data.cpp.
01955 { 01956 cout<<"Set data pointers"<<endl; 01957 01958 // copy maps 01959 m_intMap = data->m_intMap; 01960 m_doubleMap = data->m_doubleMap; 01961 m_boolMap = data->m_boolMap; 01962 m_stringMap = data->m_stringMap; 01963 01964 m_algorithmName = data->m_algorithmName; 01965 m_algorithmID = data->m_algorithmID; 01966 m_trainOnFullPredictorFile = data->m_trainOnFullPredictorFile; 01967 m_disableTraining = data->m_disableTraining; 01968 01969 m_randSeed = data->m_randSeed; 01970 m_positiveTarget = data->m_positiveTarget; 01971 m_negativeTarget = data->m_negativeTarget; 01972 01973 m_mixList = data->m_mixList; 01974 01975 // dataset pathes 01976 m_datasetPath = data->m_datasetPath; 01977 m_datasetName = data->m_datasetName; 01978 m_tempPath = data->m_tempPath; 01979 m_dscPath = data->m_dscPath; 01980 m_fullPredPath = data->m_fullPredPath; 01981 m_dataPath = data->m_dataPath; 01982 01983 // dataset organization (input/output dimensionality) 01984 m_nFeatures = data->m_nFeatures; 01985 m_nClass = data->m_nClass; 01986 m_nDomain = data->m_nDomain; 01987 m_nMixTrainList = data->m_nMixTrainList; 01988 01989 // cross-validation settings 01990 m_nCross = data->m_nCross; 01991 m_validationType = data->m_validationType; 01992 01993 // global mean and standard deviation over whole dataset 01994 m_mean = data->m_mean; 01995 m_std = data->m_std; 01996 m_standardDeviationMin = data->m_standardDeviationMin; 01997 m_targetMean = data->m_targetMean; 01998 01999 // full training set 02000 m_nTrain = data->m_nTrain; 02001 m_trainOrig = data->m_trainOrig; 02002 m_trainTargetOrig = data->m_trainTargetOrig; 02003 m_trainTargetOrigEffect = data->m_trainTargetOrigEffect; 02004 m_trainTargetOrigResidual = data->m_trainTargetOrigResidual; 02005 m_trainLabelOrig = data->m_trainLabelOrig; 02006 m_trainBaggingIndex = data->m_trainBaggingIndex; 02007 02008 // the validation set 02009 m_validSize = data->m_validSize; 02010 m_valid = data->m_valid; 02011 m_validTarget = data->m_validTarget; 02012 m_validLabel = data->m_validLabel; 02013 02014 // the testset 02015 m_nTest = data->m_nTest; 02016 m_testOrig = data->m_testOrig; 02017 m_testTargetOrig = data->m_testTargetOrig; 02018 m_testLabelOrig = data->m_testLabelOrig; 02019 02020 // probe split inices 02021 m_slotBoundaries = data->m_slotBoundaries; 02022 02023 // trainsets per cross-validation division 02024 m_trainSize = data->m_trainSize; 02025 m_train = data->m_train; 02026 m_trainTarget = data->m_trainTarget; 02027 m_trainTargetEffect = data->m_trainTargetEffect; 02028 m_trainTargetResidual = data->m_trainTargetResidual; 02029 m_trainLabel = data->m_trainLabel; 02030 02031 // probesets per cross-validation division 02032 m_probeSize = data->m_probeSize; 02033 m_probe = data->m_probe; 02034 m_probeTarget = data->m_probeTarget; 02035 m_probeTargetEffect = data->m_probeTargetEffect; 02036 m_probeTargetResidual = data->m_probeTargetResidual; 02037 m_probeLabel = data->m_probeLabel; 02038 m_probeIndex = data->m_probeIndex; 02039 02040 m_crossIndex = data->m_crossIndex; 02041 02042 // blend stopping 02043 m_blendingRegularization = data->m_blendingRegularization; 02044 m_enableGlobalBlendingWeights = data->m_enableGlobalBlendingWeights; 02045 m_blendingEnableCrossValidation = data->m_blendingEnableCrossValidation; 02046 m_enablePostNNBlending = data->m_enablePostNNBlending; 02047 m_blendingAlgorithm = data->m_blendingAlgorithm; 02048 02049 // cascade learning 02050 m_enableCascadeLearning = data->m_enableCascadeLearning; 02051 m_nCascadeInputs = data->m_nCascadeInputs; 02052 m_cascadeInputs = data->m_cascadeInputs; 02053 02054 // average over mean and std as new mean and std 02055 m_enableGlobalMeanStdEstimate = data->m_enableGlobalMeanStdEstimate; 02056 02057 // paralellization of k-fold cross validation 02058 m_maxThreadsInCross = data->m_maxThreadsInCross; 02059 02060 // memory save option 02061 m_enableSaveMemory = data->m_enableSaveMemory; 02062 02063 // error function "AUC" or "RMSE" 02064 m_errorFunction = data->m_errorFunction; 02065 02066 // reverse mix table 02067 m_mixDatasetIndices = data->m_mixDatasetIndices; 02068 02069 // already trained algo list 02070 m_algorithmNameList = data->m_algorithmNameList; 02071 02072 // clip after blend 02073 m_enablePostBlendClipping = data->m_enablePostBlendClipping; 02074 02075 // add output noise 02076 m_addOutputNoise = data->m_addOutputNoise; 02077 02078 // feature selection 02079 m_enableFeatureSelection = data->m_enableFeatureSelection; 02080 m_featureSelectionWriteBinaryDataset = data->m_featureSelectionWriteBinaryDataset; 02081 02082 // bagging 02083 m_enableBagging = data->m_enableBagging; 02084 m_randomSeedBagging = data->m_randomSeedBagging; 02085 02086 // write dsc files in training 02087 m_disableWriteDscFile = data->m_disableWriteDscFile; 02088 02089 // static mean and std normalization 02090 m_enableStaticNormalization = data->m_enableStaticNormalization; 02091 m_staticMeanNormalization = data->m_staticMeanNormalization; 02092 m_staticStdNormalization = data->m_staticStdNormalization; 02093 m_enableProbablisticNormalization = data->m_enableProbablisticNormalization; 02094 02095 // dimensionality reduction 02096 m_dimensionalityReduction = data->m_dimensionalityReduction; 02097 02098 // if this is set, the algorithm should load saved weights before start to training 02099 m_loadWeightsBeforeTraining = data->m_loadWeightsBeforeTraining; 02100 02101 m_subsampleTrainSet = data->m_subsampleTrainSet; 02102 m_subsampleFeatures = data->m_subsampleFeatures; 02103 m_globalTrainingLoops = data->m_globalTrainingLoops; 02104 m_addConstantInput = data->m_addConstantInput; 02105 }
void Data::setPathes | ( | string | temp, | |
string | dsc, | |||
string | fullPred, | |||
string | data | |||
) |
Set important pathes for running the Framework
temp | The temp directors, used for weights files of Algorithms | |
dsc | The description file dir, the cout<<.. per Algorithm are collected here | |
fullPred | The full-prediction dir, files which predicts the trainset with cross validation | |
data | The dataset directory, where the dataset files are |
Definition at line 1775 of file Data.cpp.
01776 { 01777 m_tempPath = temp; 01778 m_dscPath = dsc; 01779 m_fullPredPath = fullPred; 01780 m_dataPath = data; 01781 }
int * Data::splitStringToIntegerList | ( | string | str, | |
char | delimiter | |||
) | [static] |
Split a list of integers in a string E.g.: str="10,10,100,50"
str | The string (input) | |
delimiter | The delimiter sign (char) |
Definition at line 1897 of file Data.cpp.
01898 { 01899 vector<int> v; 01900 int number; 01901 char *begin = ( char* ) str.c_str(), *end = ( char* ) str.c_str(), tmp; 01902 for ( int i=0;i<str.length();i++ ) 01903 { 01904 end++; 01905 if ( *end==delimiter || *end==0 ) 01906 { 01907 tmp = *end; 01908 *end = 0; 01909 sscanf ( begin, "%d", &number ); 01910 begin = end + 1; 01911 *end = tmp; 01912 v.push_back ( number ); 01913 } 01914 } 01915 int* returnList = new int[v.size() ]; 01916 for ( int i=0;i<v.size();i++ ) 01917 returnList[i] = v[i]; 01918 return returnList; 01919 }
vector< string > Data::splitStringToStringList | ( | string | str, | |
char | delimiter | |||
) | [static] |
Split a string to substrings E.g.: str="10,10,100,50" and delimiter=','
str | The string (input) | |
delimiter | The delimiter sign (char) |
Definition at line 1929 of file Data.cpp.
01930 { 01931 vector<string> v; 01932 int number; 01933 char *begin = ( char* ) str.c_str(), *end = ( char* ) str.c_str(), tmp; 01934 for ( int i=0;i<str.length();i++ ) 01935 { 01936 end++; 01937 if ( *end==delimiter || *end==0 ) 01938 { 01939 tmp = *end; 01940 *end = 0; 01941 v.push_back ( begin ); 01942 begin = end + 1; 01943 *end = tmp; 01944 } 01945 } 01946 return v; 01947 }
int Data::vectorSampling | ( | REAL * | probs, | |
int | length | |||
) |
Returns the number of samples when having probabilites for each vector sample
probs | per-sample probability | |
length | the number of samples |
Definition at line 680 of file Data.cpp.
00681 { 00682 double sum = 0.0; 00683 for ( int i=0;i<length;i++ ) 00684 sum += probs[i]; 00685 00686 double value = sum * ( ( double ) rand() / ( double ) RAND_MAX ); 00687 00688 sum = 0.0; 00689 for ( int i=0;i<length;i++ ) 00690 { 00691 sum += probs[i]; 00692 if ( sum >= value ) 00693 return i; 00694 } 00695 cout<<"value:"<<value<<endl<<"length:"<<length<<endl<<"sum:"<<sum<<endl; 00696 for ( int i=0;i<length;i++ ) 00697 cout<<probs[i]<<" "<<flush; 00698 assert ( false ); 00699 return -1; 00700 }