Data.h
00001 #ifndef _DATA_H__
00002 #define _DATA_H__
00003
00004 #include <assert.h>
00005 #include <stdio.h>
00006 #include <stdlib.h>
00007
00008 #include <fstream>
00009 #include <string>
00010 #include <math.h>
00011 #include <vector>
00012 #include <map>
00013 #include <dirent.h>
00014 #include <algorithm>
00015
00016 using namespace std;
00017
00018 #include "StreamOutput.h"
00019 #include "DatasetReader.h"
00020 #include "Framework.h"
00021 #include "InputFeatureSelector.h"
00022
00034 class Data : public Framework
00035 {
00036
00037 friend class Scheduler;
00038 friend class Algorithm;
00039 friend class Autoencoder;
00040
00041 public:
00042 Data();
00043 virtual ~Data();
00044
00045 void readParameter ( string line, int mode );
00046 void readDscFile ( string name );
00047 void setPathes ( string temp, string dsc, string fullPred, string data );
00048
00049 void readDataset ( string name );
00050 void allocMemForCrossValidationSets();
00051
00052 void partitionDatasetToCrossValidationSets();
00053 void fillCascadeLearningInputs();
00054 void extendTrainDataWithCascadeInputs();
00055 void fillNCrossValidationSet ( int n );
00056 void freeNCrossValidationSet ( int n );
00057 void readEffectFile();
00058
00059 static vector<string> getDirectoryFileList ( string path );
00060 static int* splitStringToIntegerList ( string str, char delimiter );
00061 static vector<string> splitStringToStringList ( string str, char delimiter );
00062
00063 void setDataPointers ( Data* data );
00064 void mixDataset();
00065
00066 void deleteMemory();
00067 void loadNormalization ( int nCascade = 0 );
00068
00069 void setAlgorithmList ( vector<string> m_algorithmNameList );
00070
00071 void loadFeatureSelectionFile();
00072 void saveFeatureSelectionFile();
00073
00074 void doFeatureSelection();
00075 void makeBinaryDataset();
00076
00077 void enableBagging ( bool en );
00078 void doBootstrapSampling ( REAL* probs, REAL* &train, REAL* &target, REAL* &targetEff, REAL* &targetRes, int* &label, int nTrainNew = 0 );
00079 void baggingRandomSeed ( uint seed );
00080
00081 int vectorSampling ( REAL* probs, int length );
00082
00083 void mergeTrainAndTest();
00084 void normalizeZeroOne();
00085
00086 void reduceTrainingSetSize ( REAL percent );
00087 void reduceFeatureSize ( REAL* &table, int tableRows, int &tableCols, REAL percent, bool loadColumnSet );
00088
00089 void addConstantInput();
00090
00091 protected:
00092
00093 string m_datasetPath;
00094 string m_datasetName;
00095 string m_algorithmName;
00096 int m_algorithmID;
00097 string m_trainOnFullPredictorFile;
00098 bool m_disableTraining;
00099 int m_randSeed;
00100 int m_nMixDataset;
00101 int m_nMixTrainList;
00102 int m_nCross;
00103 string m_validationType;
00104 int m_maxThreadsInCross;
00105 bool m_enableGlobalMeanStdEstimate;
00106 REAL m_positiveTarget;
00107 REAL m_negativeTarget;
00108 double m_blendingRegularization;
00109 bool m_enableGlobalBlendingWeights;
00110 bool m_blendingEnableCrossValidation;
00111 bool m_enablePostNNBlending;
00112 string m_blendingAlgorithm;
00113
00114
00115
00116 bool m_enableCascadeLearning;
00117 int m_nCascadeInputs;
00118 REAL* m_cascadeInputs;
00119
00120
00121 map<string,int> m_intMap;
00122 map<string,double> m_doubleMap;
00123 map<string,bool> m_boolMap;
00124 map<string,string> m_stringMap;
00125
00126
00127 string m_tempPath;
00128 string m_dscPath;
00129 string m_fullPredPath;
00130 string m_dataPath;
00131
00132
00133 int m_nFeatures;
00134 int m_nClass;
00135 int m_nDomain;
00136
00137
00138 int* m_mixDatasetIndices;
00139 int* m_mixList;
00140 int* m_crossIndex;
00141
00142
00143 uint m_nTrain;
00144 REAL* m_trainOrig;
00145 REAL* m_trainTargetOrig;
00146 REAL* m_trainTargetOrigEffect;
00147 REAL* m_trainTargetOrigResidual;
00148 int* m_trainLabelOrig;
00149
00150
00151 uint m_nTest;
00152 REAL* m_testOrig;
00153 REAL* m_testTargetOrig;
00154 int* m_testLabelOrig;
00155
00156
00157 int* m_slotBoundaries;
00158
00159
00160 int* m_trainSize;
00161 REAL** m_train;
00162 REAL** m_trainTarget;
00163 REAL** m_trainTargetEffect;
00164 REAL** m_trainTargetResidual;
00165 int** m_trainLabel;
00166 int** m_trainBaggingIndex;
00167
00168
00169 int* m_probeSize;
00170 REAL** m_probe;
00171 REAL** m_probeTarget;
00172 REAL** m_probeTargetEffect;
00173 REAL** m_probeTargetResidual;
00174 int** m_probeLabel;
00175 int** m_probeIndex;
00176
00177
00178 int m_validSize;
00179 REAL* m_valid;
00180 REAL* m_validTarget;
00181 int* m_validLabel;
00182
00183
00184 REAL* m_mean;
00185 REAL* m_std;
00186 REAL m_standardDeviationMin;
00187 REAL* m_targetMean;
00188
00189
00190 bool m_enableSaveMemory;
00191
00192
00193 string m_errorFunction;
00194
00195 REAL* m_support;
00196
00197
00198 vector<string> m_algorithmNameList;
00199
00200
00201 bool m_enablePostBlendClipping;
00202
00203
00204 REAL m_addOutputNoise;
00205
00206
00207 bool m_enableFeatureSelection;
00208 bool m_featureSelectionWriteBinaryDataset;
00209
00210
00211 bool m_enableBagging;
00212 uint m_randomSeedBagging;
00213
00214
00215 bool m_disableWriteDscFile;
00216
00217
00218 bool m_enableStaticNormalization;
00219 REAL m_staticMeanNormalization;
00220 REAL m_staticStdNormalization;
00221 bool m_enableProbablisticNormalization;
00222
00223
00224 string m_dimensionalityReduction;
00225
00226
00227 REAL m_subsampleTrainSet;
00228
00229
00230 REAL m_subsampleFeatures;
00231
00232
00233 int m_globalTrainingLoops;
00234
00235
00236 bool m_addConstantInput;
00237
00238
00239 bool m_loadWeightsBeforeTraining;
00240 };
00241
00242
00243 #endif