Data.h

00001 #ifndef _DATA_H__
00002 #define _DATA_H__
00003 
00004 #include <assert.h>
00005 #include <stdio.h>
00006 #include <stdlib.h>
00007 //#include <iostream>
00008 #include <fstream>
00009 #include <string>
00010 #include <math.h>
00011 #include <vector>
00012 #include <map>
00013 #include <dirent.h>
00014 #include <algorithm>
00015 
00016 using namespace std;
00017 
00018 #include "StreamOutput.h"
00019 #include "DatasetReader.h"
00020 #include "Framework.h"
00021 #include "InputFeatureSelector.h"
00022 
00034 class Data : public Framework
00035 {
00036     // These two classes have full access to all members
00037     friend class Scheduler;
00038     friend class Algorithm;
00039     friend class Autoencoder;
00040 
00041 public:
00042     Data();
00043     virtual ~Data();
00044 
00045     void readParameter ( string line, int mode );
00046     void readDscFile ( string name );
00047     void setPathes ( string temp, string dsc, string fullPred, string data );
00048 
00049     void readDataset ( string name );
00050     void allocMemForCrossValidationSets();
00051 
00052     void partitionDatasetToCrossValidationSets();
00053     void fillCascadeLearningInputs();
00054     void extendTrainDataWithCascadeInputs();
00055     void fillNCrossValidationSet ( int n );
00056     void freeNCrossValidationSet ( int n );
00057     void readEffectFile();
00058 
00059     static vector<string> getDirectoryFileList ( string path );
00060     static int* splitStringToIntegerList ( string str, char delimiter );
00061     static vector<string> splitStringToStringList ( string str, char delimiter );
00062 
00063     void setDataPointers ( Data* data );
00064     void mixDataset();
00065 
00066     void deleteMemory();
00067     void loadNormalization ( int nCascade = 0 );
00068 
00069     void setAlgorithmList ( vector<string> m_algorithmNameList );
00070 
00071     void loadFeatureSelectionFile();
00072     void saveFeatureSelectionFile();
00073 
00074     void doFeatureSelection();
00075     void makeBinaryDataset();
00076 
00077     void enableBagging ( bool en );
00078     void doBootstrapSampling ( REAL* probs, REAL* &train, REAL* &target, REAL* &targetEff, REAL* &targetRes, int* &label, int nTrainNew = 0 );
00079     void baggingRandomSeed ( uint seed );
00080 
00081     int vectorSampling ( REAL* probs, int length );
00082 
00083     void mergeTrainAndTest();
00084     void normalizeZeroOne();
00085 
00086     void reduceTrainingSetSize ( REAL percent );
00087     void reduceFeatureSize ( REAL* &table, int tableRows, int &tableCols, REAL percent, bool loadColumnSet );
00088 
00089     void addConstantInput();
00090     
00091 protected:
00092     // master metadata
00093     string m_datasetPath;
00094     string m_datasetName;
00095     string m_algorithmName;
00096     int m_algorithmID;
00097     string m_trainOnFullPredictorFile;
00098     bool m_disableTraining;
00099     int m_randSeed;
00100     int m_nMixDataset;
00101     int m_nMixTrainList;
00102     int m_nCross;
00103     string m_validationType;
00104     int m_maxThreadsInCross;
00105     bool m_enableGlobalMeanStdEstimate;
00106     REAL m_positiveTarget;
00107     REAL m_negativeTarget;
00108     double m_blendingRegularization;
00109     bool m_enableGlobalBlendingWeights;
00110     bool m_blendingEnableCrossValidation;
00111     bool m_enablePostNNBlending;
00112     string m_blendingAlgorithm;
00113 
00114     // enable for cascade learning (where the outcome[targets] of the
00115     // layer1-algo is added to the input features of the layer2-algo)
00116     bool m_enableCascadeLearning;
00117     int m_nCascadeInputs;
00118     REAL* m_cascadeInputs;
00119 
00120     // read strings from the algorithm's description file
00121     map<string,int> m_intMap;
00122     map<string,double> m_doubleMap;
00123     map<string,bool> m_boolMap;
00124     map<string,string> m_stringMap;
00125 
00126     // string pathes
00127     string m_tempPath;
00128     string m_dscPath;
00129     string m_fullPredPath;
00130     string m_dataPath;
00131 
00132     // dataset organization (input/output dimensions)
00133     int m_nFeatures;
00134     int m_nClass;
00135     int m_nDomain;  // number of target classes, 1 is usual
00136 
00137     // randomized train sample list
00138     int* m_mixDatasetIndices;
00139     int* m_mixList;
00140     int* m_crossIndex;
00141 
00142     // full training set
00143     uint m_nTrain;
00144     REAL* m_trainOrig;
00145     REAL* m_trainTargetOrig;
00146     REAL* m_trainTargetOrigEffect;
00147     REAL* m_trainTargetOrigResidual;
00148     int* m_trainLabelOrig;
00149 
00150     // the testset
00151     uint m_nTest;
00152     REAL* m_testOrig;
00153     REAL* m_testTargetOrig;
00154     int* m_testLabelOrig;
00155 
00156     // probe split bounds
00157     int* m_slotBoundaries;
00158 
00159     // trainsets per cross-validation division (randomized)
00160     int* m_trainSize;
00161     REAL** m_train;
00162     REAL** m_trainTarget;
00163     REAL** m_trainTargetEffect;
00164     REAL** m_trainTargetResidual;
00165     int** m_trainLabel;
00166     int** m_trainBaggingIndex;
00167 
00168     // probesets per cross-validation division (randomized)
00169     int* m_probeSize;
00170     REAL** m_probe;
00171     REAL** m_probeTarget;
00172     REAL** m_probeTargetEffect;
00173     REAL** m_probeTargetResidual;
00174     int** m_probeLabel;
00175     int** m_probeIndex;
00176 
00177     // validationset (fix random subset of train)
00178     int m_validSize;
00179     REAL* m_valid;
00180     REAL* m_validTarget;
00181     int* m_validLabel;
00182     
00183     // global mean and standard deviation over whole dataset
00184     REAL* m_mean;
00185     REAL* m_std;
00186     REAL m_standardDeviationMin;
00187     REAL* m_targetMean;
00188 
00189     // allocate memory (input feautures) only for active train threads
00190     bool m_enableSaveMemory;
00191 
00192     // error function
00193     string m_errorFunction;
00194 
00195     REAL* m_support;
00196 
00197     // algorithm list from scheduler
00198     vector<string> m_algorithmNameList;
00199 
00200     // clip at the end
00201     bool m_enablePostBlendClipping;
00202 
00203     // add output noise
00204     REAL m_addOutputNoise;
00205 
00206     // enable feature selection heuristics
00207     bool m_enableFeatureSelection;
00208     bool m_featureSelectionWriteBinaryDataset;
00209 
00210     // enable bagging: done by resampling of the trainingset in retraining
00211     bool m_enableBagging;
00212     uint m_randomSeedBagging;
00213 
00214     // write dsc files in training
00215     bool m_disableWriteDscFile;
00216 
00217     // static mean and std normalization of input features
00218     bool m_enableStaticNormalization;
00219     REAL m_staticMeanNormalization;
00220     REAL m_staticStdNormalization;
00221     bool m_enableProbablisticNormalization;
00222 
00223     // dimensionality reduction
00224     string m_dimensionalityReduction;
00225 
00226     // reduce the size of the training set
00227     REAL m_subsampleTrainSet;
00228 
00229     // reduce the size of the features
00230     REAL m_subsampleFeatures;
00231 
00232     // global train loops: optimize all algos in the ensemble n times (default value=1)
00233     int m_globalTrainingLoops;
00234 
00235     // add a constant 1 column to the feature matrix
00236     bool m_addConstantInput;
00237     
00238     // if this is set, the algorithm should load saved weights before start to training
00239     bool m_loadWeightsBeforeTraining;
00240 };
00241 
00242 
00243 #endif

Generated on Tue Jan 26 09:20:58 2010 for ELF by  doxygen 1.5.8