DatasetReader.h

00001 #ifndef __DATASETREADER_
00002 #define __DATASETREADER_
00003 
00004 #include "Data.h"
00005 #include "Framework.h"
00006 
00007 #include <math.h>
00008 #include <stdio.h>
00009 #include <stdlib.h>
00010 #include <sstream>
00011 #include <vector>
00012 #include <set>
00013 #include <algorithm>
00014 #include <string.h>
00015 #include <cstring>
00016 
00017 // NETFLIX data
00018 #define NETFLIX_DATA_DIR "./NETFLIX/DataFiles/tmp/"
00019 #define NETFLIX_SLOTDATA_ROOT_DIR "./NETFLIX/DataFiles/tmp2/"
00020 
00021 using namespace std;
00022 
00030 class DatasetReader : public Framework
00031 {
00032 public:
00033     DatasetReader();
00034     ~DatasetReader();
00035 
00036     // for reading dataset in matrix form, separated by a delimiter
00037     void getDataBounds ( const char** filenames, string delimiter, int& nFeat, int& nClass, uint& nLines, char* columnType, char* enabledCol, int targetColumn, int filenameID, bool fillData = false, REAL* data = 0, int* labels = 0, bool addConstantOne = true, bool skipFirstLine = false );
00038 
00039     // for split a random train and testset from data
00040     void splitRandomTestset ( REAL percentTest, REAL* data, int* labels, int nData, int nFeat, int nClass, REAL* &train, int* &trainLabel, REAL* &trainTarget, REAL* &test, int* &testLabel, REAL* &testTarget, uint& nTrain, uint& nTest, REAL positiveTarget, REAL negativeTarget, bool noRandom = false );
00041 
00042     // make numeric train and test target vectors
00043     void makeNumericTrainAndTestTargets ( int nClass, int nTrain, int nTest, REAL positiveTarget, REAL negativeTarget, int* trainLabel, int* testLabel, REAL* &trainTarget, REAL* &testTarget );
00044 
00045     // MNIST: handwritten digits
00046     void readMNIST ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00047 
00048     // NETFLIX: blend predictions
00049     void readNETFLIX ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00050 
00051     // KDDCup09: customer relationship management
00052     void readKDDCup09Large ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00053     void readKDDCup09LargeBin ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00054     void readKDDCup09Small ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00055 
00056     // AusDM2009: blend predictions (subset of netflix)
00057     void readAusDM2009 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00058 
00059     // BINARY: results from feature selection
00060     void readBINARY ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00061 
00062     // CSV: comma separated text files
00063     void readCSV ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00064 
00065     // ARFF: weka format
00066     void readARFF ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00067 
00068     // PRUDSYS_DMC 2009: data mining cup Prudsys AG
00069     void readPRUDSYS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00070 
00071     // read different UCI datasets
00072 
00073     void readADULT ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00074 
00075     void readAUSTRALIAN ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00076 
00077     void readBALANCE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00078 
00079     void readCYLINDERBANDS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00080 
00081     void readBREASTCANCERWISCONSIN ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00082 
00083     void readAUSTRALIANCREDIT ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00084 
00085     void readDIABETES ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00086 
00087     void readGERMAN ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00088 
00089     void readGLASS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00090 
00091     void readHEART ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00092 
00093     void readHEPATITIS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00094 
00095     void readIONOSPHERE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00096 
00097     void readIRIS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00098 
00099     void readLETTER ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00100 
00101     void readMONKS1 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00102 
00103     void readMONKS2 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00104 
00105     void readMONKS3 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00106 
00107     void readMUSHROOM ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00108 
00109     void readSATIMAGE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00110 
00111     void readSEGMENTATION ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00112 
00113     void readSONAR ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00114 
00115     void readVEHICLE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00116 
00117     void readVOTES ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00118 
00119     void readWINE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00120 
00121     void readPOKER ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00122 
00123     void readYEAST ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00124 
00125     void readSURVIVAL ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00126 
00127     // artificial dataset generated by spider
00128     void readSPIDER ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget = 1.0, REAL negativeTarget = -1.0 );
00129 };
00130 
00131 
00132 #endif

Generated on Tue Jan 26 09:20:58 2010 for ELF by  doxygen 1.5.8