DatasetReader Class Reference

#include <DatasetReader.h>

Inheritance diagram for DatasetReader:

Framework

List of all members.

Public Member Functions

 DatasetReader ()
 ~DatasetReader ()
void getDataBounds (const char **filenames, string delimiter, int &nFeat, int &nClass, uint &nLines, char *columnType, char *enabledCol, int targetColumn, int filenameID, bool fillData=false, REAL *data=0, int *labels=0, bool addConstantOne=true, bool skipFirstLine=false)
void splitRandomTestset (REAL percentTest, REAL *data, int *labels, int nData, int nFeat, int nClass, REAL *&train, int *&trainLabel, REAL *&trainTarget, REAL *&test, int *&testLabel, REAL *&testTarget, uint &nTrain, uint &nTest, REAL positiveTarget, REAL negativeTarget, bool noRandom=false)
void makeNumericTrainAndTestTargets (int nClass, int nTrain, int nTest, REAL positiveTarget, REAL negativeTarget, int *trainLabel, int *testLabel, REAL *&trainTarget, REAL *&testTarget)
void readMNIST (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readNETFLIX (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readKDDCup09Large (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readKDDCup09LargeBin (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readKDDCup09Small (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readAusDM2009 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readBINARY (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readCSV (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readARFF (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readPRUDSYS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readADULT (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readAUSTRALIAN (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readBALANCE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readCYLINDERBANDS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readBREASTCANCERWISCONSIN (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readAUSTRALIANCREDIT (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readDIABETES (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readGERMAN (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readGLASS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readHEART (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readHEPATITIS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readIONOSPHERE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readIRIS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readLETTER (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readMONKS1 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readMONKS2 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readMONKS3 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readMUSHROOM (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readSATIMAGE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readSEGMENTATION (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readSONAR (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readVEHICLE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readVOTES (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readWINE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readPOKER (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readYEAST (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readSURVIVAL (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)
void readSPIDER (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0)


Detailed Description

Reads predefined datasets The dataset must have: train + testset For both sets: input features + targets (+ for classification:labels)

Definition at line 30 of file DatasetReader.h.


Constructor & Destructor Documentation

DatasetReader::DatasetReader (  ) 

Constructor

Definition at line 8 of file DatasetReader.cpp.

00009 {
00010     cout<<"DatasetReader"<<endl;
00011 }

DatasetReader::~DatasetReader (  ) 

Destructor

Definition at line 16 of file DatasetReader.cpp.

00017 {
00018     cout<<"descructor DatasetReader"<<endl;
00019 }


Member Function Documentation

void DatasetReader::getDataBounds ( const char **  filenames,
string  delimiter,
int &  nFeat,
int &  nClass,
uint &  nLines,
char *  columnType,
char *  enabledCol,
int  targetColumn,
int  filenameID,
bool  fillData = false,
REAL *  data = 0,
int *  labels = 0,
bool  addConstantOne = true,
bool  skipFirstLine = false 
)

Read from a standard data matrix If numerical values are undefined, assign the mean value

e.g.: (last column is target class) 19910108,X126,NO,LINE,YES,Motter94,1911,55,46,0.2,17,78,0.75,20,13.1,1700,50.5,36.4,0,0,2.5,1,34,40,105,100,band 19910109,X266,NO,LINE,YES,Motter94,?,55,46,0.3,15,80,0.75,20,6.6,1900,54.9,38.5,0,0,2.5,0.7,34,40,105,100,noband 19910104,B7,NO,LINE,YES,WoodHoe70,?,62,40,0.433,16,80,?,30,6.5,1850,53.8,39.8,0,0,2.8,0.9,40,40,103.87,100,noband 19910104,T133,NO,LINE,YES,WoodHoe70,1910,52,40,0.3,16,75,0.3125,30,5.6,1467,55.6,38.8,0,0,2.5,1.3,40,40,108.06,100,noband 19910111,J34,NO,LINE,YES,WoodHoe70,1910,50,46,0.3,17,80,0.75,30,0,2100,57.5,42.5,5,0,2.3,0.6,35,40,106.67,100,noband

Parameters:
filenames The dataset names
delemiter The delimiter string, e.g.: ", " or ","
nFeat Reference to the number of features (output value)
nLines Reference to the number of lines in the dataset
columnType A char* that select the data type: 'd' for discrete string value, 'n' for numeric value
enabledCol A chat* that has '0' or '1', for reject or select a data column
targetColumn The number of the column, that holds target classes (begin with 0)
filenameID Select the filename in filenames
fillData If true: fill REAL* data and int* labels with data
data Pointer to data (allocated here)
labels Pointer to labels (allocated here)

Definition at line 4510 of file DatasetReader.cpp.

04511 {
04512     int bufSize = 1024*1024;
04513     int nFiles = 0;
04514     while ( filenames[nFiles] )
04515         nFiles++;
04516     cout<<"nFiles:"<<nFiles<<endl;
04517 
04518     fstream f;
04519 
04520     for ( int i=0;i<nFiles;i++ )
04521     {
04522         f.open ( filenames[i], ios::in );
04523         if ( f.is_open() == false )
04524         {
04525             cout<<"Can not open "<<filenames[i]<<endl;
04526             exit ( 0 );
04527         }
04528         f.close();
04529     }
04530 
04531     int columnTypeSize = 0;
04532     while ( columnType[columnTypeSize] )
04533         columnTypeSize++;
04534     cout<<"columnTypeSize:"<<columnTypeSize<<endl;
04535     char buf0[bufSize], buf1[bufSize];
04536     int delimiterLength = delimiter.length();
04537     const char* delimiterCharPtr = delimiter.c_str();
04538     vector<string>* discreteValues = new vector<string>[columnTypeSize];
04539     double* numericMean = new double[columnTypeSize];
04540     int* numericMeanCnt = new int[columnTypeSize];
04541     for ( int i=0;i<columnTypeSize;i++ )
04542     {
04543         numericMean[i] = 0.0;
04544         numericMeanCnt[i] = 0;
04545     }
04546     for ( int fileCnt=0;fileCnt<nFiles;fileCnt++ )
04547     {
04548         f.open ( filenames[fileCnt], ios::in );
04549         if ( fileCnt == filenameID )
04550             nLines = 0;
04551 
04552         if ( skipFirstLine )
04553             f.getline ( buf0, bufSize );
04554 
04555         while ( f.getline ( buf0, bufSize ) ) // read all lines
04556         {
04557             int cnt0 = 0, cnt1 = 0, cellCnt = 0;
04558             while ( buf0[cnt1] != 0 && cnt1 < bufSize ) // read all chars per line
04559             {
04560                 int matchCnt = 0;
04561                 for ( int i=0;i<delimiterLength;i++ )
04562                     matchCnt += delimiterCharPtr[i] == buf0[cnt1+i];
04563 
04564                 if ( buf0[cnt1+delimiterLength]!=' ' && cnt1 > 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) ) // a delimiter match is found, or end of line
04565                 {
04566                     if ( cellCnt >= columnTypeSize )
04567                         break;
04568 
04569                     int addOne = 0;
04570                     if ( buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' )
04571                         addOne = 1;
04572                     strncpy ( buf1, buf0 + cnt0, cnt1 - cnt0 + addOne );
04573                     buf1[cnt1 - cnt0 + addOne] = 0;
04574                     cnt0 = cnt1 + delimiterLength;
04575                     if ( cnt1 < cnt0 - 1 )
04576                         cnt1 = cnt0 - 1;
04577                     if ( enabledCol[cellCnt] == '1' )
04578                     {
04579                         if ( columnType[cellCnt] == 'd' )
04580                         {
04581                             // search for existing
04582                             bool exists = false;
04583                             for ( int i=0;i<discreteValues[cellCnt].size();i++ )
04584                                 if ( discreteValues[cellCnt][i] == string ( buf1 ) )
04585                                     exists = true;
04586                             if ( exists == false )
04587                                 discreteValues[cellCnt].push_back ( string ( buf1 ) );
04588                         }
04589                         else if ( columnType[cellCnt] == 'n' )
04590                         {
04591                             if ( ( buf1[0] >= '0' && buf1[0] <= '9' ) || buf1[0] == '.' || buf1[0] == '-' ) // is a numeric value
04592                             {
04593                                 float num;
04594                                 sscanf ( buf1,"%f",&num );
04595                                 if ( fileCnt == filenameID )
04596                                 {
04597                                     numericMean[cellCnt] += num;
04598                                     numericMeanCnt[cellCnt]++;
04599                                 }
04600                             }
04601                             else  // is an unknown numeric value
04602                             {
04603                                 ;
04604                             }
04605                         }
04606                         else
04607                             assert ( false );
04608                     }
04609                     //cout<<cellCnt<<":"<<string(buf1)<<"|";
04610                     cellCnt++;
04611                     if ( buf0[cnt1+1] == 0 )
04612                         break;
04613                 }
04614                 else if ( cnt1 == 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 ) )
04615                     cnt0++;
04616                 cnt1++;
04617             }
04618             //cout<<endl;
04619 
04620             // if the line has content
04621             if ( cnt1 > 1 )
04622             {
04623                 if ( cellCnt != columnTypeSize && cellCnt > 1 )
04624                 {
04625                     cout<<"cellCnt:"<<cellCnt<<" columnTypeSize:"<<columnTypeSize<<endl;
04626                     assert ( false );
04627                 }
04628                 if ( fileCnt == filenameID )
04629                     nLines++;
04630             }
04631             memset ( buf0, 0, bufSize );
04632         }
04633         f.close();
04634 
04635     }
04636 
04637     // calculate the total number of features
04638     nFeat = 0;
04639     cout<<"ValuesPerDiscreteInput:"<<endl;
04640     for ( int i=0;i<columnTypeSize;i++ )
04641     {
04642         if ( i+1 != targetColumn )
04643         {
04644             if ( enabledCol[i] == '1' )
04645             {
04646                 if ( columnType[i] == 'd' )
04647                 {
04648                     cout<<i<<": #"<< ( int ) discreteValues[i].size() <<" {";
04649                     for ( int j=0;j<discreteValues[i].size();j++ )
04650                         cout<<discreteValues[i][j]<<",";
04651                     cout<<"}"<<endl;
04652                     nFeat += discreteValues[i].size();
04653                 }
04654                 else if ( columnType[i] == 'n' )
04655                     nFeat++;
04656                 else
04657                     assert ( false );
04658             }
04659         }
04660     }
04661     if ( addConstantOne )
04662         nFeat++;
04663     cout<<endl;
04664 
04665     nClass = discreteValues[targetColumn-1].size();
04666     cout<<"#Targets:"<< ( int ) nClass<<" {";
04667     for ( int j=0;j<nClass;j++ )
04668     {
04669         string value = discreteValues[targetColumn-1][j];
04670         cout<<value<<","<<flush;
04671     }
04672     cout<<"}"<<endl;
04673 
04674     cout<<endl;
04675     cout<<"nFeat:"<<nFeat<<endl;
04676     cout<<"nLines:"<<nLines<<endl;
04677 
04678     if ( fillData )
04679     {
04680         // clear data
04681         for ( int i=0;i<nLines*nFeat;i++ )
04682             data[i] = 0.0;
04683         if ( addConstantOne )
04684         {
04685             for ( int i=0;i<nLines;i++ )
04686                 data[i*nFeat + nFeat-1] = 1.0;
04687         }
04688         for ( int i=0;i<nLines;i++ )
04689             labels[i] = 0;
04690 
04691         f.open ( filenames[filenameID], ios::in );
04692         nLines = 0;
04693 
04694         if ( skipFirstLine )
04695             f.getline ( buf0, bufSize );
04696 
04697         while ( f.getline ( buf0, bufSize ) ) // read all lines
04698         {
04699             int cnt0 = 0, cnt1 = 0, cellCnt = 0, pos = 0;
04700             while ( buf0[cnt1] != 0 && cnt1 < bufSize ) // read all chars per line
04701             {
04702                 int matchCnt = 0;
04703                 for ( int i=0;i<delimiterLength;i++ )
04704                     matchCnt += delimiterCharPtr[i] == buf0[cnt1+i];
04705 
04706                 if ( buf0[cnt1+delimiterLength]!=' ' && cnt1 > 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) ) // a delimiter match is found, or end of line
04707                 {
04708                     if ( cellCnt >= columnTypeSize )
04709                         break;
04710 
04711                     int addOne = 0;
04712                     if ( buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' )
04713                         addOne = 1;
04714                     strncpy ( buf1, buf0 + cnt0, cnt1 - cnt0 + addOne );
04715                     buf1[cnt1 - cnt0 + addOne] = 0;
04716                     cnt0 = cnt1 + delimiterLength;
04717                     if ( cnt1 < cnt0 - 1 )
04718                         cnt1 = cnt0 - 1;
04719                     if ( enabledCol[cellCnt] == '1' )
04720                     {
04721                         if ( columnType[cellCnt] == 'd' ) // discete value: {"Hugo","Bart","Moe",..}
04722                         {
04723                             // search in existing values
04724                             int searchPos = -1;
04725                             for ( int i=0;i<discreteValues[cellCnt].size();i++ )
04726                                 if ( discreteValues[cellCnt][i] == string ( buf1 ) )
04727                                     searchPos = i;
04728 
04729                             if ( searchPos == -1 )
04730                                 assert ( false );
04731 
04732                             // assign value
04733                             if ( cellCnt+1 == targetColumn )
04734                             {
04735                                 labels[nLines] = searchPos;
04736                             }
04737                             else
04738                             {
04739                                 data[nLines*nFeat + pos + searchPos] = 1.0;
04740                                 pos += discreteValues[cellCnt].size();
04741                             }
04742                         }
04743                         else if ( columnType[cellCnt] == 'n' ) // numeric value like: 1.23 or .34 or 1.2e3
04744                         {
04745                             if ( ( buf1[0] >= '0' && buf1[0] <= '9' ) || buf1[0] == '.' || buf1[0] == '-' ) // is a numeric value
04746                             {
04747                                 float num;
04748                                 sscanf ( buf1,"%f",&num );
04749                                 data[nLines*nFeat + pos] = num;
04750                             }
04751                             else  // is an unknown numeric value
04752                             {
04753                                 data[nLines*nFeat + pos] = 0.0;
04754                                 if ( numericMeanCnt[cellCnt] > 0 )
04755                                     data[nLines*nFeat + pos] = numericMean[cellCnt] / numericMeanCnt[cellCnt];
04756                             }
04757                             pos++;
04758                         }
04759                         else
04760                             assert ( false );
04761                     }
04762                     cellCnt++;
04763                 }
04764                 else if ( cnt1 == 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 ) )
04765                     cnt0++;
04766                 cnt1++;
04767             }
04768 
04769             // if the line has content
04770             if ( cnt1 > 1 )
04771             {
04772                 if ( cellCnt != columnTypeSize && cellCnt > 1 )
04773                 {
04774                     cout<<"cellCnt:"<<cellCnt<<" columnTypeSize:"<<columnTypeSize<<endl;
04775                     assert ( false );
04776                 }
04777                 nLines++;
04778 
04779                 if ( pos != nFeat - ( int ) addConstantOne )
04780                 {
04781                     cout<<"pos:"<<pos<<" nFeat:"<<nFeat<<endl;
04782                     assert ( false );
04783                 }
04784             }
04785             memset ( buf0, 0, bufSize );
04786         }
04787         f.close();
04788 
04789         // check for NANs or INFs or too large numbers
04790         for ( int i=0;i<nLines*nFeat;i++ )
04791             if ( isnan ( data[i] ) || isinf ( data[i] ) || data[i]>1e10 || data[i]<-1e10 )
04792             {
04793                 cout<<"data["<<i<<"]:"<<data[i]<<endl;
04794                 assert ( false );
04795             }
04796         for ( int i=0;i<nLines;i++ )
04797             if ( isnan ( labels[i] ) || isinf ( labels[i] ) || labels[i]<0 )
04798             {
04799                 cout<<"labels["<<i<<"]:"<<labels[i]<<endl;
04800                 assert ( false );
04801             }
04802 
04803     }
04804 
04805 }

void DatasetReader::makeNumericTrainAndTestTargets ( int  nClass,
int  nTrain,
int  nTest,
REAL  positiveTarget,
REAL  negativeTarget,
int *  trainLabel,
int *  testLabel,
REAL *&  trainTarget,
REAL *&  testTarget 
)

make numeric train and test target vectors

Definition at line 4889 of file DatasetReader.cpp.

04890 {
04891     // train targets
04892     trainTarget = new REAL[nClass*nTrain];
04893     for ( int i=0;i<nTrain;i++ )
04894     {
04895         for ( int j=0;j<nClass;j++ )
04896             trainTarget[i*nClass + j] = negativeTarget;  // negative class labels
04897         trainTarget[i*nClass + trainLabel[i]] = positiveTarget;  // positive class label
04898     }
04899 
04900     // test targets
04901     testTarget = new REAL[nClass*nTest];
04902     for ( int i=0;i<nTest;i++ )
04903     {
04904         for ( int j=0;j<nClass;j++ )
04905             testTarget[i*nClass + j] = negativeTarget;  // negative class labels
04906         testTarget[i*nClass + testLabel[i]] = positiveTarget;  // positive class label
04907     }
04908 
04909     // check for NANs or INFs or too large numbers
04910     for ( int i=0;i<nTrain*nClass;i++ )
04911         if ( isnan ( trainTarget[i] ) || isinf ( trainTarget[i] ) || trainTarget[i]>1e10 || trainTarget[i]<-1e10 )
04912         {
04913             cout<<"trainTarget["<<i<<"]:"<<trainTarget[i]<<endl;
04914             assert ( false );
04915         }
04916 
04917     for ( int i=0;i<nTest*nClass;i++ )
04918         if ( isnan ( testTarget[i] ) || isinf ( testTarget[i] ) || testTarget[i]>1e10 || testTarget[i]<-1e10 )
04919         {
04920             cout<<"testTarget["<<i<<"]:"<<testTarget[i]<<endl;
04921             assert ( false );
04922         }
04923 
04924 }

void DatasetReader::readADULT ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the ADULT dataset (UCI) 3974305Bytes adult.data 5229Bytes adult.names 2003153Bytes adult.test

Definition at line 3266 of file DatasetReader.cpp.

03267 {
03268     cout<<"Read ADULT from: "<<path<<endl;
03269     nDomain = 1;
03270 
03271     // define data type and files
03272     int targetColumn = 15;
03273     char columnType[] = "ndndndddddnnndd";
03274     char enabledCol[] = "111111111111111";
03275     const char* dataFiles[] = { ( new string ( path+"/adult.data" ) )->c_str(), ( new string ( path+"/adult.test" ) )->c_str(),0};
03276 
03277     // === TRAIN SET ===
03278     getDataBounds ( dataFiles, ", ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03279 
03280     // allocate tmp mem
03281     train = new REAL[nTrain * nFeat];
03282     trainLabel = new int[nTrain];
03283 
03284     // fill data
03285     getDataBounds ( dataFiles, ", ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03286 
03287 
03288     // === TEST SET ===
03289     getDataBounds ( dataFiles, ", ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03290 
03291     // allocate tmp mem
03292     test = new REAL[nTest * nFeat];
03293     testLabel = new int[nTest];
03294 
03295     // fill data
03296     getDataBounds ( dataFiles, ", ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03297 
03298     // make numerical targets
03299     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03300 }

void DatasetReader::readARFF ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Read the data in ARFF format see: http://www.cs.waikato.ac.nz/~ml/weka/arff.html

Definition at line 2083 of file DatasetReader.cpp.

02084 {
02085     cout<<"Read ARFF from: "<<path<<endl;
02086     nDomain = 1;
02087     
02088     char* buf = new char[1024*1024];
02089     char del = 0;
02090     string trainName, trainTargetColumn;
02091     
02092     // read the settings file
02093     fstream fSetting(string(path+"/settings.txt").c_str(),ios::in);
02094     while ( fSetting.getline ( buf, 1024*1024 ) )
02095     {
02096         string s = buf;
02097         size_t pos = s.find_first_of('=');
02098         string token = s.substr(0,pos);
02099         cout<<token<<endl;
02100         if(token == "trainTargetColumn")
02101             trainTargetColumn = s.substr(pos+1);
02102         else if(token == "train")
02103             trainName = s.substr(pos+1);
02104     }
02105     fSetting.close();
02106     
02107     if(trainName=="" || trainTargetColumn=="")
02108         assert(false);
02109     
02110     // read training set
02111     fstream fTrain(string(path+"/"+trainName).c_str(),ios::in);
02112     vector<vector<REAL> > targets;
02113     vector<vector<REAL> > features;
02114     vector<string> featureNames;
02115     vector<map<string,int> > featureValues;
02116     bool dataMode = false;
02117     while ( fTrain.getline ( buf, 1024*1024 ) )
02118     {
02119         string s = buf;  // the line
02120         
02121         if(s.length() == 0)  // no empty lines
02122             continue;
02123         if(s[0] == '%')  // skip comments
02124             continue;
02125         if(s[0] == '@')  // control sign
02126         {
02127             dataMode = false;
02128             size_t spacePos0 = s.find_first_of(' ');
02129             string token = s.substr(0,spacePos0);  // token from beginning
02130             
02131             if(token == "@relation" || token == "@RELATION")
02132                 cout<<"Dataset name:"<<s.substr(spacePos0+1)<<endl;
02133             else if(token == "@attribute" || token == "@ATTRIBUTE")
02134             {
02135                 // @attribute 'family' {'?','GB','GK','GS','TN','ZA','ZF','ZH','ZM','ZS'}
02136                 size_t spacePos1 = s.find_first_of(" \t", spacePos0+1);
02137                 string featureName = s.substr(spacePos0+1,spacePos1-spacePos0-1);
02138                 featureNames.push_back(featureName);
02139                 
02140                 map<string,int> values;
02141                 size_t curlyPos0 = s.find_first_of('{', spacePos1+1);
02142                 size_t curlyPos1 = s.find_first_of('}', spacePos1+1);
02143                 size_t pos = curlyPos0+1;
02144                 if(curlyPos0 != string::npos && curlyPos1 != string::npos)
02145                 {
02146                     while(pos < s.length())
02147                     {
02148                         size_t delPos = s.find_first_of(',',pos);
02149                         if(delPos==string::npos)
02150                             delPos = curlyPos1;
02151                         string feature = s.substr(pos,delPos-pos);
02152                         while(*(feature.begin()) == ' ')  // remove leading spaces
02153                             feature = feature.substr(1);
02154                         if(feature.length() > 0)
02155                             while(feature[feature.length()-1] == ' ')  // remove ending spaces
02156                             {
02157                                 feature = feature.substr(0,feature.length()-1);
02158                                 if(feature.length() == 0)
02159                                     break;
02160                             }
02161                         if(feature.length() > 0)
02162                             values[feature] = values.size();  // assign new id
02163                         pos += feature.length()+1;
02164                     }
02165                 }
02166                 featureValues.push_back(values);  // push empty map when having a "real" attribute
02167             }
02168             else if(token == "@data" || token == "@DATA")
02169                 dataMode = true;
02170         }
02171         else if(dataMode)
02172         {
02173             // '?','C','A',8,0,'?','S','?',0,'?','?','G','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','COIL',0.7,610,0,'?','0','?','3'
02174             //cout<<featureValues.size()<<endl;
02175             size_t pos = 0;
02176             uint valueCnt = 0;
02177             vector<REAL> feature;
02178             while(pos < s.length())
02179             {
02180                 size_t delPos = s.find_first_of(',',pos);
02181                 if(delPos==string::npos)
02182                     delPos = s.length();
02183                 string value = s.substr(pos,delPos-pos);
02184                 //cout<<value<<" "<<featureNames[valueCnt]<<endl;
02185                 if(featureValues[valueCnt].size() == 0)  // check for real-valued attribute
02186                 {
02187                     if(featureNames[valueCnt] == trainTargetColumn)
02188                     {
02189                         vector<REAL> target;
02190                         target.push_back(atof(value.c_str()));
02191                         targets.push_back(target);
02192                     }
02193                     else
02194                         feature.push_back(atof(value.c_str()));
02195                 }
02196                 else  // categorical type
02197                 {
02198                     uint catSize = featureValues[valueCnt].size();
02199                     if(featureNames[valueCnt] == trainTargetColumn)
02200                     {
02201                         vector<REAL> target;
02202                         map<string,int>::iterator it = featureValues[valueCnt].find(value);
02203                         for(int i=0;i<catSize;i++)
02204                             target.push_back(negativeTarget);
02205                         uint catPos = it->second;
02206                         target[catPos] = positiveTarget;
02207                         targets.push_back(target);
02208                     }
02209                     else
02210                     {
02211                         map<string,int>::iterator it = featureValues[valueCnt].find(value);
02212                         if(it == featureValues[valueCnt].end())
02213                             assert(false);
02214                         for(int i=0;i<catSize;i++)
02215                             feature.push_back(-1.0);  // init with negative
02216                         uint catPos = it->second;
02217                         feature[feature.size()-catSize+catPos] = 1.0;
02218                     }
02219                 }
02220                 valueCnt++;
02221                 pos += value.length()+1;
02222             }
02223             features.push_back(feature);
02224         }
02225     }
02226     fTrain.close();
02227     
02228     assert(features.size() == targets.size());
02229     
02230     // print a short summary
02231     nTrain = features.size();
02232     nFeat = features[0].size();
02233     nClass = targets[0].size();
02234     cout<<"nTrain:"<<nTrain<<" nFeat:"<<nFeat<<" nClass:"<<nClass<<" nFeatureNames:"<<featureNames.size()<<endl;
02235     for(int i=0;i<featureNames.size();i++)
02236     {
02237         cout<<"name:"<<featureNames[i]<<" ";
02238         if(featureValues[i].size() == 0)
02239             cout<<"[REAL]";
02240         else
02241         {
02242             for(map<string,int>::iterator it = featureValues[i].begin();it != featureValues[i].end(); it++)
02243                 cout<<"\""<<it->first<<"\" ";
02244         }
02245         cout<<endl;
02246     }
02247     
02248     // allocate + fill train data
02249     train = new REAL[nFeat*nTrain];
02250     trainTarget = new REAL[nClass*nTrain];
02251     trainLabel = nClass > 1 ? new int[nTrain] : 0;
02252     
02253     for(int i=0;i<nTrain;i++)
02254     {
02255         for(int j=0;j<nFeat;j++)
02256             train[i*nFeat+j] = features[i][j];
02257         for(int j=0;j<nClass;j++)
02258         {
02259             trainTarget[i*nClass+j] = targets[i][j];
02260             if(targets[i][j] == positiveTarget && nClass > 1)
02261                 trainLabel[i] = j;
02262         }
02263     }
02264     
02265     // no test set
02266     test = 0;
02267     testTarget = 0;
02268     testLabel = 0;
02269     nTest = 0;
02270     
02271     delete[] buf;
02272 }

void DatasetReader::readAusDM2009 ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

AusDM2009 competition http://www.tiberius.biz/ausdm09/ -rw-r--r-- 1 15136906 Sep 25 20:18 S_AUC_Score.csv -rw-r--r-- 1 15131946 Sep 25 20:18 S_AUC_Train.csv -rw-r--r-- 1 15137106 Sep 25 20:18 S_RMSE_Score.csv -rw-r--r-- 1 15171000 Sep 25 20:18 S_RMSE_Train.csv

Definition at line 2510 of file DatasetReader.cpp.

02511 {
02512     cout<<"Read AusDM2009 from: "<<path<<endl;
02513     //string nameTrain = "S_RMSE_Train.csv";
02514     //string nameTest = "S_RMSE_Score.csv";
02515     //string nameTrain = "M_RMSE_Train.csv";
02516     //string nameTest = "M_RMSE_Score.csv";
02517     string nameTrain = "L_RMSE_Train.csv";
02518     string nameTest = "L_RMSE_Score.csv";
02519     nClass = 1;
02520     bool addConstantOne = true;
02521 
02522     if ( Framework::getDatasetType() == 1 ) // is classification
02523     {
02524         //nameTrain = "S_AUC_Train.csv";
02525         //nameTest = "S_AUC_Score.csv";
02526         //nameTrain = "M_AUC_Train.csv";
02527         //nameTest = "M_AUC_Score.csv";
02528         nameTrain = "L_AUC_Train.csv";
02529         nameTest = "L_AUC_Score.csv";
02530         nClass = 2;
02531     }
02532 
02533     cout<<"nameTrain:"<<nameTrain<<" nameTest:"<<nameTest<<endl;
02534 
02535     int bufSize = 1024*1024;
02536     char *buf = new char[bufSize];
02537 
02538     nDomain = 1;
02539 
02540     fstream fTrainRMSE;
02541     fstream fTestRMSE;
02542 
02543     // determine #cols
02544     fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in );
02545     fTrainRMSE.getline ( buf, bufSize );
02546     fTrainRMSE.getline ( buf, bufSize );
02547     nFeat = 0;
02548     char *ptr = buf, *ptrLast = buf;
02549     int pos = 0, val, colCnt = 0;
02550     while ( ptr[pos] )
02551     {
02552         if ( ptr[pos] == ',' || ptr[pos+1] == 0 )
02553         {
02554             sscanf ( ptrLast,"%d",&val );
02555             ptrLast = ptr + pos + 1;
02556             colCnt++;
02557             if ( colCnt > 2 )
02558                 nFeat++;
02559         }
02560         pos++;
02561     }
02562     fTrainRMSE.close();
02563 
02564     if ( addConstantOne )
02565         nFeat++;  // constant one
02566 
02567     // determine #rows train
02568     fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in );
02569     fTrainRMSE.getline ( buf, bufSize );
02570     nTrain = 0;
02571     while ( fTrainRMSE.getline ( buf, bufSize ) )
02572         nTrain++;
02573     fTrainRMSE.close();
02574 
02575     // determine #rows test
02576     fTestRMSE.open ( ( path+"/"+nameTest ).c_str(), ios::in );
02577     fTestRMSE.getline ( buf, bufSize );
02578     nTest = 0;
02579     while ( fTestRMSE.getline ( buf, bufSize ) )
02580         nTest++;
02581     fTestRMSE.close();
02582 
02583     // alloc mem
02584     train = new REAL[nFeat*nTrain];
02585     test = new REAL[nFeat*nTest];
02586     if ( Framework::getDatasetType() == 1 )
02587     {
02588         trainTarget = new REAL[nTrain*2];
02589         trainLabel = new int[nTrain];
02590         testTarget = new REAL[nTest*2];
02591         testLabel = new int[nTest];
02592     }
02593     else
02594     {
02595         trainTarget = new REAL[nTrain];
02596         trainLabel = 0;
02597         testTarget = new REAL[nTest];
02598         testLabel = 0;
02599     }
02600 
02601     // read train
02602     fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in );
02603     fTrainRMSE.getline ( buf, bufSize );
02604     nTrain = 0;
02605     while ( fTrainRMSE.getline ( buf, bufSize ) )
02606     {
02607         ptr = buf;
02608         ptrLast = buf;
02609         pos = 0;
02610         colCnt = 0;
02611         while ( ptr[pos] )
02612         {
02613             if ( ptr[pos] == ',' || ptr[pos+1] == 0 )
02614             {
02615                 sscanf ( ptrLast,"%d",&val );
02616                 ptrLast = ptr + pos + 1;
02617                 colCnt++;
02618                 if ( colCnt == 2 )
02619                 {
02620                     if ( Framework::getDatasetType() == 1 )
02621                     {
02622                         trainLabel[nTrain] = val>0? 0 : 1;
02623                         trainTarget[2*nTrain+0] = val>0? positiveTarget : negativeTarget;
02624                         trainTarget[2*nTrain+1] = val>0? negativeTarget : positiveTarget;
02625                     }
02626                     else
02627                         trainTarget[nTrain] = ( REAL ) val * 0.001;
02628                     //trainTarget[nTrain] = (REAL)val;
02629                 }
02630                 if ( colCnt > 2 )
02631                     train[nTrain*nFeat+colCnt-3] = ( REAL ) val * 0.001;
02632                 //train[nTrain*nFeat+colCnt-3] = (REAL)val;
02633             }
02634             pos++;
02635         }
02636         if ( ( colCnt-3 != nFeat-1 && addConstantOne == false ) || ( colCnt-3 != nFeat-2 && addConstantOne == true ) )
02637         {
02638             cout<<"colCnt:"<<colCnt<<" nFeat:"<<nFeat<<" addConstantOne:"<<addConstantOne<<endl;
02639             assert ( false );
02640         }
02641         if ( addConstantOne )
02642             train[nTrain*nFeat+nFeat-1] = 1.0;
02643         nTrain++;
02644     }
02645     fTrainRMSE.close();
02646 
02647     // read test
02648     fTestRMSE.open ( ( path+"/"+nameTest ).c_str(), ios::in );
02649     fTestRMSE.getline ( buf, bufSize );
02650     nTest = 0;
02651     while ( fTestRMSE.getline ( buf, bufSize ) )
02652     {
02653         ptr = buf;
02654         ptrLast = buf;
02655         pos = 0;
02656         colCnt = 0;
02657         while ( ptr[pos] )
02658         {
02659             if ( ptr[pos] == ',' || ptr[pos+1] == 0 )
02660             {
02661                 sscanf ( ptrLast,"%d",&val );
02662                 ptrLast = ptr + pos + 1;
02663                 colCnt++;
02664 
02665                 if ( Framework::getDatasetType() == 1 )
02666                 {
02667                     testTarget[nTest] = val>0? 0 : 1;
02668                     testTarget[2*nTest+0] = val>0? positiveTarget : negativeTarget;
02669                     testTarget[2*nTest+1] = val>0? negativeTarget : positiveTarget;
02670                 }
02671                 else
02672                     testTarget[nTest] = ( REAL ) val * 0.001;
02673                 //testTarget[nTest] = (REAL)val;
02674                 if ( colCnt > 2 )
02675                     test[nTest*nFeat+colCnt-3] = ( REAL ) val * 0.001;
02676                 //test[nTest*nFeat+colCnt-3] = (REAL)val;
02677             }
02678             pos++;
02679         }
02680         if ( ( colCnt-3 != nFeat-1 && addConstantOne == false ) || ( colCnt-3 != nFeat-2 && addConstantOne == true ) )
02681         {
02682             cout<<"colCnt:"<<colCnt<<" nFeat:"<<nFeat<<" addConstantOne:"<<addConstantOne<<endl;
02683             assert ( false );
02684         }
02685         if ( addConstantOne )
02686             test[nTest*nFeat+nFeat-1] = 1.0;
02687         nTest++;
02688     }
02689     fTestRMSE.close();
02690 
02691     /*
02692     // random subspace idea
02693     REAL subspace = 0.45;
02694     //REAL subspace = 1.0;
02695     bool* subspaceBit = new bool[nFeat];
02696     if(Framework::getFrameworkMode() == 0 && subspace < 1.0)  // training
02697     {
02698         //srand(time(0));
02699         cout<<"Create a random subspace:"<<subspace<<endl;
02700         fstream f((path+"/subspace.txt").c_str(), ios::out);
02701         for(int i=0;i<nFeat;i++)
02702         {
02703             subspaceBit[i] = (double)rand()/(double)RAND_MAX < subspace? true : false;
02704             subspaceBit[nFeat-1] = true;
02705             f<<(int)subspaceBit[i]<<endl;
02706             cout<<(int)subspaceBit[i]<<" ";
02707         }
02708         cout<<endl;
02709         f.close();
02710     }
02711     else if(subspace < 1.0) // prediction
02712     {
02713         cout<<"Read the random subspace"<<endl;
02714         fstream f((path+"/subspace.txt").c_str(), ios::in);
02715         for(int i=0;i<nFeat;i++)
02716         {
02717             f>>subspaceBit[i];
02718             cout<<(int)subspaceBit[i]<<" ";
02719         }
02720         cout<<endl;
02721         f.close();
02722     }
02723 
02724     if(subspace < 1.0)
02725     {
02726         int nFeatNew = 0;
02727         for(int i=0;i<nFeat;i++)
02728             nFeatNew += subspaceBit[i];
02729         cout<<"nFeatNew:"<<nFeatNew<<endl;
02730 
02731         REAL* trainNew = new REAL[nFeatNew*nTrain];
02732         REAL* testNew = new REAL[nFeatNew*nTest];
02733 
02734         for(int i=0;i<nTrain;i++)
02735         {
02736             int cnt = 0;
02737             for(int j=0;j<nFeat;j++)
02738             {
02739                 if(subspaceBit[j])
02740                 {
02741                     trainNew[cnt + i * nFeatNew] = train[j + i * nFeat];
02742                     cnt++;
02743                 }
02744             }
02745         }
02746 
02747         for(int i=0;i<nTest;i++)
02748         {
02749             int cnt = 0;
02750             for(int j=0;j<nFeat;j++)
02751             {
02752                 if(subspaceBit[j])
02753                 {
02754                     testNew[cnt + i * nFeatNew] = test[j + i * nFeat];
02755                     cnt++;
02756                 }
02757             }
02758         }
02759 
02760         delete[] train;
02761         delete[] test;
02762         train = trainNew;
02763         test = testNew;
02764         nFeat = nFeatNew;
02765     }
02766     */
02767     delete[] buf;
02768 }

void DatasetReader::readAUSTRALIAN ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the AUSTRALIAN dataset (UCI) 9 folders with 95 different signs in sum, 27 samples per sign

Definition at line 3307 of file DatasetReader.cpp.

03308 {
03309     cout<<"Read AUSTRALIAN from: "<<path<<endl;
03310     nDomain = 1;
03311 
03312     char* dirs[] = {"tctodd1","tctodd2","tctodd3","tctodd4","tctodd5","tctodd6","tctodd7","tctodd8","tctodd9",0};
03313 
03314     char* signs[] = {"alive","all","answer","boy","building","buy","change_mind_","cold","come","computer_PC_","cost","crazy","danger","deaf","different","draw","drink","eat","exit","flash-light","forget","girl","give","glove","go","God","happy","head","hear","hello","his_hers","hot","how","hurry","hurt","I","innocent","is_true_","joke","juice","know","later","lose","love","make","man","maybe","mine","money","more","name","no","Norway","not-my-problem","paper","pen","please","polite","question","read","ready","research","responsible","right","sad","same","science","share","shop","soon","sorry","spend","stubborn","surprise","take","temper","thank","think","tray","us","voluntary","wait_notyet_","what","when","where","which","who","why","wild","will","write","wrong","yes","you","zero",0};
03315 
03316     nClass = 0;
03317     while ( signs[nClass] )
03318         nClass++;
03319 
03320     cout<<"nClass:"<<nClass<<endl;
03321 
03322     fstream fTrain;
03323 
03324     // get data bounds
03325     int nTrainTmp = 0;
03326     int dirCnt = 0;
03327     char buf[10000];
03328     int maxFrames = 0;
03329     int dataPerLine = 22;
03330     while ( dirs[dirCnt] )
03331     {
03332         int signCnt = 0;
03333         while ( signs[signCnt] )
03334         {
03335             for ( int i=0;i<3;i++ )
03336             {
03337                 sprintf ( buf,"%s/%s/%s-%d.tsd",path.c_str(),dirs[dirCnt],signs[signCnt],i+1 );
03338                 fTrain.open ( buf, ios::in );
03339                 if ( fTrain.is_open() == false )
03340                     cout<<"Can not open "<<buf<<endl;
03341                 else
03342                 {
03343                     int lines = 0;
03344                     while ( fTrain.getline ( buf, 10000 ) ) // read all lines
03345                     {
03346                         stringstream ss ( buf );
03347                         REAL r;
03348                         int cnt = 0;
03349                         while ( ss>>r )
03350                             cnt++;
03351                         if ( cnt != dataPerLine )
03352                             assert ( false );
03353                         lines++;
03354                     }
03355                     if ( lines > maxFrames )
03356                         maxFrames = lines;
03357                     nTrainTmp++;
03358                 }
03359                 fTrain.close();
03360             }
03361             signCnt++;
03362         }
03363         dirCnt++;
03364     }
03365 
03366     cout<<"nTrainTmp:"<<nTrainTmp<<endl;
03367     cout<<"maxFrames:"<<maxFrames<<endl;
03368 
03369     nFeat = maxFrames * dataPerLine;
03370     cout<<"nFeat:"<<nFeat<<" ("<<maxFrames<<"*"<<dataPerLine<<")"<<endl;
03371 
03372     // allocate tmp mem
03373     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03374     int* trainLabelTmp = new int[nTrainTmp];
03375     for ( int i=0;i<nTrainTmp * nFeat;i++ )
03376         trainTmp[i] = 0.0;
03377     for ( int i=0;i<nTrainTmp;i++ )
03378         trainLabelTmp[i] = 0;
03379 
03380     // fill data
03381     nTrainTmp = 0;
03382     dirCnt = 0;
03383     while ( dirs[dirCnt] )
03384     {
03385         int signCnt = 0;
03386         while ( signs[signCnt] )
03387         {
03388             for ( int i=0;i<3;i++ )
03389             {
03390                 sprintf ( buf,"%s/%s/%s-%d.tsd",path.c_str(),dirs[dirCnt],signs[signCnt],i+1 );
03391                 fTrain.open ( buf, ios::in );
03392                 if ( fTrain.is_open() == false )
03393                     cout<<"Can not open "<<buf<<endl;
03394                 else
03395                 {
03396                     int lines = 0;
03397                     while ( fTrain.getline ( buf, 10000 ) ) // read all lines
03398                     {
03399                         stringstream ss ( buf );
03400                         REAL r;
03401                         int cnt = 0;
03402                         while ( ss>>r )
03403                         {
03404                             trainTmp[nTrainTmp * nFeat + lines * dataPerLine + cnt] = r;
03405                             trainLabelTmp[nTrainTmp] = signCnt;
03406                             cnt++;
03407                         }
03408                         if ( cnt != dataPerLine )
03409                             assert ( false );
03410                         lines++;
03411                     }
03412                     nTrainTmp++;
03413                 }
03414                 fTrain.close();
03415             }
03416             signCnt++;
03417         }
03418         dirCnt++;
03419     }
03420 
03421     // split train and testset from trainTmp
03422     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03423 
03424     delete[] trainTmp;
03425     delete[] trainLabelTmp;
03426 
03427 }

void DatasetReader::readAUSTRALIANCREDIT ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the AUSTRALIAN-CREDIT dataset (UCI)

28735Bytes australian.dat 2467Bytes australian.doc

Definition at line 3553 of file DatasetReader.cpp.

03554 {
03555     cout<<"Read AUSTRALIAN-CREDIT from: "<<path<<endl;
03556     nDomain = 1;
03557 
03558     // define data type and files
03559     int targetColumn = 15;
03560     uint nTrainTmp;
03561     char columnType[] = "dnndddnddnddnnd";
03562     char enabledCol[] = "111111111111111";
03563     const char* dataFiles[] = { ( new string ( path+"/australian.dat" ) )->c_str(),0};
03564 
03565     // === TRAIN SET ===
03566     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03567 
03568     // allocate tmp mem
03569     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03570     int* trainLabelTmp = new int[nTrainTmp];
03571 
03572     // fill data
03573     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03574 
03575     // split train and testset from trainTmp
03576     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03577 
03578     delete[] trainTmp;
03579     delete[] trainLabelTmp;
03580 }

void DatasetReader::readBALANCE ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the BALANCE dataset (UCI)

  • 6250Bytes balance-scale.data
  • 2222Bytes balance-scale.names

Definition at line 3436 of file DatasetReader.cpp.

03437 {
03438     cout<<"Read BALANCE from: "<<path<<endl;
03439     nDomain = 1;
03440 
03441     // define data type and files
03442     int targetColumn = 1;
03443     uint nTrainTmp;
03444     char columnType[] = "dnnnn";
03445     char enabledCol[] = "11111";
03446     const char* dataFiles[] = { ( new string ( path+"/balance-scale.data" ) )->c_str(),0};
03447 
03448     // === TRAIN SET ===
03449     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03450 
03451     // allocate tmp mem
03452     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03453     int* trainLabelTmp = new int[nTrainTmp];
03454 
03455     // fill data
03456     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03457 
03458     // split train and testset from trainTmp
03459     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03460 
03461     delete[] trainTmp;
03462     delete[] trainLabelTmp;
03463 
03464 }

void DatasetReader::readBINARY ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads a binary dataset format (result from feature selection) Limitied only to classification datasets

Format: [nExamples(4Byte INT),nClass(4Byte INT),nDomain(4Byte INT),nFeat(4Byte INT), features(nExamples*nFeat Bytes REAL),labels(nExamples*nDomain Bytes REAL)]

  • binary.train
  • binary.test

Definition at line 1818 of file DatasetReader.cpp.

01819 {
01820     REAL* feat, *target;
01821     int* label, N;
01822 
01823     fstream f;
01824     if ( Framework::getFrameworkMode() == 1 )
01825         f.open ( ( path+"/binary.test" ).c_str(), ios::in );
01826     else
01827         f.open ( ( path+"/binary.train" ).c_str(), ios::in );
01828 
01829     // dataset bounds
01830     f.read ( ( char* ) &N, sizeof ( int ) );
01831     f.read ( ( char* ) &nClass, sizeof ( int ) );
01832     f.read ( ( char* ) &nDomain, sizeof ( int ) );
01833     f.read ( ( char* ) &nFeat, sizeof ( int ) );
01834 
01835     feat = new REAL[N*nFeat];
01836     target = new REAL[N*nClass*nDomain];
01837     label = new int[N*nDomain];
01838 
01839     // features and labels
01840     f.read ( ( char* ) feat, sizeof ( REAL ) *N*nFeat );
01841     f.read ( ( char* ) label, sizeof ( int ) *N*nDomain );
01842     f.close();
01843 
01844     for ( int i=0;i<N;i++ )
01845     {
01846         for ( int j=0;j<nClass*nDomain;j++ )
01847             target[i*nClass*nDomain+j] = negativeTarget;
01848         for ( int j=0;j<nDomain;j++ )
01849             target[i*nClass*nDomain + j*nClass + label[i*nDomain+j]] = positiveTarget;
01850     }
01851 
01852     if ( Framework::getFrameworkMode() == 1 )
01853     {
01854         nTest = N;
01855         test = feat;
01856         testTarget = target;
01857         testLabel = label;
01858         train = 0;
01859         trainTarget = 0;
01860         trainLabel = 0;
01861         nTrain = 0;
01862     }
01863     else
01864     {
01865         nTrain = N;
01866         train = feat;
01867         trainTarget = target;
01868         trainLabel = label;
01869         test = 0;
01870         testTarget = 0;
01871         testLabel = 0;
01872         nTest = 0;
01873     }
01874 
01875 }

void DatasetReader::readBREASTCANCERWISCONSIN ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the BEAST-CANCER-WISCONSIN dataset (UCI)

19889Bytes breast-cancer-wisconsin.data 5657Bytes breast-cancer-wisconsin.names 21363Bytes unformatted-data 124103Bytes wdbc.data 4708Bytes wdbc.names 44234Bytes wpbc.data 5671Bytes wpbc.names

Definition at line 3515 of file DatasetReader.cpp.

03516 {
03517     cout<<"Read BREAST-CANCER-WISCONSIN from: "<<path<<endl;
03518     nDomain = 1;
03519 
03520     // define data type and files
03521     int targetColumn = 11;
03522     uint nTrainTmp;
03523     char columnType[] = "nnnnnnnnnnd";
03524     char enabledCol[] = "11111111111";
03525     //char columnType[] = "ddddddddddd";
03526     const char* dataFiles[] = { ( new string ( path+"/breast-cancer-wisconsin.data" ) )->c_str(),0};
03527 
03528     // === TRAIN SET ===
03529     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03530 
03531     // allocate tmp mem
03532     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03533     int* trainLabelTmp = new int[nTrainTmp];
03534 
03535     // fill data
03536     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03537 
03538     // split train and testset from trainTmp
03539     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03540 
03541     delete[] trainTmp;
03542     delete[] trainLabelTmp;
03543 
03544 }

void DatasetReader::readCSV ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Regression+Classification Reads a csv dataset format, fields separated by delimiter

Definition at line 1882 of file DatasetReader.cpp.

01883 {
01884     cout<<"Read CSV from: "<<path<<endl;
01885     nDomain = 1;
01886     
01887     char* buf = new char[1024*1024];
01888     char del = 0;
01889     int trainTargetColumn = -1;
01890     string trainName, testName;
01891     
01892     // read the settings file
01893     fstream fSetting(string(path+"/settings.txt").c_str(),ios::in);
01894     while ( fSetting.getline ( buf, 1024*1024 ) )
01895     {
01896         string s = buf;
01897         size_t pos = s.find_first_of('=');
01898         string token = s.substr(0,pos);
01899         //cout<<token<<endl;
01900         if(token == "delimiter")
01901             del = buf[pos+1];
01902         else if(token == "trainTargetColumn")
01903             trainTargetColumn = atoi(s.substr(pos+1).c_str());
01904         else if(token == "train")
01905             trainName = s.substr(pos+1);
01906         else if(token == "test")
01907             testName = s.substr(pos+1);
01908     }
01909     fSetting.close();
01910     
01911     // check if available
01912     if(trainTargetColumn == -1 || del == 0 || trainName == "" || (Framework::getFrameworkMode() && testName == ""))
01913         assert(false);
01914     
01915     // read training set
01916     fstream fTrain(string(path+"/"+trainName).c_str(),ios::in);
01917     vector<string> targets;
01918     map<string,int> targetMap;
01919     vector<vector<REAL> > features;
01920     while ( fTrain.getline ( buf, 1024*1024 ) )
01921     {
01922         string s = buf;
01923         size_t lastPos = 0;
01924         vector<REAL> feature;
01925         for(int i=0;i<s.length();i++)
01926         {
01927             if(s[i] == del || i == s.length()-1)
01928             {
01929                 string token = s.substr(lastPos,i-lastPos);  // tokens from beginning
01930                 if(i == s.length()-1)  // the last token in the line
01931                     token = s.substr(lastPos,i-lastPos+1);
01932                 if(feature.size() == trainTargetColumn)  // any value
01933                 {
01934                     targets.push_back(token);
01935                     if(Framework::getDatasetType())
01936                     {
01937                         map<string,int>::iterator it = targetMap.find(token);
01938                         if(it == targetMap.end())
01939                             targetMap[token] = targetMap.size();
01940                     }
01941                 }
01942                 else  // real value
01943                 {
01944                     if((token[0] == '-' || token[0] == '.' || token[0] >= '0' && token[0] <= '9') == 0)  // real value check
01945                         assert(false);
01946                     REAL value = atof(token.c_str());
01947                     //cout<<value<<" ";
01948                     feature.push_back(value);
01949                 }
01950                 lastPos = i+1;
01951             }
01952         }
01953         if(feature.size())
01954             features.push_back(feature);
01955         //cout<<targets[targets.size()-1]<<endl;
01956     }
01957     fTrain.close();
01958     
01959     // count the different targets in a classification problem
01960     nClass = 1;
01961     if(Framework::getDatasetType())
01962     {
01963         nClass = targetMap.size();
01964         map<string,int>::iterator it;
01965         cout<<"Target values: ";
01966         for(it=targetMap.begin();it!=targetMap.end();it++)
01967             cout<<"["<<it->second<<"]"<<it->first<<" ";
01968         cout<<endl;
01969     }
01970     
01971     // assign bounds and allocate mem
01972     nTrain = features.size();
01973     nTest = 0;
01974     nFeat = features[0].size();
01975     train = new REAL[nFeat*nTrain];
01976     trainTarget = new REAL[nClass*nTrain];
01977     if(Framework::getDatasetType())
01978         trainLabel = new int[nTrain];
01979     
01980     // fill train data
01981     for(int i=0;i<nTrain;i++)
01982     {
01983         for(int j=0;j<nFeat;j++)  // fill features
01984             train[i*nFeat+j] = features[i][j];
01985         if(Framework::getDatasetType())  // classification dataset ?
01986         {
01987             int label = targetMap[targets[i]];
01988             trainLabel[i] = label;
01989             for(int j=0;j<nClass;j++)
01990                 trainTarget[i*nClass+j] = (j==label? positiveTarget : negativeTarget);
01991         }
01992         else  // regression dataset
01993         {
01994             REAL target = atof(targets[i].c_str());
01995             trainTarget[i] = target;
01996         }
01997     }
01998     
01999     // read test set
02000     if(Framework::getFrameworkMode())
02001     {
02002         fstream fTest(string(path+"/"+testName).c_str(),ios::in);
02003         targets.clear();
02004         features.clear();
02005         while ( fTest.getline ( buf, 1024*1024 ) )
02006         {
02007             string s = buf;
02008             size_t lastPos = 0;
02009             vector<REAL> feature;
02010             for(int i=0;i<s.length();i++)
02011             {
02012                 if(s[i] == del || i == s.length()-1)
02013                 {
02014                     string token = s.substr(lastPos,i-lastPos);  // tokens from beginning
02015                     if(i == s.length()-1)  // the last token in the line
02016                         token = s.substr(lastPos,i-lastPos+1);
02017                     if(feature.size() == trainTargetColumn)  // any value
02018                     {
02019                         targets.push_back(token);
02020                         if(Framework::getDatasetType())
02021                         {
02022                             map<string,int>::iterator it = targetMap.find(token);
02023                             if(it == targetMap.end())
02024                                 targetMap[token] = targetMap.size();
02025                         }
02026                     }
02027                     else  // real value
02028                     {
02029                         if((token[0] == '-' || token[0] == '.' || token[0] >= '0' && token[0] <= '9') == 0)  // real value check
02030                             assert(false);
02031                         REAL value = atof(token.c_str());
02032                         //cout<<value<<" ";
02033                         feature.push_back(value);
02034                     }
02035                     lastPos = i+1;
02036                 }
02037             }
02038             if(feature.size())
02039                 features.push_back(feature);
02040             //cout<<targets[targets.size()-1]<<endl;
02041         }
02042         fTest.close();
02043         
02044         // assign bounds and allocate mem
02045         nTest = features.size();
02046         test = new REAL[nFeat*nTest];
02047         testTarget = new REAL[nClass*nTest];
02048         if(Framework::getDatasetType())
02049             testLabel = new int[nTrain];
02050         
02051         // fill train data
02052         for(int i=0;i<nTest;i++)
02053         {
02054             for(int j=0;j<nFeat;j++)  // fill features
02055                 test[i*nFeat+j] = features[i][j];
02056             if(targets.size() == features.size())
02057             {
02058                 if(Framework::getDatasetType())  // classification dataset ?
02059                 {
02060                     int label = targetMap[targets[i]];
02061                     testLabel[i] = label;
02062                     for(int j=0;j<nClass;j++)
02063                         testTarget[i*nClass+j] = (j==label? positiveTarget : negativeTarget);
02064                 }
02065                 else  // regression dataset
02066                 {
02067                     REAL target = atof(targets[i].c_str());
02068                     testTarget[i] = target;
02069                 }
02070             }
02071         }
02072         
02073     }
02074     
02075     delete[] buf;
02076 }

void DatasetReader::readCYLINDERBANDS ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the CYLINDER-BANDS dataset (UCI)

  • 103300Bytes bands.data
  • 3481Bytes bands.names

Definition at line 3473 of file DatasetReader.cpp.

03474 {
03475     cout<<"Read CYLINDER-BANDS from: "<<path<<endl;
03476     nDomain = 1;
03477 
03478     // define data type and files
03479     int targetColumn = 40;
03480     uint nTrainTmp;
03481     char columnType[] = "ndddddddddddddddddddnnnnnnnnnnnnnnnnnnnd";
03482     char enabledCol[] = "1111111111111111111111111111111111111111";
03483     const char* dataFiles[] = { ( new string ( path+"/bands.data" ) )->c_str(),0};
03484 
03485     // === TRAIN SET ===
03486     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03487 
03488     // allocate tmp mem
03489     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03490     int* trainLabelTmp = new int[nTrainTmp];
03491 
03492     // fill data
03493     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03494 
03495     // split train and testset from trainTmp
03496     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03497 
03498     delete[] trainTmp;
03499     delete[] trainLabelTmp;
03500 
03501 }

void DatasetReader::readDIABETES ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the DIABETES dataset (UCI)

23279Bytes pima-indians-diabetes.data 3067Bytes pima-indians-diabetes.names

Definition at line 3589 of file DatasetReader.cpp.

03590 {
03591     cout<<"Read DIABETES from: "<<path<<endl;
03592     nDomain = 1;
03593 
03594     // define data type and files
03595     int targetColumn = 9;
03596     uint nTrainTmp;
03597     char columnType[] = "nnnnnnnnd";
03598     char enabledCol[] = "111111111";
03599     const char* dataFiles[] = { ( new string ( path+"/pima-indians-diabetes.data" ) )->c_str(),0};
03600 
03601     // === TRAIN SET ===
03602     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03603 
03604     // allocate tmp mem
03605     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03606     int* trainLabelTmp = new int[nTrainTmp];
03607 
03608     // fill data
03609     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03610 
03611     // split train and testset from trainTmp
03612     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03613 
03614     delete[] trainTmp;
03615     delete[] trainLabelTmp;
03616 
03617 }

void DatasetReader::readGERMAN ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the GERMAN dataset (UCI)

79793Bytes german.data 102000Bytes german.data-numeric 4679Bytes german.doc

Definition at line 3626 of file DatasetReader.cpp.

03627 {
03628     cout<<"Read GERMAN from: "<<path<<endl;
03629     nDomain = 1;
03630 
03631     // define data type and files
03632     int targetColumn = 21;
03633     uint nTrainTmp;
03634     char columnType[] = "dnddnddnddndnddndnddd";
03635     char enabledCol[] = "111111111111111111111";
03636     const char* dataFiles[] = { ( new string ( path+"/german.data" ) )->c_str(),0};
03637 
03638     // === TRAIN SET ===
03639     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03640 
03641     // allocate tmp mem
03642     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03643     int* trainLabelTmp = new int[nTrainTmp];
03644 
03645     // fill data
03646     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03647 
03648     // split train and testset from trainTmp
03649     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03650 
03651     delete[] trainTmp;
03652     delete[] trainLabelTmp;
03653 
03654 }

void DatasetReader::readGLASS ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the GLASS dataset (UCI)

11903Bytes glass.data 3506Bytes glass.names 780Bytes glass.tag

Definition at line 3663 of file DatasetReader.cpp.

03664 {
03665     cout<<"Read GLASS from: "<<path<<endl;
03666     nDomain = 1;
03667 
03668     // define data type and files
03669     int targetColumn = 11;
03670     uint nTrainTmp;
03671     char columnType[] = "nnnnnnnnnnd";
03672     char enabledCol[] = "01111111111";
03673     const char* dataFiles[] = { ( new string ( path+"/glass.data" ) )->c_str(),0};
03674 
03675     // === TRAIN SET ===
03676     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03677 
03678     // allocate tmp mem
03679     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03680     int* trainLabelTmp = new int[nTrainTmp];
03681 
03682     // fill data
03683     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03684 
03685     // split train and testset from trainTmp
03686     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03687 
03688     delete[] trainTmp;
03689     delete[] trainLabelTmp;
03690 
03691 }

void DatasetReader::readHEART ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the HEART dataset (UCI)

4979Bytes SPECTF.names 33459Bytes SPECTF.test 10797Bytes SPECTF.train

Definition at line 3700 of file DatasetReader.cpp.

03701 {
03702     cout<<"Read HEART from: "<<path<<endl;
03703     nDomain = 1;
03704 
03705     // define data type and files
03706     int targetColumn = 1;
03707     char columnType[] = "dnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn";
03708     char enabledCol[] = "111111111111111111111111111111111111111111111";
03709     const char* dataFiles[] = { ( new string ( path+"/SPECTF.train" ) )->c_str(), ( new string ( path+"/SPECTF.test" ) )->c_str(),0};
03710 
03711     // === TRAIN SET ===
03712     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03713     train = new REAL[nFeat*nTrain];
03714     trainLabel = new int[nTrain];
03715     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03716 
03717     // === TEST SET ===
03718     getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03719     test = new REAL[nFeat*nTest];
03720     testLabel = new int[nTest];
03721     getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03722 
03723     // make numerical test targets
03724     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03725 
03726 }

void DatasetReader::readHEPATITIS ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the HEPATITIS dataset (UCI)

7545Bytes hepatitis.data 3098Bytes hepatitis.names

Definition at line 3734 of file DatasetReader.cpp.

03735 {
03736     cout<<"Read HEPATITIS from: "<<path<<endl;
03737     nDomain = 1;
03738 
03739     // define data type and files
03740     int targetColumn = 1;
03741     uint nTrainTmp;
03742     char columnType[] = "dnnnnnnnnnnnnnnnnnnn";
03743     char enabledCol[] = "11111111111111111111";
03744     const char* dataFiles[] = { ( new string ( path+"/hepatitis.data" ) )->c_str(),0};
03745 
03746     // === TRAIN SET ===
03747     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03748 
03749     // allocate tmp mem
03750     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03751     int* trainLabelTmp = new int[nTrainTmp];
03752 
03753     // fill data
03754     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03755 
03756     // split train and testset from trainTmp
03757     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03758 
03759     delete[] trainTmp;
03760     delete[] trainLabelTmp;
03761 
03762 }

void DatasetReader::readIONOSPHERE ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the IONOSPHERE dataset (UCI)

76467Bytes ionosphere.data 3116Bytes ionosphere.names

Definition at line 3770 of file DatasetReader.cpp.

03771 {
03772     cout<<"Read IONOSPHERE from: "<<path<<endl;
03773     nDomain = 1;
03774 
03775     // define data type and files
03776     int targetColumn = 35;
03777     uint nTrainTmp;
03778     char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd";
03779     char enabledCol[] = "11111111111111111111111111111111111";
03780     const char* dataFiles[] = { ( new string ( path+"/ionosphere.data" ) )->c_str(),0};
03781 
03782     // === TRAIN SET ===
03783     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03784 
03785     // allocate tmp mem
03786     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03787     int* trainLabelTmp = new int[nTrainTmp];
03788 
03789     // fill data
03790     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03791 
03792     // split train and testset from trainTmp
03793     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03794 
03795     delete[] trainTmp;
03796     delete[] trainLabelTmp;
03797 
03798 }

void DatasetReader::readIRIS ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the IRIS dataset (UCI)

4551Bytes iris.data 2998Bytes iris.names

Definition at line 3807 of file DatasetReader.cpp.

03808 {
03809     cout<<"Read IRIS from: "<<path<<endl;
03810     nDomain = 1;
03811 
03812     // define data type and files
03813     int targetColumn = 5;
03814     uint nTrainTmp;
03815     char columnType[] = "nnnnd";
03816     char enabledCol[] = "11111";
03817     const char* dataFiles[] = { ( new string ( path+"/iris.data" ) )->c_str(),0};
03818 
03819     // === TRAIN SET ===
03820     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03821 
03822     // allocate tmp mem
03823     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03824     int* trainLabelTmp = new int[nTrainTmp];
03825 
03826     // fill data
03827     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03828 
03829     // split train and testset from trainTmp
03830     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03831 
03832     delete[] trainTmp;
03833     delete[] trainLabelTmp;
03834 
03835 }

void DatasetReader::readKDDCup09Large ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the KNNCup09Large dataset

Definition at line 120 of file DatasetReader.cpp.

00121 {
00122     if ( 0 )
00123     {
00124         readKDDCup09LargeBin ( path, train, trainTarget, trainLabel, test, testTarget, testLabel, nTrain, nTest, nClass, nDomain, nFeat, positiveTarget, negativeTarget );
00125         return;
00126     }
00127 
00128     time_t t0 = time ( 0 );
00129 
00130     nDomain = 3;
00131 
00132     cout<<"Read KDDCup09 from: "<<path<<endl;
00133 
00134     // 8-full-numeric inputs
00135     // (epoch=10) reg=0.00113318 .... [classErr:49.6833%] [rmse:0.99835] [probe:-0.498109]  [CalcBlend] lambda:0.001  [classErr:61.3693%] [rmse:1.12297] ERR=-0.648214 35[s] !min! [saveBest][SB]
00136 
00137     char* targetFiles[] =  //"orange_large_train_toy.labels",
00138     {
00139         "orange_large_train_churn.labels",
00140         "orange_large_train_appetency.labels",
00141         "orange_large_train_upselling.labels"
00142     };
00143 
00144     int nPreAlloc = 100000000;
00145     char *buf0 = new char[512], *buf1 = new char[512];
00146     char* lineBuf = new char[nPreAlloc];
00147     //REAL* support;
00148     //int* supportCnt;
00149 
00150     int NUM = 14740, CAT = 260, NLINES = 50000;
00151     int nFiles = 5;
00152     bool setNumZerosToMeans = false;
00153     bool setMissingToMeans = false;
00154     int numericMinMissing = 1;
00155     int numericMaxCluster = 0;  // add categoric (one-hot) from numeric input, max. occurence cnt
00156     int minAttributeOccurenceCategorical = 50*nFiles;  // 20
00157     int minAttributeOccurenceNumerical = 500*nFiles;  // 200
00158     REAL maxSTD = 1e10; // 10
00159     cout<<"nFiles:"<<nFiles<<" minAttrOccurCat:"<<minAttributeOccurenceCategorical<<" minAttrOccurNum:"<<minAttributeOccurenceNumerical<<endl;
00160     cout<<setNumZerosToMeans<<" "<<setMissingToMeans<<" "<<numericMaxCluster<<" "<<minAttributeOccurenceCategorical<<" "<<minAttributeOccurenceNumerical<<" "<<maxSTD<<endl;
00161 
00162     vector<string>* numericalAttributes = new vector<string>[NUM];
00163     vector<int>* numericalAttributesCnt = new vector<int>[NUM];
00164     vector<string>* categoricalAttributes = new vector<string>[CAT];
00165     vector<int>* categoricalAttributesCnt = new vector<int>[CAT];
00166     bool* categoricalHasMissingBin = new bool[CAT];
00167     int* categoricalMissingCnt = new int[CAT];
00168     bool* categoricalHasUnknownBin = new bool[CAT];
00169     for ( int i=0;i<CAT;i++ )
00170     {
00171         categoricalHasMissingBin[i] = false;
00172         categoricalHasUnknownBin[i] = false;
00173         categoricalMissingCnt[i] = 0;
00174     }
00175     int* numericNonZeroCnt = new int[NUM];
00176     int* numericMissingCnt = new int[NUM];
00177     bool* numericHasMissingBin = new bool[NUM];
00178     double* numericNonZeroPercent = new double[NUM];
00179     for ( int i=0;i<NUM;i++ )
00180     {
00181         numericMissingCnt[i] = 0;
00182         numericNonZeroCnt[i] = 0;
00183         numericNonZeroPercent[i] = 0.0;
00184         numericHasMissingBin[i] = false;
00185     }
00186 
00187     double* minValues = new double[100000];
00188     double* maxValues = new double[100000];
00189     double* maxNormValues = new double[100000];
00190     double* meanValues = new double[100000];
00191     double* stdValues = new double[100000];
00192     double* mean2Values = new double[100000];
00193     int* meanCnt = new int[100000];
00194     for ( int i=0;i<100000;i++ )
00195     {
00196         minValues[i] = 1e20;
00197         maxValues[i] = -1e20;
00198         maxNormValues[i] = 0.0;
00199         meanValues[i] = 0.0;
00200         mean2Values[i] = 0.0;
00201         meanCnt[i] = 0;
00202         stdValues[i] = 0.0;
00203     }
00204 
00205     //===========================================================================================================================
00206     //===========================================================================================================================
00207     // Loop over 2 states:
00208     // - State=0  read train values (+build index tables)
00209     // - State=1  store to features (train or test)
00210     //
00211     for ( int state=0;state<2;state++ )
00212     {
00213         int nTrainFill = 0;
00214         if ( state == 0 )
00215         {
00216             nTrain = 0;
00217         }
00218 
00219         //=======================================================================================================================
00220         //=======================================================================================================================
00221         // Loop over n files (file chunks)
00222         //
00223         for ( int file=0;file<nFiles;file++ )
00224         {
00225             // open train or test set
00226             if ( state == 0 )
00227                 sprintf ( buf0,"%s/orange_large_train.data.chunk%d",path.c_str(), file+1 );
00228             else
00229             {
00230                 if ( Framework::getFrameworkMode() == 1 )
00231                     sprintf ( buf0,"%s/orange_large_test.data.chunk%d",path.c_str(), file+1 );
00232                 else
00233                     sprintf ( buf0,"%s/orange_large_train.data.chunk%d",path.c_str(), file+1 );
00234             }
00235 
00236             cout<<"Open:"<<buf0<<endl;
00237             fstream f;
00238             f.open ( buf0, ios::in );
00239             if ( f.is_open() == false )
00240                 assert ( false );
00241 
00242             // read the first line in the first file (dummy)
00243             if ( file==0 )
00244                 f.getline ( lineBuf, nPreAlloc );
00245 
00246             // tmp and count vars
00247             double zeroRatio = 0.0;
00248             double sparse = 0.0;
00249             int nTrainTmp = 0;
00250 
00251             //===================================================================================================================
00252             //===================================================================================================================
00253             // Read all lines of chunk file n
00254             //
00255             while ( f.getline ( lineBuf, nPreAlloc ) )
00256             {
00257                 if ( nTrainTmp%1000 == 0 )
00258                     cout<<"."<<flush;
00259 
00260                 // tmp and count vars
00261                 int pos0 = 0, pos1 = 0;
00262                 int nF = 0, nMissing = 0, nZeros = 0;
00263                 int nFeatFill = 0;
00264                 int nrHot = 0;
00265                 double value;
00266 
00267                 if ( state == 1 )
00268                 {
00269                     // add constant one
00270                     train[nTrainFill*nFeat + nFeatFill] = 1.0;
00271                     nFeatFill++;
00272                 }
00273 
00274                 //===============================================================================================================
00275                 //===============================================================================================================
00276                 // Go through all characters of this line
00277                 //
00278                 while ( lineBuf[pos1] )
00279                 {
00280                     // search for next tabulator
00281                     while ( lineBuf[pos1] != '\t' && lineBuf[pos1] != 0 )
00282                         pos1++;
00283 
00284                     //===========================================================================================================
00285                     //===========================================================================================================
00286                     // If the feature has some content
00287                     // This means no consecutive tabs
00288                     //
00289                     if ( pos1 > pos0 && lineBuf[pos1]!=0 )
00290                     {
00291                         // copy to tmp buffer
00292                         if ( pos1-pos0 <=0 || pos1-pos0 >= 512 )
00293                             assert ( false );
00294                         for ( int j=0;j<pos1-pos0;j++ )
00295                             buf1[j] = lineBuf[pos0+j];
00296                         buf1[pos1-pos0] = 0;
00297 
00298 
00299                         //=======================================================================================================
00300                         //=======================================================================================================
00301                         // Read Numeric value (feature count < NUM)
00302                         //
00303                         if ( nF < NUM )
00304                         {
00305                             if ( ( buf1[0]>='0' && buf1[0] <='9' ) || buf1[0]=='-' )
00306                                 ;
00307                             else
00308                             {
00309                                 cout<<"BUF:"<<buf1<<endl;
00310                                 assert ( false );
00311                             }
00312 
00313                             //sscanf(buf1, "%f", &value);
00314                             value = atof ( buf1 );
00315 
00316                             if ( value == 0.0 )
00317                                 nZeros++;
00318 
00319                             // first run through train data
00320                             if ( state==0 )
00321                             {
00322                                 if ( minValues[nF] > value )
00323                                     minValues[nF] = value;
00324                                 if ( maxValues[nF] < value )
00325                                     maxValues[nF] = value;
00326 
00327                                 // histogram over numeric values
00328                                 int size = numericalAttributes[nF].size();
00329                                 if ( size < numericMaxCluster )
00330                                 {
00331                                     int foundIndex = -1;
00332                                     for ( int j=0;j<size;j++ )
00333                                         if ( numericalAttributes[nF][j] == buf1 )
00334                                         {
00335                                             foundIndex = j;
00336                                             break;
00337                                         }
00338                                     // add value
00339                                     if ( foundIndex == -1 )
00340                                     {
00341                                         numericalAttributes[nF].push_back ( buf1 );
00342                                         numericalAttributesCnt[nF].push_back ( 1 );
00343                                     }
00344                                     else
00345                                         numericalAttributesCnt[nF][foundIndex]++;
00346                                 }
00347 
00348                                 if ( value != 0.0 )
00349                                 {
00350                                     numericNonZeroCnt[nF]++;
00351                                     if ( numericNonZeroCnt[nF] > nTrain+nTrainTmp+1 )
00352                                     {
00353                                         cout<<"numericNonZeroCnt[nF]:"<<numericNonZeroCnt[nF]<<" nF:"<<nF<<" nTrainTmp:"<<nTrainTmp<<" nZeros:"<<nZeros<<" pos0:"<<pos0<<" pos1:"<<pos1<<endl;
00354                                         assert ( false );
00355                                     }
00356                                 }
00357 
00358                                 if ( value != 0.0 )
00359                                 {
00360                                     // calc mean over numeric input
00361                                     meanValues[nF] += value;
00362                                     mean2Values[nF] += value * value;
00363                                     meanCnt[nF]++;
00364                                 }
00365                             }
00366                             else if ( state==1 ) // second run, fill data tables
00367                             {
00368                                 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
00369                                 {
00370                                     // numeric add
00371                                     if ( value == 0.0 && setNumZerosToMeans )
00372                                         train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
00373                                     else
00374                                         train[nTrainFill*nFeat + nFeatFill] = value;
00375                                     nFeatFill++;
00376 
00377                                     // numeric one hot add
00378                                     int size = numericalAttributes[nF].size();
00379                                     if ( size < numericMaxCluster && size > 1 )
00380                                     {
00381                                         int foundIndex = -1;
00382                                         for ( int j=0;j<size;j++ )
00383                                             if ( numericalAttributes[nF][j] == buf1 )
00384                                             {
00385                                                 foundIndex = j;
00386                                                 break;
00387                                             }
00388                                         // fill categorical
00389                                         int beforeHot = nrHot;
00390                                         for ( int j=0;j<size;j++ )
00391                                         {
00392                                             if ( foundIndex == j )
00393                                             {
00394                                                 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00395                                                 nrHot++;
00396                                             }
00397                                             else
00398                                                 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00399                                             nFeatFill++;
00400                                         }
00401                                         // fill missing
00402                                         /*if(nrHot == beforeHot)
00403                                             train[nTrainFill*nFeat + nFeatFill] = 0.0;
00404                                         else
00405                                             train[nTrainFill*nFeat + nFeatFill] = 1.0;
00406                                         nFeatFill++;*/
00407                                     }
00408 
00409                                 }
00410 
00411                                 // missing values one-hot encoded
00412                                 if ( numericHasMissingBin[nF] )
00413                                 {
00414                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;  // <- missing
00415                                     nFeatFill++;
00416                                     train[nTrainFill*nFeat + nFeatFill] = 1.0;  // <- available
00417                                     nFeatFill++;
00418                                 }
00419                             }
00420                         }
00421                         //=======================================================================================================
00422                         //=======================================================================================================
00423                         // Read Categorical value (feature count >= NUM)
00424                         //
00425                         else
00426                         {
00427                             int index = nF-NUM;
00428                             if ( index >= CAT )
00429                                 assert ( false );
00430                             int size = categoricalAttributes[index].size();
00431                             int sizeCnt = categoricalAttributesCnt[index].size();
00432                             if ( size != sizeCnt )
00433                                 assert ( false );
00434 
00435                             int foundIndex = -1;
00436                             for ( int j=0;j<size;j++ )
00437                                 if ( categoricalAttributes[index][j] == buf1 )
00438                                 {
00439                                     foundIndex = j;
00440                                     break;
00441                                 }
00442 
00443                             // first run through train data
00444                             if ( state==0 )
00445                             {
00446                                 // add value
00447                                 if ( foundIndex == -1 )
00448                                 {
00449                                     categoricalAttributes[index].push_back ( buf1 );
00450                                     categoricalAttributesCnt[index].push_back ( 1 );
00451                                 }
00452                                 else // already exists
00453                                     categoricalAttributesCnt[index][foundIndex]++;
00454                             }
00455                             else if ( state==1 ) // second run, fill data tables
00456                             {
00457                                 // one-hot encoding
00458                                 int fillCnt = 0;
00459                                 int beforeHot = nrHot;
00460                                 for ( int j=0;j<size;j++ )
00461                                 {
00462                                     if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
00463                                     {
00464                                         if ( foundIndex == j )
00465                                         {
00466                                             train[nTrainFill*nFeat + nFeatFill] = 1.0;
00467                                             nrHot++;
00468                                         }
00469                                         else
00470                                             train[nTrainFill*nFeat + nFeatFill] = 0.0;
00471                                         fillCnt++;
00472                                         nFeatFill++;
00473                                     }
00474                                 }
00475 
00476                                 // no missing (no consecutive tabs here)
00477                                 if ( categoricalHasMissingBin[index] )
00478                                 {
00479                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;
00480                                     fillCnt++;
00481                                     nFeatFill++;
00482                                 }
00483 
00484                                 // if found, but not in cache
00485                                 if ( categoricalHasUnknownBin[index] )
00486                                 {
00487                                     if ( beforeHot == nrHot )
00488                                     {
00489                                         train[nTrainFill*nFeat + nFeatFill] = 1.0;
00490                                         nrHot++;
00491                                     }
00492                                     else
00493                                         train[nTrainFill*nFeat + nFeatFill] = 0.0;
00494                                     fillCnt++;
00495                                     nFeatFill++;
00496                                 }
00497 
00498                                 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
00499                                 {
00500                                     cout<<"WARNING: foundIndex:"<<foundIndex<<" "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
00501                                     //assert(false);
00502                                 }
00503                             }
00504                         }
00505                     }
00506                     //===========================================================================================================
00507                     //===========================================================================================================
00508                     // If the feature has no content
00509                     // Missing value here
00510                     //
00511                     else
00512                     {
00513                         nMissing++;
00514 
00515                         if ( state==0 )
00516                         {
00517                             // numeric
00518                             if ( nF < NUM )
00519                             {
00520                                 numericMissingCnt[nF]++;
00521                             }
00522                             // categorical
00523                             if ( nF >= NUM )
00524                             {
00525                                 int index = nF-NUM;
00526                                 categoricalMissingCnt[index]++;
00527                             }
00528                         }
00529 
00530                         // second run, fill data tables with zeros
00531                         if ( state==1 )
00532                         {
00533                             //===================================================================================================
00534                             //===================================================================================================
00535                             // Read Numeric value (feature count < NUM)
00536                             //
00537                             if ( nF < NUM )
00538                             {
00539                                 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
00540                                 {
00541                                     // numeric add
00542                                     if ( setMissingToMeans )
00543                                         train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
00544                                     else
00545                                         train[nTrainFill*nFeat + nFeatFill] = 0.0;
00546                                     nFeatFill++;
00547 
00548                                     // numeric one hot add
00549                                     int size = numericalAttributes[nF].size();
00550                                     if ( size < numericMaxCluster && size > 1 )
00551                                     {
00552                                         // fill categorical
00553                                         for ( int j=0;j<size;j++ )
00554                                         {
00555                                             train[nTrainFill*nFeat + nFeatFill] = 0.0;
00556                                             nFeatFill++;
00557                                         }
00558                                         // fill missing
00559                                         //train[nTrainFill*nFeat + nFeatFill] = 1.0;
00560                                         //nFeatFill++;
00561                                     }
00562                                 }
00563 
00564                                 // missing values one-hot encoded
00565                                 if ( numericHasMissingBin[nF] )
00566                                 {
00567                                     train[nTrainFill*nFeat + nFeatFill] = 1.0;  // <- missing
00568                                     nFeatFill++;
00569                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;  // <- available
00570                                     nFeatFill++;
00571                                 }
00572                             }
00573                             //===================================================================================================
00574                             //===================================================================================================
00575                             // Read Categorical value (feature count >= NUM)
00576                             //
00577                             else
00578                             {
00579                                 int index = nF - NUM;
00580                                 if ( index >= CAT )
00581                                     assert ( false );
00582                                 int size = categoricalAttributes[index].size();
00583                                 int sizeCnt = categoricalAttributesCnt[index].size();
00584                                 if ( size != sizeCnt )
00585                                     assert ( false );
00586 
00587                                 // one-hot encoding
00588                                 int fillCnt = 0;
00589                                 int beforeHot = nrHot;
00590                                 for ( int j=0;j<size;j++ )
00591                                 {
00592                                     if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
00593                                     {
00594                                         train[nTrainFill*nFeat + nFeatFill] = 0.0;  // no here
00595                                         fillCnt++;
00596                                         nFeatFill++;
00597                                     }
00598                                 }
00599                                 if ( categoricalHasMissingBin[index] )
00600                                 {
00601                                     if ( fillCnt == 0 && categoricalHasUnknownBin[index] == false )
00602                                     {
00603                                         cout<<"categoricalMissingCnt["<<index<<"]:"<<categoricalMissingCnt[index]<<endl;
00604                                         assert ( false );
00605                                     }
00606                                     // set the input to "missing value"
00607                                     train[nTrainFill*nFeat + nFeatFill] = 1.0;
00608                                     nrHot++;
00609                                     fillCnt++;
00610                                     nFeatFill++;
00611                                 }
00612 
00613                                 if ( categoricalHasUnknownBin[index] )
00614                                 {
00615                                     // no unknown value
00616                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;
00617                                     fillCnt++;
00618                                     nFeatFill++;
00619                                 }
00620 
00621                                 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
00622                                 {
00623                                     cout<<"WARNING: "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
00624                                     //assert(false);
00625                                 }
00626                             }
00627                         }
00628                     }
00629 
00630                     // check for last character
00631                     if ( lineBuf[pos1]!=0 )
00632                         pos1++;
00633 
00634                     // beginpos = endpos
00635                     pos0 = pos1;
00636 
00637                     // column count
00638                     nF++;
00639                 }
00640 
00641                 // valid checks
00642                 if ( nF != NUM + CAT )
00643                     assert ( false );
00644                 if ( state==1 )
00645                 {
00646                     if ( nFeatFill != nFeat )
00647                     {
00648                         cout<<"nFeatFill:"<<nFeatFill<<" nFeat:"<<nFeat<<endl;
00649                         assert ( false );
00650                     }
00651                     nTrainFill++;
00652                 }
00653 
00654                 nTrainTmp++;
00655 
00656                 sparse += nMissing / ( double ) nF;
00657                 zeroRatio += nZeros / ( double ) nF;
00658             }
00659 
00660             f.close();
00661 
00662             // ratio of sparseness and zeroPercent
00663             sparse /= ( double ) nTrainTmp;
00664             zeroRatio /= ( double ) nTrainTmp;
00665             cout<<"nTrainTmp:"<<nTrainTmp<<endl;
00666             cout<<"missing values:"<<100.0*sparse<<"%"<<endl;
00667             cout<<"zero values:"<<100.0*zeroRatio<<"%"<<endl;
00668 
00669             double min0 = 1e20, max0 = -1e20;
00670             for ( int i=0;i<100000;i++ )
00671             {
00672                 if ( min0 > minValues[i] )
00673                     min0 = minValues[i];
00674                 if ( max0 < maxValues[i] )
00675                     max0 = maxValues[i];
00676             }
00677             cout<<"min|max values: "<<min0<<"|"<<max0<<endl;
00678 
00679             int sum = 0;
00680             for ( int j=0;j<CAT;j++ )
00681                 sum += categoricalAttributes[j].size();
00682             cout<<"nCategoricalSum:"<<sum<<endl;
00683 
00684             if ( state == 0 )
00685                 nTrain += nTrainTmp;
00686 
00687         }
00688 
00689         // do some checks
00690         if ( state == 1 )
00691         {
00692             if ( nTrain != nTrainFill )
00693                 assert ( false );
00694 
00695             for ( int i=0;i<nTrain*nFeat;i++ )
00696                 if ( train[i] == 1e10 )
00697                 {
00698                     cout<<"i:"<<i<<endl;
00699                     assert ( false );
00700                 }
00701         }
00702 
00703         if ( state==0 )
00704         {
00705             for ( int i=0;i<NUM;i++ )
00706                 numericNonZeroPercent[i] = ( double ) numericNonZeroCnt[i]/ ( double ) nTrain;
00707             for ( int i=0;i<100000;i++ )
00708                 if ( meanCnt[i] > 0 )
00709                 {
00710                     meanValues[i] /= ( double ) meanCnt[i];
00711                     stdValues[i] = sqrt ( mean2Values[i]/ ( double ) meanCnt[i] - meanValues[i]/ ( double ) meanCnt[i] );
00712                     maxNormValues[i] = fabs ( maxValues[i] - meanValues[i] );
00713                     if ( maxNormValues[i] < fabs ( minValues[i] - meanValues[i] ) )
00714                         maxNormValues[i] = fabs ( minValues[i] - meanValues[i] );
00715                 }
00716 
00717             cout<<"nTrain:"<<nTrain<<endl;
00718 
00719             // === Calculate effective number of input features ===
00720             nFeat = 1; // const
00721             int nFeatNum = 0, nFeatNumRaw = 0, nFeatNumCat = 0, nFeatCat = 0, nUnknown = 0, nMissing = 0, nIn = 0, nNumMiss = 0;
00722             // numerical
00723             for ( int j=0;j<NUM;j++ )
00724             {
00725                 if ( numericNonZeroCnt[j] >= minAttributeOccurenceNumerical && maxNormValues[j] < stdValues[j]*maxSTD )
00726                 {
00727                     // standard numerical input
00728                     nFeat++;
00729                     nFeatNum++;
00730                     nFeatNumRaw++;
00731 
00732                     // numerical input with limited number of different values -> translate it to categorical input
00733                     if ( numericalAttributes[j].size() < numericMaxCluster && numericalAttributes[j].size() > 1 )
00734                     {
00735                         cout<<"nFeatNum:"<<nFeatNum<<" ";
00736                         for ( int k=0;k<numericalAttributes[j].size();k++ )
00737                         {
00738                             cout<<numericalAttributes[j][k]<<"("<<numericalAttributesCnt[j][k]<<") ";
00739                             nFeat++;
00740                             nFeatNum++;
00741                             nFeatNumCat++;
00742                         }
00743                         cout<<endl;
00744                         /*
00745                         // add one bin for "missing or unknown value"
00746                         nFeat++;
00747                         nFeatNum++;
00748                         nFeatNumCat++;*/
00749                     }
00750                     if ( numericMissingCnt[j] >= numericMinMissing )
00751                     {
00752                         numericHasMissingBin[j] = true;
00753                         nFeat+=2;
00754                         nNumMiss+=2;
00755                     }
00756                 }
00757             }
00758             // categorical
00759             for ( int j=0;j<CAT;j++ )
00760             {
00761                 int nUsed = 0, nUn = 0, nCat = 0, nMiss = 0, nUnk = 0;
00762                 for ( int k=0;k<categoricalAttributesCnt[j].size();k++ )
00763                 {
00764                     // count valid entries (with enough occurence)
00765                     if ( categoricalAttributesCnt[j][k] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[j][k] < nTrain )
00766                     {
00767                         nFeat++;
00768                         nFeatCat++;
00769                         nUsed++;
00770                         nIn++;
00771                         nCat++;
00772                     }
00773                     else if ( categoricalAttributesCnt[j][k] < nTrain ) // not enough occurence -> put to unknown
00774                         nUn++;
00775                 }
00776                 // missing is like a normal categoric input
00777                 if ( ( categoricalMissingCnt[j] >= minAttributeOccurenceCategorical && categoricalMissingCnt[j] < nTrain ) || categoricalMissingCnt[j] > 0 && nCat > 0 )
00778                 {
00779                     // add a "missing value" input of this feature
00780                     nFeat++;
00781                     nFeatCat++;
00782                     nMissing++;
00783                     nMiss++;
00784                     categoricalHasMissingBin[j] = true;
00785                 }
00786                 if ( nUn > 0 && nCat + nMiss > 0 )
00787                 {
00788                     // add a "unknown value" input of this feature
00789                     nFeat++;
00790                     nFeatCat++;
00791                     nUnknown++;
00792                     nUnk++;
00793                     categoricalHasUnknownBin[j] = true;
00794                 }
00795 
00796                 if ( nCat + nMiss + nUnk == 1 )
00797                     assert ( false );
00798             }
00799 
00800             cout<<"nFeat:"<<nFeat<<" (numInputs:"<<nFeatNum<<" [rawNum:"<<nFeatNumRaw<<" nFeatNumCat:"<<nFeatNumCat<<"] catInputs:"<<nFeatCat<<" [nUnknown:"<<nUnknown<<" nMissing:"<<nMissing<<" nCat:"<<nIn<<"] numMissingHot:"<<nNumMiss<<" [+1const.])"<<endl;
00801 
00802             cout<<"Allocate train features: "<< ( double ) nTrain*nFeat/1e6*4.0<<" MB"<<endl;
00803             train = new REAL[nTrain*nFeat];
00804             for ( int i=0;i<nTrain*nFeat;i++ )
00805                 train[i] = 1e10;
00806 
00807             //support = new REAL[nFeat];
00808             //supportCnt = new int[nFeat];
00809             //for(int i=0;i<nFeat;i++)
00810             //{
00811             //support[i] = 0.0;
00812             //supportCnt[i] = 0;
00813             //}
00814 
00815             // read targets
00816             nClass = 2;
00817             trainTarget = new REAL[nTrain*nClass*nDomain];
00818             trainLabel = new int[nTrain*nDomain];
00819             for ( int d=0;d<nDomain;d++ )
00820             {
00821                 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] );
00822                 fstream f;
00823                 cout<<"Open targets:"<<buf0<<endl;
00824                 f.open ( buf0,ios::in );
00825                 if ( f.is_open() == false )
00826                     assert ( false );
00827                 int label;
00828                 for ( int i=0;i<nTrain;i++ )
00829                 {
00830                     f>>label;
00831                     if ( label==-1 )
00832                     {
00833                         trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget;
00834                         trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget;
00835                         trainLabel[i*nDomain + d] = 0;
00836                     }
00837                     else if ( label==1 )
00838                     {
00839                         trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget;
00840                         trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget;
00841                         trainLabel[i*nDomain + d] = 1;
00842                     }
00843                     else
00844                         assert ( false );
00845                 }
00846                 f.close();
00847             }
00848             // test set
00849             nTest = 0;
00850             test = 0;
00851             testTarget = 0;
00852             testLabel = 0;
00853 
00854         }
00855     }
00856 
00857     for ( int i=0;i<nTrain;i++ )
00858         for ( int j=0;j<nFeat;j++ )
00859             if ( train[i*nFeat+j] == 1e10 )
00860             {
00861                 cout<<"i:"<<i<<" j:"<<j<<" "<<train[i*nFeat+j]<<endl;
00862                 assert ( false );
00863             }
00864 
00865 
00866     fstream f;
00867     /*f.open("AAA.txt",ios::out);
00868     f<<"========= numerical ========="<<endl<<endl;
00869     for(int i=0;i<NUM;i++)
00870         if(numericNonZeroCnt[i] >= minAttributeOccurenceNumerical  && maxNormValues[i] < stdValues[i]*5.0)
00871             f<<i<<":"<<numericNonZeroCnt[i]<<"["<<numericNonZeroCnt[i]<<"]["<<minValues[i]<<"|"<<maxValues[i]<<"]"<<endl;
00872     f<<endl<<endl;
00873     f<<"========= categorical ========="<<endl<<endl;
00874     for(int i=0;i<CAT;i++)
00875     {
00876         int size = categoricalAttributes[i].size();
00877 
00878         int chkCnt = 0;
00879         for(int j=0;j<size;j++)
00880         if(categoricalAttributesCnt[i][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[i][j] < NLINES)
00881                 chkCnt++;
00882 
00883         if(chkCnt > 0)
00884             f<<endl<<"Attrib."<<i<<"(#"<<chkCnt<<"):";
00885 
00886         // go over all possible values
00887         for(int j=0;j<size;j++)
00888         {
00889             // find the max support
00890             int ind = -1;
00891             int max = -1;
00892             for(int k=0;k<size;k++)
00893             {
00894                 if(categoricalAttributesCnt[i][k] > max)
00895                 {
00896                     max = categoricalAttributesCnt[i][k];
00897                     ind = k;
00898                 }
00899             }
00900             if(ind==-1)
00901                 assert(false);
00902 
00903             if(categoricalAttributesCnt[i][ind] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[i][ind] < NLINES)
00904                 f<<j<<":"<<categoricalAttributes[i][ind]<<"["<<categoricalAttributesCnt[i][ind]<<"]("<<ind<<")  ";
00905 
00906             // mark as viewed
00907             categoricalAttributesCnt[i][ind] = -1;
00908         }
00909 
00910         if(categoricalHasMissingBin[i])
00911             f<<"+1xMissing"<<"[]("<<-1<<")  ";
00912 
00913         if(chkCnt > 0)
00914             f<<endl;
00915 
00916     }
00917     f.close();
00918     */
00919     if ( lineBuf )
00920     {
00921         delete[] lineBuf;
00922         lineBuf = 0;
00923     }
00924     if ( numericNonZeroCnt )
00925     {
00926         delete[] numericNonZeroCnt;
00927         numericNonZeroCnt = 0;
00928     }
00929     if ( numericNonZeroPercent )
00930     {
00931         delete[] numericNonZeroPercent;
00932         numericNonZeroPercent = 0;
00933     }
00934     if ( categoricalAttributes )
00935     {
00936         delete[] categoricalAttributes;
00937         categoricalAttributes = 0;
00938     }
00939     if ( meanValues )
00940     {
00941         delete[] meanValues;
00942         meanValues = 0;
00943     }
00944     if ( meanCnt )
00945     {
00946         delete[] meanCnt;
00947         meanCnt = 0;
00948     }
00949     if ( categoricalHasMissingBin )
00950     {
00951         delete[] categoricalHasMissingBin;
00952         categoricalHasMissingBin = 0;
00953     }
00954 
00955     // tmp print out of: train data
00956     f.open ( "A.txt",ios::out );
00957     double* mu = new double[nFeat];
00958     for ( int i=0;i<nFeat;i++ )
00959         mu[i] = 0.0;
00960     for ( int i=0;i<nTrain;i++ )
00961         for ( int j=0;j<nFeat;j++ )
00962             mu[j] += train[i*nFeat + j];
00963     for ( int i=0;i<nFeat;i++ )
00964         mu[i] /= ( double ) nTrain;
00965     for ( int i=0;i<nFeat;i++ )
00966         f<<mu[i]<<endl;
00967     f.close();
00968 
00969     //f.open("A.dat",ios::out);
00970     //f.write((char*)train,sizeof(REAL)*nTrain*nFeat);
00971     //f.close();
00972 
00973     /*f.open("A.txt",ios::out);
00974     for(int i=0;i<nTrain;i++)
00975     {
00976         for(int j=0;j<nFeat;j++)
00977             f<<train[i*nFeat+j]<<" ";
00978         f<<endl;
00979     }
00980     f.close();
00981 
00982     f.open("B.txt",ios::out);
00983     for(int i=0;i<nTrain;i++)
00984     {
00985         for(int j=0;j<nDomain;j++)
00986             f<<trainLabel[i*nDomain+j]<<" ";
00987         f<<endl;
00988     }
00989     f.close();
00990 
00991     f.open("C.txt",ios::out);
00992     for(int i=0;i<nTrain;i++)
00993     {
00994         for(int j=0;j<nDomain*nClass;j++)
00995             f<<trainTarget[i*nDomain*nClass+j]<<" ";
00996         f<<endl;
00997     }
00998     f.close();
00999     */
01000     if ( Framework::getFrameworkMode() == 1 )
01001     {
01002         cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl;
01003         test = train;
01004         train = 0;
01005         nTest = nTrain;
01006         nTrain = 0;
01007         testTarget = trainTarget;
01008         trainTarget = 0;
01009         testLabel = trainLabel;
01010         trainLabel = 0;
01011     }
01012     cout<<endl<<"Finished read in "<<time ( 0 )-t0<<"[s]"<<endl<<endl;
01013 }

void DatasetReader::readKDDCup09LargeBin ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Read the large dataset from the KDDCup2009 (internal binary type)

Definition at line 24 of file DatasetReader.cpp.

00025 {
00026     nDomain = 3;
00027     char* targetFiles[] =  //"orange_large_train_toy.labels",
00028     {
00029         "orange_large_train_churn.labels",
00030         "orange_large_train_appetency.labels",
00031         "orange_large_train_upselling.labels"
00032     };
00033 
00034     nTrain = 50000;
00035     nFeat = 113;
00036 
00037     char buf[512];
00038     if ( Framework::getFrameworkMode() == 1 )
00039     {
00040         //sprintf(buf,"featureSelection_churn_test_%d_features.dat",nFeat);
00041         //sprintf(buf,"featureSelection_appetency_test_%d_features.dat",nFeat);
00042         //sprintf(buf,"featureSelection_upselling_test_%d_features.dat",nFeat);
00043         sprintf ( buf,"featureSelection_all_test_%d_features.dat",nFeat );
00044     }
00045     else
00046     {
00047         //sprintf(buf,"featureSelection_churn_train_%d_features.dat",nFeat);
00048         //sprintf(buf,"featureSelection_appetency_train_%d_features.dat",nFeat);
00049         //sprintf(buf,"featureSelection_upselling_train_%d_features.dat",nFeat);
00050         sprintf ( buf,"featureSelection_all_train_%d_features.dat",nFeat );
00051     }
00052     cout<<"Open:"<<buf<<endl;
00053     train = new REAL[nTrain * nFeat];
00054     fstream f ( buf,ios::in );
00055     if ( f.is_open() == false )
00056         assert ( false );
00057     f.read ( ( char* ) train, sizeof ( REAL ) *nTrain*nFeat );
00058     f.close();
00059 
00060     // read targets
00061     nClass = 2;
00062     trainTarget = new REAL[nTrain*nClass*nDomain];
00063     trainLabel = new int[nTrain*nDomain];
00064     char buf0[512];
00065     for ( int d=0;d<nDomain;d++ )
00066     {
00067         sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] );
00068         fstream f;
00069         cout<<"Open targets:"<<buf0<<endl;
00070         f.open ( buf0,ios::in );
00071         if ( f.is_open() == false )
00072             assert ( false );
00073         int label;
00074         for ( int i=0;i<nTrain;i++ )
00075         {
00076             f>>label;
00077             if ( label==-1 )
00078             {
00079                 trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget;
00080                 trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget;
00081                 trainLabel[i*nDomain + d] = 0;
00082             }
00083             else if ( label==1 )
00084             {
00085                 trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget;
00086                 trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget;
00087                 trainLabel[i*nDomain + d] = 1;
00088             }
00089             else
00090                 assert ( false );
00091         }
00092         f.close();
00093     }
00094 
00095     // test set
00096     nTest = 0;
00097     test = 0;
00098     testTarget = 0;
00099     testLabel = 0;
00100 
00101     if ( Framework::getFrameworkMode() == 1 )
00102     {
00103         cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl;
00104         test = train;
00105         train = 0;
00106         nTest = nTrain;
00107         nTrain = 0;
00108         testTarget = trainTarget;
00109         trainTarget = 0;
00110         testLabel = trainLabel;
00111         trainLabel = 0;
00112     }
00113 
00114 }

void DatasetReader::readKDDCup09Small ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the KNNCup09Small dataset

Definition at line 1020 of file DatasetReader.cpp.

01021 {
01022     time_t t0 = time ( 0 );
01023 
01024     nDomain = 3;
01025 
01026     cout<<"Read KDDCup09 from: "<<path<<endl;
01027 
01028     char* targetFiles[] = {"orange_small_train_churn.labels"
01029                            ,"orange_small_train_appetency.labels"
01030                            ,"orange_small_train_upselling.labels"
01031                           };
01032 
01033     int nPreAlloc = 100000000;
01034     char *buf0 = new char[512], *buf1 = new char[512];
01035     char* lineBuf = new char[nPreAlloc];
01036 
01037     int NUM = 190, CAT = 40, NLINES = 50000;
01038     int nFiles = 1;
01039     bool setNumZerosToMeans = false;
01040     bool setMissingToMeans = false;
01041     int numericMinMissing = 1;
01042     int numericMaxCluster = 0;  // add categoric (one-hot) from numeric input, max. occurence cnt
01043     int minAttributeOccurenceCategorical = 200*nFiles;  // 20
01044     int minAttributeOccurenceNumerical = 500*nFiles;  // 50
01045     REAL maxSTD = 1e10; // 10
01046     cout<<"nFiles:"<<nFiles<<" minAttrOccurCat:"<<minAttributeOccurenceCategorical<<" minAttrOccurNum:"<<minAttributeOccurenceNumerical<<endl;
01047     cout<<setNumZerosToMeans<<" "<<setMissingToMeans<<" "<<numericMaxCluster<<" "<<minAttributeOccurenceCategorical<<" "<<minAttributeOccurenceNumerical<<" "<<maxSTD<<endl;
01048 
01049     vector<string>* numericalAttributes = new vector<string>[NUM];
01050     vector<int>* numericalAttributesCnt = new vector<int>[NUM];
01051     vector<string>* categoricalAttributes = new vector<string>[CAT];
01052     vector<int>* categoricalAttributesCnt = new vector<int>[CAT];
01053     bool* categoricalHasMissingBin = new bool[CAT];
01054     int* categoricalMissingCnt = new int[CAT];
01055     bool* categoricalHasUnknownBin = new bool[CAT];
01056     for ( int i=0;i<CAT;i++ )
01057     {
01058         categoricalHasMissingBin[i] = false;
01059         categoricalHasUnknownBin[i] = false;
01060         categoricalMissingCnt[i] = 0;
01061     }
01062     int* numericNonZeroCnt = new int[NUM];
01063     int* numericMissingCnt = new int[NUM];
01064     bool* numericHasMissingBin = new bool[NUM];
01065     double* numericNonZeroPercent = new double[NUM];
01066     for ( int i=0;i<NUM;i++ )
01067     {
01068         numericMissingCnt[i] = 0;
01069         numericNonZeroCnt[i] = 0;
01070         numericNonZeroPercent[i] = 0.0;
01071         numericHasMissingBin[i] = false;
01072     }
01073 
01074     double* minValues = new double[100000];
01075     double* maxValues = new double[100000];
01076     double* maxNormValues = new double[100000];
01077     double* meanValues = new double[100000];
01078     double* stdValues = new double[100000];
01079     double* mean2Values = new double[100000];
01080     int* meanCnt = new int[100000];
01081     for ( int i=0;i<100000;i++ )
01082     {
01083         minValues[i] = 1e20;
01084         maxValues[i] = -1e20;
01085         maxNormValues[i] = 0.0;
01086         meanValues[i] = 0.0;
01087         mean2Values[i] = 0.0;
01088         meanCnt[i] = 0;
01089         stdValues[i] = 0.0;
01090     }
01091 
01092     //===========================================================================================================================
01093     //===========================================================================================================================
01094     // Loop over 2 states:
01095     // - State=0  read train values (+build index tables)
01096     // - State=1  store to features (train or test)
01097     //
01098     for ( int state=0;state<2;state++ )
01099     {
01100         int nTrainFill = 0;
01101         if ( state == 0 )
01102         {
01103             nTrain = 0;
01104         }
01105 
01106         //=======================================================================================================================
01107         //=======================================================================================================================
01108         // Loop over n files (file chunks)
01109         //
01110         for ( int file=0;file<nFiles;file++ )
01111         {
01112             // open train or test set
01113             if ( state == 0 )
01114                 sprintf ( buf0,"%s/orange_small_train.data",path.c_str() );
01115             else
01116             {
01117                 if ( Framework::getFrameworkMode() == 1 )
01118                     sprintf ( buf0,"%s/orange_small_test.data",path.c_str() );
01119                 else
01120                     sprintf ( buf0,"%s/orange_small_train.data",path.c_str() );
01121             }
01122 
01123             cout<<"Open:"<<buf0<<endl;
01124             fstream f;
01125             f.open ( buf0, ios::in );
01126             if ( f.is_open() == false )
01127                 assert ( false );
01128 
01129             // read the first line in the first file (dummy)
01130             if ( file==0 )
01131                 f.getline ( lineBuf, nPreAlloc );
01132 
01133             // tmp and count vars
01134             double zeroRatio = 0.0;
01135             double sparse = 0.0;
01136             int nTrainTmp = 0;
01137 
01138             //===================================================================================================================
01139             //===================================================================================================================
01140             // Read all lines of chunk file n
01141             //
01142             while ( f.getline ( lineBuf, nPreAlloc ) )
01143             {
01144                 if ( nTrainTmp%1000 == 0 )
01145                     cout<<"."<<flush;
01146 
01147                 // tmp and count vars
01148                 int pos0 = 0, pos1 = 0;
01149                 int nF = 0, nMissing = 0, nZeros = 0;
01150                 int nFeatFill = 0;
01151                 int nrHot = 0;
01152                 double value;
01153 
01154                 if ( state == 1 )
01155                 {
01156                     // add constant one
01157                     train[nTrainFill*nFeat + nFeatFill] = 1.0;
01158                     nFeatFill++;
01159                 }
01160 
01161                 //===============================================================================================================
01162                 //===============================================================================================================
01163                 // Go through all characters of this line
01164                 //
01165                 while ( lineBuf[pos1] )
01166                 {
01167                     // search for next tabulator
01168                     while ( lineBuf[pos1] != '\t' && lineBuf[pos1] != 0 )
01169                         pos1++;
01170 
01171                     //===========================================================================================================
01172                     //===========================================================================================================
01173                     // If the feature has some content
01174                     // This means no consecutive tabs
01175                     //
01176                     if ( pos1 > pos0 && lineBuf[pos1]!=0 )
01177                     {
01178                         // copy to tmp buffer
01179                         if ( pos1-pos0 <=0 || pos1-pos0 >= 512 )
01180                             assert ( false );
01181                         for ( int j=0;j<pos1-pos0;j++ )
01182                             buf1[j] = lineBuf[pos0+j];
01183                         buf1[pos1-pos0] = 0;
01184 
01185 
01186                         //=======================================================================================================
01187                         //=======================================================================================================
01188                         // Read Numeric value (feature count < NUM)
01189                         //
01190                         if ( nF < NUM )
01191                         {
01192                             if ( ( buf1[0]>='0' && buf1[0] <='9' ) || buf1[0]=='-' )
01193                                 ;
01194                             else
01195                             {
01196                                 cout<<"BUF:"<<buf1<<endl;
01197                                 assert ( false );
01198                             }
01199 
01200                             //sscanf(buf1, "%f", &value);
01201                             value = atof ( buf1 );
01202 
01203                             if ( value == 0.0 )
01204                                 nZeros++;
01205 
01206                             // first run through train data
01207                             if ( state==0 )
01208                             {
01209                                 if ( minValues[nF] > value )
01210                                     minValues[nF] = value;
01211                                 if ( maxValues[nF] < value )
01212                                     maxValues[nF] = value;
01213 
01214                                 // histogram over numeric values
01215                                 int size = numericalAttributes[nF].size();
01216                                 if ( size < numericMaxCluster )
01217                                 {
01218                                     int foundIndex = -1;
01219                                     for ( int j=0;j<size;j++ )
01220                                         if ( numericalAttributes[nF][j] == buf1 )
01221                                         {
01222                                             foundIndex = j;
01223                                             break;
01224                                         }
01225                                     // add value
01226                                     if ( foundIndex == -1 )
01227                                     {
01228                                         numericalAttributes[nF].push_back ( buf1 );
01229                                         numericalAttributesCnt[nF].push_back ( 1 );
01230                                     }
01231                                     else
01232                                         numericalAttributesCnt[nF][foundIndex]++;
01233                                 }
01234 
01235                                 if ( value != 0.0 )
01236                                 {
01237                                     numericNonZeroCnt[nF]++;
01238                                     if ( numericNonZeroCnt[nF] > nTrain+nTrainTmp+1 )
01239                                     {
01240                                         cout<<"numericNonZeroCnt[nF]:"<<numericNonZeroCnt[nF]<<" nF:"<<nF<<" nTrainTmp:"<<nTrainTmp<<" nZeros:"<<nZeros<<" pos0:"<<pos0<<" pos1:"<<pos1<<endl;
01241                                         assert ( false );
01242                                     }
01243                                 }
01244 
01245                                 if ( value != 0.0 )
01246                                 {
01247                                     // calc mean over numeric input
01248                                     meanValues[nF] += value;
01249                                     mean2Values[nF] += value * value;
01250                                     meanCnt[nF]++;
01251                                 }
01252                             }
01253                             else if ( state==1 ) // second run, fill data tables
01254                             {
01255                                 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
01256                                 {
01257                                     // numeric add
01258                                     if ( value == 0.0 && setNumZerosToMeans )
01259                                         train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
01260                                     else
01261                                         train[nTrainFill*nFeat + nFeatFill] = value;
01262                                     nFeatFill++;
01263 
01264                                     // numeric one hot add
01265                                     int size = numericalAttributes[nF].size();
01266                                     if ( size < numericMaxCluster && size > 1 )
01267                                     {
01268                                         int foundIndex = -1;
01269                                         for ( int j=0;j<size;j++ )
01270                                             if ( numericalAttributes[nF][j] == buf1 )
01271                                             {
01272                                                 foundIndex = j;
01273                                                 break;
01274                                             }
01275                                         // fill categorical
01276                                         int beforeHot = nrHot;
01277                                         for ( int j=0;j<size;j++ )
01278                                         {
01279                                             if ( foundIndex == j )
01280                                             {
01281                                                 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01282                                                 nrHot++;
01283                                             }
01284                                             else
01285                                                 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01286                                             nFeatFill++;
01287                                         }
01288                                         // fill missing
01289                                         /*if(nrHot == beforeHot)
01290                                             train[nTrainFill*nFeat + nFeatFill] = 0.0;
01291                                             else
01292                                             train[nTrainFill*nFeat + nFeatFill] = 1.0;
01293                                             nFeatFill++;*/
01294                                     }
01295 
01296                                 }
01297 
01298                                 // missing values one-hot encoded
01299                                 if ( numericHasMissingBin[nF] )
01300                                 {
01301                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;  // <- missing
01302                                     nFeatFill++;
01303                                     train[nTrainFill*nFeat + nFeatFill] = 1.0;  // <- available
01304                                     nFeatFill++;
01305                                 }
01306                             }
01307                         }
01308                         //=======================================================================================================
01309                         //=======================================================================================================
01310                         // Read Categorical value (feature count >= NUM)
01311                         //
01312                         else
01313                         {
01314                             int index = nF-NUM;
01315                             if ( index >= CAT )
01316                                 assert ( false );
01317                             int size = categoricalAttributes[index].size();
01318                             int sizeCnt = categoricalAttributesCnt[index].size();
01319                             if ( size != sizeCnt )
01320                                 assert ( false );
01321 
01322                             int foundIndex = -1;
01323                             for ( int j=0;j<size;j++ )
01324                                 if ( categoricalAttributes[index][j] == buf1 )
01325                                 {
01326                                     foundIndex = j;
01327                                     break;
01328                                 }
01329 
01330                             // first run through train data
01331                             if ( state==0 )
01332                             {
01333                                 // add value
01334                                 if ( foundIndex == -1 )
01335                                 {
01336                                     categoricalAttributes[index].push_back ( buf1 );
01337                                     categoricalAttributesCnt[index].push_back ( 1 );
01338                                 }
01339                                 else // already exists
01340                                     categoricalAttributesCnt[index][foundIndex]++;
01341                             }
01342                             else if ( state==1 ) // second run, fill data tables
01343                             {
01344                                 // one-hot encoding
01345                                 int fillCnt = 0;
01346                                 int beforeHot = nrHot;
01347                                 for ( int j=0;j<size;j++ )
01348                                 {
01349                                     if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
01350                                     {
01351                                         if ( foundIndex == j )
01352                                         {
01353                                             train[nTrainFill*nFeat + nFeatFill] = 1.0;
01354                                             nrHot++;
01355                                         }
01356                                         else
01357                                             train[nTrainFill*nFeat + nFeatFill] = 0.0;
01358                                         fillCnt++;
01359                                         nFeatFill++;
01360                                     }
01361                                 }
01362 
01363                                 // no missing (no consecutive tabs here)
01364                                 if ( categoricalHasMissingBin[index] )
01365                                 {
01366                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;
01367                                     fillCnt++;
01368                                     nFeatFill++;
01369                                 }
01370 
01371                                 // if found, but not in cache
01372                                 if ( categoricalHasUnknownBin[index] )
01373                                 {
01374                                     if ( beforeHot == nrHot )
01375                                     {
01376                                         train[nTrainFill*nFeat + nFeatFill] = 1.0;
01377                                         nrHot++;
01378                                     }
01379                                     else
01380                                         train[nTrainFill*nFeat + nFeatFill] = 0.0;
01381                                     fillCnt++;
01382                                     nFeatFill++;
01383                                 }
01384 
01385                                 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
01386                                 {
01387                                     cout<<"WARNING: foundIndex:"<<foundIndex<<" "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
01388                                     //assert(false);
01389                                 }
01390                             }
01391                         }
01392                     }
01393                     //===========================================================================================================
01394                     //===========================================================================================================
01395                     // If the feature has no content
01396                     // Missing value here
01397                     //
01398                     else
01399                     {
01400                         nMissing++;
01401 
01402                         if ( state==0 )
01403                         {
01404                             // numeric
01405                             if ( nF < NUM )
01406                             {
01407                                 numericMissingCnt[nF]++;
01408                             }
01409                             // categorical
01410                             if ( nF >= NUM )
01411                             {
01412                                 int index = nF-NUM;
01413                                 categoricalMissingCnt[index]++;
01414                             }
01415                         }
01416 
01417                         // second run, fill data tables with zeros
01418                         if ( state==1 )
01419                         {
01420                             //===================================================================================================
01421                             //===================================================================================================
01422                             // Read Numeric value (feature count < NUM)
01423                             //
01424                             if ( nF < NUM )
01425                             {
01426                                 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
01427                                 {
01428                                     // numeric add
01429                                     if ( setMissingToMeans )
01430                                         train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
01431                                     else
01432                                         train[nTrainFill*nFeat + nFeatFill] = 0.0;
01433                                     nFeatFill++;
01434 
01435                                     // numeric one hot add
01436                                     int size = numericalAttributes[nF].size();
01437                                     if ( size < numericMaxCluster && size > 1 )
01438                                     {
01439                                         // fill categorical
01440                                         for ( int j=0;j<size;j++ )
01441                                         {
01442                                             train[nTrainFill*nFeat + nFeatFill] = 0.0;
01443                                             nFeatFill++;
01444                                         }
01445                                         // fill missing
01446                                         //train[nTrainFill*nFeat + nFeatFill] = 1.0;
01447                                         //nFeatFill++;
01448                                     }
01449                                 }
01450 
01451                                 // missing values one-hot encoded
01452                                 if ( numericHasMissingBin[nF] )
01453                                 {
01454                                     train[nTrainFill*nFeat + nFeatFill] = 1.0;  // <- missing
01455                                     nFeatFill++;
01456                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;  // <- available
01457                                     nFeatFill++;
01458                                 }
01459                             }
01460                             //===================================================================================================
01461                             //===================================================================================================
01462                             // Read Categorical value (feature count >= NUM)
01463                             //
01464                             else
01465                             {
01466                                 int index = nF - NUM;
01467                                 if ( index >= CAT )
01468                                     assert ( false );
01469                                 int size = categoricalAttributes[index].size();
01470                                 int sizeCnt = categoricalAttributesCnt[index].size();
01471                                 if ( size != sizeCnt )
01472                                     assert ( false );
01473 
01474                                 // one-hot encoding
01475                                 int fillCnt = 0;
01476                                 int beforeHot = nrHot;
01477                                 for ( int j=0;j<size;j++ )
01478                                 {
01479                                     if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
01480                                     {
01481                                         train[nTrainFill*nFeat + nFeatFill] = 0.0;  // no here
01482                                         fillCnt++;
01483                                         nFeatFill++;
01484                                     }
01485                                 }
01486                                 if ( categoricalHasMissingBin[index] )
01487                                 {
01488                                     if ( fillCnt == 0 && categoricalHasUnknownBin[index] == false )
01489                                     {
01490                                         cout<<"categoricalMissingCnt["<<index<<"]:"<<categoricalMissingCnt[index]<<endl;
01491                                         assert ( false );
01492                                     }
01493                                     // set the input to "missing value"
01494                                     train[nTrainFill*nFeat + nFeatFill] = 1.0;
01495                                     nrHot++;
01496                                     fillCnt++;
01497                                     nFeatFill++;
01498                                 }
01499 
01500                                 if ( categoricalHasUnknownBin[index] )
01501                                 {
01502                                     // no unknown value
01503                                     train[nTrainFill*nFeat + nFeatFill] = 0.0;
01504                                     fillCnt++;
01505                                     nFeatFill++;
01506                                 }
01507 
01508                                 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
01509                                 {
01510                                     cout<<"WARNING: "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
01511                                     //assert(false);
01512                                 }
01513                             }
01514                         }
01515                     }
01516 
01517                     // check for last character
01518                     if ( lineBuf[pos1]!=0 )
01519                         pos1++;
01520 
01521                     // beginpos = endpos
01522                     pos0 = pos1;
01523 
01524                     // column count
01525                     nF++;
01526                 }
01527 
01528                 // valid checks
01529                 if ( nF != NUM + CAT )
01530                     assert ( false );
01531                 if ( state==1 )
01532                 {
01533                     if ( nFeatFill != nFeat )
01534                     {
01535                         cout<<"nFeatFill:"<<nFeatFill<<" nFeat:"<<nFeat<<endl;
01536                         assert ( false );
01537                     }
01538                     nTrainFill++;
01539                 }
01540 
01541                 nTrainTmp++;
01542 
01543                 sparse += nMissing / ( double ) nF;
01544                 zeroRatio += nZeros / ( double ) nF;
01545             }
01546 
01547             f.close();
01548 
01549             // ratio of sparseness and zeroPercent
01550             sparse /= ( double ) nTrainTmp;
01551             zeroRatio /= ( double ) nTrainTmp;
01552             cout<<"nTrainTmp:"<<nTrainTmp<<endl;
01553             cout<<"missing values:"<<100.0*sparse<<"%"<<endl;
01554             cout<<"zero values:"<<100.0*zeroRatio<<"%"<<endl;
01555 
01556             double min0 = 1e20, max0 = -1e20;
01557             for ( int i=0;i<100000;i++ )
01558             {
01559                 if ( min0 > minValues[i] )
01560                     min0 = minValues[i];
01561                 if ( max0 < maxValues[i] )
01562                     max0 = maxValues[i];
01563             }
01564             cout<<"min|max values: "<<min0<<"|"<<max0<<endl;
01565 
01566             int sum = 0;
01567             for ( int j=0;j<CAT;j++ )
01568                 sum += categoricalAttributes[j].size();
01569             cout<<"nCategoricalSum:"<<sum<<endl;
01570 
01571             if ( state == 0 )
01572                 nTrain += nTrainTmp;
01573 
01574         }
01575 
01576         // do some checks
01577         if ( state == 1 )
01578         {
01579             if ( nTrain != nTrainFill )
01580                 assert ( false );
01581 
01582             for ( int i=0;i<nTrain*nFeat;i++ )
01583                 if ( train[i] == 1e10 )
01584                 {
01585                     cout<<"i:"<<i<<endl;
01586                     assert ( false );
01587                 }
01588         }
01589 
01590         if ( state==0 )
01591         {
01592             for ( int i=0;i<NUM;i++ )
01593                 numericNonZeroPercent[i] = ( double ) numericNonZeroCnt[i]/ ( double ) nTrain;
01594             for ( int i=0;i<100000;i++ )
01595                 if ( meanCnt[i] > 0 )
01596                 {
01597                     meanValues[i] /= ( double ) meanCnt[i];
01598                     stdValues[i] = sqrt ( mean2Values[i]/ ( double ) meanCnt[i] - meanValues[i]/ ( double ) meanCnt[i] );
01599                     maxNormValues[i] = fabs ( maxValues[i] - meanValues[i] );
01600                     if ( maxNormValues[i] < fabs ( minValues[i] - meanValues[i] ) )
01601                         maxNormValues[i] = fabs ( minValues[i] - meanValues[i] );
01602                 }
01603 
01604             cout<<"nTrain:"<<nTrain<<endl;
01605 
01606             // === Calculate effective number of input features ===
01607             nFeat = 1; // const
01608             int nFeatNum = 0, nFeatNumRaw = 0, nFeatNumCat = 0, nFeatCat = 0, nUnknown = 0, nMissing = 0, nIn = 0, nNumMiss = 0;
01609             // numerical
01610             for ( int j=0;j<NUM;j++ )
01611             {
01612                 if ( numericNonZeroCnt[j] >= minAttributeOccurenceNumerical && maxNormValues[j] < stdValues[j]*maxSTD )
01613                 {
01614                     // standard numerical input
01615                     nFeat++;
01616                     nFeatNum++;
01617                     nFeatNumRaw++;
01618 
01619                     // numerical input with limited number of different values -> translate it to categorical input
01620                     if ( numericalAttributes[j].size() < numericMaxCluster && numericalAttributes[j].size() > 1 )
01621                     {
01622                         cout<<"nFeatNum:"<<nFeatNum<<" ";
01623                         for ( int k=0;k<numericalAttributes[j].size();k++ )
01624                         {
01625                             cout<<numericalAttributes[j][k]<<"("<<numericalAttributesCnt[j][k]<<") ";
01626                             nFeat++;
01627                             nFeatNum++;
01628                             nFeatNumCat++;
01629                         }
01630                         cout<<endl;
01631                         /*
01632                         // add one bin for "missing or unknown value"
01633                             nFeat++;
01634                             nFeatNum++;
01635                             nFeatNumCat++;*/
01636                     }
01637                     if ( numericMissingCnt[j] >= numericMinMissing )
01638                     {
01639                         numericHasMissingBin[j] = true;
01640                         nFeat+=2;
01641                         nNumMiss+=2;
01642                     }
01643                 }
01644             }
01645             // categorical
01646             for ( int j=0;j<CAT;j++ )
01647             {
01648                 int nUsed = 0, nUn = 0, nCat = 0, nMiss = 0, nUnk = 0;
01649                 for ( int k=0;k<categoricalAttributesCnt[j].size();k++ )
01650                 {
01651                     // count valid entries (with enough occurence)
01652                     if ( categoricalAttributesCnt[j][k] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[j][k] < nTrain )
01653                     {
01654                         nFeat++;
01655                         nFeatCat++;
01656                         nUsed++;
01657                         nIn++;
01658                         nCat++;
01659                     }
01660                     else if ( categoricalAttributesCnt[j][k] < nTrain ) // not enough occurence -> put to unknown
01661                         nUn++;
01662                 }
01663                 // missing is like a normal categoric input
01664                 if ( ( categoricalMissingCnt[j] >= minAttributeOccurenceCategorical && categoricalMissingCnt[j] < nTrain ) || categoricalMissingCnt[j] > 0 && nCat > 0 )
01665                 {
01666                     // add a "missing value" input of this feature
01667                     nFeat++;
01668                     nFeatCat++;
01669                     nMissing++;
01670                     nMiss++;
01671                     categoricalHasMissingBin[j] = true;
01672                 }
01673                 if ( nUn > 0 && nCat + nMiss > 0 )
01674                 {
01675                     // add a "unknown value" input of this feature
01676                     nFeat++;
01677                     nFeatCat++;
01678                     nUnknown++;
01679                     nUnk++;
01680                     categoricalHasUnknownBin[j] = true;
01681                 }
01682 
01683                 if ( nCat + nMiss + nUnk == 1 )
01684                     assert ( false );
01685             }
01686 
01687             cout<<"nFeat:"<<nFeat<<" (numInputs:"<<nFeatNum<<" [rawNum:"<<nFeatNumRaw<<" nFeatNumCat:"<<nFeatNumCat<<"] catInputs:"<<nFeatCat<<" [nUnknown:"<<nUnknown<<" nMissing:"<<nMissing<<" nCat:"<<nIn<<"] numMissingHot:"<<nNumMiss<<" [+1const.])"<<endl;
01688 
01689             cout<<"Allocate train features: "<< ( double ) nTrain*nFeat/1e6*4.0<<" MB"<<endl;
01690             train = new REAL[nTrain*nFeat];
01691             for ( int i=0;i<nTrain*nFeat;i++ )
01692                 train[i] = 1e10;
01693 
01694             //support = new REAL[nFeat];
01695             //supportCnt = new int[nFeat];
01696             //for(int i=0;i<nFeat;i++)
01697             //{
01698             //support[i] = 0.0;
01699             //supportCnt[i] = 0;
01700             //}
01701 
01702             // read targets
01703             nClass = 2;
01704             trainTarget = new REAL[nTrain*nClass*nDomain];
01705             trainLabel = new int[nTrain*nDomain];
01706             for ( int d=0;d<nDomain;d++ )
01707             {
01708                 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] );
01709                 fstream f;
01710                 cout<<"Open targets:"<<buf0<<endl;
01711                 f.open ( buf0,ios::in );
01712                 if ( f.is_open() == false )
01713                     assert ( false );
01714                 int label;
01715                 for ( int i=0;i<nTrain;i++ )
01716                 {
01717                     f>>label;
01718                     if ( label==-1 )
01719                     {
01720                         trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget;
01721                         trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget;
01722                         trainLabel[i*nDomain + d] = 0;
01723                     }
01724                     else if ( label==1 )
01725                     {
01726                         trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget;
01727                         trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget;
01728                         trainLabel[i*nDomain + d] = 1;
01729                     }
01730                     else
01731                         assert ( false );
01732                 }
01733                 f.close();
01734             }
01735             // test set
01736             nTest = 0;
01737             test = 0;
01738             testTarget = 0;
01739             testLabel = 0;
01740 
01741         }
01742     }
01743 
01744     for ( int i=0;i<nTrain;i++ )
01745         for ( int j=0;j<nFeat;j++ )
01746             if ( train[i*nFeat+j] == 1e10 )
01747             {
01748                 cout<<"i:"<<i<<" j:"<<j<<" "<<train[i*nFeat+j]<<endl;
01749                 assert ( false );
01750             }
01751 
01752 
01753     fstream f;
01754     if ( lineBuf )
01755     {
01756         delete[] lineBuf;
01757         lineBuf = 0;
01758     }
01759     if ( numericNonZeroCnt )
01760     {
01761         delete[] numericNonZeroCnt;
01762         numericNonZeroCnt = 0;
01763     }
01764     if ( numericNonZeroPercent )
01765     {
01766         delete[] numericNonZeroPercent;
01767         numericNonZeroPercent = 0;
01768     }
01769     if ( categoricalAttributes )
01770     {
01771         delete[] categoricalAttributes;
01772         categoricalAttributes = 0;
01773     }
01774     if ( meanValues )
01775     {
01776         delete[] meanValues;
01777         meanValues = 0;
01778     }
01779     if ( meanCnt )
01780     {
01781         delete[] meanCnt;
01782         meanCnt = 0;
01783     }
01784     if ( categoricalHasMissingBin )
01785     {
01786         delete[] categoricalHasMissingBin;
01787         categoricalHasMissingBin = 0;
01788     }
01789 
01790     if ( Framework::getFrameworkMode() == 1 )
01791     {
01792         cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl;
01793         test = train;
01794         train = 0;
01795         nTest = nTrain;
01796         nTrain = 0;
01797         testTarget = trainTarget;
01798         trainTarget = 0;
01799         testLabel = trainLabel;
01800         trainLabel = 0;
01801     }
01802 
01803     cout<<endl<<"Finished read in "<<time ( 0 )-t0<<"[s]"<<endl<<endl;
01804 
01805 }

void DatasetReader::readLETTER ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the LETTER dataset (UCI)

712565Bytes letter-recognition.data 2734Bytes letter-recognition.names

Definition at line 3843 of file DatasetReader.cpp.

03844 {
03845     cout<<"Read LETTER from: "<<path<<endl;
03846     nDomain = 1;
03847 
03848     // define data type and files
03849     int targetColumn = 1;
03850     uint nTrainTmp;
03851     char columnType[] = "dnnnnnnnnnnnnnnnn";
03852     char enabledCol[] = "11111111111111111";
03853     const char* dataFiles[] = { ( new string ( path+"/letter-recognition.data" ) )->c_str(),0};
03854 
03855     // === TRAIN SET ===
03856     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03857 
03858     // allocate tmp mem
03859     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03860     int* trainLabelTmp = new int[nTrainTmp];
03861 
03862     // fill data
03863     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03864 
03865     // split train and testset from trainTmp
03866     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget, true );  // true = take the last n percent (without random selection)
03867 
03868     delete[] trainTmp;
03869     delete[] trainLabelTmp;
03870 
03871 }

void DatasetReader::readMNIST ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the dataset from following files Trainset: train-images-idx3-ubyte (47040016 Bytes), train-labels-idx1-ubyte (60008 Bytes) Testset: t10k-images-idx3-ubyte (7840016 Bytes), t10k-labels-idx1-ubyte (10008 Bytes)

Definition at line 2365 of file DatasetReader.cpp.

02366 {
02367     cout<<"Read MNIST from: "<<path<<endl;
02368 
02369     fstream fTrain ( ( path+"/"+string ( "train-images-idx3-ubyte" ) ).c_str(), ios::in );
02370     fstream fTrainLabels ( ( path+"/"+string ( "train-labels-idx1-ubyte" ) ).c_str(), ios::in );
02371     fstream fTest ( ( path+"/"+string ( "t10k-images-idx3-ubyte" ) ).c_str(), ios::in );
02372     fstream fTestLabels ( ( path+"/"+string ( "t10k-labels-idx1-ubyte" ) ).c_str(), ios::in );
02373 
02374     if ( fTrain.is_open() ==false || fTrainLabels.is_open() ==false || fTest.is_open() ==false || fTestLabels.is_open() ==false )
02375     {
02376         cout<<"Error in opening the files"<<endl;
02377         exit ( 0 );
02378     }
02379 
02380     // population
02381     nClass = 10;
02382     nDomain = 1;
02383     nTrain = 60000;
02384     nTest  = 10000;
02385     nFeat = 784;    // (28 x 28 pixel 8-Bit images)
02386 
02387     // allocate mem
02388     unsigned char* trainChar = new unsigned char[nTrain * nFeat];
02389     unsigned char* testChar = new unsigned char[nTest * nFeat];
02390     unsigned char* trainLabelChar = new unsigned char[nTrain];
02391     unsigned char* testLabelChar = new unsigned char[nTest];
02392 
02393     // load raw data
02394     unsigned int dummy;
02395     fTrain.read ( ( char* ) &dummy, sizeof ( int ) );  // magic number
02396     fTrain.read ( ( char* ) &dummy, sizeof ( int ) );  // #images
02397     fTrain.read ( ( char* ) &dummy, sizeof ( int ) );  // rows
02398     fTrain.read ( ( char* ) &dummy, sizeof ( int ) );  // cols
02399     fTrain.read ( ( char* ) trainChar, sizeof ( unsigned char ) *nTrain*nFeat );  // images
02400     fTrain.close();
02401 
02402     fTrainLabels.read ( ( char* ) &dummy, sizeof ( int ) );  // magic number
02403     fTrainLabels.read ( ( char* ) &dummy, sizeof ( int ) );  // #items
02404     fTrainLabels.read ( ( char* ) trainLabelChar, sizeof ( unsigned char ) *nTrain );  // labels
02405     fTrainLabels.close();
02406 
02407     fTest.read ( ( char* ) &dummy, sizeof ( int ) );  // magic number
02408     fTest.read ( ( char* ) &dummy, sizeof ( int ) );  // #images
02409     fTest.read ( ( char* ) &dummy, sizeof ( int ) );  // rows
02410     fTest.read ( ( char* ) &dummy, sizeof ( int ) );  // cols
02411     fTest.read ( ( char* ) testChar, sizeof ( unsigned char ) *nTest*nFeat );  // images
02412     fTest.close();
02413 
02414     fTestLabels.read ( ( char* ) &dummy, sizeof ( int ) );  // magic number
02415     fTestLabels.read ( ( char* ) &dummy, sizeof ( int ) );  // #items
02416     fTestLabels.read ( ( char* ) testLabelChar, sizeof ( unsigned char ) *nTest );  // labels
02417     fTestLabels.close();
02418 
02419     // row x col train images as test pgm file
02420     int rows = 50, cols = 100;
02421     fstream fimg ( ( path + "/MNIST.pgm" ).c_str(),ios::out );
02422     char buf[256];
02423     sprintf ( buf,"P5\n%d %d\n255\n", cols*28, rows*28 );
02424     fimg<<buf;
02425     // image
02426     for ( int I=0;I<rows;I++ )
02427     {
02428         // write image
02429         for ( int j=0;j<28;j++ )
02430         {
02431             for ( int i=0;i<cols;i++ )
02432             {
02433                 for ( int k=0;k<28;k++ )
02434                     fimg.write ( ( char* ) &trainChar[k + i*nFeat + j*28 + I*cols*nFeat], sizeof ( unsigned char ) );
02435             }
02436         }
02437     }
02438     fimg.close();
02439 
02440     // allocate + write dataset
02441     train = new REAL[nTrain * nFeat];
02442     trainLabel = new int[nTrain];
02443     test = new REAL[nTest * nFeat];
02444     testLabel = new int[nTest];
02445 
02446     for ( int i=0;i<nTrain;i++ )
02447     {
02448         trainLabel[i] = ( int ) trainLabelChar[i];
02449         for ( int j=0;j<nFeat;j++ )
02450             train[i*nFeat + j] = ( REAL ) trainChar[i*nFeat + j] / 255.0;
02451     }
02452 
02453     for ( int i=0;i<nTest;i++ )
02454     {
02455         testLabel[i] = ( int ) testLabelChar[i];
02456         for ( int j=0;j<nFeat;j++ )
02457             test[i*nFeat + j] = ( REAL ) testChar[i*nFeat + j] / 255.0;
02458     }
02459 
02460     // train targets
02461     trainTarget = new REAL[nClass*nTrain];
02462     for ( int i=0;i<nTrain;i++ )
02463     {
02464         for ( int j=0;j<nClass;j++ )
02465             trainTarget[i*nClass + j] = negativeTarget;  // negative class labels
02466         trainTarget[i*nClass + trainLabel[i]] = positiveTarget;  // positive class label
02467     }
02468 
02469     // test targets
02470     testTarget = new REAL[nClass*nTest];
02471     for ( int i=0;i<nTest;i++ )
02472     {
02473         for ( int j=0;j<nClass;j++ )
02474             testTarget[i*nClass + j] = negativeTarget;  // negative class labels
02475         testTarget[i*nClass + testLabel[i]] = positiveTarget;  // positive class label
02476     }
02477 
02478     // free raw data
02479     if ( trainChar )
02480     {
02481         delete[] trainChar;
02482         trainChar = 0;
02483     }
02484     if ( testChar )
02485     {
02486         delete[] testChar;
02487         testChar = 0;
02488     }
02489     if ( trainLabelChar )
02490     {
02491         delete[] trainLabelChar;
02492         trainLabelChar = 0;
02493     }
02494     if ( testLabelChar )
02495     {
02496         delete[] testLabelChar;
02497         testLabelChar = 0;
02498     }
02499 }

void DatasetReader::readMONKS1 ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the MONKS1 dataset (UCI)

10CATBytes monks-1.test 2947Bytes monks-1.train

Definition at line 3879 of file DatasetReader.cpp.

03880 {
03881     cout<<"Read MONKS1 from: "<<path<<endl;
03882     nDomain = 1;
03883 
03884     // define data type and files
03885     int targetColumn = 1;
03886     char columnType[] = "dnnnnnnd";
03887     char enabledCol[] = "11111110";
03888     const char* dataFiles[] = { ( new string ( path+"/monks-1.train" ) )->c_str(), ( new string ( path+"/monks-1.test" ) )->c_str(),0};
03889 
03890     // === TRAIN SET ===
03891     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03892     train = new REAL[nFeat*nTrain];
03893     trainLabel = new int[nTrain];
03894     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03895 
03896     // === TEST SET ===
03897     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03898     test = new REAL[nFeat*nTest];
03899     testLabel = new int[nTest];
03900     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03901 
03902     // make numerical test targets
03903     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03904 
03905 }

void DatasetReader::readMONKS2 ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the MONKS2 dataset (UCI)

10CATBytes monks-2.test 4013Bytes monks-2.train

Definition at line 3913 of file DatasetReader.cpp.

03914 {
03915     cout<<"Read MONKS2 from: "<<path<<endl;
03916     nDomain = 1;
03917 
03918     // define data type and files
03919     int targetColumn = 1;
03920     char columnType[] = "dnnnnnnd";
03921     char enabledCol[] = "11111110";
03922     const char* dataFiles[] = { ( new string ( path+"/monks-2.train" ) )->c_str(), ( new string ( path+"/monks-2.test" ) )->c_str(),0};
03923 
03924     // === TRAIN SET ===
03925     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03926     train = new REAL[nFeat*nTrain];
03927     trainLabel = new int[nTrain];
03928     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03929 
03930     // === TEST SET ===
03931     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03932     test = new REAL[nFeat*nTest];
03933     testLabel = new int[nTest];
03934     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03935 
03936     // make numerical test targets
03937     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03938 
03939 }

void DatasetReader::readMONKS3 ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the MONKS3 dataset (UCI)

10CATBytes monks-3.test 2886Bytes monks-3.train

Definition at line 3947 of file DatasetReader.cpp.

03948 {
03949     cout<<"Read MONKS3 from: "<<path<<endl;
03950     nDomain = 1;
03951 
03952     // define data type and files
03953     int targetColumn = 1;
03954     char columnType[] = "dnnnnnnd";
03955     char enabledCol[] = "11111110";
03956     const char* dataFiles[] = { ( new string ( path+"/monks-3.train" ) )->c_str(), ( new string ( path+"/monks-3.test" ) )->c_str(),0};
03957 
03958     // === TRAIN SET ===
03959     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03960     train = new REAL[nFeat*nTrain];
03961     trainLabel = new int[nTrain];
03962     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03963 
03964     // === TEST SET ===
03965     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03966     test = new REAL[nFeat*nTest];
03967     testLabel = new int[nTest];
03968     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03969 
03970     // make numerical test targets
03971     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03972 
03973 }

void DatasetReader::readMUSHROOM ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the MUSHROOM dataset (UCI)

373704Bytes agaricus-lepiota.data 6816Bytes agaricus-lepiota.names

Definition at line 3981 of file DatasetReader.cpp.

03982 {
03983     cout<<"Read MUSHROOM from: "<<path<<endl;
03984     nDomain = 1;
03985 
03986     // define data type and files
03987     int targetColumn = 1;
03988     uint nTrainTmp;
03989     char columnType[] = "ddddddddddddddddddddddd";
03990     char enabledCol[] = "11111111111111111111111";
03991     const char* dataFiles[] = { ( new string ( path+"/agaricus-lepiota.data" ) )->c_str(),0};
03992 
03993     // === TRAIN SET ===
03994     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03995 
03996     // allocate tmp mem
03997     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03998     int* trainLabelTmp = new int[nTrainTmp];
03999 
04000     // fill data
04001     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04002 
04003     // split train and testset from trainTmp
04004     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04005 
04006     delete[] trainTmp;
04007     delete[] trainLabelTmp;
04008 
04009 }

void DatasetReader::readNETFLIX ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the binary NETFLIX prediction files in the DataFiles folder e.g. prediction.dat: 16902104 Bytes, linear float precision [probe,qualifying] probeRatings.txt: 4
... 1408395 ratings (1..5)

Definition at line 2775 of file DatasetReader.cpp.

02776 {
02777     cout<<"Read NETFLIX binary predictions from: "<<NETFLIX_DATA_DIR<<endl;
02778 
02779     if ( Framework::getAdditionalStartupParameter() < 0 )
02780     {
02781         // probeset subsampling
02782         if ( Framework::getAdditionalStartupParameter() < -100 )
02783         {
02784             srand ( Framework::getRandomSeed() );
02785 
02786             cout<<"Probeset subsampling"<<endl;
02787 
02788             // population
02789             nClass = 1;   // -> one regression target
02790             nDomain = 1;
02791             nTrain = 1408395;  // #probe
02792             nTest = 2817131;   // #qual
02793 
02794             bool* maskProbe = new bool[nTrain];
02795             for ( int i=0;i<nTrain;i++ )
02796                 maskProbe[i] = false;
02797             double p = - ( double ) ( Framework::getAdditionalStartupParameter() ) / ( double ) nTrain;
02798             int c = 0;
02799             for ( int i=0;i<nTrain;i++ )
02800                 if ( ( double ) rand() / ( double ) RAND_MAX < p )
02801                 {
02802                     maskProbe[i] = true;
02803                     c++;
02804                 }
02805             cout<<"Selected: "<<c<<" probe samples"<<endl;
02806 
02807 
02808             // get all the data files
02809             vector<string> files = Data::getDirectoryFileList ( NETFLIX_DATA_DIR );
02810             vector<string> predictionFiles;
02811 
02812             // read the *.dat files (prediction of probe+qual files)
02813             nFeat = 0;
02814             for ( int i=0;i<files.size();i++ )
02815             {
02816                 int pos = files[i].find ( ".dat" );
02817                 string fileEnding = files[i].substr ( files[i].length()-4,4 );
02818                 if ( fileEnding == ".dat" )
02819                 {
02820                     predictionFiles.push_back ( files[i] );
02821                     nFeat++;
02822                 }
02823             }
02824 
02825             cout<<"nFeat: "<<nFeat<<endl;
02826             cout<<"nClass: "<<nClass<<endl;
02827 
02828             // probe targets
02829             //cout<<"Targets Read:"<<path+"/"+string("probeRatings.txt")<<endl;
02830             //fstream fProbeRatings((path+"/"+string("probeRatings.txt")).c_str(), ios::in);
02831             cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl;
02832             fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in );
02833             float* ratingCache = new float[nTrain];
02834             for ( int i=0;i<nTrain;i++ )
02835                 fProbeRatings>>ratingCache[i];
02836             fProbeRatings.close();
02837 
02838 
02839             test = 0;
02840             testLabel = 0;
02841             testTarget = 0;
02842             if ( Framework::getFrameworkMode() == 1 )
02843             {
02844                 test = new REAL[ ( nTrain+nTest ) * nFeat];
02845                 testTarget = new REAL[nTrain+nTest];
02846                 for ( int i=0;i<nTrain+nTest;i++ )
02847                     testTarget[i] = 0.0;
02848             }
02849             train = new REAL[c * nFeat];
02850             trainLabel = 0;
02851             trainTarget = new REAL[c];
02852             int d = 0;
02853             for ( int j=0;j<nTrain;j++ )
02854             {
02855                 if ( maskProbe[j] )
02856                 {
02857                     trainTarget[d] = ratingCache[j];
02858                     d++;
02859                 }
02860             }
02861 
02862             // predictions
02863             float* trainTmp = new float[nTrain+nTest];
02864             for ( int i=0;i<predictionFiles.size();i++ )
02865             {
02866                 fstream f ( predictionFiles[i].c_str(), ios::in );
02867                 f.read ( ( char* ) trainTmp, sizeof ( float ) * ( nTrain+nTest ) );
02868                 if ( Framework::getFrameworkMode() == 1 )
02869                     for ( int j=0;j<nTrain+nTest;j++ )
02870                         test[j*nFeat + i] = trainTmp[j];
02871                 d = 0;
02872                 for ( int j=0;j<nTrain;j++ )
02873                 {
02874                     if ( maskProbe[j] )
02875                     {
02876                         train[d*nFeat + i] = trainTmp[j];
02877                         d++;
02878                     }
02879                 }
02880                 f.close();
02881                 cout<<"Prediction file: "<<predictionFiles[i]<<endl;
02882             }
02883 
02884             delete[] trainTmp;
02885             delete[] ratingCache;
02886             delete[] maskProbe;
02887 
02888             nTest = nTrain + nTest;
02889             nTrain = c;
02890             cout<<"nTrain:"<<nTrain<<endl<<"nTest:"<<nTest<<endl<<endl;
02891             return;
02892         }
02893 
02894 
02895         // population
02896         nClass = 1;   // -> one regression target
02897         nDomain = 1;
02898         //nTrain = 1408395;  // #probe
02899         //nTest = 2817131;   // #qual
02900 
02901         // HACK: divide probe into 2 halfs
02902         nTrain = 704197;  // #probe
02903         nTest = 704198;   // #qual
02904 
02905         // get all the data files
02906         cout<<"read path from:"<<path+"/path.txt"<<endl;
02907         fstream fP ( ( path+"/path.txt" ).c_str(),ios::in );
02908         string predictorPath;
02909         fP>>predictorPath;
02910         cout<<"path:"<<predictorPath<<endl;
02911         fP.close();
02912         
02913         //vector<string> files = Data::getDirectoryFileList(NETFLIX_DATA_DIR);
02914         vector<string> files = Data::getDirectoryFileList ( predictorPath );
02915         sort(files.begin(), files.end());
02916         vector<string> predictionFiles;
02917 
02918         // read the *.dat files (prediction of probe+qual files)
02919         nFeat = 0;
02920         for ( int i=0;i<files.size();i++ )
02921         {
02922             int pos = files[i].find ( ".dat" );
02923             string fileEnding = files[i].substr ( files[i].length()-4,4 );
02924             if ( fileEnding == ".dat" )
02925             {
02926                 predictionFiles.push_back ( files[i] );
02927                 nFeat++;
02928             }
02929         }
02930 
02931         // =============== write the qual parts ================
02932         int nProbe = 1408395;
02933         int nQual = 2817131;
02934         REAL* tmp = new float[nProbe+nQual];
02935         REAL* tmp2 = new float[predictionFiles.size()*nQual];
02936         int* tmp3 = new int[nQual];
02937         fstream ff((predictorPath+"/grand_prize/judging.txt").c_str(),ios::in);
02938         char buf[1024];
02939         int cnt = 0;
02940         while(ff.getline(buf,1024))  // read judging.txt
02941         {
02942             string line(buf);
02943             if(line.length() > 0)
02944             {
02945                 if(line[line.length()-2] != ':')
02946                 {
02947                     int nr = atoi(line.c_str());
02948                     tmp3[cnt] = nr;
02949                     cnt++;
02950                 }
02951             }
02952         }
02953         assert(cnt==nQual);
02954         ff.close();
02955         for ( int i=0;i<predictionFiles.size();i++ )
02956         {
02957             fstream f ( predictionFiles[i].c_str(), ios::in );
02958             f.read ( ( char* ) tmp, sizeof ( float ) *(nProbe+nQual) );
02959             for(int j=0;j<nQual;j++)
02960                 tmp2[j*predictionFiles.size()+i] = tmp[nProbe+j];
02961             f.close();
02962         }
02963         fstream trainCSV((path+"/testQual.csv").c_str(), ios::out);  // write CSV file
02964         for(int i=0;i<nQual;i++)
02965         {
02966             for(int j=0;j<predictionFiles.size();j++)
02967                 trainCSV<<tmp2[i*nFeat+j]<<",";
02968             trainCSV<<tmp3[i]<<endl;
02969         }
02970         trainCSV.close();
02971         exit(0);
02972         // =============== write the qual parts ================
02973         
02974         
02975         
02976         cout<<"nFeat: "<<nFeat<<endl;
02977         cout<<"nClass: "<<nClass<<endl;
02978 
02979         bool doClipping = true;
02980         if ( Framework::getAdditionalStartupParameter() == -2 )
02981             doClipping = false;
02982 
02983         // allocate complete dataset
02984         if ( Framework::getFrameworkMode() == 0 )
02985         {
02986             train = new REAL[nTrain * nFeat];
02987             trainLabel = 0; //new int[nTrain];
02988             trainTarget = new REAL[nTrain * nClass];
02989 
02990             // probe targets
02991             //cout<<"Targets Read:"<<path+"/"+string("probeRatings.txt")<<endl;
02992             //fstream fProbeRatings((path+"/"+string("probeRatings.txt")).c_str(), ios::in);
02993             cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl;
02994             fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in );
02995             for ( int i=0;i<nTrain;i++ )
02996                 fProbeRatings>>trainTarget[i];
02997             fProbeRatings.close();
02998 
02999             float* trainTmp = new float[nTrain];
03000 
03001             // predictions
03002             for ( int i=0;i<predictionFiles.size();i++ )
03003             {
03004                 fstream f ( predictionFiles[i].c_str(), ios::in );
03005                 f.read ( ( char* ) trainTmp, sizeof ( float ) *nTrain );
03006                 double mean = 0.0;
03007                 for ( int j=0;j<nTrain;j++ )
03008                     mean += trainTmp[j];
03009                 mean /= ( double ) nTrain;
03010                 if ( mean > 1.0 && mean < 5.0 && doClipping )
03011                     cout<<"[clip] ";
03012                 for ( int j=0;j<nTrain;j++ )
03013                 {
03014                     train[j*nFeat + i] = trainTmp[j];
03015                     if ( mean > 1.0 && mean < 5.0 && doClipping )
03016                     {
03017                         if ( train[j*nFeat + i] > 5.0 )
03018                             train[j*nFeat + i] = 5.0;
03019                         if ( train[j*nFeat + i] < 1.0 )
03020                             train[j*nFeat + i] = 1.0;
03021                     }
03022                 }
03023                 f.close();
03024                 cout<<"Prediction file: "<<predictionFiles[i]<<" mean:"<<mean<<endl;
03025             }
03026 
03027             if ( trainTmp )
03028             {
03029                 delete[] trainTmp;
03030                 trainTmp = 0;
03031             }
03032 
03033             test = 0;
03034             testLabel = 0;
03035             testTarget = 0;
03036             nTest = 0;
03037             
03038             // write CSV file
03039             /*fstream trainCSV((path+"/train.csv").c_str(), ios::out);
03040             for(int i=0;i<nTrain;i++)
03041             {
03042                 for(int j=0;j<nFeat;j++)
03043                     trainCSV<<train[i*nFeat+j]<<",";
03044                 trainCSV<<trainTarget[i]<<endl;
03045             }
03046             trainCSV.close();*/
03047         }
03048         
03049         if ( Framework::getFrameworkMode() == 1 )
03050         {
03051             cout<<"alloc: "<<nTest * ( uint ) nFeat<<endl;
03052             test = new REAL[nTest * ( uint ) nFeat];
03053             testLabel = 0; //new int[nTest];
03054             testTarget = new REAL[nTest * ( uint ) nClass];
03055 
03056             // dummy targets
03057             for ( int i=0;i<nTest;i++ )
03058                 testTarget[i] = 3.7;  // just a init value (not known in netflix prize)
03059 
03060             // HACK: read 2nd half of probe, this act as a test set
03061             cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl;
03062             fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in );
03063             REAL dummy;
03064             for ( int i=0;i<nTrain;i++ )
03065                 fProbeRatings>>dummy;
03066             for ( int i=0;i<nTest;i++ )
03067                 fProbeRatings>>testTarget[i];
03068             fProbeRatings.close();
03069 
03070             float* testTmp = new float[nTest];
03071 
03072             // predictions
03073             for ( uint i=0;i<predictionFiles.size();i++ )
03074             {
03075                 fstream f ( predictionFiles[i].c_str(), ios::in );
03076                 f.read ( ( char* ) testTmp, sizeof ( float ) *nTrain );  // probe read (dummy)
03077                 f.read ( ( char* ) testTmp, sizeof ( float ) *nTest );
03078                 double mean = 0.0;
03079                 for ( int j=0;j<nTest;j++ )
03080                     mean += testTmp[j];
03081                 mean /= ( double ) nTest;
03082                 if ( mean > 1.0 && mean < 5.0 && doClipping )
03083                     cout<<"[clip] ";
03084                 for ( uint j=0;j<nTest;j++ )
03085                 {
03086                     test[j* ( uint ) nFeat + i] = testTmp[j];
03087                     if ( mean > 1.0 && mean < 5.0 && doClipping )
03088                     {
03089                         if ( test[j* ( uint ) nFeat + i] > 5.0 )
03090                             test[j* ( uint ) nFeat + i] = 5.0;
03091                         if ( test[j* ( uint ) nFeat + i] < 1.0 )
03092                             test[j* ( uint ) nFeat + i] = 1.0;
03093                     }
03094                 }
03095                 f.close();
03096                 cout<<"Prediction file: "<<predictionFiles[i]<<" mean:"<<mean<<endl;
03097             }
03098 
03099             if ( testTmp )
03100             {
03101                 delete[] testTmp;
03102                 testTmp = 0;
03103             }
03104 
03105             train = 0;
03106             trainLabel = 0;
03107             trainTarget = 0;
03108             nTrain = 0;
03109 
03110             // write CSV file
03111             /*fstream testCSV((path+"/test.csv").c_str(), ios::out);
03112             for(int i=0;i<nTest;i++)
03113             {
03114                 for(int j=0;j<nFeat;j++)
03115                     testCSV<<test[i*nFeat+j]<<",";
03116                 testCSV<<testTarget[i]<<endl;
03117             }
03118             testCSV.close();*/
03119         }
03120     }
03121     else // slot blend
03122     {
03123         // population
03124         nClass = 1;   // -> one regression target
03125         nDomain = 1;
03126         char buf0[512];
03127         char buf1[512];
03128         char buf2[512];
03129         char buf3[512];
03130         char buf4[512];
03131         sprintf ( buf0,"%s/%s%d/",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03132         sprintf ( buf1,"%s/%s%d/nProbe.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03133         sprintf ( buf2,"%s/%s%d/nQual.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03134         sprintf ( buf3,"%s/%s%d/ratings.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03135         sprintf ( buf4,"%s/%s%d/ratingsTest.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03136 
03137         fstream f;
03138 
03139         // nTrain
03140         f.open ( buf1,ios::in );
03141         f.read ( ( char* ) &nTrain,sizeof ( int ) );
03142         f.close();
03143 
03144         // nTest
03145         f.open ( buf2,ios::in );
03146         f.read ( ( char* ) &nTest,sizeof ( int ) );
03147         f.close();
03148 
03149         // targets
03150         float* tmp = new float[nTrain+nTest];
03151         f.open ( buf3,ios::in );
03152         f.read ( ( char* ) tmp,sizeof ( float ) *nTrain );
03153         f.close();
03154         trainTarget = new REAL[nTrain];
03155         for ( int i=0;i<nTrain;i++ )
03156             trainTarget[i] = tmp[i];
03157         testTarget = new REAL[nTest];
03158         //for(int i=0;i<nTest;i++)
03159         //    testTarget[i] = 3.7;
03160         f.open ( buf4,ios::in );
03161         f.read ( ( char* ) tmp,sizeof ( float ) *nTest );
03162         f.close();
03163         for ( int i=0;i<nTest;i++ )
03164             testTarget[i] = tmp[i];
03165 
03166         // get all the data files
03167         vector<string> files = Data::getDirectoryFileList ( buf0 );
03168         vector<string> predictionFiles;
03169 
03170         // read the *.dat files (prediction of probe+qual files)
03171         nFeat = 0;
03172         for ( int i=0;i<files.size();i++ )
03173         {
03174             string fileEnding = files[i].substr ( files[i].length()-4,4 );
03175             if ( fileEnding == ".dat" )
03176             {
03177                 predictionFiles.push_back ( files[i] );
03178                 nFeat++;
03179             }
03180         }
03181 
03182         cout<<"nFeat: "<<nFeat<<endl;
03183         cout<<"nClass: "<<nClass<<endl;
03184         cout<<"nTrain: "<<nTrain<<endl;
03185         cout<<"nTest: "<<nTest<<endl;
03186 
03187         // input features
03188         if ( Framework::getFrameworkMode() == 0 )
03189         {
03190             cout<<"allocate trainset: "<<nTrain * nFeat<<" elements"<<endl;
03191             train = new REAL[nTrain * nFeat];
03192             trainLabel = 0;
03193         }
03194         else
03195         {
03196             cout<<"allocate testset : "<< ( uint ) nTest * nFeat<<" elements"<<endl;
03197             test = new REAL[nTest * nFeat];
03198             testLabel = 0;
03199         }
03200 
03201         // predictions
03202         for ( int i=0;i<predictionFiles.size();i++ )
03203         {
03204             cout<<i<<"/"<< ( int ) predictionFiles.size() <<" ";
03205             f.open ( predictionFiles[i].c_str(), ios::in );
03206             f.read ( ( char* ) tmp, sizeof ( float ) * ( nTrain+nTest ) );
03207             f.close();
03208             double mean = 0.0;
03209             for ( int j=0;j<nTrain+nTest;j++ )
03210                 mean += tmp[j];
03211             mean /= ( double ) ( nTrain+nTest );
03212             if ( mean > 1.0 && mean < 5.0 )
03213                 cout<<"[clip] ";
03214             cout<<"mu:"<<mean<<" ";
03215             if ( Framework::getFrameworkMode() == 0 )
03216             {
03217                 // train
03218                 for ( int j=0;j<nTrain;j++ )
03219                 {
03220                     train[j*nFeat + i] = tmp[j];
03221                     if ( mean > 1.0 && mean < 5.0 )
03222                     {
03223                         if ( train[j*nFeat + i] > 5.0 )
03224                             train[j*nFeat + i] = 5.0;
03225                         if ( train[j*nFeat + i] < 1.0 )
03226                             train[j*nFeat + i] = 1.0;
03227                     }
03228                 }
03229             }
03230             else
03231             {
03232                 // test
03233                 for ( int j=0;j<nTest;j++ )
03234                 {
03235                     test[j*nFeat + i] = tmp[j+nTrain];
03236                     if ( mean > 1.0 && mean < 5.0 )
03237                     {
03238                         if ( test[j*nFeat + i] > 5.0 )
03239                             test[j*nFeat + i] = 5.0;
03240                         if ( test[j*nFeat + i] < 1.0 )
03241                             test[j*nFeat + i] = 1.0;
03242                     }
03243                 }
03244             }
03245             cout<<"Prediction file: "<<predictionFiles[i]<<endl;
03246         }
03247 
03248         if ( Framework::getFrameworkMode() == 0 )
03249             nTest = 0;
03250         else
03251             nTrain = 0;
03252 
03253         delete[] tmp;
03254 
03255     }
03256     
03257 }

void DatasetReader::readPOKER ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the POKER dataset (UCI)

24538333Bytes poker-hand-testing.data 613694Bytes poker-hand-training-true.data 5946Bytes poker-hand.names

Definition at line 4235 of file DatasetReader.cpp.

04236 {
04237     cout<<"Read POKER from: "<<path<<endl;
04238     nDomain = 1;
04239 
04240     // define data type and files
04241     int targetColumn = 11;
04242     char columnType[] = "ddddddddddd";
04243     char enabledCol[] = "11111111111";
04244     const char* dataFiles[] = { ( new string ( path+"/poker-hand-training-true.data" ) )->c_str(), ( new string ( path+"/poker-hand-testing.data" ) )->c_str(),0};
04245 
04246     // === TRAIN SET ===
04247     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
04248     train = new REAL[nFeat*nTrain];
04249     trainLabel = new int[nTrain];
04250     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
04251 
04252     // === TEST SET ===
04253     getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
04254     test = new REAL[nFeat*nTest];
04255     testLabel = new int[nTest];
04256     getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
04257 
04258     // make numerical test targets
04259     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04260 
04261 }

void DatasetReader::readPRUDSYS ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

PRUDSYS_DMC 2009: data mining cup Prudsys AG Trainset: dmc2009_train.txt (9239726 Bytes) Testset: dmc2009_forecast.txt (9308436 Bytes)

Definition at line 2279 of file DatasetReader.cpp.

02280 {
02281     REAL* feat, *target;
02282     int* label, N;
02283 
02284     fstream f;
02285     if ( Framework::getFrameworkMode() == 1 )
02286     {
02287         f.open ( ( path+"/dmc2009_forecast.txt" ).c_str(), ios::in );
02288         nFeat = 1857+1;
02289         N = 2418;
02290         nClass = 1;
02291         nDomain = 8;
02292     }
02293     else
02294     {
02295         f.open ( ( path+"/dmc2009_train.txt" ).c_str(), ios::in );
02296         nFeat = 1857+1;
02297         N = 2394;
02298         nClass = 1;
02299         nDomain = 8;
02300     }
02301 
02302     feat = new REAL[N*nFeat];
02303     target = new REAL[N*nClass*nDomain];
02304     label = 0;
02305 
02306     // features and labels
02307     char *buf = new char[100000];
02308     f.getline ( buf,100000 );
02309     positiveTarget = -1e10;
02310     negativeTarget = 1e10;
02311     for ( int i=0;i<N;i++ )
02312     {
02313         f.getline ( buf,100000 );
02314         stringstream ss ( buf );
02315         REAL r;
02316         int cnt = 0;
02317         feat[nFeat*i + cnt] = 1.0;
02318         cnt++;
02319         while ( ss>>r )
02320         {
02321             if ( cnt < nFeat )
02322                 feat[nFeat*i + cnt] = r;
02323             else if ( Framework::getFrameworkMode() == 0 )
02324                 target[nDomain*nClass*i + cnt - nFeat] = r;
02325             else if ( Framework::getFrameworkMode() == 1 )
02326                 target[nDomain*nClass*i + cnt - nFeat] = 0.0;
02327             cnt++;
02328         }
02329         if ( cnt != nFeat+nClass*nDomain && Framework::getFrameworkMode() == 0 )
02330             assert ( false );
02331     }
02332     f.close();
02333     delete[] buf;
02334 
02335     if ( Framework::getFrameworkMode() == 1 )
02336     {
02337         nTest = N;
02338         test = feat;
02339         testTarget = target;
02340         testLabel = label;
02341         train = 0;
02342         trainTarget = 0;
02343         trainLabel = 0;
02344         nTrain = 0;
02345     }
02346     else
02347     {
02348         nTrain = N;
02349         train = feat;
02350         trainTarget = target;
02351         trainLabel = label;
02352         test = 0;
02353         testTarget = 0;
02354         testLabel = 0;
02355         nTest = 0;
02356     }
02357 }

void DatasetReader::readSATIMAGE ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the SATIMAGE dataset (UCI)

5254Bytes sat.doc 525830Bytes sat.trn 236745Bytes sat.tst

Definition at line 4018 of file DatasetReader.cpp.

04019 {
04020     cout<<"Read SATIMAGE from: "<<path<<endl;
04021     nDomain = 1;
04022 
04023     // define data type and files
04024     int targetColumn = 37;
04025     uint nTrainTmp;
04026     char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd";
04027     char enabledCol[] = "1111111111111111111111111111111111111";
04028     const char* dataFiles[] = { ( new string ( path+"/sat.trn" ) )->c_str(), ( new string ( path+"/sat.tst" ) )->c_str(),0};
04029 
04030     // === TRAIN SET ===
04031     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
04032     train = new REAL[nFeat*nTrain];
04033     trainLabel = new int[nTrain];
04034     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
04035 
04036     // === TEST SET ===
04037     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
04038     test = new REAL[nFeat*nTest];
04039     testLabel = new int[nTest];
04040     getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
04041 
04042     // make numerical test targets
04043     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04044 
04045 }

void DatasetReader::readSEGMENTATION ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the SEGMENTATION dataset (UCI)

34481Bytes segmentation.data 2458Bytes segmentation.names 344723Bytes segmentation.test

Definition at line 4054 of file DatasetReader.cpp.

04055 {
04056     cout<<"Read SEGMENTATION from: "<<path<<endl;
04057     nDomain = 1;
04058 
04059     // define data type and files
04060     int targetColumn = 1;
04061     uint nTrainTmp;
04062     char columnType[] = "dnnnnnnnnnnnnnnnnnnn";
04063     char enabledCol[] = "11111111111111111111";
04064     const char* dataFiles[] = { ( new string ( path+"/segmentation.data" ) )->c_str(), ( new string ( path+"/segmentation.test" ) )->c_str(),0};
04065 
04066     // === TRAIN SET ===
04067     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
04068     train = new REAL[nFeat*nTrain];
04069     trainLabel = new int[nTrain];
04070     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
04071 
04072     // === TEST SET ===
04073     getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
04074     test = new REAL[nFeat*nTest];
04075     testLabel = new int[nTest];
04076     getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
04077 
04078     // make numerical test targets
04079     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04080 
04081 }

void DatasetReader::readSONAR ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the SONAR dataset (UCI)

87776Bytes sonar.all-data 5872Bytes sonar.names

Definition at line 4089 of file DatasetReader.cpp.

04090 {
04091     cout<<"Read SONAR from: "<<path<<endl;
04092     nDomain = 1;
04093 
04094     // define data type and files
04095     int targetColumn = 61;
04096     uint nTrainTmp;
04097     char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd";
04098     char enabledCol[] = "1111111111111111111111111111111111111111111111111111111111111";
04099     const char* dataFiles[] = { ( new string ( path+"/sonar.all-data" ) )->c_str(),0};
04100 
04101     // === TRAIN SET ===
04102     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04103 
04104     // allocate tmp mem
04105     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04106     int* trainLabelTmp = new int[nTrainTmp];
04107 
04108     // fill data
04109     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04110 
04111     // split train and testset from trainTmp
04112     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04113 
04114     delete[] trainTmp;
04115     delete[] trainLabelTmp;
04116 
04117 }

void DatasetReader::readSPIDER ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the artificial dataset generated by spider matlab framework

10CATBytes monks-1.test 2947Bytes monks-1.train

Definition at line 4341 of file DatasetReader.cpp.

04342 {
04343     cout<<"Read SPIDER from: "<<path<<endl;
04344     nDomain = 1;
04345     nFeat = 3;
04346     nClass = 2;
04347 
04348     // the faster version for read-in
04349     int bufLen = 1024 * 1024;
04350     char *buf = new char[bufLen];
04351 
04352     // trainset
04353     nTrain = 0;
04354     fstream f ( ( path+"/train.data" ).c_str(), ios::in );
04355     while ( f.getline ( buf,bufLen ) )
04356         nTrain++;
04357     f.close();
04358     train = new REAL[3*nTrain];
04359     trainTarget = new REAL[2*nTrain];
04360     trainLabel = new int[nTrain];
04361 
04362     f.open ( ( path+"/train.data" ).c_str(), ios::in );
04363     nTrain = 0;
04364     while ( f.getline ( buf,bufLen ) )
04365     {
04366         sscanf ( buf,"%f %f %d",&train[3*nTrain],&train[3*nTrain+1],&trainLabel[nTrain] );
04367         train[3*nTrain+2] = 1.0;
04368         if ( trainLabel[nTrain] > 0 )
04369         {
04370             trainTarget[2*nTrain] = positiveTarget;
04371             trainTarget[2*nTrain+1] = negativeTarget;
04372             trainLabel[nTrain] = 0;
04373         }
04374         else
04375         {
04376             trainTarget[2*nTrain] = negativeTarget;
04377             trainTarget[2*nTrain+1] = positiveTarget;
04378             trainLabel[nTrain] = 1;
04379         }
04380         nTrain++;
04381     }
04382     f.close();
04383 
04384     // testset
04385     nTest = 0;
04386     f.open ( ( path+"/test.data" ).c_str(), ios::in );
04387     while ( f.getline ( buf,bufLen ) )
04388         nTest++;
04389     f.close();
04390     test = new REAL[3*nTest];
04391     testTarget = new REAL[2*nTest];
04392     testLabel = new int[nTest];
04393 
04394     f.open ( ( path+"/test.data" ).c_str(), ios::in );
04395     nTest = 0;
04396     while ( f.getline ( buf,bufLen ) )
04397     {
04398         sscanf ( buf,"%f %f %d",&test[3*nTest],&test[3*nTest+1],&testLabel[nTest] );
04399         test[3*nTest+2] = 1.0;
04400         if ( testLabel[nTrain] > 0 )
04401         {
04402             testTarget[2*nTest] = positiveTarget;
04403             testTarget[2*nTest+1] = negativeTarget;
04404             testLabel[nTest] = 0;
04405         }
04406         else
04407         {
04408             testTarget[2*nTest] = negativeTarget;
04409             testTarget[2*nTest+1] = positiveTarget;
04410             testLabel[nTest] = 1;
04411         }
04412         nTest++;
04413     }
04414     f.close();
04415 
04416     delete[] buf;
04417 
04418     /*
04419     // define data type and files
04420     int targetColumn = 3;
04421     char columnType[] = "nnd";
04422     char enabledCol[] = "111";
04423     const char* dataFiles[] = {(new string(path+"/train.data"))->c_str(),(new string(path+"/test.data"))->c_str(),0};
04424 
04425     bool addConstantOne = true;
04426 
04427     // === TRAIN SET ===
04428     getDataBounds(dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, false, 0, 0, addConstantOne);
04429     train = new REAL[nFeat*nTrain];
04430     trainLabel = new int[nTrain];
04431     getDataBounds(dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel, addConstantOne);
04432 
04433     // === TEST SET ===
04434     getDataBounds(dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, false, 0, 0, addConstantOne);
04435     test = new REAL[nFeat*nTest];
04436     testLabel = new int[nTest];
04437     getDataBounds(dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel, addConstantOne);
04438 
04439     // make numerical test targets
04440     makeNumericTrainAndTestTargets(nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget);
04441     */
04442 
04443     /*
04444     // simulate more domains
04445     nDomain = 3;
04446     int* trainLabelTmp = new int[nTrain*nDomain];
04447     for(int i=0;i<nTrain;i++)
04448         for(int d=0;d<nDomain;d++)
04449             trainLabelTmp[i*nDomain + d] = trainLabel[i];
04450     delete[] trainLabel;
04451     trainLabel = trainLabelTmp;
04452 
04453     int* testLabelTmp = new int[nTest*nDomain];
04454     for(int i=0;i<nTest;i++)
04455         for(int d=0;d<nDomain;d++)
04456             testLabelTmp[i*nDomain + d] = testLabel[i];
04457     delete[] testLabel;
04458     testLabel = testLabelTmp;
04459 
04460     // train targets
04461     trainTarget = new REAL[nClass*nDomain*nTrain];
04462     for(int i=0;i<nTrain;i++)
04463     {
04464         for(int d=0;d<nDomain;d++)
04465         {
04466             for(int j=0;j<nClass;j++)
04467                 trainTarget[i*nClass*nDomain + d*nClass + j] = d==1?positiveTarget:negativeTarget;  // negative class labels
04468             trainTarget[i*nClass*nDomain + d*nClass + trainLabel[i*nDomain + d]] = d==1?negativeTarget:positiveTarget;  // positive class label
04469         }
04470     }
04471 
04472     // test targets
04473     testTarget = new REAL[nClass*nDomain*nTest];
04474     for(int i=0;i<nTest;i++)
04475     {
04476         for(int d=0;d<nDomain;d++)
04477         {
04478             for(int j=0;j<nClass;j++)
04479                 testTarget[i*nClass*nDomain + d*nClass + j] = d==1?positiveTarget:negativeTarget;  // negative class labels
04480             testTarget[i*nClass*nDomain + d*nClass + testLabel[i*nDomain + d]] = d==1?negativeTarget:positiveTarget;  // positive class label
04481         }
04482     }
04483     */
04484 }

void DatasetReader::readSURVIVAL ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the SURVIVAL dataset (UCI)

3103Bytes haberman.data 1368Bytes haberman.names

Definition at line 4305 of file DatasetReader.cpp.

04306 {
04307     cout<<"Read SURVIVAL from: "<<path<<endl;
04308     nDomain = 1;
04309 
04310     // define data type and files
04311     int targetColumn = 4;
04312     uint nTrainTmp;
04313     char columnType[] = "nnnd";
04314     char enabledCol[] = "1111";
04315     const char* dataFiles[] = { ( new string ( path+"/haberman.data" ) )->c_str(),0};
04316 
04317     // === TRAIN SET ===
04318     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04319 
04320     // allocate tmp mem
04321     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04322     int* trainLabelTmp = new int[nTrainTmp];
04323 
04324     // fill data
04325     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04326 
04327     // split train and testset from trainTmp
04328     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04329 
04330     delete[] trainTmp;
04331     delete[] trainLabelTmp;
04332 
04333 }

void DatasetReader::readVEHICLE ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the VEHICLE dataset (UCI)

55517Bytes train.data 6386Bytes vehicle.doc

Definition at line 4126 of file DatasetReader.cpp.

04127 {
04128     cout<<"Read VEHICLE from: "<<path<<endl;
04129     nDomain = 1;
04130 
04131     // define data type and files
04132     int targetColumn = 19;
04133     uint nTrainTmp;
04134     char columnType[] = "nnnnnnnnnnnnnnnnnnd";
04135     char enabledCol[] = "1111111111111111111";
04136     const char* dataFiles[] = { ( new string ( path+"/train.data" ) )->c_str(),0};
04137 
04138     // === TRAIN SET ===
04139     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04140 
04141     // allocate tmp mem
04142     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04143     int* trainLabelTmp = new int[nTrainTmp];
04144 
04145     // fill data
04146     getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04147 
04148     // split train and testset from trainTmp
04149     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04150 
04151     delete[] trainTmp;
04152     delete[] trainLabelTmp;
04153 
04154 }

void DatasetReader::readVOTES ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the VOTES dataset (UCI)

18171Bytes house-votes-84.data 6868Bytes house-votes-84.names

Definition at line 4162 of file DatasetReader.cpp.

04163 {
04164     cout<<"Read VOTES from: "<<path<<endl;
04165     nDomain = 1;
04166 
04167     // define data type and files
04168     int targetColumn = 1;
04169     uint nTrainTmp;
04170     char columnType[] = "ddddddddddddddddd";
04171     char enabledCol[] = "11111111111111111";
04172     const char* dataFiles[] = { ( new string ( path+"/house-votes-84.data" ) )->c_str(),0};
04173 
04174     // === TRAIN SET ===
04175     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04176 
04177     // allocate tmp mem
04178     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04179     int* trainLabelTmp = new int[nTrainTmp];
04180 
04181     // fill data
04182     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04183 
04184     // split train and testset from trainTmp
04185     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04186 
04187     delete[] trainTmp;
04188     delete[] trainLabelTmp;
04189 
04190 }

void DatasetReader::readWINE ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the WINE dataset (UCI)

10782Bytes wine.data 3036Bytes wine.names

Definition at line 4198 of file DatasetReader.cpp.

04199 {
04200     cout<<"Read WINE from: "<<path<<endl;
04201     nDomain = 1;
04202 
04203     // define data type and files
04204     int targetColumn = 1;
04205     uint nTrainTmp;
04206     char columnType[] = "dnnnnnnnnnnnnn";
04207     char enabledCol[] = "11111111111111";
04208     const char* dataFiles[] = { ( new string ( path+"/wine.data" ) )->c_str(),0};
04209 
04210     // === TRAIN SET ===
04211     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04212 
04213     // allocate tmp mem
04214     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04215     int* trainLabelTmp = new int[nTrainTmp];
04216 
04217     // fill data
04218     getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04219 
04220     // split train and testset from trainTmp
04221     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04222 
04223     delete[] trainTmp;
04224     delete[] trainLabelTmp;
04225 
04226 }

void DatasetReader::readYEAST ( string  path,
REAL *&  train,
REAL *&  trainTarget,
int *&  trainLabel,
REAL *&  test,
REAL *&  testTarget,
int *&  testLabel,
uint &  nTrain,
uint &  nTest,
int &  nClass,
int &  nDomain,
int &  nFeat,
REAL  positiveTarget = 1.0,
REAL  negativeTarget = -1.0 
)

Reads the YEAST dataset (UCI)

94976Bytes yeast.data 3313Bytes yeast.names

Definition at line 4269 of file DatasetReader.cpp.

04270 {
04271     cout<<"Read YEAST from: "<<path<<endl;
04272     nDomain = 1;
04273 
04274     // define data type and files
04275     int targetColumn = 10;
04276     uint nTrainTmp;
04277     char columnType[] = "dnnnnnnnnd";
04278     char enabledCol[] = "0111111111";
04279     const char* dataFiles[] = { ( new string ( path+"/yeast.data" ) )->c_str(),0};
04280 
04281     // === TRAIN SET ===
04282     getDataBounds ( dataFiles, "  ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04283 
04284     // allocate tmp mem
04285     REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04286     int* trainLabelTmp = new int[nTrainTmp];
04287 
04288     // fill data
04289     getDataBounds ( dataFiles, "  ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04290 
04291     // split train and testset from trainTmp
04292     splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04293 
04294     delete[] trainTmp;
04295     delete[] trainLabelTmp;
04296 
04297 }

void DatasetReader::splitRandomTestset ( REAL  percentTest,
REAL *  data,
int *  labels,
int  nData,
int  nFeat,
int  nClass,
REAL *&  train,
int *&  trainLabel,
REAL *&  trainTarget,
REAL *&  test,
int *&  testLabel,
REAL *&  testTarget,
uint &  nTrain,
uint &  nTest,
REAL  positiveTarget,
REAL  negativeTarget,
bool  noRandom = false 
)

for split a random train and testset from data

Definition at line 4811 of file DatasetReader.cpp.

04812 {
04813     // split the train and test set
04814     if ( noRandom )
04815         cout<<"take the last percentTest:"<<100.0*percentTest<<"[%]"<<endl;
04816     else
04817         cout<<"random percentTest:"<<100.0*percentTest<<"[%]"<<endl;
04818 
04819     // set train and test bounds
04820     nTrain = 0;
04821     nTest = 0;
04822     srand ( getRandomSeed() );
04823     for ( int i=0;i<nData;i++ )
04824     {
04825         REAL r = ( double ) rand() / ( double ) RAND_MAX;
04826         if ( noRandom ) // take the last x as testset
04827             r = ( double ) i/ ( double ) nData< ( 1.0 - percentTest ) ?1.0:0.0;
04828         if ( r < percentTest )
04829             nTest++;
04830         else
04831             nTrain++;
04832     }
04833     cout<<"nTrain:"<<nTrain<<endl;
04834     cout<<"nTest:"<<nTest<<endl;
04835 
04836     // allocate mem
04837     train = new REAL[nTrain * nFeat];
04838     trainLabel = new int[nTrain];
04839     test = new REAL[nTest * nFeat];
04840     testLabel = new int[nTest];
04841 
04842     // fill train and test set
04843     nTrain = 0;
04844     nTest = 0;
04845     srand ( getRandomSeed() );
04846     for ( int i=0;i<nData;i++ )
04847     {
04848         REAL r = ( double ) rand() / ( double ) RAND_MAX;
04849         if ( noRandom ) // take the last x as testset
04850             r = ( double ) i/ ( double ) nData< ( 1.0 - percentTest ) ?1.0:0.0;
04851         if ( r < percentTest )
04852         {
04853             for ( int j=0;j<nFeat;j++ )
04854                 test[nTest*nFeat + j] = data[i*nFeat + j];
04855             testLabel[nTest] = labels[i];
04856             nTest++;
04857         }
04858         else
04859         {
04860             for ( int j=0;j<nFeat;j++ )
04861                 train[nTrain*nFeat + j] = data[i*nFeat + j];
04862             trainLabel[nTrain] = labels[i];
04863             nTrain++;
04864         }
04865     }
04866 
04867     // check for NANs or INFs or too large numbers
04868     for ( int i=0;i<nTrain*nFeat;i++ )
04869         if ( isnan ( train[i] ) || isinf ( train[i] ) || train[i]>1e10 || train[i]<-1e10 )
04870         {
04871             cout<<"train["<<i<<"]:"<<train[i]<<endl;
04872             assert ( false );
04873         }
04874 
04875     for ( int i=0;i<nTest*nFeat;i++ )
04876         if ( isnan ( test[i] ) || isinf ( test[i] ) || test[i]>1e10 || test[i]<-1e10 )
04877         {
04878             cout<<"test["<<i<<"]:"<<test[i]<<endl;
04879             assert ( false );
04880         }
04881 
04882     makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04883 }


The documentation for this class was generated from the following files:

Generated on Tue Jan 26 09:21:06 2010 for ELF by  doxygen 1.5.8