#include <DatasetReader.h>
Public Member Functions | |
DatasetReader () | |
~DatasetReader () | |
void | getDataBounds (const char **filenames, string delimiter, int &nFeat, int &nClass, uint &nLines, char *columnType, char *enabledCol, int targetColumn, int filenameID, bool fillData=false, REAL *data=0, int *labels=0, bool addConstantOne=true, bool skipFirstLine=false) |
void | splitRandomTestset (REAL percentTest, REAL *data, int *labels, int nData, int nFeat, int nClass, REAL *&train, int *&trainLabel, REAL *&trainTarget, REAL *&test, int *&testLabel, REAL *&testTarget, uint &nTrain, uint &nTest, REAL positiveTarget, REAL negativeTarget, bool noRandom=false) |
void | makeNumericTrainAndTestTargets (int nClass, int nTrain, int nTest, REAL positiveTarget, REAL negativeTarget, int *trainLabel, int *testLabel, REAL *&trainTarget, REAL *&testTarget) |
void | readMNIST (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readNETFLIX (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readKDDCup09Large (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readKDDCup09LargeBin (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readKDDCup09Small (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readAusDM2009 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readBINARY (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readCSV (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readARFF (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readPRUDSYS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readADULT (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readAUSTRALIAN (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readBALANCE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readCYLINDERBANDS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readBREASTCANCERWISCONSIN (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readAUSTRALIANCREDIT (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readDIABETES (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readGERMAN (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readGLASS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readHEART (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readHEPATITIS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readIONOSPHERE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readIRIS (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readLETTER (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readMONKS1 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readMONKS2 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readMONKS3 (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readMUSHROOM (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readSATIMAGE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readSEGMENTATION (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readSONAR (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readVEHICLE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readVOTES (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readWINE (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readPOKER (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readYEAST (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readSURVIVAL (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
void | readSPIDER (string path, REAL *&train, REAL *&trainTarget, int *&trainLabel, REAL *&test, REAL *&testTarget, int *&testLabel, uint &nTrain, uint &nTest, int &nClass, int &nDomain, int &nFeat, REAL positiveTarget=1.0, REAL negativeTarget=-1.0) |
Definition at line 30 of file DatasetReader.h.
DatasetReader::DatasetReader | ( | ) |
DatasetReader::~DatasetReader | ( | ) |
void DatasetReader::getDataBounds | ( | const char ** | filenames, | |
string | delimiter, | |||
int & | nFeat, | |||
int & | nClass, | |||
uint & | nLines, | |||
char * | columnType, | |||
char * | enabledCol, | |||
int | targetColumn, | |||
int | filenameID, | |||
bool | fillData = false , |
|||
REAL * | data = 0 , |
|||
int * | labels = 0 , |
|||
bool | addConstantOne = true , |
|||
bool | skipFirstLine = false | |||
) |
Read from a standard data matrix If numerical values are undefined, assign the mean value
e.g.: (last column is target class) 19910108,X126,NO,LINE,YES,Motter94,1911,55,46,0.2,17,78,0.75,20,13.1,1700,50.5,36.4,0,0,2.5,1,34,40,105,100,band 19910109,X266,NO,LINE,YES,Motter94,?,55,46,0.3,15,80,0.75,20,6.6,1900,54.9,38.5,0,0,2.5,0.7,34,40,105,100,noband 19910104,B7,NO,LINE,YES,WoodHoe70,?,62,40,0.433,16,80,?,30,6.5,1850,53.8,39.8,0,0,2.8,0.9,40,40,103.87,100,noband 19910104,T133,NO,LINE,YES,WoodHoe70,1910,52,40,0.3,16,75,0.3125,30,5.6,1467,55.6,38.8,0,0,2.5,1.3,40,40,108.06,100,noband 19910111,J34,NO,LINE,YES,WoodHoe70,1910,50,46,0.3,17,80,0.75,30,0,2100,57.5,42.5,5,0,2.3,0.6,35,40,106.67,100,noband
filenames | The dataset names | |
delemiter | The delimiter string, e.g.: ", " or "," | |
nFeat | Reference to the number of features (output value) | |
nLines | Reference to the number of lines in the dataset | |
columnType | A char* that select the data type: 'd' for discrete string value, 'n' for numeric value | |
enabledCol | A chat* that has '0' or '1', for reject or select a data column | |
targetColumn | The number of the column, that holds target classes (begin with 0) | |
filenameID | Select the filename in filenames | |
fillData | If true: fill REAL* data and int* labels with data | |
data | Pointer to data (allocated here) | |
labels | Pointer to labels (allocated here) |
Definition at line 4510 of file DatasetReader.cpp.
04511 { 04512 int bufSize = 1024*1024; 04513 int nFiles = 0; 04514 while ( filenames[nFiles] ) 04515 nFiles++; 04516 cout<<"nFiles:"<<nFiles<<endl; 04517 04518 fstream f; 04519 04520 for ( int i=0;i<nFiles;i++ ) 04521 { 04522 f.open ( filenames[i], ios::in ); 04523 if ( f.is_open() == false ) 04524 { 04525 cout<<"Can not open "<<filenames[i]<<endl; 04526 exit ( 0 ); 04527 } 04528 f.close(); 04529 } 04530 04531 int columnTypeSize = 0; 04532 while ( columnType[columnTypeSize] ) 04533 columnTypeSize++; 04534 cout<<"columnTypeSize:"<<columnTypeSize<<endl; 04535 char buf0[bufSize], buf1[bufSize]; 04536 int delimiterLength = delimiter.length(); 04537 const char* delimiterCharPtr = delimiter.c_str(); 04538 vector<string>* discreteValues = new vector<string>[columnTypeSize]; 04539 double* numericMean = new double[columnTypeSize]; 04540 int* numericMeanCnt = new int[columnTypeSize]; 04541 for ( int i=0;i<columnTypeSize;i++ ) 04542 { 04543 numericMean[i] = 0.0; 04544 numericMeanCnt[i] = 0; 04545 } 04546 for ( int fileCnt=0;fileCnt<nFiles;fileCnt++ ) 04547 { 04548 f.open ( filenames[fileCnt], ios::in ); 04549 if ( fileCnt == filenameID ) 04550 nLines = 0; 04551 04552 if ( skipFirstLine ) 04553 f.getline ( buf0, bufSize ); 04554 04555 while ( f.getline ( buf0, bufSize ) ) // read all lines 04556 { 04557 int cnt0 = 0, cnt1 = 0, cellCnt = 0; 04558 while ( buf0[cnt1] != 0 && cnt1 < bufSize ) // read all chars per line 04559 { 04560 int matchCnt = 0; 04561 for ( int i=0;i<delimiterLength;i++ ) 04562 matchCnt += delimiterCharPtr[i] == buf0[cnt1+i]; 04563 04564 if ( buf0[cnt1+delimiterLength]!=' ' && cnt1 > 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) ) // a delimiter match is found, or end of line 04565 { 04566 if ( cellCnt >= columnTypeSize ) 04567 break; 04568 04569 int addOne = 0; 04570 if ( buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) 04571 addOne = 1; 04572 strncpy ( buf1, buf0 + cnt0, cnt1 - cnt0 + addOne ); 04573 buf1[cnt1 - cnt0 + addOne] = 0; 04574 cnt0 = cnt1 + delimiterLength; 04575 if ( cnt1 < cnt0 - 1 ) 04576 cnt1 = cnt0 - 1; 04577 if ( enabledCol[cellCnt] == '1' ) 04578 { 04579 if ( columnType[cellCnt] == 'd' ) 04580 { 04581 // search for existing 04582 bool exists = false; 04583 for ( int i=0;i<discreteValues[cellCnt].size();i++ ) 04584 if ( discreteValues[cellCnt][i] == string ( buf1 ) ) 04585 exists = true; 04586 if ( exists == false ) 04587 discreteValues[cellCnt].push_back ( string ( buf1 ) ); 04588 } 04589 else if ( columnType[cellCnt] == 'n' ) 04590 { 04591 if ( ( buf1[0] >= '0' && buf1[0] <= '9' ) || buf1[0] == '.' || buf1[0] == '-' ) // is a numeric value 04592 { 04593 float num; 04594 sscanf ( buf1,"%f",&num ); 04595 if ( fileCnt == filenameID ) 04596 { 04597 numericMean[cellCnt] += num; 04598 numericMeanCnt[cellCnt]++; 04599 } 04600 } 04601 else // is an unknown numeric value 04602 { 04603 ; 04604 } 04605 } 04606 else 04607 assert ( false ); 04608 } 04609 //cout<<cellCnt<<":"<<string(buf1)<<"|"; 04610 cellCnt++; 04611 if ( buf0[cnt1+1] == 0 ) 04612 break; 04613 } 04614 else if ( cnt1 == 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 ) ) 04615 cnt0++; 04616 cnt1++; 04617 } 04618 //cout<<endl; 04619 04620 // if the line has content 04621 if ( cnt1 > 1 ) 04622 { 04623 if ( cellCnt != columnTypeSize && cellCnt > 1 ) 04624 { 04625 cout<<"cellCnt:"<<cellCnt<<" columnTypeSize:"<<columnTypeSize<<endl; 04626 assert ( false ); 04627 } 04628 if ( fileCnt == filenameID ) 04629 nLines++; 04630 } 04631 memset ( buf0, 0, bufSize ); 04632 } 04633 f.close(); 04634 04635 } 04636 04637 // calculate the total number of features 04638 nFeat = 0; 04639 cout<<"ValuesPerDiscreteInput:"<<endl; 04640 for ( int i=0;i<columnTypeSize;i++ ) 04641 { 04642 if ( i+1 != targetColumn ) 04643 { 04644 if ( enabledCol[i] == '1' ) 04645 { 04646 if ( columnType[i] == 'd' ) 04647 { 04648 cout<<i<<": #"<< ( int ) discreteValues[i].size() <<" {"; 04649 for ( int j=0;j<discreteValues[i].size();j++ ) 04650 cout<<discreteValues[i][j]<<","; 04651 cout<<"}"<<endl; 04652 nFeat += discreteValues[i].size(); 04653 } 04654 else if ( columnType[i] == 'n' ) 04655 nFeat++; 04656 else 04657 assert ( false ); 04658 } 04659 } 04660 } 04661 if ( addConstantOne ) 04662 nFeat++; 04663 cout<<endl; 04664 04665 nClass = discreteValues[targetColumn-1].size(); 04666 cout<<"#Targets:"<< ( int ) nClass<<" {"; 04667 for ( int j=0;j<nClass;j++ ) 04668 { 04669 string value = discreteValues[targetColumn-1][j]; 04670 cout<<value<<","<<flush; 04671 } 04672 cout<<"}"<<endl; 04673 04674 cout<<endl; 04675 cout<<"nFeat:"<<nFeat<<endl; 04676 cout<<"nLines:"<<nLines<<endl; 04677 04678 if ( fillData ) 04679 { 04680 // clear data 04681 for ( int i=0;i<nLines*nFeat;i++ ) 04682 data[i] = 0.0; 04683 if ( addConstantOne ) 04684 { 04685 for ( int i=0;i<nLines;i++ ) 04686 data[i*nFeat + nFeat-1] = 1.0; 04687 } 04688 for ( int i=0;i<nLines;i++ ) 04689 labels[i] = 0; 04690 04691 f.open ( filenames[filenameID], ios::in ); 04692 nLines = 0; 04693 04694 if ( skipFirstLine ) 04695 f.getline ( buf0, bufSize ); 04696 04697 while ( f.getline ( buf0, bufSize ) ) // read all lines 04698 { 04699 int cnt0 = 0, cnt1 = 0, cellCnt = 0, pos = 0; 04700 while ( buf0[cnt1] != 0 && cnt1 < bufSize ) // read all chars per line 04701 { 04702 int matchCnt = 0; 04703 for ( int i=0;i<delimiterLength;i++ ) 04704 matchCnt += delimiterCharPtr[i] == buf0[cnt1+i]; 04705 04706 if ( buf0[cnt1+delimiterLength]!=' ' && cnt1 > 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) ) // a delimiter match is found, or end of line 04707 { 04708 if ( cellCnt >= columnTypeSize ) 04709 break; 04710 04711 int addOne = 0; 04712 if ( buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) 04713 addOne = 1; 04714 strncpy ( buf1, buf0 + cnt0, cnt1 - cnt0 + addOne ); 04715 buf1[cnt1 - cnt0 + addOne] = 0; 04716 cnt0 = cnt1 + delimiterLength; 04717 if ( cnt1 < cnt0 - 1 ) 04718 cnt1 = cnt0 - 1; 04719 if ( enabledCol[cellCnt] == '1' ) 04720 { 04721 if ( columnType[cellCnt] == 'd' ) // discete value: {"Hugo","Bart","Moe",..} 04722 { 04723 // search in existing values 04724 int searchPos = -1; 04725 for ( int i=0;i<discreteValues[cellCnt].size();i++ ) 04726 if ( discreteValues[cellCnt][i] == string ( buf1 ) ) 04727 searchPos = i; 04728 04729 if ( searchPos == -1 ) 04730 assert ( false ); 04731 04732 // assign value 04733 if ( cellCnt+1 == targetColumn ) 04734 { 04735 labels[nLines] = searchPos; 04736 } 04737 else 04738 { 04739 data[nLines*nFeat + pos + searchPos] = 1.0; 04740 pos += discreteValues[cellCnt].size(); 04741 } 04742 } 04743 else if ( columnType[cellCnt] == 'n' ) // numeric value like: 1.23 or .34 or 1.2e3 04744 { 04745 if ( ( buf1[0] >= '0' && buf1[0] <= '9' ) || buf1[0] == '.' || buf1[0] == '-' ) // is a numeric value 04746 { 04747 float num; 04748 sscanf ( buf1,"%f",&num ); 04749 data[nLines*nFeat + pos] = num; 04750 } 04751 else // is an unknown numeric value 04752 { 04753 data[nLines*nFeat + pos] = 0.0; 04754 if ( numericMeanCnt[cellCnt] > 0 ) 04755 data[nLines*nFeat + pos] = numericMean[cellCnt] / numericMeanCnt[cellCnt]; 04756 } 04757 pos++; 04758 } 04759 else 04760 assert ( false ); 04761 } 04762 cellCnt++; 04763 } 04764 else if ( cnt1 == 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 ) ) 04765 cnt0++; 04766 cnt1++; 04767 } 04768 04769 // if the line has content 04770 if ( cnt1 > 1 ) 04771 { 04772 if ( cellCnt != columnTypeSize && cellCnt > 1 ) 04773 { 04774 cout<<"cellCnt:"<<cellCnt<<" columnTypeSize:"<<columnTypeSize<<endl; 04775 assert ( false ); 04776 } 04777 nLines++; 04778 04779 if ( pos != nFeat - ( int ) addConstantOne ) 04780 { 04781 cout<<"pos:"<<pos<<" nFeat:"<<nFeat<<endl; 04782 assert ( false ); 04783 } 04784 } 04785 memset ( buf0, 0, bufSize ); 04786 } 04787 f.close(); 04788 04789 // check for NANs or INFs or too large numbers 04790 for ( int i=0;i<nLines*nFeat;i++ ) 04791 if ( isnan ( data[i] ) || isinf ( data[i] ) || data[i]>1e10 || data[i]<-1e10 ) 04792 { 04793 cout<<"data["<<i<<"]:"<<data[i]<<endl; 04794 assert ( false ); 04795 } 04796 for ( int i=0;i<nLines;i++ ) 04797 if ( isnan ( labels[i] ) || isinf ( labels[i] ) || labels[i]<0 ) 04798 { 04799 cout<<"labels["<<i<<"]:"<<labels[i]<<endl; 04800 assert ( false ); 04801 } 04802 04803 } 04804 04805 }
void DatasetReader::makeNumericTrainAndTestTargets | ( | int | nClass, | |
int | nTrain, | |||
int | nTest, | |||
REAL | positiveTarget, | |||
REAL | negativeTarget, | |||
int * | trainLabel, | |||
int * | testLabel, | |||
REAL *& | trainTarget, | |||
REAL *& | testTarget | |||
) |
make numeric train and test target vectors
Definition at line 4889 of file DatasetReader.cpp.
04890 { 04891 // train targets 04892 trainTarget = new REAL[nClass*nTrain]; 04893 for ( int i=0;i<nTrain;i++ ) 04894 { 04895 for ( int j=0;j<nClass;j++ ) 04896 trainTarget[i*nClass + j] = negativeTarget; // negative class labels 04897 trainTarget[i*nClass + trainLabel[i]] = positiveTarget; // positive class label 04898 } 04899 04900 // test targets 04901 testTarget = new REAL[nClass*nTest]; 04902 for ( int i=0;i<nTest;i++ ) 04903 { 04904 for ( int j=0;j<nClass;j++ ) 04905 testTarget[i*nClass + j] = negativeTarget; // negative class labels 04906 testTarget[i*nClass + testLabel[i]] = positiveTarget; // positive class label 04907 } 04908 04909 // check for NANs or INFs or too large numbers 04910 for ( int i=0;i<nTrain*nClass;i++ ) 04911 if ( isnan ( trainTarget[i] ) || isinf ( trainTarget[i] ) || trainTarget[i]>1e10 || trainTarget[i]<-1e10 ) 04912 { 04913 cout<<"trainTarget["<<i<<"]:"<<trainTarget[i]<<endl; 04914 assert ( false ); 04915 } 04916 04917 for ( int i=0;i<nTest*nClass;i++ ) 04918 if ( isnan ( testTarget[i] ) || isinf ( testTarget[i] ) || testTarget[i]>1e10 || testTarget[i]<-1e10 ) 04919 { 04920 cout<<"testTarget["<<i<<"]:"<<testTarget[i]<<endl; 04921 assert ( false ); 04922 } 04923 04924 }
void DatasetReader::readADULT | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the ADULT dataset (UCI) 3974305Bytes adult.data 5229Bytes adult.names 2003153Bytes adult.test
Definition at line 3266 of file DatasetReader.cpp.
03267 { 03268 cout<<"Read ADULT from: "<<path<<endl; 03269 nDomain = 1; 03270 03271 // define data type and files 03272 int targetColumn = 15; 03273 char columnType[] = "ndndndddddnnndd"; 03274 char enabledCol[] = "111111111111111"; 03275 const char* dataFiles[] = { ( new string ( path+"/adult.data" ) )->c_str(), ( new string ( path+"/adult.test" ) )->c_str(),0}; 03276 03277 // === TRAIN SET === 03278 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 03279 03280 // allocate tmp mem 03281 train = new REAL[nTrain * nFeat]; 03282 trainLabel = new int[nTrain]; 03283 03284 // fill data 03285 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 03286 03287 03288 // === TEST SET === 03289 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 03290 03291 // allocate tmp mem 03292 test = new REAL[nTest * nFeat]; 03293 testLabel = new int[nTest]; 03294 03295 // fill data 03296 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 03297 03298 // make numerical targets 03299 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 03300 }
void DatasetReader::readARFF | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Read the data in ARFF format see: http://www.cs.waikato.ac.nz/~ml/weka/arff.html
Definition at line 2083 of file DatasetReader.cpp.
02084 { 02085 cout<<"Read ARFF from: "<<path<<endl; 02086 nDomain = 1; 02087 02088 char* buf = new char[1024*1024]; 02089 char del = 0; 02090 string trainName, trainTargetColumn; 02091 02092 // read the settings file 02093 fstream fSetting(string(path+"/settings.txt").c_str(),ios::in); 02094 while ( fSetting.getline ( buf, 1024*1024 ) ) 02095 { 02096 string s = buf; 02097 size_t pos = s.find_first_of('='); 02098 string token = s.substr(0,pos); 02099 cout<<token<<endl; 02100 if(token == "trainTargetColumn") 02101 trainTargetColumn = s.substr(pos+1); 02102 else if(token == "train") 02103 trainName = s.substr(pos+1); 02104 } 02105 fSetting.close(); 02106 02107 if(trainName=="" || trainTargetColumn=="") 02108 assert(false); 02109 02110 // read training set 02111 fstream fTrain(string(path+"/"+trainName).c_str(),ios::in); 02112 vector<vector<REAL> > targets; 02113 vector<vector<REAL> > features; 02114 vector<string> featureNames; 02115 vector<map<string,int> > featureValues; 02116 bool dataMode = false; 02117 while ( fTrain.getline ( buf, 1024*1024 ) ) 02118 { 02119 string s = buf; // the line 02120 02121 if(s.length() == 0) // no empty lines 02122 continue; 02123 if(s[0] == '%') // skip comments 02124 continue; 02125 if(s[0] == '@') // control sign 02126 { 02127 dataMode = false; 02128 size_t spacePos0 = s.find_first_of(' '); 02129 string token = s.substr(0,spacePos0); // token from beginning 02130 02131 if(token == "@relation" || token == "@RELATION") 02132 cout<<"Dataset name:"<<s.substr(spacePos0+1)<<endl; 02133 else if(token == "@attribute" || token == "@ATTRIBUTE") 02134 { 02135 // @attribute 'family' {'?','GB','GK','GS','TN','ZA','ZF','ZH','ZM','ZS'} 02136 size_t spacePos1 = s.find_first_of(" \t", spacePos0+1); 02137 string featureName = s.substr(spacePos0+1,spacePos1-spacePos0-1); 02138 featureNames.push_back(featureName); 02139 02140 map<string,int> values; 02141 size_t curlyPos0 = s.find_first_of('{', spacePos1+1); 02142 size_t curlyPos1 = s.find_first_of('}', spacePos1+1); 02143 size_t pos = curlyPos0+1; 02144 if(curlyPos0 != string::npos && curlyPos1 != string::npos) 02145 { 02146 while(pos < s.length()) 02147 { 02148 size_t delPos = s.find_first_of(',',pos); 02149 if(delPos==string::npos) 02150 delPos = curlyPos1; 02151 string feature = s.substr(pos,delPos-pos); 02152 while(*(feature.begin()) == ' ') // remove leading spaces 02153 feature = feature.substr(1); 02154 if(feature.length() > 0) 02155 while(feature[feature.length()-1] == ' ') // remove ending spaces 02156 { 02157 feature = feature.substr(0,feature.length()-1); 02158 if(feature.length() == 0) 02159 break; 02160 } 02161 if(feature.length() > 0) 02162 values[feature] = values.size(); // assign new id 02163 pos += feature.length()+1; 02164 } 02165 } 02166 featureValues.push_back(values); // push empty map when having a "real" attribute 02167 } 02168 else if(token == "@data" || token == "@DATA") 02169 dataMode = true; 02170 } 02171 else if(dataMode) 02172 { 02173 // '?','C','A',8,0,'?','S','?',0,'?','?','G','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','COIL',0.7,610,0,'?','0','?','3' 02174 //cout<<featureValues.size()<<endl; 02175 size_t pos = 0; 02176 uint valueCnt = 0; 02177 vector<REAL> feature; 02178 while(pos < s.length()) 02179 { 02180 size_t delPos = s.find_first_of(',',pos); 02181 if(delPos==string::npos) 02182 delPos = s.length(); 02183 string value = s.substr(pos,delPos-pos); 02184 //cout<<value<<" "<<featureNames[valueCnt]<<endl; 02185 if(featureValues[valueCnt].size() == 0) // check for real-valued attribute 02186 { 02187 if(featureNames[valueCnt] == trainTargetColumn) 02188 { 02189 vector<REAL> target; 02190 target.push_back(atof(value.c_str())); 02191 targets.push_back(target); 02192 } 02193 else 02194 feature.push_back(atof(value.c_str())); 02195 } 02196 else // categorical type 02197 { 02198 uint catSize = featureValues[valueCnt].size(); 02199 if(featureNames[valueCnt] == trainTargetColumn) 02200 { 02201 vector<REAL> target; 02202 map<string,int>::iterator it = featureValues[valueCnt].find(value); 02203 for(int i=0;i<catSize;i++) 02204 target.push_back(negativeTarget); 02205 uint catPos = it->second; 02206 target[catPos] = positiveTarget; 02207 targets.push_back(target); 02208 } 02209 else 02210 { 02211 map<string,int>::iterator it = featureValues[valueCnt].find(value); 02212 if(it == featureValues[valueCnt].end()) 02213 assert(false); 02214 for(int i=0;i<catSize;i++) 02215 feature.push_back(-1.0); // init with negative 02216 uint catPos = it->second; 02217 feature[feature.size()-catSize+catPos] = 1.0; 02218 } 02219 } 02220 valueCnt++; 02221 pos += value.length()+1; 02222 } 02223 features.push_back(feature); 02224 } 02225 } 02226 fTrain.close(); 02227 02228 assert(features.size() == targets.size()); 02229 02230 // print a short summary 02231 nTrain = features.size(); 02232 nFeat = features[0].size(); 02233 nClass = targets[0].size(); 02234 cout<<"nTrain:"<<nTrain<<" nFeat:"<<nFeat<<" nClass:"<<nClass<<" nFeatureNames:"<<featureNames.size()<<endl; 02235 for(int i=0;i<featureNames.size();i++) 02236 { 02237 cout<<"name:"<<featureNames[i]<<" "; 02238 if(featureValues[i].size() == 0) 02239 cout<<"[REAL]"; 02240 else 02241 { 02242 for(map<string,int>::iterator it = featureValues[i].begin();it != featureValues[i].end(); it++) 02243 cout<<"\""<<it->first<<"\" "; 02244 } 02245 cout<<endl; 02246 } 02247 02248 // allocate + fill train data 02249 train = new REAL[nFeat*nTrain]; 02250 trainTarget = new REAL[nClass*nTrain]; 02251 trainLabel = nClass > 1 ? new int[nTrain] : 0; 02252 02253 for(int i=0;i<nTrain;i++) 02254 { 02255 for(int j=0;j<nFeat;j++) 02256 train[i*nFeat+j] = features[i][j]; 02257 for(int j=0;j<nClass;j++) 02258 { 02259 trainTarget[i*nClass+j] = targets[i][j]; 02260 if(targets[i][j] == positiveTarget && nClass > 1) 02261 trainLabel[i] = j; 02262 } 02263 } 02264 02265 // no test set 02266 test = 0; 02267 testTarget = 0; 02268 testLabel = 0; 02269 nTest = 0; 02270 02271 delete[] buf; 02272 }
void DatasetReader::readAusDM2009 | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
AusDM2009 competition http://www.tiberius.biz/ausdm09/ -rw-r--r-- 1 15136906 Sep 25 20:18 S_AUC_Score.csv -rw-r--r-- 1 15131946 Sep 25 20:18 S_AUC_Train.csv -rw-r--r-- 1 15137106 Sep 25 20:18 S_RMSE_Score.csv -rw-r--r-- 1 15171000 Sep 25 20:18 S_RMSE_Train.csv
Definition at line 2510 of file DatasetReader.cpp.
02511 { 02512 cout<<"Read AusDM2009 from: "<<path<<endl; 02513 //string nameTrain = "S_RMSE_Train.csv"; 02514 //string nameTest = "S_RMSE_Score.csv"; 02515 //string nameTrain = "M_RMSE_Train.csv"; 02516 //string nameTest = "M_RMSE_Score.csv"; 02517 string nameTrain = "L_RMSE_Train.csv"; 02518 string nameTest = "L_RMSE_Score.csv"; 02519 nClass = 1; 02520 bool addConstantOne = true; 02521 02522 if ( Framework::getDatasetType() == 1 ) // is classification 02523 { 02524 //nameTrain = "S_AUC_Train.csv"; 02525 //nameTest = "S_AUC_Score.csv"; 02526 //nameTrain = "M_AUC_Train.csv"; 02527 //nameTest = "M_AUC_Score.csv"; 02528 nameTrain = "L_AUC_Train.csv"; 02529 nameTest = "L_AUC_Score.csv"; 02530 nClass = 2; 02531 } 02532 02533 cout<<"nameTrain:"<<nameTrain<<" nameTest:"<<nameTest<<endl; 02534 02535 int bufSize = 1024*1024; 02536 char *buf = new char[bufSize]; 02537 02538 nDomain = 1; 02539 02540 fstream fTrainRMSE; 02541 fstream fTestRMSE; 02542 02543 // determine #cols 02544 fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in ); 02545 fTrainRMSE.getline ( buf, bufSize ); 02546 fTrainRMSE.getline ( buf, bufSize ); 02547 nFeat = 0; 02548 char *ptr = buf, *ptrLast = buf; 02549 int pos = 0, val, colCnt = 0; 02550 while ( ptr[pos] ) 02551 { 02552 if ( ptr[pos] == ',' || ptr[pos+1] == 0 ) 02553 { 02554 sscanf ( ptrLast,"%d",&val ); 02555 ptrLast = ptr + pos + 1; 02556 colCnt++; 02557 if ( colCnt > 2 ) 02558 nFeat++; 02559 } 02560 pos++; 02561 } 02562 fTrainRMSE.close(); 02563 02564 if ( addConstantOne ) 02565 nFeat++; // constant one 02566 02567 // determine #rows train 02568 fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in ); 02569 fTrainRMSE.getline ( buf, bufSize ); 02570 nTrain = 0; 02571 while ( fTrainRMSE.getline ( buf, bufSize ) ) 02572 nTrain++; 02573 fTrainRMSE.close(); 02574 02575 // determine #rows test 02576 fTestRMSE.open ( ( path+"/"+nameTest ).c_str(), ios::in ); 02577 fTestRMSE.getline ( buf, bufSize ); 02578 nTest = 0; 02579 while ( fTestRMSE.getline ( buf, bufSize ) ) 02580 nTest++; 02581 fTestRMSE.close(); 02582 02583 // alloc mem 02584 train = new REAL[nFeat*nTrain]; 02585 test = new REAL[nFeat*nTest]; 02586 if ( Framework::getDatasetType() == 1 ) 02587 { 02588 trainTarget = new REAL[nTrain*2]; 02589 trainLabel = new int[nTrain]; 02590 testTarget = new REAL[nTest*2]; 02591 testLabel = new int[nTest]; 02592 } 02593 else 02594 { 02595 trainTarget = new REAL[nTrain]; 02596 trainLabel = 0; 02597 testTarget = new REAL[nTest]; 02598 testLabel = 0; 02599 } 02600 02601 // read train 02602 fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in ); 02603 fTrainRMSE.getline ( buf, bufSize ); 02604 nTrain = 0; 02605 while ( fTrainRMSE.getline ( buf, bufSize ) ) 02606 { 02607 ptr = buf; 02608 ptrLast = buf; 02609 pos = 0; 02610 colCnt = 0; 02611 while ( ptr[pos] ) 02612 { 02613 if ( ptr[pos] == ',' || ptr[pos+1] == 0 ) 02614 { 02615 sscanf ( ptrLast,"%d",&val ); 02616 ptrLast = ptr + pos + 1; 02617 colCnt++; 02618 if ( colCnt == 2 ) 02619 { 02620 if ( Framework::getDatasetType() == 1 ) 02621 { 02622 trainLabel[nTrain] = val>0? 0 : 1; 02623 trainTarget[2*nTrain+0] = val>0? positiveTarget : negativeTarget; 02624 trainTarget[2*nTrain+1] = val>0? negativeTarget : positiveTarget; 02625 } 02626 else 02627 trainTarget[nTrain] = ( REAL ) val * 0.001; 02628 //trainTarget[nTrain] = (REAL)val; 02629 } 02630 if ( colCnt > 2 ) 02631 train[nTrain*nFeat+colCnt-3] = ( REAL ) val * 0.001; 02632 //train[nTrain*nFeat+colCnt-3] = (REAL)val; 02633 } 02634 pos++; 02635 } 02636 if ( ( colCnt-3 != nFeat-1 && addConstantOne == false ) || ( colCnt-3 != nFeat-2 && addConstantOne == true ) ) 02637 { 02638 cout<<"colCnt:"<<colCnt<<" nFeat:"<<nFeat<<" addConstantOne:"<<addConstantOne<<endl; 02639 assert ( false ); 02640 } 02641 if ( addConstantOne ) 02642 train[nTrain*nFeat+nFeat-1] = 1.0; 02643 nTrain++; 02644 } 02645 fTrainRMSE.close(); 02646 02647 // read test 02648 fTestRMSE.open ( ( path+"/"+nameTest ).c_str(), ios::in ); 02649 fTestRMSE.getline ( buf, bufSize ); 02650 nTest = 0; 02651 while ( fTestRMSE.getline ( buf, bufSize ) ) 02652 { 02653 ptr = buf; 02654 ptrLast = buf; 02655 pos = 0; 02656 colCnt = 0; 02657 while ( ptr[pos] ) 02658 { 02659 if ( ptr[pos] == ',' || ptr[pos+1] == 0 ) 02660 { 02661 sscanf ( ptrLast,"%d",&val ); 02662 ptrLast = ptr + pos + 1; 02663 colCnt++; 02664 02665 if ( Framework::getDatasetType() == 1 ) 02666 { 02667 testTarget[nTest] = val>0? 0 : 1; 02668 testTarget[2*nTest+0] = val>0? positiveTarget : negativeTarget; 02669 testTarget[2*nTest+1] = val>0? negativeTarget : positiveTarget; 02670 } 02671 else 02672 testTarget[nTest] = ( REAL ) val * 0.001; 02673 //testTarget[nTest] = (REAL)val; 02674 if ( colCnt > 2 ) 02675 test[nTest*nFeat+colCnt-3] = ( REAL ) val * 0.001; 02676 //test[nTest*nFeat+colCnt-3] = (REAL)val; 02677 } 02678 pos++; 02679 } 02680 if ( ( colCnt-3 != nFeat-1 && addConstantOne == false ) || ( colCnt-3 != nFeat-2 && addConstantOne == true ) ) 02681 { 02682 cout<<"colCnt:"<<colCnt<<" nFeat:"<<nFeat<<" addConstantOne:"<<addConstantOne<<endl; 02683 assert ( false ); 02684 } 02685 if ( addConstantOne ) 02686 test[nTest*nFeat+nFeat-1] = 1.0; 02687 nTest++; 02688 } 02689 fTestRMSE.close(); 02690 02691 /* 02692 // random subspace idea 02693 REAL subspace = 0.45; 02694 //REAL subspace = 1.0; 02695 bool* subspaceBit = new bool[nFeat]; 02696 if(Framework::getFrameworkMode() == 0 && subspace < 1.0) // training 02697 { 02698 //srand(time(0)); 02699 cout<<"Create a random subspace:"<<subspace<<endl; 02700 fstream f((path+"/subspace.txt").c_str(), ios::out); 02701 for(int i=0;i<nFeat;i++) 02702 { 02703 subspaceBit[i] = (double)rand()/(double)RAND_MAX < subspace? true : false; 02704 subspaceBit[nFeat-1] = true; 02705 f<<(int)subspaceBit[i]<<endl; 02706 cout<<(int)subspaceBit[i]<<" "; 02707 } 02708 cout<<endl; 02709 f.close(); 02710 } 02711 else if(subspace < 1.0) // prediction 02712 { 02713 cout<<"Read the random subspace"<<endl; 02714 fstream f((path+"/subspace.txt").c_str(), ios::in); 02715 for(int i=0;i<nFeat;i++) 02716 { 02717 f>>subspaceBit[i]; 02718 cout<<(int)subspaceBit[i]<<" "; 02719 } 02720 cout<<endl; 02721 f.close(); 02722 } 02723 02724 if(subspace < 1.0) 02725 { 02726 int nFeatNew = 0; 02727 for(int i=0;i<nFeat;i++) 02728 nFeatNew += subspaceBit[i]; 02729 cout<<"nFeatNew:"<<nFeatNew<<endl; 02730 02731 REAL* trainNew = new REAL[nFeatNew*nTrain]; 02732 REAL* testNew = new REAL[nFeatNew*nTest]; 02733 02734 for(int i=0;i<nTrain;i++) 02735 { 02736 int cnt = 0; 02737 for(int j=0;j<nFeat;j++) 02738 { 02739 if(subspaceBit[j]) 02740 { 02741 trainNew[cnt + i * nFeatNew] = train[j + i * nFeat]; 02742 cnt++; 02743 } 02744 } 02745 } 02746 02747 for(int i=0;i<nTest;i++) 02748 { 02749 int cnt = 0; 02750 for(int j=0;j<nFeat;j++) 02751 { 02752 if(subspaceBit[j]) 02753 { 02754 testNew[cnt + i * nFeatNew] = test[j + i * nFeat]; 02755 cnt++; 02756 } 02757 } 02758 } 02759 02760 delete[] train; 02761 delete[] test; 02762 train = trainNew; 02763 test = testNew; 02764 nFeat = nFeatNew; 02765 } 02766 */ 02767 delete[] buf; 02768 }
void DatasetReader::readAUSTRALIAN | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the AUSTRALIAN dataset (UCI) 9 folders with 95 different signs in sum, 27 samples per sign
Definition at line 3307 of file DatasetReader.cpp.
03308 { 03309 cout<<"Read AUSTRALIAN from: "<<path<<endl; 03310 nDomain = 1; 03311 03312 char* dirs[] = {"tctodd1","tctodd2","tctodd3","tctodd4","tctodd5","tctodd6","tctodd7","tctodd8","tctodd9",0}; 03313 03314 char* signs[] = {"alive","all","answer","boy","building","buy","change_mind_","cold","come","computer_PC_","cost","crazy","danger","deaf","different","draw","drink","eat","exit","flash-light","forget","girl","give","glove","go","God","happy","head","hear","hello","his_hers","hot","how","hurry","hurt","I","innocent","is_true_","joke","juice","know","later","lose","love","make","man","maybe","mine","money","more","name","no","Norway","not-my-problem","paper","pen","please","polite","question","read","ready","research","responsible","right","sad","same","science","share","shop","soon","sorry","spend","stubborn","surprise","take","temper","thank","think","tray","us","voluntary","wait_notyet_","what","when","where","which","who","why","wild","will","write","wrong","yes","you","zero",0}; 03315 03316 nClass = 0; 03317 while ( signs[nClass] ) 03318 nClass++; 03319 03320 cout<<"nClass:"<<nClass<<endl; 03321 03322 fstream fTrain; 03323 03324 // get data bounds 03325 int nTrainTmp = 0; 03326 int dirCnt = 0; 03327 char buf[10000]; 03328 int maxFrames = 0; 03329 int dataPerLine = 22; 03330 while ( dirs[dirCnt] ) 03331 { 03332 int signCnt = 0; 03333 while ( signs[signCnt] ) 03334 { 03335 for ( int i=0;i<3;i++ ) 03336 { 03337 sprintf ( buf,"%s/%s/%s-%d.tsd",path.c_str(),dirs[dirCnt],signs[signCnt],i+1 ); 03338 fTrain.open ( buf, ios::in ); 03339 if ( fTrain.is_open() == false ) 03340 cout<<"Can not open "<<buf<<endl; 03341 else 03342 { 03343 int lines = 0; 03344 while ( fTrain.getline ( buf, 10000 ) ) // read all lines 03345 { 03346 stringstream ss ( buf ); 03347 REAL r; 03348 int cnt = 0; 03349 while ( ss>>r ) 03350 cnt++; 03351 if ( cnt != dataPerLine ) 03352 assert ( false ); 03353 lines++; 03354 } 03355 if ( lines > maxFrames ) 03356 maxFrames = lines; 03357 nTrainTmp++; 03358 } 03359 fTrain.close(); 03360 } 03361 signCnt++; 03362 } 03363 dirCnt++; 03364 } 03365 03366 cout<<"nTrainTmp:"<<nTrainTmp<<endl; 03367 cout<<"maxFrames:"<<maxFrames<<endl; 03368 03369 nFeat = maxFrames * dataPerLine; 03370 cout<<"nFeat:"<<nFeat<<" ("<<maxFrames<<"*"<<dataPerLine<<")"<<endl; 03371 03372 // allocate tmp mem 03373 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03374 int* trainLabelTmp = new int[nTrainTmp]; 03375 for ( int i=0;i<nTrainTmp * nFeat;i++ ) 03376 trainTmp[i] = 0.0; 03377 for ( int i=0;i<nTrainTmp;i++ ) 03378 trainLabelTmp[i] = 0; 03379 03380 // fill data 03381 nTrainTmp = 0; 03382 dirCnt = 0; 03383 while ( dirs[dirCnt] ) 03384 { 03385 int signCnt = 0; 03386 while ( signs[signCnt] ) 03387 { 03388 for ( int i=0;i<3;i++ ) 03389 { 03390 sprintf ( buf,"%s/%s/%s-%d.tsd",path.c_str(),dirs[dirCnt],signs[signCnt],i+1 ); 03391 fTrain.open ( buf, ios::in ); 03392 if ( fTrain.is_open() == false ) 03393 cout<<"Can not open "<<buf<<endl; 03394 else 03395 { 03396 int lines = 0; 03397 while ( fTrain.getline ( buf, 10000 ) ) // read all lines 03398 { 03399 stringstream ss ( buf ); 03400 REAL r; 03401 int cnt = 0; 03402 while ( ss>>r ) 03403 { 03404 trainTmp[nTrainTmp * nFeat + lines * dataPerLine + cnt] = r; 03405 trainLabelTmp[nTrainTmp] = signCnt; 03406 cnt++; 03407 } 03408 if ( cnt != dataPerLine ) 03409 assert ( false ); 03410 lines++; 03411 } 03412 nTrainTmp++; 03413 } 03414 fTrain.close(); 03415 } 03416 signCnt++; 03417 } 03418 dirCnt++; 03419 } 03420 03421 // split train and testset from trainTmp 03422 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03423 03424 delete[] trainTmp; 03425 delete[] trainLabelTmp; 03426 03427 }
void DatasetReader::readAUSTRALIANCREDIT | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the AUSTRALIAN-CREDIT dataset (UCI)
28735Bytes australian.dat 2467Bytes australian.doc
Definition at line 3553 of file DatasetReader.cpp.
03554 { 03555 cout<<"Read AUSTRALIAN-CREDIT from: "<<path<<endl; 03556 nDomain = 1; 03557 03558 // define data type and files 03559 int targetColumn = 15; 03560 uint nTrainTmp; 03561 char columnType[] = "dnndddnddnddnnd"; 03562 char enabledCol[] = "111111111111111"; 03563 const char* dataFiles[] = { ( new string ( path+"/australian.dat" ) )->c_str(),0}; 03564 03565 // === TRAIN SET === 03566 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03567 03568 // allocate tmp mem 03569 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03570 int* trainLabelTmp = new int[nTrainTmp]; 03571 03572 // fill data 03573 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03574 03575 // split train and testset from trainTmp 03576 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03577 03578 delete[] trainTmp; 03579 delete[] trainLabelTmp; 03580 }
void DatasetReader::readBALANCE | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the BALANCE dataset (UCI)
Definition at line 3436 of file DatasetReader.cpp.
03437 { 03438 cout<<"Read BALANCE from: "<<path<<endl; 03439 nDomain = 1; 03440 03441 // define data type and files 03442 int targetColumn = 1; 03443 uint nTrainTmp; 03444 char columnType[] = "dnnnn"; 03445 char enabledCol[] = "11111"; 03446 const char* dataFiles[] = { ( new string ( path+"/balance-scale.data" ) )->c_str(),0}; 03447 03448 // === TRAIN SET === 03449 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03450 03451 // allocate tmp mem 03452 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03453 int* trainLabelTmp = new int[nTrainTmp]; 03454 03455 // fill data 03456 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03457 03458 // split train and testset from trainTmp 03459 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03460 03461 delete[] trainTmp; 03462 delete[] trainLabelTmp; 03463 03464 }
void DatasetReader::readBINARY | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads a binary dataset format (result from feature selection) Limitied only to classification datasets
Format: [nExamples(4Byte INT),nClass(4Byte INT),nDomain(4Byte INT),nFeat(4Byte INT), features(nExamples*nFeat Bytes REAL),labels(nExamples*nDomain Bytes REAL)]
Definition at line 1818 of file DatasetReader.cpp.
01819 { 01820 REAL* feat, *target; 01821 int* label, N; 01822 01823 fstream f; 01824 if ( Framework::getFrameworkMode() == 1 ) 01825 f.open ( ( path+"/binary.test" ).c_str(), ios::in ); 01826 else 01827 f.open ( ( path+"/binary.train" ).c_str(), ios::in ); 01828 01829 // dataset bounds 01830 f.read ( ( char* ) &N, sizeof ( int ) ); 01831 f.read ( ( char* ) &nClass, sizeof ( int ) ); 01832 f.read ( ( char* ) &nDomain, sizeof ( int ) ); 01833 f.read ( ( char* ) &nFeat, sizeof ( int ) ); 01834 01835 feat = new REAL[N*nFeat]; 01836 target = new REAL[N*nClass*nDomain]; 01837 label = new int[N*nDomain]; 01838 01839 // features and labels 01840 f.read ( ( char* ) feat, sizeof ( REAL ) *N*nFeat ); 01841 f.read ( ( char* ) label, sizeof ( int ) *N*nDomain ); 01842 f.close(); 01843 01844 for ( int i=0;i<N;i++ ) 01845 { 01846 for ( int j=0;j<nClass*nDomain;j++ ) 01847 target[i*nClass*nDomain+j] = negativeTarget; 01848 for ( int j=0;j<nDomain;j++ ) 01849 target[i*nClass*nDomain + j*nClass + label[i*nDomain+j]] = positiveTarget; 01850 } 01851 01852 if ( Framework::getFrameworkMode() == 1 ) 01853 { 01854 nTest = N; 01855 test = feat; 01856 testTarget = target; 01857 testLabel = label; 01858 train = 0; 01859 trainTarget = 0; 01860 trainLabel = 0; 01861 nTrain = 0; 01862 } 01863 else 01864 { 01865 nTrain = N; 01866 train = feat; 01867 trainTarget = target; 01868 trainLabel = label; 01869 test = 0; 01870 testTarget = 0; 01871 testLabel = 0; 01872 nTest = 0; 01873 } 01874 01875 }
void DatasetReader::readBREASTCANCERWISCONSIN | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the BEAST-CANCER-WISCONSIN dataset (UCI)
19889Bytes breast-cancer-wisconsin.data 5657Bytes breast-cancer-wisconsin.names 21363Bytes unformatted-data 124103Bytes wdbc.data 4708Bytes wdbc.names 44234Bytes wpbc.data 5671Bytes wpbc.names
Definition at line 3515 of file DatasetReader.cpp.
03516 { 03517 cout<<"Read BREAST-CANCER-WISCONSIN from: "<<path<<endl; 03518 nDomain = 1; 03519 03520 // define data type and files 03521 int targetColumn = 11; 03522 uint nTrainTmp; 03523 char columnType[] = "nnnnnnnnnnd"; 03524 char enabledCol[] = "11111111111"; 03525 //char columnType[] = "ddddddddddd"; 03526 const char* dataFiles[] = { ( new string ( path+"/breast-cancer-wisconsin.data" ) )->c_str(),0}; 03527 03528 // === TRAIN SET === 03529 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03530 03531 // allocate tmp mem 03532 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03533 int* trainLabelTmp = new int[nTrainTmp]; 03534 03535 // fill data 03536 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03537 03538 // split train and testset from trainTmp 03539 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03540 03541 delete[] trainTmp; 03542 delete[] trainLabelTmp; 03543 03544 }
void DatasetReader::readCSV | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Regression+Classification Reads a csv dataset format, fields separated by delimiter
Definition at line 1882 of file DatasetReader.cpp.
01883 { 01884 cout<<"Read CSV from: "<<path<<endl; 01885 nDomain = 1; 01886 01887 char* buf = new char[1024*1024]; 01888 char del = 0; 01889 int trainTargetColumn = -1; 01890 string trainName, testName; 01891 01892 // read the settings file 01893 fstream fSetting(string(path+"/settings.txt").c_str(),ios::in); 01894 while ( fSetting.getline ( buf, 1024*1024 ) ) 01895 { 01896 string s = buf; 01897 size_t pos = s.find_first_of('='); 01898 string token = s.substr(0,pos); 01899 //cout<<token<<endl; 01900 if(token == "delimiter") 01901 del = buf[pos+1]; 01902 else if(token == "trainTargetColumn") 01903 trainTargetColumn = atoi(s.substr(pos+1).c_str()); 01904 else if(token == "train") 01905 trainName = s.substr(pos+1); 01906 else if(token == "test") 01907 testName = s.substr(pos+1); 01908 } 01909 fSetting.close(); 01910 01911 // check if available 01912 if(trainTargetColumn == -1 || del == 0 || trainName == "" || (Framework::getFrameworkMode() && testName == "")) 01913 assert(false); 01914 01915 // read training set 01916 fstream fTrain(string(path+"/"+trainName).c_str(),ios::in); 01917 vector<string> targets; 01918 map<string,int> targetMap; 01919 vector<vector<REAL> > features; 01920 while ( fTrain.getline ( buf, 1024*1024 ) ) 01921 { 01922 string s = buf; 01923 size_t lastPos = 0; 01924 vector<REAL> feature; 01925 for(int i=0;i<s.length();i++) 01926 { 01927 if(s[i] == del || i == s.length()-1) 01928 { 01929 string token = s.substr(lastPos,i-lastPos); // tokens from beginning 01930 if(i == s.length()-1) // the last token in the line 01931 token = s.substr(lastPos,i-lastPos+1); 01932 if(feature.size() == trainTargetColumn) // any value 01933 { 01934 targets.push_back(token); 01935 if(Framework::getDatasetType()) 01936 { 01937 map<string,int>::iterator it = targetMap.find(token); 01938 if(it == targetMap.end()) 01939 targetMap[token] = targetMap.size(); 01940 } 01941 } 01942 else // real value 01943 { 01944 if((token[0] == '-' || token[0] == '.' || token[0] >= '0' && token[0] <= '9') == 0) // real value check 01945 assert(false); 01946 REAL value = atof(token.c_str()); 01947 //cout<<value<<" "; 01948 feature.push_back(value); 01949 } 01950 lastPos = i+1; 01951 } 01952 } 01953 if(feature.size()) 01954 features.push_back(feature); 01955 //cout<<targets[targets.size()-1]<<endl; 01956 } 01957 fTrain.close(); 01958 01959 // count the different targets in a classification problem 01960 nClass = 1; 01961 if(Framework::getDatasetType()) 01962 { 01963 nClass = targetMap.size(); 01964 map<string,int>::iterator it; 01965 cout<<"Target values: "; 01966 for(it=targetMap.begin();it!=targetMap.end();it++) 01967 cout<<"["<<it->second<<"]"<<it->first<<" "; 01968 cout<<endl; 01969 } 01970 01971 // assign bounds and allocate mem 01972 nTrain = features.size(); 01973 nTest = 0; 01974 nFeat = features[0].size(); 01975 train = new REAL[nFeat*nTrain]; 01976 trainTarget = new REAL[nClass*nTrain]; 01977 if(Framework::getDatasetType()) 01978 trainLabel = new int[nTrain]; 01979 01980 // fill train data 01981 for(int i=0;i<nTrain;i++) 01982 { 01983 for(int j=0;j<nFeat;j++) // fill features 01984 train[i*nFeat+j] = features[i][j]; 01985 if(Framework::getDatasetType()) // classification dataset ? 01986 { 01987 int label = targetMap[targets[i]]; 01988 trainLabel[i] = label; 01989 for(int j=0;j<nClass;j++) 01990 trainTarget[i*nClass+j] = (j==label? positiveTarget : negativeTarget); 01991 } 01992 else // regression dataset 01993 { 01994 REAL target = atof(targets[i].c_str()); 01995 trainTarget[i] = target; 01996 } 01997 } 01998 01999 // read test set 02000 if(Framework::getFrameworkMode()) 02001 { 02002 fstream fTest(string(path+"/"+testName).c_str(),ios::in); 02003 targets.clear(); 02004 features.clear(); 02005 while ( fTest.getline ( buf, 1024*1024 ) ) 02006 { 02007 string s = buf; 02008 size_t lastPos = 0; 02009 vector<REAL> feature; 02010 for(int i=0;i<s.length();i++) 02011 { 02012 if(s[i] == del || i == s.length()-1) 02013 { 02014 string token = s.substr(lastPos,i-lastPos); // tokens from beginning 02015 if(i == s.length()-1) // the last token in the line 02016 token = s.substr(lastPos,i-lastPos+1); 02017 if(feature.size() == trainTargetColumn) // any value 02018 { 02019 targets.push_back(token); 02020 if(Framework::getDatasetType()) 02021 { 02022 map<string,int>::iterator it = targetMap.find(token); 02023 if(it == targetMap.end()) 02024 targetMap[token] = targetMap.size(); 02025 } 02026 } 02027 else // real value 02028 { 02029 if((token[0] == '-' || token[0] == '.' || token[0] >= '0' && token[0] <= '9') == 0) // real value check 02030 assert(false); 02031 REAL value = atof(token.c_str()); 02032 //cout<<value<<" "; 02033 feature.push_back(value); 02034 } 02035 lastPos = i+1; 02036 } 02037 } 02038 if(feature.size()) 02039 features.push_back(feature); 02040 //cout<<targets[targets.size()-1]<<endl; 02041 } 02042 fTest.close(); 02043 02044 // assign bounds and allocate mem 02045 nTest = features.size(); 02046 test = new REAL[nFeat*nTest]; 02047 testTarget = new REAL[nClass*nTest]; 02048 if(Framework::getDatasetType()) 02049 testLabel = new int[nTrain]; 02050 02051 // fill train data 02052 for(int i=0;i<nTest;i++) 02053 { 02054 for(int j=0;j<nFeat;j++) // fill features 02055 test[i*nFeat+j] = features[i][j]; 02056 if(targets.size() == features.size()) 02057 { 02058 if(Framework::getDatasetType()) // classification dataset ? 02059 { 02060 int label = targetMap[targets[i]]; 02061 testLabel[i] = label; 02062 for(int j=0;j<nClass;j++) 02063 testTarget[i*nClass+j] = (j==label? positiveTarget : negativeTarget); 02064 } 02065 else // regression dataset 02066 { 02067 REAL target = atof(targets[i].c_str()); 02068 testTarget[i] = target; 02069 } 02070 } 02071 } 02072 02073 } 02074 02075 delete[] buf; 02076 }
void DatasetReader::readCYLINDERBANDS | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the CYLINDER-BANDS dataset (UCI)
Definition at line 3473 of file DatasetReader.cpp.
03474 { 03475 cout<<"Read CYLINDER-BANDS from: "<<path<<endl; 03476 nDomain = 1; 03477 03478 // define data type and files 03479 int targetColumn = 40; 03480 uint nTrainTmp; 03481 char columnType[] = "ndddddddddddddddddddnnnnnnnnnnnnnnnnnnnd"; 03482 char enabledCol[] = "1111111111111111111111111111111111111111"; 03483 const char* dataFiles[] = { ( new string ( path+"/bands.data" ) )->c_str(),0}; 03484 03485 // === TRAIN SET === 03486 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03487 03488 // allocate tmp mem 03489 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03490 int* trainLabelTmp = new int[nTrainTmp]; 03491 03492 // fill data 03493 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03494 03495 // split train and testset from trainTmp 03496 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03497 03498 delete[] trainTmp; 03499 delete[] trainLabelTmp; 03500 03501 }
void DatasetReader::readDIABETES | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the DIABETES dataset (UCI)
23279Bytes pima-indians-diabetes.data 3067Bytes pima-indians-diabetes.names
Definition at line 3589 of file DatasetReader.cpp.
03590 { 03591 cout<<"Read DIABETES from: "<<path<<endl; 03592 nDomain = 1; 03593 03594 // define data type and files 03595 int targetColumn = 9; 03596 uint nTrainTmp; 03597 char columnType[] = "nnnnnnnnd"; 03598 char enabledCol[] = "111111111"; 03599 const char* dataFiles[] = { ( new string ( path+"/pima-indians-diabetes.data" ) )->c_str(),0}; 03600 03601 // === TRAIN SET === 03602 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03603 03604 // allocate tmp mem 03605 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03606 int* trainLabelTmp = new int[nTrainTmp]; 03607 03608 // fill data 03609 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03610 03611 // split train and testset from trainTmp 03612 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03613 03614 delete[] trainTmp; 03615 delete[] trainLabelTmp; 03616 03617 }
void DatasetReader::readGERMAN | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the GERMAN dataset (UCI)
79793Bytes german.data 102000Bytes german.data-numeric 4679Bytes german.doc
Definition at line 3626 of file DatasetReader.cpp.
03627 { 03628 cout<<"Read GERMAN from: "<<path<<endl; 03629 nDomain = 1; 03630 03631 // define data type and files 03632 int targetColumn = 21; 03633 uint nTrainTmp; 03634 char columnType[] = "dnddnddnddndnddndnddd"; 03635 char enabledCol[] = "111111111111111111111"; 03636 const char* dataFiles[] = { ( new string ( path+"/german.data" ) )->c_str(),0}; 03637 03638 // === TRAIN SET === 03639 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03640 03641 // allocate tmp mem 03642 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03643 int* trainLabelTmp = new int[nTrainTmp]; 03644 03645 // fill data 03646 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03647 03648 // split train and testset from trainTmp 03649 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03650 03651 delete[] trainTmp; 03652 delete[] trainLabelTmp; 03653 03654 }
void DatasetReader::readGLASS | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the GLASS dataset (UCI)
11903Bytes glass.data 3506Bytes glass.names 780Bytes glass.tag
Definition at line 3663 of file DatasetReader.cpp.
03664 { 03665 cout<<"Read GLASS from: "<<path<<endl; 03666 nDomain = 1; 03667 03668 // define data type and files 03669 int targetColumn = 11; 03670 uint nTrainTmp; 03671 char columnType[] = "nnnnnnnnnnd"; 03672 char enabledCol[] = "01111111111"; 03673 const char* dataFiles[] = { ( new string ( path+"/glass.data" ) )->c_str(),0}; 03674 03675 // === TRAIN SET === 03676 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03677 03678 // allocate tmp mem 03679 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03680 int* trainLabelTmp = new int[nTrainTmp]; 03681 03682 // fill data 03683 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03684 03685 // split train and testset from trainTmp 03686 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03687 03688 delete[] trainTmp; 03689 delete[] trainLabelTmp; 03690 03691 }
void DatasetReader::readHEART | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the HEART dataset (UCI)
4979Bytes SPECTF.names 33459Bytes SPECTF.test 10797Bytes SPECTF.train
Definition at line 3700 of file DatasetReader.cpp.
03701 { 03702 cout<<"Read HEART from: "<<path<<endl; 03703 nDomain = 1; 03704 03705 // define data type and files 03706 int targetColumn = 1; 03707 char columnType[] = "dnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"; 03708 char enabledCol[] = "111111111111111111111111111111111111111111111"; 03709 const char* dataFiles[] = { ( new string ( path+"/SPECTF.train" ) )->c_str(), ( new string ( path+"/SPECTF.test" ) )->c_str(),0}; 03710 03711 // === TRAIN SET === 03712 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 03713 train = new REAL[nFeat*nTrain]; 03714 trainLabel = new int[nTrain]; 03715 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 03716 03717 // === TEST SET === 03718 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 03719 test = new REAL[nFeat*nTest]; 03720 testLabel = new int[nTest]; 03721 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 03722 03723 // make numerical test targets 03724 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 03725 03726 }
void DatasetReader::readHEPATITIS | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the HEPATITIS dataset (UCI)
7545Bytes hepatitis.data 3098Bytes hepatitis.names
Definition at line 3734 of file DatasetReader.cpp.
03735 { 03736 cout<<"Read HEPATITIS from: "<<path<<endl; 03737 nDomain = 1; 03738 03739 // define data type and files 03740 int targetColumn = 1; 03741 uint nTrainTmp; 03742 char columnType[] = "dnnnnnnnnnnnnnnnnnnn"; 03743 char enabledCol[] = "11111111111111111111"; 03744 const char* dataFiles[] = { ( new string ( path+"/hepatitis.data" ) )->c_str(),0}; 03745 03746 // === TRAIN SET === 03747 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03748 03749 // allocate tmp mem 03750 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03751 int* trainLabelTmp = new int[nTrainTmp]; 03752 03753 // fill data 03754 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03755 03756 // split train and testset from trainTmp 03757 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03758 03759 delete[] trainTmp; 03760 delete[] trainLabelTmp; 03761 03762 }
void DatasetReader::readIONOSPHERE | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the IONOSPHERE dataset (UCI)
76467Bytes ionosphere.data 3116Bytes ionosphere.names
Definition at line 3770 of file DatasetReader.cpp.
03771 { 03772 cout<<"Read IONOSPHERE from: "<<path<<endl; 03773 nDomain = 1; 03774 03775 // define data type and files 03776 int targetColumn = 35; 03777 uint nTrainTmp; 03778 char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd"; 03779 char enabledCol[] = "11111111111111111111111111111111111"; 03780 const char* dataFiles[] = { ( new string ( path+"/ionosphere.data" ) )->c_str(),0}; 03781 03782 // === TRAIN SET === 03783 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03784 03785 // allocate tmp mem 03786 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03787 int* trainLabelTmp = new int[nTrainTmp]; 03788 03789 // fill data 03790 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03791 03792 // split train and testset from trainTmp 03793 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03794 03795 delete[] trainTmp; 03796 delete[] trainLabelTmp; 03797 03798 }
void DatasetReader::readIRIS | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the IRIS dataset (UCI)
4551Bytes iris.data 2998Bytes iris.names
Definition at line 3807 of file DatasetReader.cpp.
03808 { 03809 cout<<"Read IRIS from: "<<path<<endl; 03810 nDomain = 1; 03811 03812 // define data type and files 03813 int targetColumn = 5; 03814 uint nTrainTmp; 03815 char columnType[] = "nnnnd"; 03816 char enabledCol[] = "11111"; 03817 const char* dataFiles[] = { ( new string ( path+"/iris.data" ) )->c_str(),0}; 03818 03819 // === TRAIN SET === 03820 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03821 03822 // allocate tmp mem 03823 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03824 int* trainLabelTmp = new int[nTrainTmp]; 03825 03826 // fill data 03827 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03828 03829 // split train and testset from trainTmp 03830 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 03831 03832 delete[] trainTmp; 03833 delete[] trainLabelTmp; 03834 03835 }
void DatasetReader::readKDDCup09Large | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the KNNCup09Large dataset
Definition at line 120 of file DatasetReader.cpp.
00121 { 00122 if ( 0 ) 00123 { 00124 readKDDCup09LargeBin ( path, train, trainTarget, trainLabel, test, testTarget, testLabel, nTrain, nTest, nClass, nDomain, nFeat, positiveTarget, negativeTarget ); 00125 return; 00126 } 00127 00128 time_t t0 = time ( 0 ); 00129 00130 nDomain = 3; 00131 00132 cout<<"Read KDDCup09 from: "<<path<<endl; 00133 00134 // 8-full-numeric inputs 00135 // (epoch=10) reg=0.00113318 .... [classErr:49.6833%] [rmse:0.99835] [probe:-0.498109] [CalcBlend] lambda:0.001 [classErr:61.3693%] [rmse:1.12297] ERR=-0.648214 35[s] !min! [saveBest][SB] 00136 00137 char* targetFiles[] = //"orange_large_train_toy.labels", 00138 { 00139 "orange_large_train_churn.labels", 00140 "orange_large_train_appetency.labels", 00141 "orange_large_train_upselling.labels" 00142 }; 00143 00144 int nPreAlloc = 100000000; 00145 char *buf0 = new char[512], *buf1 = new char[512]; 00146 char* lineBuf = new char[nPreAlloc]; 00147 //REAL* support; 00148 //int* supportCnt; 00149 00150 int NUM = 14740, CAT = 260, NLINES = 50000; 00151 int nFiles = 5; 00152 bool setNumZerosToMeans = false; 00153 bool setMissingToMeans = false; 00154 int numericMinMissing = 1; 00155 int numericMaxCluster = 0; // add categoric (one-hot) from numeric input, max. occurence cnt 00156 int minAttributeOccurenceCategorical = 50*nFiles; // 20 00157 int minAttributeOccurenceNumerical = 500*nFiles; // 200 00158 REAL maxSTD = 1e10; // 10 00159 cout<<"nFiles:"<<nFiles<<" minAttrOccurCat:"<<minAttributeOccurenceCategorical<<" minAttrOccurNum:"<<minAttributeOccurenceNumerical<<endl; 00160 cout<<setNumZerosToMeans<<" "<<setMissingToMeans<<" "<<numericMaxCluster<<" "<<minAttributeOccurenceCategorical<<" "<<minAttributeOccurenceNumerical<<" "<<maxSTD<<endl; 00161 00162 vector<string>* numericalAttributes = new vector<string>[NUM]; 00163 vector<int>* numericalAttributesCnt = new vector<int>[NUM]; 00164 vector<string>* categoricalAttributes = new vector<string>[CAT]; 00165 vector<int>* categoricalAttributesCnt = new vector<int>[CAT]; 00166 bool* categoricalHasMissingBin = new bool[CAT]; 00167 int* categoricalMissingCnt = new int[CAT]; 00168 bool* categoricalHasUnknownBin = new bool[CAT]; 00169 for ( int i=0;i<CAT;i++ ) 00170 { 00171 categoricalHasMissingBin[i] = false; 00172 categoricalHasUnknownBin[i] = false; 00173 categoricalMissingCnt[i] = 0; 00174 } 00175 int* numericNonZeroCnt = new int[NUM]; 00176 int* numericMissingCnt = new int[NUM]; 00177 bool* numericHasMissingBin = new bool[NUM]; 00178 double* numericNonZeroPercent = new double[NUM]; 00179 for ( int i=0;i<NUM;i++ ) 00180 { 00181 numericMissingCnt[i] = 0; 00182 numericNonZeroCnt[i] = 0; 00183 numericNonZeroPercent[i] = 0.0; 00184 numericHasMissingBin[i] = false; 00185 } 00186 00187 double* minValues = new double[100000]; 00188 double* maxValues = new double[100000]; 00189 double* maxNormValues = new double[100000]; 00190 double* meanValues = new double[100000]; 00191 double* stdValues = new double[100000]; 00192 double* mean2Values = new double[100000]; 00193 int* meanCnt = new int[100000]; 00194 for ( int i=0;i<100000;i++ ) 00195 { 00196 minValues[i] = 1e20; 00197 maxValues[i] = -1e20; 00198 maxNormValues[i] = 0.0; 00199 meanValues[i] = 0.0; 00200 mean2Values[i] = 0.0; 00201 meanCnt[i] = 0; 00202 stdValues[i] = 0.0; 00203 } 00204 00205 //=========================================================================================================================== 00206 //=========================================================================================================================== 00207 // Loop over 2 states: 00208 // - State=0 read train values (+build index tables) 00209 // - State=1 store to features (train or test) 00210 // 00211 for ( int state=0;state<2;state++ ) 00212 { 00213 int nTrainFill = 0; 00214 if ( state == 0 ) 00215 { 00216 nTrain = 0; 00217 } 00218 00219 //======================================================================================================================= 00220 //======================================================================================================================= 00221 // Loop over n files (file chunks) 00222 // 00223 for ( int file=0;file<nFiles;file++ ) 00224 { 00225 // open train or test set 00226 if ( state == 0 ) 00227 sprintf ( buf0,"%s/orange_large_train.data.chunk%d",path.c_str(), file+1 ); 00228 else 00229 { 00230 if ( Framework::getFrameworkMode() == 1 ) 00231 sprintf ( buf0,"%s/orange_large_test.data.chunk%d",path.c_str(), file+1 ); 00232 else 00233 sprintf ( buf0,"%s/orange_large_train.data.chunk%d",path.c_str(), file+1 ); 00234 } 00235 00236 cout<<"Open:"<<buf0<<endl; 00237 fstream f; 00238 f.open ( buf0, ios::in ); 00239 if ( f.is_open() == false ) 00240 assert ( false ); 00241 00242 // read the first line in the first file (dummy) 00243 if ( file==0 ) 00244 f.getline ( lineBuf, nPreAlloc ); 00245 00246 // tmp and count vars 00247 double zeroRatio = 0.0; 00248 double sparse = 0.0; 00249 int nTrainTmp = 0; 00250 00251 //=================================================================================================================== 00252 //=================================================================================================================== 00253 // Read all lines of chunk file n 00254 // 00255 while ( f.getline ( lineBuf, nPreAlloc ) ) 00256 { 00257 if ( nTrainTmp%1000 == 0 ) 00258 cout<<"."<<flush; 00259 00260 // tmp and count vars 00261 int pos0 = 0, pos1 = 0; 00262 int nF = 0, nMissing = 0, nZeros = 0; 00263 int nFeatFill = 0; 00264 int nrHot = 0; 00265 double value; 00266 00267 if ( state == 1 ) 00268 { 00269 // add constant one 00270 train[nTrainFill*nFeat + nFeatFill] = 1.0; 00271 nFeatFill++; 00272 } 00273 00274 //=============================================================================================================== 00275 //=============================================================================================================== 00276 // Go through all characters of this line 00277 // 00278 while ( lineBuf[pos1] ) 00279 { 00280 // search for next tabulator 00281 while ( lineBuf[pos1] != '\t' && lineBuf[pos1] != 0 ) 00282 pos1++; 00283 00284 //=========================================================================================================== 00285 //=========================================================================================================== 00286 // If the feature has some content 00287 // This means no consecutive tabs 00288 // 00289 if ( pos1 > pos0 && lineBuf[pos1]!=0 ) 00290 { 00291 // copy to tmp buffer 00292 if ( pos1-pos0 <=0 || pos1-pos0 >= 512 ) 00293 assert ( false ); 00294 for ( int j=0;j<pos1-pos0;j++ ) 00295 buf1[j] = lineBuf[pos0+j]; 00296 buf1[pos1-pos0] = 0; 00297 00298 00299 //======================================================================================================= 00300 //======================================================================================================= 00301 // Read Numeric value (feature count < NUM) 00302 // 00303 if ( nF < NUM ) 00304 { 00305 if ( ( buf1[0]>='0' && buf1[0] <='9' ) || buf1[0]=='-' ) 00306 ; 00307 else 00308 { 00309 cout<<"BUF:"<<buf1<<endl; 00310 assert ( false ); 00311 } 00312 00313 //sscanf(buf1, "%f", &value); 00314 value = atof ( buf1 ); 00315 00316 if ( value == 0.0 ) 00317 nZeros++; 00318 00319 // first run through train data 00320 if ( state==0 ) 00321 { 00322 if ( minValues[nF] > value ) 00323 minValues[nF] = value; 00324 if ( maxValues[nF] < value ) 00325 maxValues[nF] = value; 00326 00327 // histogram over numeric values 00328 int size = numericalAttributes[nF].size(); 00329 if ( size < numericMaxCluster ) 00330 { 00331 int foundIndex = -1; 00332 for ( int j=0;j<size;j++ ) 00333 if ( numericalAttributes[nF][j] == buf1 ) 00334 { 00335 foundIndex = j; 00336 break; 00337 } 00338 // add value 00339 if ( foundIndex == -1 ) 00340 { 00341 numericalAttributes[nF].push_back ( buf1 ); 00342 numericalAttributesCnt[nF].push_back ( 1 ); 00343 } 00344 else 00345 numericalAttributesCnt[nF][foundIndex]++; 00346 } 00347 00348 if ( value != 0.0 ) 00349 { 00350 numericNonZeroCnt[nF]++; 00351 if ( numericNonZeroCnt[nF] > nTrain+nTrainTmp+1 ) 00352 { 00353 cout<<"numericNonZeroCnt[nF]:"<<numericNonZeroCnt[nF]<<" nF:"<<nF<<" nTrainTmp:"<<nTrainTmp<<" nZeros:"<<nZeros<<" pos0:"<<pos0<<" pos1:"<<pos1<<endl; 00354 assert ( false ); 00355 } 00356 } 00357 00358 if ( value != 0.0 ) 00359 { 00360 // calc mean over numeric input 00361 meanValues[nF] += value; 00362 mean2Values[nF] += value * value; 00363 meanCnt[nF]++; 00364 } 00365 } 00366 else if ( state==1 ) // second run, fill data tables 00367 { 00368 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD ) 00369 { 00370 // numeric add 00371 if ( value == 0.0 && setNumZerosToMeans ) 00372 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF]; 00373 else 00374 train[nTrainFill*nFeat + nFeatFill] = value; 00375 nFeatFill++; 00376 00377 // numeric one hot add 00378 int size = numericalAttributes[nF].size(); 00379 if ( size < numericMaxCluster && size > 1 ) 00380 { 00381 int foundIndex = -1; 00382 for ( int j=0;j<size;j++ ) 00383 if ( numericalAttributes[nF][j] == buf1 ) 00384 { 00385 foundIndex = j; 00386 break; 00387 } 00388 // fill categorical 00389 int beforeHot = nrHot; 00390 for ( int j=0;j<size;j++ ) 00391 { 00392 if ( foundIndex == j ) 00393 { 00394 train[nTrainFill*nFeat + nFeatFill] = 1.0; 00395 nrHot++; 00396 } 00397 else 00398 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00399 nFeatFill++; 00400 } 00401 // fill missing 00402 /*if(nrHot == beforeHot) 00403 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00404 else 00405 train[nTrainFill*nFeat + nFeatFill] = 1.0; 00406 nFeatFill++;*/ 00407 } 00408 00409 } 00410 00411 // missing values one-hot encoded 00412 if ( numericHasMissingBin[nF] ) 00413 { 00414 train[nTrainFill*nFeat + nFeatFill] = 0.0; // <- missing 00415 nFeatFill++; 00416 train[nTrainFill*nFeat + nFeatFill] = 1.0; // <- available 00417 nFeatFill++; 00418 } 00419 } 00420 } 00421 //======================================================================================================= 00422 //======================================================================================================= 00423 // Read Categorical value (feature count >= NUM) 00424 // 00425 else 00426 { 00427 int index = nF-NUM; 00428 if ( index >= CAT ) 00429 assert ( false ); 00430 int size = categoricalAttributes[index].size(); 00431 int sizeCnt = categoricalAttributesCnt[index].size(); 00432 if ( size != sizeCnt ) 00433 assert ( false ); 00434 00435 int foundIndex = -1; 00436 for ( int j=0;j<size;j++ ) 00437 if ( categoricalAttributes[index][j] == buf1 ) 00438 { 00439 foundIndex = j; 00440 break; 00441 } 00442 00443 // first run through train data 00444 if ( state==0 ) 00445 { 00446 // add value 00447 if ( foundIndex == -1 ) 00448 { 00449 categoricalAttributes[index].push_back ( buf1 ); 00450 categoricalAttributesCnt[index].push_back ( 1 ); 00451 } 00452 else // already exists 00453 categoricalAttributesCnt[index][foundIndex]++; 00454 } 00455 else if ( state==1 ) // second run, fill data tables 00456 { 00457 // one-hot encoding 00458 int fillCnt = 0; 00459 int beforeHot = nrHot; 00460 for ( int j=0;j<size;j++ ) 00461 { 00462 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain ) 00463 { 00464 if ( foundIndex == j ) 00465 { 00466 train[nTrainFill*nFeat + nFeatFill] = 1.0; 00467 nrHot++; 00468 } 00469 else 00470 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00471 fillCnt++; 00472 nFeatFill++; 00473 } 00474 } 00475 00476 // no missing (no consecutive tabs here) 00477 if ( categoricalHasMissingBin[index] ) 00478 { 00479 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00480 fillCnt++; 00481 nFeatFill++; 00482 } 00483 00484 // if found, but not in cache 00485 if ( categoricalHasUnknownBin[index] ) 00486 { 00487 if ( beforeHot == nrHot ) 00488 { 00489 train[nTrainFill*nFeat + nFeatFill] = 1.0; 00490 nrHot++; 00491 } 00492 else 00493 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00494 fillCnt++; 00495 nFeatFill++; 00496 } 00497 00498 if ( nrHot != beforeHot + 1 && fillCnt > 0 ) 00499 { 00500 cout<<"WARNING: foundIndex:"<<foundIndex<<" "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl; 00501 //assert(false); 00502 } 00503 } 00504 } 00505 } 00506 //=========================================================================================================== 00507 //=========================================================================================================== 00508 // If the feature has no content 00509 // Missing value here 00510 // 00511 else 00512 { 00513 nMissing++; 00514 00515 if ( state==0 ) 00516 { 00517 // numeric 00518 if ( nF < NUM ) 00519 { 00520 numericMissingCnt[nF]++; 00521 } 00522 // categorical 00523 if ( nF >= NUM ) 00524 { 00525 int index = nF-NUM; 00526 categoricalMissingCnt[index]++; 00527 } 00528 } 00529 00530 // second run, fill data tables with zeros 00531 if ( state==1 ) 00532 { 00533 //=================================================================================================== 00534 //=================================================================================================== 00535 // Read Numeric value (feature count < NUM) 00536 // 00537 if ( nF < NUM ) 00538 { 00539 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD ) 00540 { 00541 // numeric add 00542 if ( setMissingToMeans ) 00543 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF]; 00544 else 00545 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00546 nFeatFill++; 00547 00548 // numeric one hot add 00549 int size = numericalAttributes[nF].size(); 00550 if ( size < numericMaxCluster && size > 1 ) 00551 { 00552 // fill categorical 00553 for ( int j=0;j<size;j++ ) 00554 { 00555 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00556 nFeatFill++; 00557 } 00558 // fill missing 00559 //train[nTrainFill*nFeat + nFeatFill] = 1.0; 00560 //nFeatFill++; 00561 } 00562 } 00563 00564 // missing values one-hot encoded 00565 if ( numericHasMissingBin[nF] ) 00566 { 00567 train[nTrainFill*nFeat + nFeatFill] = 1.0; // <- missing 00568 nFeatFill++; 00569 train[nTrainFill*nFeat + nFeatFill] = 0.0; // <- available 00570 nFeatFill++; 00571 } 00572 } 00573 //=================================================================================================== 00574 //=================================================================================================== 00575 // Read Categorical value (feature count >= NUM) 00576 // 00577 else 00578 { 00579 int index = nF - NUM; 00580 if ( index >= CAT ) 00581 assert ( false ); 00582 int size = categoricalAttributes[index].size(); 00583 int sizeCnt = categoricalAttributesCnt[index].size(); 00584 if ( size != sizeCnt ) 00585 assert ( false ); 00586 00587 // one-hot encoding 00588 int fillCnt = 0; 00589 int beforeHot = nrHot; 00590 for ( int j=0;j<size;j++ ) 00591 { 00592 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain ) 00593 { 00594 train[nTrainFill*nFeat + nFeatFill] = 0.0; // no here 00595 fillCnt++; 00596 nFeatFill++; 00597 } 00598 } 00599 if ( categoricalHasMissingBin[index] ) 00600 { 00601 if ( fillCnt == 0 && categoricalHasUnknownBin[index] == false ) 00602 { 00603 cout<<"categoricalMissingCnt["<<index<<"]:"<<categoricalMissingCnt[index]<<endl; 00604 assert ( false ); 00605 } 00606 // set the input to "missing value" 00607 train[nTrainFill*nFeat + nFeatFill] = 1.0; 00608 nrHot++; 00609 fillCnt++; 00610 nFeatFill++; 00611 } 00612 00613 if ( categoricalHasUnknownBin[index] ) 00614 { 00615 // no unknown value 00616 train[nTrainFill*nFeat + nFeatFill] = 0.0; 00617 fillCnt++; 00618 nFeatFill++; 00619 } 00620 00621 if ( nrHot != beforeHot + 1 && fillCnt > 0 ) 00622 { 00623 cout<<"WARNING: "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl; 00624 //assert(false); 00625 } 00626 } 00627 } 00628 } 00629 00630 // check for last character 00631 if ( lineBuf[pos1]!=0 ) 00632 pos1++; 00633 00634 // beginpos = endpos 00635 pos0 = pos1; 00636 00637 // column count 00638 nF++; 00639 } 00640 00641 // valid checks 00642 if ( nF != NUM + CAT ) 00643 assert ( false ); 00644 if ( state==1 ) 00645 { 00646 if ( nFeatFill != nFeat ) 00647 { 00648 cout<<"nFeatFill:"<<nFeatFill<<" nFeat:"<<nFeat<<endl; 00649 assert ( false ); 00650 } 00651 nTrainFill++; 00652 } 00653 00654 nTrainTmp++; 00655 00656 sparse += nMissing / ( double ) nF; 00657 zeroRatio += nZeros / ( double ) nF; 00658 } 00659 00660 f.close(); 00661 00662 // ratio of sparseness and zeroPercent 00663 sparse /= ( double ) nTrainTmp; 00664 zeroRatio /= ( double ) nTrainTmp; 00665 cout<<"nTrainTmp:"<<nTrainTmp<<endl; 00666 cout<<"missing values:"<<100.0*sparse<<"%"<<endl; 00667 cout<<"zero values:"<<100.0*zeroRatio<<"%"<<endl; 00668 00669 double min0 = 1e20, max0 = -1e20; 00670 for ( int i=0;i<100000;i++ ) 00671 { 00672 if ( min0 > minValues[i] ) 00673 min0 = minValues[i]; 00674 if ( max0 < maxValues[i] ) 00675 max0 = maxValues[i]; 00676 } 00677 cout<<"min|max values: "<<min0<<"|"<<max0<<endl; 00678 00679 int sum = 0; 00680 for ( int j=0;j<CAT;j++ ) 00681 sum += categoricalAttributes[j].size(); 00682 cout<<"nCategoricalSum:"<<sum<<endl; 00683 00684 if ( state == 0 ) 00685 nTrain += nTrainTmp; 00686 00687 } 00688 00689 // do some checks 00690 if ( state == 1 ) 00691 { 00692 if ( nTrain != nTrainFill ) 00693 assert ( false ); 00694 00695 for ( int i=0;i<nTrain*nFeat;i++ ) 00696 if ( train[i] == 1e10 ) 00697 { 00698 cout<<"i:"<<i<<endl; 00699 assert ( false ); 00700 } 00701 } 00702 00703 if ( state==0 ) 00704 { 00705 for ( int i=0;i<NUM;i++ ) 00706 numericNonZeroPercent[i] = ( double ) numericNonZeroCnt[i]/ ( double ) nTrain; 00707 for ( int i=0;i<100000;i++ ) 00708 if ( meanCnt[i] > 0 ) 00709 { 00710 meanValues[i] /= ( double ) meanCnt[i]; 00711 stdValues[i] = sqrt ( mean2Values[i]/ ( double ) meanCnt[i] - meanValues[i]/ ( double ) meanCnt[i] ); 00712 maxNormValues[i] = fabs ( maxValues[i] - meanValues[i] ); 00713 if ( maxNormValues[i] < fabs ( minValues[i] - meanValues[i] ) ) 00714 maxNormValues[i] = fabs ( minValues[i] - meanValues[i] ); 00715 } 00716 00717 cout<<"nTrain:"<<nTrain<<endl; 00718 00719 // === Calculate effective number of input features === 00720 nFeat = 1; // const 00721 int nFeatNum = 0, nFeatNumRaw = 0, nFeatNumCat = 0, nFeatCat = 0, nUnknown = 0, nMissing = 0, nIn = 0, nNumMiss = 0; 00722 // numerical 00723 for ( int j=0;j<NUM;j++ ) 00724 { 00725 if ( numericNonZeroCnt[j] >= minAttributeOccurenceNumerical && maxNormValues[j] < stdValues[j]*maxSTD ) 00726 { 00727 // standard numerical input 00728 nFeat++; 00729 nFeatNum++; 00730 nFeatNumRaw++; 00731 00732 // numerical input with limited number of different values -> translate it to categorical input 00733 if ( numericalAttributes[j].size() < numericMaxCluster && numericalAttributes[j].size() > 1 ) 00734 { 00735 cout<<"nFeatNum:"<<nFeatNum<<" "; 00736 for ( int k=0;k<numericalAttributes[j].size();k++ ) 00737 { 00738 cout<<numericalAttributes[j][k]<<"("<<numericalAttributesCnt[j][k]<<") "; 00739 nFeat++; 00740 nFeatNum++; 00741 nFeatNumCat++; 00742 } 00743 cout<<endl; 00744 /* 00745 // add one bin for "missing or unknown value" 00746 nFeat++; 00747 nFeatNum++; 00748 nFeatNumCat++;*/ 00749 } 00750 if ( numericMissingCnt[j] >= numericMinMissing ) 00751 { 00752 numericHasMissingBin[j] = true; 00753 nFeat+=2; 00754 nNumMiss+=2; 00755 } 00756 } 00757 } 00758 // categorical 00759 for ( int j=0;j<CAT;j++ ) 00760 { 00761 int nUsed = 0, nUn = 0, nCat = 0, nMiss = 0, nUnk = 0; 00762 for ( int k=0;k<categoricalAttributesCnt[j].size();k++ ) 00763 { 00764 // count valid entries (with enough occurence) 00765 if ( categoricalAttributesCnt[j][k] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[j][k] < nTrain ) 00766 { 00767 nFeat++; 00768 nFeatCat++; 00769 nUsed++; 00770 nIn++; 00771 nCat++; 00772 } 00773 else if ( categoricalAttributesCnt[j][k] < nTrain ) // not enough occurence -> put to unknown 00774 nUn++; 00775 } 00776 // missing is like a normal categoric input 00777 if ( ( categoricalMissingCnt[j] >= minAttributeOccurenceCategorical && categoricalMissingCnt[j] < nTrain ) || categoricalMissingCnt[j] > 0 && nCat > 0 ) 00778 { 00779 // add a "missing value" input of this feature 00780 nFeat++; 00781 nFeatCat++; 00782 nMissing++; 00783 nMiss++; 00784 categoricalHasMissingBin[j] = true; 00785 } 00786 if ( nUn > 0 && nCat + nMiss > 0 ) 00787 { 00788 // add a "unknown value" input of this feature 00789 nFeat++; 00790 nFeatCat++; 00791 nUnknown++; 00792 nUnk++; 00793 categoricalHasUnknownBin[j] = true; 00794 } 00795 00796 if ( nCat + nMiss + nUnk == 1 ) 00797 assert ( false ); 00798 } 00799 00800 cout<<"nFeat:"<<nFeat<<" (numInputs:"<<nFeatNum<<" [rawNum:"<<nFeatNumRaw<<" nFeatNumCat:"<<nFeatNumCat<<"] catInputs:"<<nFeatCat<<" [nUnknown:"<<nUnknown<<" nMissing:"<<nMissing<<" nCat:"<<nIn<<"] numMissingHot:"<<nNumMiss<<" [+1const.])"<<endl; 00801 00802 cout<<"Allocate train features: "<< ( double ) nTrain*nFeat/1e6*4.0<<" MB"<<endl; 00803 train = new REAL[nTrain*nFeat]; 00804 for ( int i=0;i<nTrain*nFeat;i++ ) 00805 train[i] = 1e10; 00806 00807 //support = new REAL[nFeat]; 00808 //supportCnt = new int[nFeat]; 00809 //for(int i=0;i<nFeat;i++) 00810 //{ 00811 //support[i] = 0.0; 00812 //supportCnt[i] = 0; 00813 //} 00814 00815 // read targets 00816 nClass = 2; 00817 trainTarget = new REAL[nTrain*nClass*nDomain]; 00818 trainLabel = new int[nTrain*nDomain]; 00819 for ( int d=0;d<nDomain;d++ ) 00820 { 00821 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] ); 00822 fstream f; 00823 cout<<"Open targets:"<<buf0<<endl; 00824 f.open ( buf0,ios::in ); 00825 if ( f.is_open() == false ) 00826 assert ( false ); 00827 int label; 00828 for ( int i=0;i<nTrain;i++ ) 00829 { 00830 f>>label; 00831 if ( label==-1 ) 00832 { 00833 trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget; 00834 trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget; 00835 trainLabel[i*nDomain + d] = 0; 00836 } 00837 else if ( label==1 ) 00838 { 00839 trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget; 00840 trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget; 00841 trainLabel[i*nDomain + d] = 1; 00842 } 00843 else 00844 assert ( false ); 00845 } 00846 f.close(); 00847 } 00848 // test set 00849 nTest = 0; 00850 test = 0; 00851 testTarget = 0; 00852 testLabel = 0; 00853 00854 } 00855 } 00856 00857 for ( int i=0;i<nTrain;i++ ) 00858 for ( int j=0;j<nFeat;j++ ) 00859 if ( train[i*nFeat+j] == 1e10 ) 00860 { 00861 cout<<"i:"<<i<<" j:"<<j<<" "<<train[i*nFeat+j]<<endl; 00862 assert ( false ); 00863 } 00864 00865 00866 fstream f; 00867 /*f.open("AAA.txt",ios::out); 00868 f<<"========= numerical ========="<<endl<<endl; 00869 for(int i=0;i<NUM;i++) 00870 if(numericNonZeroCnt[i] >= minAttributeOccurenceNumerical && maxNormValues[i] < stdValues[i]*5.0) 00871 f<<i<<":"<<numericNonZeroCnt[i]<<"["<<numericNonZeroCnt[i]<<"]["<<minValues[i]<<"|"<<maxValues[i]<<"]"<<endl; 00872 f<<endl<<endl; 00873 f<<"========= categorical ========="<<endl<<endl; 00874 for(int i=0;i<CAT;i++) 00875 { 00876 int size = categoricalAttributes[i].size(); 00877 00878 int chkCnt = 0; 00879 for(int j=0;j<size;j++) 00880 if(categoricalAttributesCnt[i][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[i][j] < NLINES) 00881 chkCnt++; 00882 00883 if(chkCnt > 0) 00884 f<<endl<<"Attrib."<<i<<"(#"<<chkCnt<<"):"; 00885 00886 // go over all possible values 00887 for(int j=0;j<size;j++) 00888 { 00889 // find the max support 00890 int ind = -1; 00891 int max = -1; 00892 for(int k=0;k<size;k++) 00893 { 00894 if(categoricalAttributesCnt[i][k] > max) 00895 { 00896 max = categoricalAttributesCnt[i][k]; 00897 ind = k; 00898 } 00899 } 00900 if(ind==-1) 00901 assert(false); 00902 00903 if(categoricalAttributesCnt[i][ind] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[i][ind] < NLINES) 00904 f<<j<<":"<<categoricalAttributes[i][ind]<<"["<<categoricalAttributesCnt[i][ind]<<"]("<<ind<<") "; 00905 00906 // mark as viewed 00907 categoricalAttributesCnt[i][ind] = -1; 00908 } 00909 00910 if(categoricalHasMissingBin[i]) 00911 f<<"+1xMissing"<<"[]("<<-1<<") "; 00912 00913 if(chkCnt > 0) 00914 f<<endl; 00915 00916 } 00917 f.close(); 00918 */ 00919 if ( lineBuf ) 00920 { 00921 delete[] lineBuf; 00922 lineBuf = 0; 00923 } 00924 if ( numericNonZeroCnt ) 00925 { 00926 delete[] numericNonZeroCnt; 00927 numericNonZeroCnt = 0; 00928 } 00929 if ( numericNonZeroPercent ) 00930 { 00931 delete[] numericNonZeroPercent; 00932 numericNonZeroPercent = 0; 00933 } 00934 if ( categoricalAttributes ) 00935 { 00936 delete[] categoricalAttributes; 00937 categoricalAttributes = 0; 00938 } 00939 if ( meanValues ) 00940 { 00941 delete[] meanValues; 00942 meanValues = 0; 00943 } 00944 if ( meanCnt ) 00945 { 00946 delete[] meanCnt; 00947 meanCnt = 0; 00948 } 00949 if ( categoricalHasMissingBin ) 00950 { 00951 delete[] categoricalHasMissingBin; 00952 categoricalHasMissingBin = 0; 00953 } 00954 00955 // tmp print out of: train data 00956 f.open ( "A.txt",ios::out ); 00957 double* mu = new double[nFeat]; 00958 for ( int i=0;i<nFeat;i++ ) 00959 mu[i] = 0.0; 00960 for ( int i=0;i<nTrain;i++ ) 00961 for ( int j=0;j<nFeat;j++ ) 00962 mu[j] += train[i*nFeat + j]; 00963 for ( int i=0;i<nFeat;i++ ) 00964 mu[i] /= ( double ) nTrain; 00965 for ( int i=0;i<nFeat;i++ ) 00966 f<<mu[i]<<endl; 00967 f.close(); 00968 00969 //f.open("A.dat",ios::out); 00970 //f.write((char*)train,sizeof(REAL)*nTrain*nFeat); 00971 //f.close(); 00972 00973 /*f.open("A.txt",ios::out); 00974 for(int i=0;i<nTrain;i++) 00975 { 00976 for(int j=0;j<nFeat;j++) 00977 f<<train[i*nFeat+j]<<" "; 00978 f<<endl; 00979 } 00980 f.close(); 00981 00982 f.open("B.txt",ios::out); 00983 for(int i=0;i<nTrain;i++) 00984 { 00985 for(int j=0;j<nDomain;j++) 00986 f<<trainLabel[i*nDomain+j]<<" "; 00987 f<<endl; 00988 } 00989 f.close(); 00990 00991 f.open("C.txt",ios::out); 00992 for(int i=0;i<nTrain;i++) 00993 { 00994 for(int j=0;j<nDomain*nClass;j++) 00995 f<<trainTarget[i*nDomain*nClass+j]<<" "; 00996 f<<endl; 00997 } 00998 f.close(); 00999 */ 01000 if ( Framework::getFrameworkMode() == 1 ) 01001 { 01002 cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl; 01003 test = train; 01004 train = 0; 01005 nTest = nTrain; 01006 nTrain = 0; 01007 testTarget = trainTarget; 01008 trainTarget = 0; 01009 testLabel = trainLabel; 01010 trainLabel = 0; 01011 } 01012 cout<<endl<<"Finished read in "<<time ( 0 )-t0<<"[s]"<<endl<<endl; 01013 }
void DatasetReader::readKDDCup09LargeBin | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Read the large dataset from the KDDCup2009 (internal binary type)
Definition at line 24 of file DatasetReader.cpp.
00025 { 00026 nDomain = 3; 00027 char* targetFiles[] = //"orange_large_train_toy.labels", 00028 { 00029 "orange_large_train_churn.labels", 00030 "orange_large_train_appetency.labels", 00031 "orange_large_train_upselling.labels" 00032 }; 00033 00034 nTrain = 50000; 00035 nFeat = 113; 00036 00037 char buf[512]; 00038 if ( Framework::getFrameworkMode() == 1 ) 00039 { 00040 //sprintf(buf,"featureSelection_churn_test_%d_features.dat",nFeat); 00041 //sprintf(buf,"featureSelection_appetency_test_%d_features.dat",nFeat); 00042 //sprintf(buf,"featureSelection_upselling_test_%d_features.dat",nFeat); 00043 sprintf ( buf,"featureSelection_all_test_%d_features.dat",nFeat ); 00044 } 00045 else 00046 { 00047 //sprintf(buf,"featureSelection_churn_train_%d_features.dat",nFeat); 00048 //sprintf(buf,"featureSelection_appetency_train_%d_features.dat",nFeat); 00049 //sprintf(buf,"featureSelection_upselling_train_%d_features.dat",nFeat); 00050 sprintf ( buf,"featureSelection_all_train_%d_features.dat",nFeat ); 00051 } 00052 cout<<"Open:"<<buf<<endl; 00053 train = new REAL[nTrain * nFeat]; 00054 fstream f ( buf,ios::in ); 00055 if ( f.is_open() == false ) 00056 assert ( false ); 00057 f.read ( ( char* ) train, sizeof ( REAL ) *nTrain*nFeat ); 00058 f.close(); 00059 00060 // read targets 00061 nClass = 2; 00062 trainTarget = new REAL[nTrain*nClass*nDomain]; 00063 trainLabel = new int[nTrain*nDomain]; 00064 char buf0[512]; 00065 for ( int d=0;d<nDomain;d++ ) 00066 { 00067 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] ); 00068 fstream f; 00069 cout<<"Open targets:"<<buf0<<endl; 00070 f.open ( buf0,ios::in ); 00071 if ( f.is_open() == false ) 00072 assert ( false ); 00073 int label; 00074 for ( int i=0;i<nTrain;i++ ) 00075 { 00076 f>>label; 00077 if ( label==-1 ) 00078 { 00079 trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget; 00080 trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget; 00081 trainLabel[i*nDomain + d] = 0; 00082 } 00083 else if ( label==1 ) 00084 { 00085 trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget; 00086 trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget; 00087 trainLabel[i*nDomain + d] = 1; 00088 } 00089 else 00090 assert ( false ); 00091 } 00092 f.close(); 00093 } 00094 00095 // test set 00096 nTest = 0; 00097 test = 0; 00098 testTarget = 0; 00099 testLabel = 0; 00100 00101 if ( Framework::getFrameworkMode() == 1 ) 00102 { 00103 cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl; 00104 test = train; 00105 train = 0; 00106 nTest = nTrain; 00107 nTrain = 0; 00108 testTarget = trainTarget; 00109 trainTarget = 0; 00110 testLabel = trainLabel; 00111 trainLabel = 0; 00112 } 00113 00114 }
void DatasetReader::readKDDCup09Small | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the KNNCup09Small dataset
Definition at line 1020 of file DatasetReader.cpp.
01021 { 01022 time_t t0 = time ( 0 ); 01023 01024 nDomain = 3; 01025 01026 cout<<"Read KDDCup09 from: "<<path<<endl; 01027 01028 char* targetFiles[] = {"orange_small_train_churn.labels" 01029 ,"orange_small_train_appetency.labels" 01030 ,"orange_small_train_upselling.labels" 01031 }; 01032 01033 int nPreAlloc = 100000000; 01034 char *buf0 = new char[512], *buf1 = new char[512]; 01035 char* lineBuf = new char[nPreAlloc]; 01036 01037 int NUM = 190, CAT = 40, NLINES = 50000; 01038 int nFiles = 1; 01039 bool setNumZerosToMeans = false; 01040 bool setMissingToMeans = false; 01041 int numericMinMissing = 1; 01042 int numericMaxCluster = 0; // add categoric (one-hot) from numeric input, max. occurence cnt 01043 int minAttributeOccurenceCategorical = 200*nFiles; // 20 01044 int minAttributeOccurenceNumerical = 500*nFiles; // 50 01045 REAL maxSTD = 1e10; // 10 01046 cout<<"nFiles:"<<nFiles<<" minAttrOccurCat:"<<minAttributeOccurenceCategorical<<" minAttrOccurNum:"<<minAttributeOccurenceNumerical<<endl; 01047 cout<<setNumZerosToMeans<<" "<<setMissingToMeans<<" "<<numericMaxCluster<<" "<<minAttributeOccurenceCategorical<<" "<<minAttributeOccurenceNumerical<<" "<<maxSTD<<endl; 01048 01049 vector<string>* numericalAttributes = new vector<string>[NUM]; 01050 vector<int>* numericalAttributesCnt = new vector<int>[NUM]; 01051 vector<string>* categoricalAttributes = new vector<string>[CAT]; 01052 vector<int>* categoricalAttributesCnt = new vector<int>[CAT]; 01053 bool* categoricalHasMissingBin = new bool[CAT]; 01054 int* categoricalMissingCnt = new int[CAT]; 01055 bool* categoricalHasUnknownBin = new bool[CAT]; 01056 for ( int i=0;i<CAT;i++ ) 01057 { 01058 categoricalHasMissingBin[i] = false; 01059 categoricalHasUnknownBin[i] = false; 01060 categoricalMissingCnt[i] = 0; 01061 } 01062 int* numericNonZeroCnt = new int[NUM]; 01063 int* numericMissingCnt = new int[NUM]; 01064 bool* numericHasMissingBin = new bool[NUM]; 01065 double* numericNonZeroPercent = new double[NUM]; 01066 for ( int i=0;i<NUM;i++ ) 01067 { 01068 numericMissingCnt[i] = 0; 01069 numericNonZeroCnt[i] = 0; 01070 numericNonZeroPercent[i] = 0.0; 01071 numericHasMissingBin[i] = false; 01072 } 01073 01074 double* minValues = new double[100000]; 01075 double* maxValues = new double[100000]; 01076 double* maxNormValues = new double[100000]; 01077 double* meanValues = new double[100000]; 01078 double* stdValues = new double[100000]; 01079 double* mean2Values = new double[100000]; 01080 int* meanCnt = new int[100000]; 01081 for ( int i=0;i<100000;i++ ) 01082 { 01083 minValues[i] = 1e20; 01084 maxValues[i] = -1e20; 01085 maxNormValues[i] = 0.0; 01086 meanValues[i] = 0.0; 01087 mean2Values[i] = 0.0; 01088 meanCnt[i] = 0; 01089 stdValues[i] = 0.0; 01090 } 01091 01092 //=========================================================================================================================== 01093 //=========================================================================================================================== 01094 // Loop over 2 states: 01095 // - State=0 read train values (+build index tables) 01096 // - State=1 store to features (train or test) 01097 // 01098 for ( int state=0;state<2;state++ ) 01099 { 01100 int nTrainFill = 0; 01101 if ( state == 0 ) 01102 { 01103 nTrain = 0; 01104 } 01105 01106 //======================================================================================================================= 01107 //======================================================================================================================= 01108 // Loop over n files (file chunks) 01109 // 01110 for ( int file=0;file<nFiles;file++ ) 01111 { 01112 // open train or test set 01113 if ( state == 0 ) 01114 sprintf ( buf0,"%s/orange_small_train.data",path.c_str() ); 01115 else 01116 { 01117 if ( Framework::getFrameworkMode() == 1 ) 01118 sprintf ( buf0,"%s/orange_small_test.data",path.c_str() ); 01119 else 01120 sprintf ( buf0,"%s/orange_small_train.data",path.c_str() ); 01121 } 01122 01123 cout<<"Open:"<<buf0<<endl; 01124 fstream f; 01125 f.open ( buf0, ios::in ); 01126 if ( f.is_open() == false ) 01127 assert ( false ); 01128 01129 // read the first line in the first file (dummy) 01130 if ( file==0 ) 01131 f.getline ( lineBuf, nPreAlloc ); 01132 01133 // tmp and count vars 01134 double zeroRatio = 0.0; 01135 double sparse = 0.0; 01136 int nTrainTmp = 0; 01137 01138 //=================================================================================================================== 01139 //=================================================================================================================== 01140 // Read all lines of chunk file n 01141 // 01142 while ( f.getline ( lineBuf, nPreAlloc ) ) 01143 { 01144 if ( nTrainTmp%1000 == 0 ) 01145 cout<<"."<<flush; 01146 01147 // tmp and count vars 01148 int pos0 = 0, pos1 = 0; 01149 int nF = 0, nMissing = 0, nZeros = 0; 01150 int nFeatFill = 0; 01151 int nrHot = 0; 01152 double value; 01153 01154 if ( state == 1 ) 01155 { 01156 // add constant one 01157 train[nTrainFill*nFeat + nFeatFill] = 1.0; 01158 nFeatFill++; 01159 } 01160 01161 //=============================================================================================================== 01162 //=============================================================================================================== 01163 // Go through all characters of this line 01164 // 01165 while ( lineBuf[pos1] ) 01166 { 01167 // search for next tabulator 01168 while ( lineBuf[pos1] != '\t' && lineBuf[pos1] != 0 ) 01169 pos1++; 01170 01171 //=========================================================================================================== 01172 //=========================================================================================================== 01173 // If the feature has some content 01174 // This means no consecutive tabs 01175 // 01176 if ( pos1 > pos0 && lineBuf[pos1]!=0 ) 01177 { 01178 // copy to tmp buffer 01179 if ( pos1-pos0 <=0 || pos1-pos0 >= 512 ) 01180 assert ( false ); 01181 for ( int j=0;j<pos1-pos0;j++ ) 01182 buf1[j] = lineBuf[pos0+j]; 01183 buf1[pos1-pos0] = 0; 01184 01185 01186 //======================================================================================================= 01187 //======================================================================================================= 01188 // Read Numeric value (feature count < NUM) 01189 // 01190 if ( nF < NUM ) 01191 { 01192 if ( ( buf1[0]>='0' && buf1[0] <='9' ) || buf1[0]=='-' ) 01193 ; 01194 else 01195 { 01196 cout<<"BUF:"<<buf1<<endl; 01197 assert ( false ); 01198 } 01199 01200 //sscanf(buf1, "%f", &value); 01201 value = atof ( buf1 ); 01202 01203 if ( value == 0.0 ) 01204 nZeros++; 01205 01206 // first run through train data 01207 if ( state==0 ) 01208 { 01209 if ( minValues[nF] > value ) 01210 minValues[nF] = value; 01211 if ( maxValues[nF] < value ) 01212 maxValues[nF] = value; 01213 01214 // histogram over numeric values 01215 int size = numericalAttributes[nF].size(); 01216 if ( size < numericMaxCluster ) 01217 { 01218 int foundIndex = -1; 01219 for ( int j=0;j<size;j++ ) 01220 if ( numericalAttributes[nF][j] == buf1 ) 01221 { 01222 foundIndex = j; 01223 break; 01224 } 01225 // add value 01226 if ( foundIndex == -1 ) 01227 { 01228 numericalAttributes[nF].push_back ( buf1 ); 01229 numericalAttributesCnt[nF].push_back ( 1 ); 01230 } 01231 else 01232 numericalAttributesCnt[nF][foundIndex]++; 01233 } 01234 01235 if ( value != 0.0 ) 01236 { 01237 numericNonZeroCnt[nF]++; 01238 if ( numericNonZeroCnt[nF] > nTrain+nTrainTmp+1 ) 01239 { 01240 cout<<"numericNonZeroCnt[nF]:"<<numericNonZeroCnt[nF]<<" nF:"<<nF<<" nTrainTmp:"<<nTrainTmp<<" nZeros:"<<nZeros<<" pos0:"<<pos0<<" pos1:"<<pos1<<endl; 01241 assert ( false ); 01242 } 01243 } 01244 01245 if ( value != 0.0 ) 01246 { 01247 // calc mean over numeric input 01248 meanValues[nF] += value; 01249 mean2Values[nF] += value * value; 01250 meanCnt[nF]++; 01251 } 01252 } 01253 else if ( state==1 ) // second run, fill data tables 01254 { 01255 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD ) 01256 { 01257 // numeric add 01258 if ( value == 0.0 && setNumZerosToMeans ) 01259 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF]; 01260 else 01261 train[nTrainFill*nFeat + nFeatFill] = value; 01262 nFeatFill++; 01263 01264 // numeric one hot add 01265 int size = numericalAttributes[nF].size(); 01266 if ( size < numericMaxCluster && size > 1 ) 01267 { 01268 int foundIndex = -1; 01269 for ( int j=0;j<size;j++ ) 01270 if ( numericalAttributes[nF][j] == buf1 ) 01271 { 01272 foundIndex = j; 01273 break; 01274 } 01275 // fill categorical 01276 int beforeHot = nrHot; 01277 for ( int j=0;j<size;j++ ) 01278 { 01279 if ( foundIndex == j ) 01280 { 01281 train[nTrainFill*nFeat + nFeatFill] = 1.0; 01282 nrHot++; 01283 } 01284 else 01285 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01286 nFeatFill++; 01287 } 01288 // fill missing 01289 /*if(nrHot == beforeHot) 01290 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01291 else 01292 train[nTrainFill*nFeat + nFeatFill] = 1.0; 01293 nFeatFill++;*/ 01294 } 01295 01296 } 01297 01298 // missing values one-hot encoded 01299 if ( numericHasMissingBin[nF] ) 01300 { 01301 train[nTrainFill*nFeat + nFeatFill] = 0.0; // <- missing 01302 nFeatFill++; 01303 train[nTrainFill*nFeat + nFeatFill] = 1.0; // <- available 01304 nFeatFill++; 01305 } 01306 } 01307 } 01308 //======================================================================================================= 01309 //======================================================================================================= 01310 // Read Categorical value (feature count >= NUM) 01311 // 01312 else 01313 { 01314 int index = nF-NUM; 01315 if ( index >= CAT ) 01316 assert ( false ); 01317 int size = categoricalAttributes[index].size(); 01318 int sizeCnt = categoricalAttributesCnt[index].size(); 01319 if ( size != sizeCnt ) 01320 assert ( false ); 01321 01322 int foundIndex = -1; 01323 for ( int j=0;j<size;j++ ) 01324 if ( categoricalAttributes[index][j] == buf1 ) 01325 { 01326 foundIndex = j; 01327 break; 01328 } 01329 01330 // first run through train data 01331 if ( state==0 ) 01332 { 01333 // add value 01334 if ( foundIndex == -1 ) 01335 { 01336 categoricalAttributes[index].push_back ( buf1 ); 01337 categoricalAttributesCnt[index].push_back ( 1 ); 01338 } 01339 else // already exists 01340 categoricalAttributesCnt[index][foundIndex]++; 01341 } 01342 else if ( state==1 ) // second run, fill data tables 01343 { 01344 // one-hot encoding 01345 int fillCnt = 0; 01346 int beforeHot = nrHot; 01347 for ( int j=0;j<size;j++ ) 01348 { 01349 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain ) 01350 { 01351 if ( foundIndex == j ) 01352 { 01353 train[nTrainFill*nFeat + nFeatFill] = 1.0; 01354 nrHot++; 01355 } 01356 else 01357 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01358 fillCnt++; 01359 nFeatFill++; 01360 } 01361 } 01362 01363 // no missing (no consecutive tabs here) 01364 if ( categoricalHasMissingBin[index] ) 01365 { 01366 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01367 fillCnt++; 01368 nFeatFill++; 01369 } 01370 01371 // if found, but not in cache 01372 if ( categoricalHasUnknownBin[index] ) 01373 { 01374 if ( beforeHot == nrHot ) 01375 { 01376 train[nTrainFill*nFeat + nFeatFill] = 1.0; 01377 nrHot++; 01378 } 01379 else 01380 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01381 fillCnt++; 01382 nFeatFill++; 01383 } 01384 01385 if ( nrHot != beforeHot + 1 && fillCnt > 0 ) 01386 { 01387 cout<<"WARNING: foundIndex:"<<foundIndex<<" "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl; 01388 //assert(false); 01389 } 01390 } 01391 } 01392 } 01393 //=========================================================================================================== 01394 //=========================================================================================================== 01395 // If the feature has no content 01396 // Missing value here 01397 // 01398 else 01399 { 01400 nMissing++; 01401 01402 if ( state==0 ) 01403 { 01404 // numeric 01405 if ( nF < NUM ) 01406 { 01407 numericMissingCnt[nF]++; 01408 } 01409 // categorical 01410 if ( nF >= NUM ) 01411 { 01412 int index = nF-NUM; 01413 categoricalMissingCnt[index]++; 01414 } 01415 } 01416 01417 // second run, fill data tables with zeros 01418 if ( state==1 ) 01419 { 01420 //=================================================================================================== 01421 //=================================================================================================== 01422 // Read Numeric value (feature count < NUM) 01423 // 01424 if ( nF < NUM ) 01425 { 01426 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD ) 01427 { 01428 // numeric add 01429 if ( setMissingToMeans ) 01430 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF]; 01431 else 01432 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01433 nFeatFill++; 01434 01435 // numeric one hot add 01436 int size = numericalAttributes[nF].size(); 01437 if ( size < numericMaxCluster && size > 1 ) 01438 { 01439 // fill categorical 01440 for ( int j=0;j<size;j++ ) 01441 { 01442 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01443 nFeatFill++; 01444 } 01445 // fill missing 01446 //train[nTrainFill*nFeat + nFeatFill] = 1.0; 01447 //nFeatFill++; 01448 } 01449 } 01450 01451 // missing values one-hot encoded 01452 if ( numericHasMissingBin[nF] ) 01453 { 01454 train[nTrainFill*nFeat + nFeatFill] = 1.0; // <- missing 01455 nFeatFill++; 01456 train[nTrainFill*nFeat + nFeatFill] = 0.0; // <- available 01457 nFeatFill++; 01458 } 01459 } 01460 //=================================================================================================== 01461 //=================================================================================================== 01462 // Read Categorical value (feature count >= NUM) 01463 // 01464 else 01465 { 01466 int index = nF - NUM; 01467 if ( index >= CAT ) 01468 assert ( false ); 01469 int size = categoricalAttributes[index].size(); 01470 int sizeCnt = categoricalAttributesCnt[index].size(); 01471 if ( size != sizeCnt ) 01472 assert ( false ); 01473 01474 // one-hot encoding 01475 int fillCnt = 0; 01476 int beforeHot = nrHot; 01477 for ( int j=0;j<size;j++ ) 01478 { 01479 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain ) 01480 { 01481 train[nTrainFill*nFeat + nFeatFill] = 0.0; // no here 01482 fillCnt++; 01483 nFeatFill++; 01484 } 01485 } 01486 if ( categoricalHasMissingBin[index] ) 01487 { 01488 if ( fillCnt == 0 && categoricalHasUnknownBin[index] == false ) 01489 { 01490 cout<<"categoricalMissingCnt["<<index<<"]:"<<categoricalMissingCnt[index]<<endl; 01491 assert ( false ); 01492 } 01493 // set the input to "missing value" 01494 train[nTrainFill*nFeat + nFeatFill] = 1.0; 01495 nrHot++; 01496 fillCnt++; 01497 nFeatFill++; 01498 } 01499 01500 if ( categoricalHasUnknownBin[index] ) 01501 { 01502 // no unknown value 01503 train[nTrainFill*nFeat + nFeatFill] = 0.0; 01504 fillCnt++; 01505 nFeatFill++; 01506 } 01507 01508 if ( nrHot != beforeHot + 1 && fillCnt > 0 ) 01509 { 01510 cout<<"WARNING: "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl; 01511 //assert(false); 01512 } 01513 } 01514 } 01515 } 01516 01517 // check for last character 01518 if ( lineBuf[pos1]!=0 ) 01519 pos1++; 01520 01521 // beginpos = endpos 01522 pos0 = pos1; 01523 01524 // column count 01525 nF++; 01526 } 01527 01528 // valid checks 01529 if ( nF != NUM + CAT ) 01530 assert ( false ); 01531 if ( state==1 ) 01532 { 01533 if ( nFeatFill != nFeat ) 01534 { 01535 cout<<"nFeatFill:"<<nFeatFill<<" nFeat:"<<nFeat<<endl; 01536 assert ( false ); 01537 } 01538 nTrainFill++; 01539 } 01540 01541 nTrainTmp++; 01542 01543 sparse += nMissing / ( double ) nF; 01544 zeroRatio += nZeros / ( double ) nF; 01545 } 01546 01547 f.close(); 01548 01549 // ratio of sparseness and zeroPercent 01550 sparse /= ( double ) nTrainTmp; 01551 zeroRatio /= ( double ) nTrainTmp; 01552 cout<<"nTrainTmp:"<<nTrainTmp<<endl; 01553 cout<<"missing values:"<<100.0*sparse<<"%"<<endl; 01554 cout<<"zero values:"<<100.0*zeroRatio<<"%"<<endl; 01555 01556 double min0 = 1e20, max0 = -1e20; 01557 for ( int i=0;i<100000;i++ ) 01558 { 01559 if ( min0 > minValues[i] ) 01560 min0 = minValues[i]; 01561 if ( max0 < maxValues[i] ) 01562 max0 = maxValues[i]; 01563 } 01564 cout<<"min|max values: "<<min0<<"|"<<max0<<endl; 01565 01566 int sum = 0; 01567 for ( int j=0;j<CAT;j++ ) 01568 sum += categoricalAttributes[j].size(); 01569 cout<<"nCategoricalSum:"<<sum<<endl; 01570 01571 if ( state == 0 ) 01572 nTrain += nTrainTmp; 01573 01574 } 01575 01576 // do some checks 01577 if ( state == 1 ) 01578 { 01579 if ( nTrain != nTrainFill ) 01580 assert ( false ); 01581 01582 for ( int i=0;i<nTrain*nFeat;i++ ) 01583 if ( train[i] == 1e10 ) 01584 { 01585 cout<<"i:"<<i<<endl; 01586 assert ( false ); 01587 } 01588 } 01589 01590 if ( state==0 ) 01591 { 01592 for ( int i=0;i<NUM;i++ ) 01593 numericNonZeroPercent[i] = ( double ) numericNonZeroCnt[i]/ ( double ) nTrain; 01594 for ( int i=0;i<100000;i++ ) 01595 if ( meanCnt[i] > 0 ) 01596 { 01597 meanValues[i] /= ( double ) meanCnt[i]; 01598 stdValues[i] = sqrt ( mean2Values[i]/ ( double ) meanCnt[i] - meanValues[i]/ ( double ) meanCnt[i] ); 01599 maxNormValues[i] = fabs ( maxValues[i] - meanValues[i] ); 01600 if ( maxNormValues[i] < fabs ( minValues[i] - meanValues[i] ) ) 01601 maxNormValues[i] = fabs ( minValues[i] - meanValues[i] ); 01602 } 01603 01604 cout<<"nTrain:"<<nTrain<<endl; 01605 01606 // === Calculate effective number of input features === 01607 nFeat = 1; // const 01608 int nFeatNum = 0, nFeatNumRaw = 0, nFeatNumCat = 0, nFeatCat = 0, nUnknown = 0, nMissing = 0, nIn = 0, nNumMiss = 0; 01609 // numerical 01610 for ( int j=0;j<NUM;j++ ) 01611 { 01612 if ( numericNonZeroCnt[j] >= minAttributeOccurenceNumerical && maxNormValues[j] < stdValues[j]*maxSTD ) 01613 { 01614 // standard numerical input 01615 nFeat++; 01616 nFeatNum++; 01617 nFeatNumRaw++; 01618 01619 // numerical input with limited number of different values -> translate it to categorical input 01620 if ( numericalAttributes[j].size() < numericMaxCluster && numericalAttributes[j].size() > 1 ) 01621 { 01622 cout<<"nFeatNum:"<<nFeatNum<<" "; 01623 for ( int k=0;k<numericalAttributes[j].size();k++ ) 01624 { 01625 cout<<numericalAttributes[j][k]<<"("<<numericalAttributesCnt[j][k]<<") "; 01626 nFeat++; 01627 nFeatNum++; 01628 nFeatNumCat++; 01629 } 01630 cout<<endl; 01631 /* 01632 // add one bin for "missing or unknown value" 01633 nFeat++; 01634 nFeatNum++; 01635 nFeatNumCat++;*/ 01636 } 01637 if ( numericMissingCnt[j] >= numericMinMissing ) 01638 { 01639 numericHasMissingBin[j] = true; 01640 nFeat+=2; 01641 nNumMiss+=2; 01642 } 01643 } 01644 } 01645 // categorical 01646 for ( int j=0;j<CAT;j++ ) 01647 { 01648 int nUsed = 0, nUn = 0, nCat = 0, nMiss = 0, nUnk = 0; 01649 for ( int k=0;k<categoricalAttributesCnt[j].size();k++ ) 01650 { 01651 // count valid entries (with enough occurence) 01652 if ( categoricalAttributesCnt[j][k] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[j][k] < nTrain ) 01653 { 01654 nFeat++; 01655 nFeatCat++; 01656 nUsed++; 01657 nIn++; 01658 nCat++; 01659 } 01660 else if ( categoricalAttributesCnt[j][k] < nTrain ) // not enough occurence -> put to unknown 01661 nUn++; 01662 } 01663 // missing is like a normal categoric input 01664 if ( ( categoricalMissingCnt[j] >= minAttributeOccurenceCategorical && categoricalMissingCnt[j] < nTrain ) || categoricalMissingCnt[j] > 0 && nCat > 0 ) 01665 { 01666 // add a "missing value" input of this feature 01667 nFeat++; 01668 nFeatCat++; 01669 nMissing++; 01670 nMiss++; 01671 categoricalHasMissingBin[j] = true; 01672 } 01673 if ( nUn > 0 && nCat + nMiss > 0 ) 01674 { 01675 // add a "unknown value" input of this feature 01676 nFeat++; 01677 nFeatCat++; 01678 nUnknown++; 01679 nUnk++; 01680 categoricalHasUnknownBin[j] = true; 01681 } 01682 01683 if ( nCat + nMiss + nUnk == 1 ) 01684 assert ( false ); 01685 } 01686 01687 cout<<"nFeat:"<<nFeat<<" (numInputs:"<<nFeatNum<<" [rawNum:"<<nFeatNumRaw<<" nFeatNumCat:"<<nFeatNumCat<<"] catInputs:"<<nFeatCat<<" [nUnknown:"<<nUnknown<<" nMissing:"<<nMissing<<" nCat:"<<nIn<<"] numMissingHot:"<<nNumMiss<<" [+1const.])"<<endl; 01688 01689 cout<<"Allocate train features: "<< ( double ) nTrain*nFeat/1e6*4.0<<" MB"<<endl; 01690 train = new REAL[nTrain*nFeat]; 01691 for ( int i=0;i<nTrain*nFeat;i++ ) 01692 train[i] = 1e10; 01693 01694 //support = new REAL[nFeat]; 01695 //supportCnt = new int[nFeat]; 01696 //for(int i=0;i<nFeat;i++) 01697 //{ 01698 //support[i] = 0.0; 01699 //supportCnt[i] = 0; 01700 //} 01701 01702 // read targets 01703 nClass = 2; 01704 trainTarget = new REAL[nTrain*nClass*nDomain]; 01705 trainLabel = new int[nTrain*nDomain]; 01706 for ( int d=0;d<nDomain;d++ ) 01707 { 01708 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] ); 01709 fstream f; 01710 cout<<"Open targets:"<<buf0<<endl; 01711 f.open ( buf0,ios::in ); 01712 if ( f.is_open() == false ) 01713 assert ( false ); 01714 int label; 01715 for ( int i=0;i<nTrain;i++ ) 01716 { 01717 f>>label; 01718 if ( label==-1 ) 01719 { 01720 trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget; 01721 trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget; 01722 trainLabel[i*nDomain + d] = 0; 01723 } 01724 else if ( label==1 ) 01725 { 01726 trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget; 01727 trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget; 01728 trainLabel[i*nDomain + d] = 1; 01729 } 01730 else 01731 assert ( false ); 01732 } 01733 f.close(); 01734 } 01735 // test set 01736 nTest = 0; 01737 test = 0; 01738 testTarget = 0; 01739 testLabel = 0; 01740 01741 } 01742 } 01743 01744 for ( int i=0;i<nTrain;i++ ) 01745 for ( int j=0;j<nFeat;j++ ) 01746 if ( train[i*nFeat+j] == 1e10 ) 01747 { 01748 cout<<"i:"<<i<<" j:"<<j<<" "<<train[i*nFeat+j]<<endl; 01749 assert ( false ); 01750 } 01751 01752 01753 fstream f; 01754 if ( lineBuf ) 01755 { 01756 delete[] lineBuf; 01757 lineBuf = 0; 01758 } 01759 if ( numericNonZeroCnt ) 01760 { 01761 delete[] numericNonZeroCnt; 01762 numericNonZeroCnt = 0; 01763 } 01764 if ( numericNonZeroPercent ) 01765 { 01766 delete[] numericNonZeroPercent; 01767 numericNonZeroPercent = 0; 01768 } 01769 if ( categoricalAttributes ) 01770 { 01771 delete[] categoricalAttributes; 01772 categoricalAttributes = 0; 01773 } 01774 if ( meanValues ) 01775 { 01776 delete[] meanValues; 01777 meanValues = 0; 01778 } 01779 if ( meanCnt ) 01780 { 01781 delete[] meanCnt; 01782 meanCnt = 0; 01783 } 01784 if ( categoricalHasMissingBin ) 01785 { 01786 delete[] categoricalHasMissingBin; 01787 categoricalHasMissingBin = 0; 01788 } 01789 01790 if ( Framework::getFrameworkMode() == 1 ) 01791 { 01792 cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl; 01793 test = train; 01794 train = 0; 01795 nTest = nTrain; 01796 nTrain = 0; 01797 testTarget = trainTarget; 01798 trainTarget = 0; 01799 testLabel = trainLabel; 01800 trainLabel = 0; 01801 } 01802 01803 cout<<endl<<"Finished read in "<<time ( 0 )-t0<<"[s]"<<endl<<endl; 01804 01805 }
void DatasetReader::readLETTER | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the LETTER dataset (UCI)
712565Bytes letter-recognition.data 2734Bytes letter-recognition.names
Definition at line 3843 of file DatasetReader.cpp.
03844 { 03845 cout<<"Read LETTER from: "<<path<<endl; 03846 nDomain = 1; 03847 03848 // define data type and files 03849 int targetColumn = 1; 03850 uint nTrainTmp; 03851 char columnType[] = "dnnnnnnnnnnnnnnnn"; 03852 char enabledCol[] = "11111111111111111"; 03853 const char* dataFiles[] = { ( new string ( path+"/letter-recognition.data" ) )->c_str(),0}; 03854 03855 // === TRAIN SET === 03856 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03857 03858 // allocate tmp mem 03859 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03860 int* trainLabelTmp = new int[nTrainTmp]; 03861 03862 // fill data 03863 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 03864 03865 // split train and testset from trainTmp 03866 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget, true ); // true = take the last n percent (without random selection) 03867 03868 delete[] trainTmp; 03869 delete[] trainLabelTmp; 03870 03871 }
void DatasetReader::readMNIST | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the dataset from following files Trainset: train-images-idx3-ubyte (47040016 Bytes), train-labels-idx1-ubyte (60008 Bytes) Testset: t10k-images-idx3-ubyte (7840016 Bytes), t10k-labels-idx1-ubyte (10008 Bytes)
Definition at line 2365 of file DatasetReader.cpp.
02366 { 02367 cout<<"Read MNIST from: "<<path<<endl; 02368 02369 fstream fTrain ( ( path+"/"+string ( "train-images-idx3-ubyte" ) ).c_str(), ios::in ); 02370 fstream fTrainLabels ( ( path+"/"+string ( "train-labels-idx1-ubyte" ) ).c_str(), ios::in ); 02371 fstream fTest ( ( path+"/"+string ( "t10k-images-idx3-ubyte" ) ).c_str(), ios::in ); 02372 fstream fTestLabels ( ( path+"/"+string ( "t10k-labels-idx1-ubyte" ) ).c_str(), ios::in ); 02373 02374 if ( fTrain.is_open() ==false || fTrainLabels.is_open() ==false || fTest.is_open() ==false || fTestLabels.is_open() ==false ) 02375 { 02376 cout<<"Error in opening the files"<<endl; 02377 exit ( 0 ); 02378 } 02379 02380 // population 02381 nClass = 10; 02382 nDomain = 1; 02383 nTrain = 60000; 02384 nTest = 10000; 02385 nFeat = 784; // (28 x 28 pixel 8-Bit images) 02386 02387 // allocate mem 02388 unsigned char* trainChar = new unsigned char[nTrain * nFeat]; 02389 unsigned char* testChar = new unsigned char[nTest * nFeat]; 02390 unsigned char* trainLabelChar = new unsigned char[nTrain]; 02391 unsigned char* testLabelChar = new unsigned char[nTest]; 02392 02393 // load raw data 02394 unsigned int dummy; 02395 fTrain.read ( ( char* ) &dummy, sizeof ( int ) ); // magic number 02396 fTrain.read ( ( char* ) &dummy, sizeof ( int ) ); // #images 02397 fTrain.read ( ( char* ) &dummy, sizeof ( int ) ); // rows 02398 fTrain.read ( ( char* ) &dummy, sizeof ( int ) ); // cols 02399 fTrain.read ( ( char* ) trainChar, sizeof ( unsigned char ) *nTrain*nFeat ); // images 02400 fTrain.close(); 02401 02402 fTrainLabels.read ( ( char* ) &dummy, sizeof ( int ) ); // magic number 02403 fTrainLabels.read ( ( char* ) &dummy, sizeof ( int ) ); // #items 02404 fTrainLabels.read ( ( char* ) trainLabelChar, sizeof ( unsigned char ) *nTrain ); // labels 02405 fTrainLabels.close(); 02406 02407 fTest.read ( ( char* ) &dummy, sizeof ( int ) ); // magic number 02408 fTest.read ( ( char* ) &dummy, sizeof ( int ) ); // #images 02409 fTest.read ( ( char* ) &dummy, sizeof ( int ) ); // rows 02410 fTest.read ( ( char* ) &dummy, sizeof ( int ) ); // cols 02411 fTest.read ( ( char* ) testChar, sizeof ( unsigned char ) *nTest*nFeat ); // images 02412 fTest.close(); 02413 02414 fTestLabels.read ( ( char* ) &dummy, sizeof ( int ) ); // magic number 02415 fTestLabels.read ( ( char* ) &dummy, sizeof ( int ) ); // #items 02416 fTestLabels.read ( ( char* ) testLabelChar, sizeof ( unsigned char ) *nTest ); // labels 02417 fTestLabels.close(); 02418 02419 // row x col train images as test pgm file 02420 int rows = 50, cols = 100; 02421 fstream fimg ( ( path + "/MNIST.pgm" ).c_str(),ios::out ); 02422 char buf[256]; 02423 sprintf ( buf,"P5\n%d %d\n255\n", cols*28, rows*28 ); 02424 fimg<<buf; 02425 // image 02426 for ( int I=0;I<rows;I++ ) 02427 { 02428 // write image 02429 for ( int j=0;j<28;j++ ) 02430 { 02431 for ( int i=0;i<cols;i++ ) 02432 { 02433 for ( int k=0;k<28;k++ ) 02434 fimg.write ( ( char* ) &trainChar[k + i*nFeat + j*28 + I*cols*nFeat], sizeof ( unsigned char ) ); 02435 } 02436 } 02437 } 02438 fimg.close(); 02439 02440 // allocate + write dataset 02441 train = new REAL[nTrain * nFeat]; 02442 trainLabel = new int[nTrain]; 02443 test = new REAL[nTest * nFeat]; 02444 testLabel = new int[nTest]; 02445 02446 for ( int i=0;i<nTrain;i++ ) 02447 { 02448 trainLabel[i] = ( int ) trainLabelChar[i]; 02449 for ( int j=0;j<nFeat;j++ ) 02450 train[i*nFeat + j] = ( REAL ) trainChar[i*nFeat + j] / 255.0; 02451 } 02452 02453 for ( int i=0;i<nTest;i++ ) 02454 { 02455 testLabel[i] = ( int ) testLabelChar[i]; 02456 for ( int j=0;j<nFeat;j++ ) 02457 test[i*nFeat + j] = ( REAL ) testChar[i*nFeat + j] / 255.0; 02458 } 02459 02460 // train targets 02461 trainTarget = new REAL[nClass*nTrain]; 02462 for ( int i=0;i<nTrain;i++ ) 02463 { 02464 for ( int j=0;j<nClass;j++ ) 02465 trainTarget[i*nClass + j] = negativeTarget; // negative class labels 02466 trainTarget[i*nClass + trainLabel[i]] = positiveTarget; // positive class label 02467 } 02468 02469 // test targets 02470 testTarget = new REAL[nClass*nTest]; 02471 for ( int i=0;i<nTest;i++ ) 02472 { 02473 for ( int j=0;j<nClass;j++ ) 02474 testTarget[i*nClass + j] = negativeTarget; // negative class labels 02475 testTarget[i*nClass + testLabel[i]] = positiveTarget; // positive class label 02476 } 02477 02478 // free raw data 02479 if ( trainChar ) 02480 { 02481 delete[] trainChar; 02482 trainChar = 0; 02483 } 02484 if ( testChar ) 02485 { 02486 delete[] testChar; 02487 testChar = 0; 02488 } 02489 if ( trainLabelChar ) 02490 { 02491 delete[] trainLabelChar; 02492 trainLabelChar = 0; 02493 } 02494 if ( testLabelChar ) 02495 { 02496 delete[] testLabelChar; 02497 testLabelChar = 0; 02498 } 02499 }
void DatasetReader::readMONKS1 | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the MONKS1 dataset (UCI)
10CATBytes monks-1.test 2947Bytes monks-1.train
Definition at line 3879 of file DatasetReader.cpp.
03880 { 03881 cout<<"Read MONKS1 from: "<<path<<endl; 03882 nDomain = 1; 03883 03884 // define data type and files 03885 int targetColumn = 1; 03886 char columnType[] = "dnnnnnnd"; 03887 char enabledCol[] = "11111110"; 03888 const char* dataFiles[] = { ( new string ( path+"/monks-1.train" ) )->c_str(), ( new string ( path+"/monks-1.test" ) )->c_str(),0}; 03889 03890 // === TRAIN SET === 03891 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 03892 train = new REAL[nFeat*nTrain]; 03893 trainLabel = new int[nTrain]; 03894 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 03895 03896 // === TEST SET === 03897 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 03898 test = new REAL[nFeat*nTest]; 03899 testLabel = new int[nTest]; 03900 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 03901 03902 // make numerical test targets 03903 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 03904 03905 }
void DatasetReader::readMONKS2 | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the MONKS2 dataset (UCI)
10CATBytes monks-2.test 4013Bytes monks-2.train
Definition at line 3913 of file DatasetReader.cpp.
03914 { 03915 cout<<"Read MONKS2 from: "<<path<<endl; 03916 nDomain = 1; 03917 03918 // define data type and files 03919 int targetColumn = 1; 03920 char columnType[] = "dnnnnnnd"; 03921 char enabledCol[] = "11111110"; 03922 const char* dataFiles[] = { ( new string ( path+"/monks-2.train" ) )->c_str(), ( new string ( path+"/monks-2.test" ) )->c_str(),0}; 03923 03924 // === TRAIN SET === 03925 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 03926 train = new REAL[nFeat*nTrain]; 03927 trainLabel = new int[nTrain]; 03928 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 03929 03930 // === TEST SET === 03931 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 03932 test = new REAL[nFeat*nTest]; 03933 testLabel = new int[nTest]; 03934 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 03935 03936 // make numerical test targets 03937 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 03938 03939 }
void DatasetReader::readMONKS3 | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the MONKS3 dataset (UCI)
10CATBytes monks-3.test 2886Bytes monks-3.train
Definition at line 3947 of file DatasetReader.cpp.
03948 { 03949 cout<<"Read MONKS3 from: "<<path<<endl; 03950 nDomain = 1; 03951 03952 // define data type and files 03953 int targetColumn = 1; 03954 char columnType[] = "dnnnnnnd"; 03955 char enabledCol[] = "11111110"; 03956 const char* dataFiles[] = { ( new string ( path+"/monks-3.train" ) )->c_str(), ( new string ( path+"/monks-3.test" ) )->c_str(),0}; 03957 03958 // === TRAIN SET === 03959 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 03960 train = new REAL[nFeat*nTrain]; 03961 trainLabel = new int[nTrain]; 03962 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 03963 03964 // === TEST SET === 03965 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 03966 test = new REAL[nFeat*nTest]; 03967 testLabel = new int[nTest]; 03968 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 03969 03970 // make numerical test targets 03971 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 03972 03973 }
void DatasetReader::readMUSHROOM | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the MUSHROOM dataset (UCI)
373704Bytes agaricus-lepiota.data 6816Bytes agaricus-lepiota.names
Definition at line 3981 of file DatasetReader.cpp.
03982 { 03983 cout<<"Read MUSHROOM from: "<<path<<endl; 03984 nDomain = 1; 03985 03986 // define data type and files 03987 int targetColumn = 1; 03988 uint nTrainTmp; 03989 char columnType[] = "ddddddddddddddddddddddd"; 03990 char enabledCol[] = "11111111111111111111111"; 03991 const char* dataFiles[] = { ( new string ( path+"/agaricus-lepiota.data" ) )->c_str(),0}; 03992 03993 // === TRAIN SET === 03994 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 03995 03996 // allocate tmp mem 03997 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 03998 int* trainLabelTmp = new int[nTrainTmp]; 03999 04000 // fill data 04001 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 04002 04003 // split train and testset from trainTmp 04004 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 04005 04006 delete[] trainTmp; 04007 delete[] trainLabelTmp; 04008 04009 }
void DatasetReader::readNETFLIX | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the binary NETFLIX prediction files in the DataFiles folder e.g. prediction.dat: 16902104 Bytes, linear float precision [probe,qualifying] probeRatings.txt: 4
... 1408395 ratings (1..5)
Definition at line 2775 of file DatasetReader.cpp.
02776 { 02777 cout<<"Read NETFLIX binary predictions from: "<<NETFLIX_DATA_DIR<<endl; 02778 02779 if ( Framework::getAdditionalStartupParameter() < 0 ) 02780 { 02781 // probeset subsampling 02782 if ( Framework::getAdditionalStartupParameter() < -100 ) 02783 { 02784 srand ( Framework::getRandomSeed() ); 02785 02786 cout<<"Probeset subsampling"<<endl; 02787 02788 // population 02789 nClass = 1; // -> one regression target 02790 nDomain = 1; 02791 nTrain = 1408395; // #probe 02792 nTest = 2817131; // #qual 02793 02794 bool* maskProbe = new bool[nTrain]; 02795 for ( int i=0;i<nTrain;i++ ) 02796 maskProbe[i] = false; 02797 double p = - ( double ) ( Framework::getAdditionalStartupParameter() ) / ( double ) nTrain; 02798 int c = 0; 02799 for ( int i=0;i<nTrain;i++ ) 02800 if ( ( double ) rand() / ( double ) RAND_MAX < p ) 02801 { 02802 maskProbe[i] = true; 02803 c++; 02804 } 02805 cout<<"Selected: "<<c<<" probe samples"<<endl; 02806 02807 02808 // get all the data files 02809 vector<string> files = Data::getDirectoryFileList ( NETFLIX_DATA_DIR ); 02810 vector<string> predictionFiles; 02811 02812 // read the *.dat files (prediction of probe+qual files) 02813 nFeat = 0; 02814 for ( int i=0;i<files.size();i++ ) 02815 { 02816 int pos = files[i].find ( ".dat" ); 02817 string fileEnding = files[i].substr ( files[i].length()-4,4 ); 02818 if ( fileEnding == ".dat" ) 02819 { 02820 predictionFiles.push_back ( files[i] ); 02821 nFeat++; 02822 } 02823 } 02824 02825 cout<<"nFeat: "<<nFeat<<endl; 02826 cout<<"nClass: "<<nClass<<endl; 02827 02828 // probe targets 02829 //cout<<"Targets Read:"<<path+"/"+string("probeRatings.txt")<<endl; 02830 //fstream fProbeRatings((path+"/"+string("probeRatings.txt")).c_str(), ios::in); 02831 cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl; 02832 fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in ); 02833 float* ratingCache = new float[nTrain]; 02834 for ( int i=0;i<nTrain;i++ ) 02835 fProbeRatings>>ratingCache[i]; 02836 fProbeRatings.close(); 02837 02838 02839 test = 0; 02840 testLabel = 0; 02841 testTarget = 0; 02842 if ( Framework::getFrameworkMode() == 1 ) 02843 { 02844 test = new REAL[ ( nTrain+nTest ) * nFeat]; 02845 testTarget = new REAL[nTrain+nTest]; 02846 for ( int i=0;i<nTrain+nTest;i++ ) 02847 testTarget[i] = 0.0; 02848 } 02849 train = new REAL[c * nFeat]; 02850 trainLabel = 0; 02851 trainTarget = new REAL[c]; 02852 int d = 0; 02853 for ( int j=0;j<nTrain;j++ ) 02854 { 02855 if ( maskProbe[j] ) 02856 { 02857 trainTarget[d] = ratingCache[j]; 02858 d++; 02859 } 02860 } 02861 02862 // predictions 02863 float* trainTmp = new float[nTrain+nTest]; 02864 for ( int i=0;i<predictionFiles.size();i++ ) 02865 { 02866 fstream f ( predictionFiles[i].c_str(), ios::in ); 02867 f.read ( ( char* ) trainTmp, sizeof ( float ) * ( nTrain+nTest ) ); 02868 if ( Framework::getFrameworkMode() == 1 ) 02869 for ( int j=0;j<nTrain+nTest;j++ ) 02870 test[j*nFeat + i] = trainTmp[j]; 02871 d = 0; 02872 for ( int j=0;j<nTrain;j++ ) 02873 { 02874 if ( maskProbe[j] ) 02875 { 02876 train[d*nFeat + i] = trainTmp[j]; 02877 d++; 02878 } 02879 } 02880 f.close(); 02881 cout<<"Prediction file: "<<predictionFiles[i]<<endl; 02882 } 02883 02884 delete[] trainTmp; 02885 delete[] ratingCache; 02886 delete[] maskProbe; 02887 02888 nTest = nTrain + nTest; 02889 nTrain = c; 02890 cout<<"nTrain:"<<nTrain<<endl<<"nTest:"<<nTest<<endl<<endl; 02891 return; 02892 } 02893 02894 02895 // population 02896 nClass = 1; // -> one regression target 02897 nDomain = 1; 02898 //nTrain = 1408395; // #probe 02899 //nTest = 2817131; // #qual 02900 02901 // HACK: divide probe into 2 halfs 02902 nTrain = 704197; // #probe 02903 nTest = 704198; // #qual 02904 02905 // get all the data files 02906 cout<<"read path from:"<<path+"/path.txt"<<endl; 02907 fstream fP ( ( path+"/path.txt" ).c_str(),ios::in ); 02908 string predictorPath; 02909 fP>>predictorPath; 02910 cout<<"path:"<<predictorPath<<endl; 02911 fP.close(); 02912 02913 //vector<string> files = Data::getDirectoryFileList(NETFLIX_DATA_DIR); 02914 vector<string> files = Data::getDirectoryFileList ( predictorPath ); 02915 sort(files.begin(), files.end()); 02916 vector<string> predictionFiles; 02917 02918 // read the *.dat files (prediction of probe+qual files) 02919 nFeat = 0; 02920 for ( int i=0;i<files.size();i++ ) 02921 { 02922 int pos = files[i].find ( ".dat" ); 02923 string fileEnding = files[i].substr ( files[i].length()-4,4 ); 02924 if ( fileEnding == ".dat" ) 02925 { 02926 predictionFiles.push_back ( files[i] ); 02927 nFeat++; 02928 } 02929 } 02930 02931 // =============== write the qual parts ================ 02932 int nProbe = 1408395; 02933 int nQual = 2817131; 02934 REAL* tmp = new float[nProbe+nQual]; 02935 REAL* tmp2 = new float[predictionFiles.size()*nQual]; 02936 int* tmp3 = new int[nQual]; 02937 fstream ff((predictorPath+"/grand_prize/judging.txt").c_str(),ios::in); 02938 char buf[1024]; 02939 int cnt = 0; 02940 while(ff.getline(buf,1024)) // read judging.txt 02941 { 02942 string line(buf); 02943 if(line.length() > 0) 02944 { 02945 if(line[line.length()-2] != ':') 02946 { 02947 int nr = atoi(line.c_str()); 02948 tmp3[cnt] = nr; 02949 cnt++; 02950 } 02951 } 02952 } 02953 assert(cnt==nQual); 02954 ff.close(); 02955 for ( int i=0;i<predictionFiles.size();i++ ) 02956 { 02957 fstream f ( predictionFiles[i].c_str(), ios::in ); 02958 f.read ( ( char* ) tmp, sizeof ( float ) *(nProbe+nQual) ); 02959 for(int j=0;j<nQual;j++) 02960 tmp2[j*predictionFiles.size()+i] = tmp[nProbe+j]; 02961 f.close(); 02962 } 02963 fstream trainCSV((path+"/testQual.csv").c_str(), ios::out); // write CSV file 02964 for(int i=0;i<nQual;i++) 02965 { 02966 for(int j=0;j<predictionFiles.size();j++) 02967 trainCSV<<tmp2[i*nFeat+j]<<","; 02968 trainCSV<<tmp3[i]<<endl; 02969 } 02970 trainCSV.close(); 02971 exit(0); 02972 // =============== write the qual parts ================ 02973 02974 02975 02976 cout<<"nFeat: "<<nFeat<<endl; 02977 cout<<"nClass: "<<nClass<<endl; 02978 02979 bool doClipping = true; 02980 if ( Framework::getAdditionalStartupParameter() == -2 ) 02981 doClipping = false; 02982 02983 // allocate complete dataset 02984 if ( Framework::getFrameworkMode() == 0 ) 02985 { 02986 train = new REAL[nTrain * nFeat]; 02987 trainLabel = 0; //new int[nTrain]; 02988 trainTarget = new REAL[nTrain * nClass]; 02989 02990 // probe targets 02991 //cout<<"Targets Read:"<<path+"/"+string("probeRatings.txt")<<endl; 02992 //fstream fProbeRatings((path+"/"+string("probeRatings.txt")).c_str(), ios::in); 02993 cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl; 02994 fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in ); 02995 for ( int i=0;i<nTrain;i++ ) 02996 fProbeRatings>>trainTarget[i]; 02997 fProbeRatings.close(); 02998 02999 float* trainTmp = new float[nTrain]; 03000 03001 // predictions 03002 for ( int i=0;i<predictionFiles.size();i++ ) 03003 { 03004 fstream f ( predictionFiles[i].c_str(), ios::in ); 03005 f.read ( ( char* ) trainTmp, sizeof ( float ) *nTrain ); 03006 double mean = 0.0; 03007 for ( int j=0;j<nTrain;j++ ) 03008 mean += trainTmp[j]; 03009 mean /= ( double ) nTrain; 03010 if ( mean > 1.0 && mean < 5.0 && doClipping ) 03011 cout<<"[clip] "; 03012 for ( int j=0;j<nTrain;j++ ) 03013 { 03014 train[j*nFeat + i] = trainTmp[j]; 03015 if ( mean > 1.0 && mean < 5.0 && doClipping ) 03016 { 03017 if ( train[j*nFeat + i] > 5.0 ) 03018 train[j*nFeat + i] = 5.0; 03019 if ( train[j*nFeat + i] < 1.0 ) 03020 train[j*nFeat + i] = 1.0; 03021 } 03022 } 03023 f.close(); 03024 cout<<"Prediction file: "<<predictionFiles[i]<<" mean:"<<mean<<endl; 03025 } 03026 03027 if ( trainTmp ) 03028 { 03029 delete[] trainTmp; 03030 trainTmp = 0; 03031 } 03032 03033 test = 0; 03034 testLabel = 0; 03035 testTarget = 0; 03036 nTest = 0; 03037 03038 // write CSV file 03039 /*fstream trainCSV((path+"/train.csv").c_str(), ios::out); 03040 for(int i=0;i<nTrain;i++) 03041 { 03042 for(int j=0;j<nFeat;j++) 03043 trainCSV<<train[i*nFeat+j]<<","; 03044 trainCSV<<trainTarget[i]<<endl; 03045 } 03046 trainCSV.close();*/ 03047 } 03048 03049 if ( Framework::getFrameworkMode() == 1 ) 03050 { 03051 cout<<"alloc: "<<nTest * ( uint ) nFeat<<endl; 03052 test = new REAL[nTest * ( uint ) nFeat]; 03053 testLabel = 0; //new int[nTest]; 03054 testTarget = new REAL[nTest * ( uint ) nClass]; 03055 03056 // dummy targets 03057 for ( int i=0;i<nTest;i++ ) 03058 testTarget[i] = 3.7; // just a init value (not known in netflix prize) 03059 03060 // HACK: read 2nd half of probe, this act as a test set 03061 cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl; 03062 fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in ); 03063 REAL dummy; 03064 for ( int i=0;i<nTrain;i++ ) 03065 fProbeRatings>>dummy; 03066 for ( int i=0;i<nTest;i++ ) 03067 fProbeRatings>>testTarget[i]; 03068 fProbeRatings.close(); 03069 03070 float* testTmp = new float[nTest]; 03071 03072 // predictions 03073 for ( uint i=0;i<predictionFiles.size();i++ ) 03074 { 03075 fstream f ( predictionFiles[i].c_str(), ios::in ); 03076 f.read ( ( char* ) testTmp, sizeof ( float ) *nTrain ); // probe read (dummy) 03077 f.read ( ( char* ) testTmp, sizeof ( float ) *nTest ); 03078 double mean = 0.0; 03079 for ( int j=0;j<nTest;j++ ) 03080 mean += testTmp[j]; 03081 mean /= ( double ) nTest; 03082 if ( mean > 1.0 && mean < 5.0 && doClipping ) 03083 cout<<"[clip] "; 03084 for ( uint j=0;j<nTest;j++ ) 03085 { 03086 test[j* ( uint ) nFeat + i] = testTmp[j]; 03087 if ( mean > 1.0 && mean < 5.0 && doClipping ) 03088 { 03089 if ( test[j* ( uint ) nFeat + i] > 5.0 ) 03090 test[j* ( uint ) nFeat + i] = 5.0; 03091 if ( test[j* ( uint ) nFeat + i] < 1.0 ) 03092 test[j* ( uint ) nFeat + i] = 1.0; 03093 } 03094 } 03095 f.close(); 03096 cout<<"Prediction file: "<<predictionFiles[i]<<" mean:"<<mean<<endl; 03097 } 03098 03099 if ( testTmp ) 03100 { 03101 delete[] testTmp; 03102 testTmp = 0; 03103 } 03104 03105 train = 0; 03106 trainLabel = 0; 03107 trainTarget = 0; 03108 nTrain = 0; 03109 03110 // write CSV file 03111 /*fstream testCSV((path+"/test.csv").c_str(), ios::out); 03112 for(int i=0;i<nTest;i++) 03113 { 03114 for(int j=0;j<nFeat;j++) 03115 testCSV<<test[i*nFeat+j]<<","; 03116 testCSV<<testTarget[i]<<endl; 03117 } 03118 testCSV.close();*/ 03119 } 03120 } 03121 else // slot blend 03122 { 03123 // population 03124 nClass = 1; // -> one regression target 03125 nDomain = 1; 03126 char buf0[512]; 03127 char buf1[512]; 03128 char buf2[512]; 03129 char buf3[512]; 03130 char buf4[512]; 03131 sprintf ( buf0,"%s/%s%d/",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() ); 03132 sprintf ( buf1,"%s/%s%d/nProbe.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() ); 03133 sprintf ( buf2,"%s/%s%d/nQual.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() ); 03134 sprintf ( buf3,"%s/%s%d/ratings.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() ); 03135 sprintf ( buf4,"%s/%s%d/ratingsTest.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() ); 03136 03137 fstream f; 03138 03139 // nTrain 03140 f.open ( buf1,ios::in ); 03141 f.read ( ( char* ) &nTrain,sizeof ( int ) ); 03142 f.close(); 03143 03144 // nTest 03145 f.open ( buf2,ios::in ); 03146 f.read ( ( char* ) &nTest,sizeof ( int ) ); 03147 f.close(); 03148 03149 // targets 03150 float* tmp = new float[nTrain+nTest]; 03151 f.open ( buf3,ios::in ); 03152 f.read ( ( char* ) tmp,sizeof ( float ) *nTrain ); 03153 f.close(); 03154 trainTarget = new REAL[nTrain]; 03155 for ( int i=0;i<nTrain;i++ ) 03156 trainTarget[i] = tmp[i]; 03157 testTarget = new REAL[nTest]; 03158 //for(int i=0;i<nTest;i++) 03159 // testTarget[i] = 3.7; 03160 f.open ( buf4,ios::in ); 03161 f.read ( ( char* ) tmp,sizeof ( float ) *nTest ); 03162 f.close(); 03163 for ( int i=0;i<nTest;i++ ) 03164 testTarget[i] = tmp[i]; 03165 03166 // get all the data files 03167 vector<string> files = Data::getDirectoryFileList ( buf0 ); 03168 vector<string> predictionFiles; 03169 03170 // read the *.dat files (prediction of probe+qual files) 03171 nFeat = 0; 03172 for ( int i=0;i<files.size();i++ ) 03173 { 03174 string fileEnding = files[i].substr ( files[i].length()-4,4 ); 03175 if ( fileEnding == ".dat" ) 03176 { 03177 predictionFiles.push_back ( files[i] ); 03178 nFeat++; 03179 } 03180 } 03181 03182 cout<<"nFeat: "<<nFeat<<endl; 03183 cout<<"nClass: "<<nClass<<endl; 03184 cout<<"nTrain: "<<nTrain<<endl; 03185 cout<<"nTest: "<<nTest<<endl; 03186 03187 // input features 03188 if ( Framework::getFrameworkMode() == 0 ) 03189 { 03190 cout<<"allocate trainset: "<<nTrain * nFeat<<" elements"<<endl; 03191 train = new REAL[nTrain * nFeat]; 03192 trainLabel = 0; 03193 } 03194 else 03195 { 03196 cout<<"allocate testset : "<< ( uint ) nTest * nFeat<<" elements"<<endl; 03197 test = new REAL[nTest * nFeat]; 03198 testLabel = 0; 03199 } 03200 03201 // predictions 03202 for ( int i=0;i<predictionFiles.size();i++ ) 03203 { 03204 cout<<i<<"/"<< ( int ) predictionFiles.size() <<" "; 03205 f.open ( predictionFiles[i].c_str(), ios::in ); 03206 f.read ( ( char* ) tmp, sizeof ( float ) * ( nTrain+nTest ) ); 03207 f.close(); 03208 double mean = 0.0; 03209 for ( int j=0;j<nTrain+nTest;j++ ) 03210 mean += tmp[j]; 03211 mean /= ( double ) ( nTrain+nTest ); 03212 if ( mean > 1.0 && mean < 5.0 ) 03213 cout<<"[clip] "; 03214 cout<<"mu:"<<mean<<" "; 03215 if ( Framework::getFrameworkMode() == 0 ) 03216 { 03217 // train 03218 for ( int j=0;j<nTrain;j++ ) 03219 { 03220 train[j*nFeat + i] = tmp[j]; 03221 if ( mean > 1.0 && mean < 5.0 ) 03222 { 03223 if ( train[j*nFeat + i] > 5.0 ) 03224 train[j*nFeat + i] = 5.0; 03225 if ( train[j*nFeat + i] < 1.0 ) 03226 train[j*nFeat + i] = 1.0; 03227 } 03228 } 03229 } 03230 else 03231 { 03232 // test 03233 for ( int j=0;j<nTest;j++ ) 03234 { 03235 test[j*nFeat + i] = tmp[j+nTrain]; 03236 if ( mean > 1.0 && mean < 5.0 ) 03237 { 03238 if ( test[j*nFeat + i] > 5.0 ) 03239 test[j*nFeat + i] = 5.0; 03240 if ( test[j*nFeat + i] < 1.0 ) 03241 test[j*nFeat + i] = 1.0; 03242 } 03243 } 03244 } 03245 cout<<"Prediction file: "<<predictionFiles[i]<<endl; 03246 } 03247 03248 if ( Framework::getFrameworkMode() == 0 ) 03249 nTest = 0; 03250 else 03251 nTrain = 0; 03252 03253 delete[] tmp; 03254 03255 } 03256 03257 }
void DatasetReader::readPOKER | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the POKER dataset (UCI)
24538333Bytes poker-hand-testing.data 613694Bytes poker-hand-training-true.data 5946Bytes poker-hand.names
Definition at line 4235 of file DatasetReader.cpp.
04236 { 04237 cout<<"Read POKER from: "<<path<<endl; 04238 nDomain = 1; 04239 04240 // define data type and files 04241 int targetColumn = 11; 04242 char columnType[] = "ddddddddddd"; 04243 char enabledCol[] = "11111111111"; 04244 const char* dataFiles[] = { ( new string ( path+"/poker-hand-training-true.data" ) )->c_str(), ( new string ( path+"/poker-hand-testing.data" ) )->c_str(),0}; 04245 04246 // === TRAIN SET === 04247 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 04248 train = new REAL[nFeat*nTrain]; 04249 trainLabel = new int[nTrain]; 04250 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 04251 04252 // === TEST SET === 04253 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 04254 test = new REAL[nFeat*nTest]; 04255 testLabel = new int[nTest]; 04256 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 04257 04258 // make numerical test targets 04259 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 04260 04261 }
void DatasetReader::readPRUDSYS | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
PRUDSYS_DMC 2009: data mining cup Prudsys AG Trainset: dmc2009_train.txt (9239726 Bytes) Testset: dmc2009_forecast.txt (9308436 Bytes)
Definition at line 2279 of file DatasetReader.cpp.
02280 { 02281 REAL* feat, *target; 02282 int* label, N; 02283 02284 fstream f; 02285 if ( Framework::getFrameworkMode() == 1 ) 02286 { 02287 f.open ( ( path+"/dmc2009_forecast.txt" ).c_str(), ios::in ); 02288 nFeat = 1857+1; 02289 N = 2418; 02290 nClass = 1; 02291 nDomain = 8; 02292 } 02293 else 02294 { 02295 f.open ( ( path+"/dmc2009_train.txt" ).c_str(), ios::in ); 02296 nFeat = 1857+1; 02297 N = 2394; 02298 nClass = 1; 02299 nDomain = 8; 02300 } 02301 02302 feat = new REAL[N*nFeat]; 02303 target = new REAL[N*nClass*nDomain]; 02304 label = 0; 02305 02306 // features and labels 02307 char *buf = new char[100000]; 02308 f.getline ( buf,100000 ); 02309 positiveTarget = -1e10; 02310 negativeTarget = 1e10; 02311 for ( int i=0;i<N;i++ ) 02312 { 02313 f.getline ( buf,100000 ); 02314 stringstream ss ( buf ); 02315 REAL r; 02316 int cnt = 0; 02317 feat[nFeat*i + cnt] = 1.0; 02318 cnt++; 02319 while ( ss>>r ) 02320 { 02321 if ( cnt < nFeat ) 02322 feat[nFeat*i + cnt] = r; 02323 else if ( Framework::getFrameworkMode() == 0 ) 02324 target[nDomain*nClass*i + cnt - nFeat] = r; 02325 else if ( Framework::getFrameworkMode() == 1 ) 02326 target[nDomain*nClass*i + cnt - nFeat] = 0.0; 02327 cnt++; 02328 } 02329 if ( cnt != nFeat+nClass*nDomain && Framework::getFrameworkMode() == 0 ) 02330 assert ( false ); 02331 } 02332 f.close(); 02333 delete[] buf; 02334 02335 if ( Framework::getFrameworkMode() == 1 ) 02336 { 02337 nTest = N; 02338 test = feat; 02339 testTarget = target; 02340 testLabel = label; 02341 train = 0; 02342 trainTarget = 0; 02343 trainLabel = 0; 02344 nTrain = 0; 02345 } 02346 else 02347 { 02348 nTrain = N; 02349 train = feat; 02350 trainTarget = target; 02351 trainLabel = label; 02352 test = 0; 02353 testTarget = 0; 02354 testLabel = 0; 02355 nTest = 0; 02356 } 02357 }
void DatasetReader::readSATIMAGE | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the SATIMAGE dataset (UCI)
5254Bytes sat.doc 525830Bytes sat.trn 236745Bytes sat.tst
Definition at line 4018 of file DatasetReader.cpp.
04019 { 04020 cout<<"Read SATIMAGE from: "<<path<<endl; 04021 nDomain = 1; 04022 04023 // define data type and files 04024 int targetColumn = 37; 04025 uint nTrainTmp; 04026 char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd"; 04027 char enabledCol[] = "1111111111111111111111111111111111111"; 04028 const char* dataFiles[] = { ( new string ( path+"/sat.trn" ) )->c_str(), ( new string ( path+"/sat.tst" ) )->c_str(),0}; 04029 04030 // === TRAIN SET === 04031 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 04032 train = new REAL[nFeat*nTrain]; 04033 trainLabel = new int[nTrain]; 04034 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 04035 04036 // === TEST SET === 04037 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 04038 test = new REAL[nFeat*nTest]; 04039 testLabel = new int[nTest]; 04040 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 04041 04042 // make numerical test targets 04043 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 04044 04045 }
void DatasetReader::readSEGMENTATION | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the SEGMENTATION dataset (UCI)
34481Bytes segmentation.data 2458Bytes segmentation.names 344723Bytes segmentation.test
Definition at line 4054 of file DatasetReader.cpp.
04055 { 04056 cout<<"Read SEGMENTATION from: "<<path<<endl; 04057 nDomain = 1; 04058 04059 // define data type and files 04060 int targetColumn = 1; 04061 uint nTrainTmp; 04062 char columnType[] = "dnnnnnnnnnnnnnnnnnnn"; 04063 char enabledCol[] = "11111111111111111111"; 04064 const char* dataFiles[] = { ( new string ( path+"/segmentation.data" ) )->c_str(), ( new string ( path+"/segmentation.test" ) )->c_str(),0}; 04065 04066 // === TRAIN SET === 04067 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 ); 04068 train = new REAL[nFeat*nTrain]; 04069 trainLabel = new int[nTrain]; 04070 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel ); 04071 04072 // === TEST SET === 04073 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 ); 04074 test = new REAL[nFeat*nTest]; 04075 testLabel = new int[nTest]; 04076 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel ); 04077 04078 // make numerical test targets 04079 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 04080 04081 }
void DatasetReader::readSONAR | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the SONAR dataset (UCI)
87776Bytes sonar.all-data 5872Bytes sonar.names
Definition at line 4089 of file DatasetReader.cpp.
04090 { 04091 cout<<"Read SONAR from: "<<path<<endl; 04092 nDomain = 1; 04093 04094 // define data type and files 04095 int targetColumn = 61; 04096 uint nTrainTmp; 04097 char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd"; 04098 char enabledCol[] = "1111111111111111111111111111111111111111111111111111111111111"; 04099 const char* dataFiles[] = { ( new string ( path+"/sonar.all-data" ) )->c_str(),0}; 04100 04101 // === TRAIN SET === 04102 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 04103 04104 // allocate tmp mem 04105 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 04106 int* trainLabelTmp = new int[nTrainTmp]; 04107 04108 // fill data 04109 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 04110 04111 // split train and testset from trainTmp 04112 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 04113 04114 delete[] trainTmp; 04115 delete[] trainLabelTmp; 04116 04117 }
void DatasetReader::readSPIDER | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the artificial dataset generated by spider matlab framework
10CATBytes monks-1.test 2947Bytes monks-1.train
Definition at line 4341 of file DatasetReader.cpp.
04342 { 04343 cout<<"Read SPIDER from: "<<path<<endl; 04344 nDomain = 1; 04345 nFeat = 3; 04346 nClass = 2; 04347 04348 // the faster version for read-in 04349 int bufLen = 1024 * 1024; 04350 char *buf = new char[bufLen]; 04351 04352 // trainset 04353 nTrain = 0; 04354 fstream f ( ( path+"/train.data" ).c_str(), ios::in ); 04355 while ( f.getline ( buf,bufLen ) ) 04356 nTrain++; 04357 f.close(); 04358 train = new REAL[3*nTrain]; 04359 trainTarget = new REAL[2*nTrain]; 04360 trainLabel = new int[nTrain]; 04361 04362 f.open ( ( path+"/train.data" ).c_str(), ios::in ); 04363 nTrain = 0; 04364 while ( f.getline ( buf,bufLen ) ) 04365 { 04366 sscanf ( buf,"%f %f %d",&train[3*nTrain],&train[3*nTrain+1],&trainLabel[nTrain] ); 04367 train[3*nTrain+2] = 1.0; 04368 if ( trainLabel[nTrain] > 0 ) 04369 { 04370 trainTarget[2*nTrain] = positiveTarget; 04371 trainTarget[2*nTrain+1] = negativeTarget; 04372 trainLabel[nTrain] = 0; 04373 } 04374 else 04375 { 04376 trainTarget[2*nTrain] = negativeTarget; 04377 trainTarget[2*nTrain+1] = positiveTarget; 04378 trainLabel[nTrain] = 1; 04379 } 04380 nTrain++; 04381 } 04382 f.close(); 04383 04384 // testset 04385 nTest = 0; 04386 f.open ( ( path+"/test.data" ).c_str(), ios::in ); 04387 while ( f.getline ( buf,bufLen ) ) 04388 nTest++; 04389 f.close(); 04390 test = new REAL[3*nTest]; 04391 testTarget = new REAL[2*nTest]; 04392 testLabel = new int[nTest]; 04393 04394 f.open ( ( path+"/test.data" ).c_str(), ios::in ); 04395 nTest = 0; 04396 while ( f.getline ( buf,bufLen ) ) 04397 { 04398 sscanf ( buf,"%f %f %d",&test[3*nTest],&test[3*nTest+1],&testLabel[nTest] ); 04399 test[3*nTest+2] = 1.0; 04400 if ( testLabel[nTrain] > 0 ) 04401 { 04402 testTarget[2*nTest] = positiveTarget; 04403 testTarget[2*nTest+1] = negativeTarget; 04404 testLabel[nTest] = 0; 04405 } 04406 else 04407 { 04408 testTarget[2*nTest] = negativeTarget; 04409 testTarget[2*nTest+1] = positiveTarget; 04410 testLabel[nTest] = 1; 04411 } 04412 nTest++; 04413 } 04414 f.close(); 04415 04416 delete[] buf; 04417 04418 /* 04419 // define data type and files 04420 int targetColumn = 3; 04421 char columnType[] = "nnd"; 04422 char enabledCol[] = "111"; 04423 const char* dataFiles[] = {(new string(path+"/train.data"))->c_str(),(new string(path+"/test.data"))->c_str(),0}; 04424 04425 bool addConstantOne = true; 04426 04427 // === TRAIN SET === 04428 getDataBounds(dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, false, 0, 0, addConstantOne); 04429 train = new REAL[nFeat*nTrain]; 04430 trainLabel = new int[nTrain]; 04431 getDataBounds(dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel, addConstantOne); 04432 04433 // === TEST SET === 04434 getDataBounds(dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, false, 0, 0, addConstantOne); 04435 test = new REAL[nFeat*nTest]; 04436 testLabel = new int[nTest]; 04437 getDataBounds(dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel, addConstantOne); 04438 04439 // make numerical test targets 04440 makeNumericTrainAndTestTargets(nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget); 04441 */ 04442 04443 /* 04444 // simulate more domains 04445 nDomain = 3; 04446 int* trainLabelTmp = new int[nTrain*nDomain]; 04447 for(int i=0;i<nTrain;i++) 04448 for(int d=0;d<nDomain;d++) 04449 trainLabelTmp[i*nDomain + d] = trainLabel[i]; 04450 delete[] trainLabel; 04451 trainLabel = trainLabelTmp; 04452 04453 int* testLabelTmp = new int[nTest*nDomain]; 04454 for(int i=0;i<nTest;i++) 04455 for(int d=0;d<nDomain;d++) 04456 testLabelTmp[i*nDomain + d] = testLabel[i]; 04457 delete[] testLabel; 04458 testLabel = testLabelTmp; 04459 04460 // train targets 04461 trainTarget = new REAL[nClass*nDomain*nTrain]; 04462 for(int i=0;i<nTrain;i++) 04463 { 04464 for(int d=0;d<nDomain;d++) 04465 { 04466 for(int j=0;j<nClass;j++) 04467 trainTarget[i*nClass*nDomain + d*nClass + j] = d==1?positiveTarget:negativeTarget; // negative class labels 04468 trainTarget[i*nClass*nDomain + d*nClass + trainLabel[i*nDomain + d]] = d==1?negativeTarget:positiveTarget; // positive class label 04469 } 04470 } 04471 04472 // test targets 04473 testTarget = new REAL[nClass*nDomain*nTest]; 04474 for(int i=0;i<nTest;i++) 04475 { 04476 for(int d=0;d<nDomain;d++) 04477 { 04478 for(int j=0;j<nClass;j++) 04479 testTarget[i*nClass*nDomain + d*nClass + j] = d==1?positiveTarget:negativeTarget; // negative class labels 04480 testTarget[i*nClass*nDomain + d*nClass + testLabel[i*nDomain + d]] = d==1?negativeTarget:positiveTarget; // positive class label 04481 } 04482 } 04483 */ 04484 }
void DatasetReader::readSURVIVAL | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the SURVIVAL dataset (UCI)
3103Bytes haberman.data 1368Bytes haberman.names
Definition at line 4305 of file DatasetReader.cpp.
04306 { 04307 cout<<"Read SURVIVAL from: "<<path<<endl; 04308 nDomain = 1; 04309 04310 // define data type and files 04311 int targetColumn = 4; 04312 uint nTrainTmp; 04313 char columnType[] = "nnnd"; 04314 char enabledCol[] = "1111"; 04315 const char* dataFiles[] = { ( new string ( path+"/haberman.data" ) )->c_str(),0}; 04316 04317 // === TRAIN SET === 04318 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 04319 04320 // allocate tmp mem 04321 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 04322 int* trainLabelTmp = new int[nTrainTmp]; 04323 04324 // fill data 04325 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 04326 04327 // split train and testset from trainTmp 04328 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 04329 04330 delete[] trainTmp; 04331 delete[] trainLabelTmp; 04332 04333 }
void DatasetReader::readVEHICLE | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the VEHICLE dataset (UCI)
55517Bytes train.data 6386Bytes vehicle.doc
Definition at line 4126 of file DatasetReader.cpp.
04127 { 04128 cout<<"Read VEHICLE from: "<<path<<endl; 04129 nDomain = 1; 04130 04131 // define data type and files 04132 int targetColumn = 19; 04133 uint nTrainTmp; 04134 char columnType[] = "nnnnnnnnnnnnnnnnnnd"; 04135 char enabledCol[] = "1111111111111111111"; 04136 const char* dataFiles[] = { ( new string ( path+"/train.data" ) )->c_str(),0}; 04137 04138 // === TRAIN SET === 04139 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 04140 04141 // allocate tmp mem 04142 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 04143 int* trainLabelTmp = new int[nTrainTmp]; 04144 04145 // fill data 04146 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 04147 04148 // split train and testset from trainTmp 04149 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 04150 04151 delete[] trainTmp; 04152 delete[] trainLabelTmp; 04153 04154 }
void DatasetReader::readVOTES | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the VOTES dataset (UCI)
18171Bytes house-votes-84.data 6868Bytes house-votes-84.names
Definition at line 4162 of file DatasetReader.cpp.
04163 { 04164 cout<<"Read VOTES from: "<<path<<endl; 04165 nDomain = 1; 04166 04167 // define data type and files 04168 int targetColumn = 1; 04169 uint nTrainTmp; 04170 char columnType[] = "ddddddddddddddddd"; 04171 char enabledCol[] = "11111111111111111"; 04172 const char* dataFiles[] = { ( new string ( path+"/house-votes-84.data" ) )->c_str(),0}; 04173 04174 // === TRAIN SET === 04175 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 04176 04177 // allocate tmp mem 04178 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 04179 int* trainLabelTmp = new int[nTrainTmp]; 04180 04181 // fill data 04182 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 04183 04184 // split train and testset from trainTmp 04185 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 04186 04187 delete[] trainTmp; 04188 delete[] trainLabelTmp; 04189 04190 }
void DatasetReader::readWINE | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the WINE dataset (UCI)
10782Bytes wine.data 3036Bytes wine.names
Definition at line 4198 of file DatasetReader.cpp.
04199 { 04200 cout<<"Read WINE from: "<<path<<endl; 04201 nDomain = 1; 04202 04203 // define data type and files 04204 int targetColumn = 1; 04205 uint nTrainTmp; 04206 char columnType[] = "dnnnnnnnnnnnnn"; 04207 char enabledCol[] = "11111111111111"; 04208 const char* dataFiles[] = { ( new string ( path+"/wine.data" ) )->c_str(),0}; 04209 04210 // === TRAIN SET === 04211 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 04212 04213 // allocate tmp mem 04214 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 04215 int* trainLabelTmp = new int[nTrainTmp]; 04216 04217 // fill data 04218 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 04219 04220 // split train and testset from trainTmp 04221 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 04222 04223 delete[] trainTmp; 04224 delete[] trainLabelTmp; 04225 04226 }
void DatasetReader::readYEAST | ( | string | path, | |
REAL *& | train, | |||
REAL *& | trainTarget, | |||
int *& | trainLabel, | |||
REAL *& | test, | |||
REAL *& | testTarget, | |||
int *& | testLabel, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
int & | nClass, | |||
int & | nDomain, | |||
int & | nFeat, | |||
REAL | positiveTarget = 1.0 , |
|||
REAL | negativeTarget = -1.0 | |||
) |
Reads the YEAST dataset (UCI)
94976Bytes yeast.data 3313Bytes yeast.names
Definition at line 4269 of file DatasetReader.cpp.
04270 { 04271 cout<<"Read YEAST from: "<<path<<endl; 04272 nDomain = 1; 04273 04274 // define data type and files 04275 int targetColumn = 10; 04276 uint nTrainTmp; 04277 char columnType[] = "dnnnnnnnnd"; 04278 char enabledCol[] = "0111111111"; 04279 const char* dataFiles[] = { ( new string ( path+"/yeast.data" ) )->c_str(),0}; 04280 04281 // === TRAIN SET === 04282 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 ); 04283 04284 // allocate tmp mem 04285 REAL* trainTmp = new REAL[nTrainTmp * nFeat]; 04286 int* trainLabelTmp = new int[nTrainTmp]; 04287 04288 // fill data 04289 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp ); 04290 04291 // split train and testset from trainTmp 04292 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget ); 04293 04294 delete[] trainTmp; 04295 delete[] trainLabelTmp; 04296 04297 }
void DatasetReader::splitRandomTestset | ( | REAL | percentTest, | |
REAL * | data, | |||
int * | labels, | |||
int | nData, | |||
int | nFeat, | |||
int | nClass, | |||
REAL *& | train, | |||
int *& | trainLabel, | |||
REAL *& | trainTarget, | |||
REAL *& | test, | |||
int *& | testLabel, | |||
REAL *& | testTarget, | |||
uint & | nTrain, | |||
uint & | nTest, | |||
REAL | positiveTarget, | |||
REAL | negativeTarget, | |||
bool | noRandom = false | |||
) |
for split a random train and testset from data
Definition at line 4811 of file DatasetReader.cpp.
04812 { 04813 // split the train and test set 04814 if ( noRandom ) 04815 cout<<"take the last percentTest:"<<100.0*percentTest<<"[%]"<<endl; 04816 else 04817 cout<<"random percentTest:"<<100.0*percentTest<<"[%]"<<endl; 04818 04819 // set train and test bounds 04820 nTrain = 0; 04821 nTest = 0; 04822 srand ( getRandomSeed() ); 04823 for ( int i=0;i<nData;i++ ) 04824 { 04825 REAL r = ( double ) rand() / ( double ) RAND_MAX; 04826 if ( noRandom ) // take the last x as testset 04827 r = ( double ) i/ ( double ) nData< ( 1.0 - percentTest ) ?1.0:0.0; 04828 if ( r < percentTest ) 04829 nTest++; 04830 else 04831 nTrain++; 04832 } 04833 cout<<"nTrain:"<<nTrain<<endl; 04834 cout<<"nTest:"<<nTest<<endl; 04835 04836 // allocate mem 04837 train = new REAL[nTrain * nFeat]; 04838 trainLabel = new int[nTrain]; 04839 test = new REAL[nTest * nFeat]; 04840 testLabel = new int[nTest]; 04841 04842 // fill train and test set 04843 nTrain = 0; 04844 nTest = 0; 04845 srand ( getRandomSeed() ); 04846 for ( int i=0;i<nData;i++ ) 04847 { 04848 REAL r = ( double ) rand() / ( double ) RAND_MAX; 04849 if ( noRandom ) // take the last x as testset 04850 r = ( double ) i/ ( double ) nData< ( 1.0 - percentTest ) ?1.0:0.0; 04851 if ( r < percentTest ) 04852 { 04853 for ( int j=0;j<nFeat;j++ ) 04854 test[nTest*nFeat + j] = data[i*nFeat + j]; 04855 testLabel[nTest] = labels[i]; 04856 nTest++; 04857 } 04858 else 04859 { 04860 for ( int j=0;j<nFeat;j++ ) 04861 train[nTrain*nFeat + j] = data[i*nFeat + j]; 04862 trainLabel[nTrain] = labels[i]; 04863 nTrain++; 04864 } 04865 } 04866 04867 // check for NANs or INFs or too large numbers 04868 for ( int i=0;i<nTrain*nFeat;i++ ) 04869 if ( isnan ( train[i] ) || isinf ( train[i] ) || train[i]>1e10 || train[i]<-1e10 ) 04870 { 04871 cout<<"train["<<i<<"]:"<<train[i]<<endl; 04872 assert ( false ); 04873 } 04874 04875 for ( int i=0;i<nTest*nFeat;i++ ) 04876 if ( isnan ( test[i] ) || isinf ( test[i] ) || test[i]>1e10 || test[i]<-1e10 ) 04877 { 04878 cout<<"test["<<i<<"]:"<<test[i]<<endl; 04879 assert ( false ); 04880 } 04881 04882 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget ); 04883 }