00001 #include "DatasetReader.h"
00002
00003 extern StreamOutput cout;
00004
00008 DatasetReader::DatasetReader()
00009 {
00010 cout<<"DatasetReader"<<endl;
00011 }
00012
00016 DatasetReader::~DatasetReader()
00017 {
00018 cout<<"descructor DatasetReader"<<endl;
00019 }
00020
00024 void DatasetReader::readKDDCup09LargeBin ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
00025 {
00026 nDomain = 3;
00027 char* targetFiles[] =
00028 {
00029 "orange_large_train_churn.labels",
00030 "orange_large_train_appetency.labels",
00031 "orange_large_train_upselling.labels"
00032 };
00033
00034 nTrain = 50000;
00035 nFeat = 113;
00036
00037 char buf[512];
00038 if ( Framework::getFrameworkMode() == 1 )
00039 {
00040
00041
00042
00043 sprintf ( buf,"featureSelection_all_test_%d_features.dat",nFeat );
00044 }
00045 else
00046 {
00047
00048
00049
00050 sprintf ( buf,"featureSelection_all_train_%d_features.dat",nFeat );
00051 }
00052 cout<<"Open:"<<buf<<endl;
00053 train = new REAL[nTrain * nFeat];
00054 fstream f ( buf,ios::in );
00055 if ( f.is_open() == false )
00056 assert ( false );
00057 f.read ( ( char* ) train, sizeof ( REAL ) *nTrain*nFeat );
00058 f.close();
00059
00060
00061 nClass = 2;
00062 trainTarget = new REAL[nTrain*nClass*nDomain];
00063 trainLabel = new int[nTrain*nDomain];
00064 char buf0[512];
00065 for ( int d=0;d<nDomain;d++ )
00066 {
00067 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] );
00068 fstream f;
00069 cout<<"Open targets:"<<buf0<<endl;
00070 f.open ( buf0,ios::in );
00071 if ( f.is_open() == false )
00072 assert ( false );
00073 int label;
00074 for ( int i=0;i<nTrain;i++ )
00075 {
00076 f>>label;
00077 if ( label==-1 )
00078 {
00079 trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget;
00080 trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget;
00081 trainLabel[i*nDomain + d] = 0;
00082 }
00083 else if ( label==1 )
00084 {
00085 trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget;
00086 trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget;
00087 trainLabel[i*nDomain + d] = 1;
00088 }
00089 else
00090 assert ( false );
00091 }
00092 f.close();
00093 }
00094
00095
00096 nTest = 0;
00097 test = 0;
00098 testTarget = 0;
00099 testLabel = 0;
00100
00101 if ( Framework::getFrameworkMode() == 1 )
00102 {
00103 cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl;
00104 test = train;
00105 train = 0;
00106 nTest = nTrain;
00107 nTrain = 0;
00108 testTarget = trainTarget;
00109 trainTarget = 0;
00110 testLabel = trainLabel;
00111 trainLabel = 0;
00112 }
00113
00114 }
00115
00120 void DatasetReader::readKDDCup09Large ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
00121 {
00122 if ( 0 )
00123 {
00124 readKDDCup09LargeBin ( path, train, trainTarget, trainLabel, test, testTarget, testLabel, nTrain, nTest, nClass, nDomain, nFeat, positiveTarget, negativeTarget );
00125 return;
00126 }
00127
00128 time_t t0 = time ( 0 );
00129
00130 nDomain = 3;
00131
00132 cout<<"Read KDDCup09 from: "<<path<<endl;
00133
00134
00135
00136
00137 char* targetFiles[] =
00138 {
00139 "orange_large_train_churn.labels",
00140 "orange_large_train_appetency.labels",
00141 "orange_large_train_upselling.labels"
00142 };
00143
00144 int nPreAlloc = 100000000;
00145 char *buf0 = new char[512], *buf1 = new char[512];
00146 char* lineBuf = new char[nPreAlloc];
00147
00148
00149
00150 int NUM = 14740, CAT = 260, NLINES = 50000;
00151 int nFiles = 5;
00152 bool setNumZerosToMeans = false;
00153 bool setMissingToMeans = false;
00154 int numericMinMissing = 1;
00155 int numericMaxCluster = 0;
00156 int minAttributeOccurenceCategorical = 50*nFiles;
00157 int minAttributeOccurenceNumerical = 500*nFiles;
00158 REAL maxSTD = 1e10;
00159 cout<<"nFiles:"<<nFiles<<" minAttrOccurCat:"<<minAttributeOccurenceCategorical<<" minAttrOccurNum:"<<minAttributeOccurenceNumerical<<endl;
00160 cout<<setNumZerosToMeans<<" "<<setMissingToMeans<<" "<<numericMaxCluster<<" "<<minAttributeOccurenceCategorical<<" "<<minAttributeOccurenceNumerical<<" "<<maxSTD<<endl;
00161
00162 vector<string>* numericalAttributes = new vector<string>[NUM];
00163 vector<int>* numericalAttributesCnt = new vector<int>[NUM];
00164 vector<string>* categoricalAttributes = new vector<string>[CAT];
00165 vector<int>* categoricalAttributesCnt = new vector<int>[CAT];
00166 bool* categoricalHasMissingBin = new bool[CAT];
00167 int* categoricalMissingCnt = new int[CAT];
00168 bool* categoricalHasUnknownBin = new bool[CAT];
00169 for ( int i=0;i<CAT;i++ )
00170 {
00171 categoricalHasMissingBin[i] = false;
00172 categoricalHasUnknownBin[i] = false;
00173 categoricalMissingCnt[i] = 0;
00174 }
00175 int* numericNonZeroCnt = new int[NUM];
00176 int* numericMissingCnt = new int[NUM];
00177 bool* numericHasMissingBin = new bool[NUM];
00178 double* numericNonZeroPercent = new double[NUM];
00179 for ( int i=0;i<NUM;i++ )
00180 {
00181 numericMissingCnt[i] = 0;
00182 numericNonZeroCnt[i] = 0;
00183 numericNonZeroPercent[i] = 0.0;
00184 numericHasMissingBin[i] = false;
00185 }
00186
00187 double* minValues = new double[100000];
00188 double* maxValues = new double[100000];
00189 double* maxNormValues = new double[100000];
00190 double* meanValues = new double[100000];
00191 double* stdValues = new double[100000];
00192 double* mean2Values = new double[100000];
00193 int* meanCnt = new int[100000];
00194 for ( int i=0;i<100000;i++ )
00195 {
00196 minValues[i] = 1e20;
00197 maxValues[i] = -1e20;
00198 maxNormValues[i] = 0.0;
00199 meanValues[i] = 0.0;
00200 mean2Values[i] = 0.0;
00201 meanCnt[i] = 0;
00202 stdValues[i] = 0.0;
00203 }
00204
00205
00206
00207
00208
00209
00210
00211 for ( int state=0;state<2;state++ )
00212 {
00213 int nTrainFill = 0;
00214 if ( state == 0 )
00215 {
00216 nTrain = 0;
00217 }
00218
00219
00220
00221
00222
00223 for ( int file=0;file<nFiles;file++ )
00224 {
00225
00226 if ( state == 0 )
00227 sprintf ( buf0,"%s/orange_large_train.data.chunk%d",path.c_str(), file+1 );
00228 else
00229 {
00230 if ( Framework::getFrameworkMode() == 1 )
00231 sprintf ( buf0,"%s/orange_large_test.data.chunk%d",path.c_str(), file+1 );
00232 else
00233 sprintf ( buf0,"%s/orange_large_train.data.chunk%d",path.c_str(), file+1 );
00234 }
00235
00236 cout<<"Open:"<<buf0<<endl;
00237 fstream f;
00238 f.open ( buf0, ios::in );
00239 if ( f.is_open() == false )
00240 assert ( false );
00241
00242
00243 if ( file==0 )
00244 f.getline ( lineBuf, nPreAlloc );
00245
00246
00247 double zeroRatio = 0.0;
00248 double sparse = 0.0;
00249 int nTrainTmp = 0;
00250
00251
00252
00253
00254
00255 while ( f.getline ( lineBuf, nPreAlloc ) )
00256 {
00257 if ( nTrainTmp%1000 == 0 )
00258 cout<<"."<<flush;
00259
00260
00261 int pos0 = 0, pos1 = 0;
00262 int nF = 0, nMissing = 0, nZeros = 0;
00263 int nFeatFill = 0;
00264 int nrHot = 0;
00265 double value;
00266
00267 if ( state == 1 )
00268 {
00269
00270 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00271 nFeatFill++;
00272 }
00273
00274
00275
00276
00277
00278 while ( lineBuf[pos1] )
00279 {
00280
00281 while ( lineBuf[pos1] != '\t' && lineBuf[pos1] != 0 )
00282 pos1++;
00283
00284
00285
00286
00287
00288
00289 if ( pos1 > pos0 && lineBuf[pos1]!=0 )
00290 {
00291
00292 if ( pos1-pos0 <=0 || pos1-pos0 >= 512 )
00293 assert ( false );
00294 for ( int j=0;j<pos1-pos0;j++ )
00295 buf1[j] = lineBuf[pos0+j];
00296 buf1[pos1-pos0] = 0;
00297
00298
00299
00300
00301
00302
00303 if ( nF < NUM )
00304 {
00305 if ( ( buf1[0]>='0' && buf1[0] <='9' ) || buf1[0]=='-' )
00306 ;
00307 else
00308 {
00309 cout<<"BUF:"<<buf1<<endl;
00310 assert ( false );
00311 }
00312
00313
00314 value = atof ( buf1 );
00315
00316 if ( value == 0.0 )
00317 nZeros++;
00318
00319
00320 if ( state==0 )
00321 {
00322 if ( minValues[nF] > value )
00323 minValues[nF] = value;
00324 if ( maxValues[nF] < value )
00325 maxValues[nF] = value;
00326
00327
00328 int size = numericalAttributes[nF].size();
00329 if ( size < numericMaxCluster )
00330 {
00331 int foundIndex = -1;
00332 for ( int j=0;j<size;j++ )
00333 if ( numericalAttributes[nF][j] == buf1 )
00334 {
00335 foundIndex = j;
00336 break;
00337 }
00338
00339 if ( foundIndex == -1 )
00340 {
00341 numericalAttributes[nF].push_back ( buf1 );
00342 numericalAttributesCnt[nF].push_back ( 1 );
00343 }
00344 else
00345 numericalAttributesCnt[nF][foundIndex]++;
00346 }
00347
00348 if ( value != 0.0 )
00349 {
00350 numericNonZeroCnt[nF]++;
00351 if ( numericNonZeroCnt[nF] > nTrain+nTrainTmp+1 )
00352 {
00353 cout<<"numericNonZeroCnt[nF]:"<<numericNonZeroCnt[nF]<<" nF:"<<nF<<" nTrainTmp:"<<nTrainTmp<<" nZeros:"<<nZeros<<" pos0:"<<pos0<<" pos1:"<<pos1<<endl;
00354 assert ( false );
00355 }
00356 }
00357
00358 if ( value != 0.0 )
00359 {
00360
00361 meanValues[nF] += value;
00362 mean2Values[nF] += value * value;
00363 meanCnt[nF]++;
00364 }
00365 }
00366 else if ( state==1 )
00367 {
00368 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
00369 {
00370
00371 if ( value == 0.0 && setNumZerosToMeans )
00372 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
00373 else
00374 train[nTrainFill*nFeat + nFeatFill] = value;
00375 nFeatFill++;
00376
00377
00378 int size = numericalAttributes[nF].size();
00379 if ( size < numericMaxCluster && size > 1 )
00380 {
00381 int foundIndex = -1;
00382 for ( int j=0;j<size;j++ )
00383 if ( numericalAttributes[nF][j] == buf1 )
00384 {
00385 foundIndex = j;
00386 break;
00387 }
00388
00389 int beforeHot = nrHot;
00390 for ( int j=0;j<size;j++ )
00391 {
00392 if ( foundIndex == j )
00393 {
00394 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00395 nrHot++;
00396 }
00397 else
00398 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00399 nFeatFill++;
00400 }
00401
00402
00403
00404
00405
00406
00407 }
00408
00409 }
00410
00411
00412 if ( numericHasMissingBin[nF] )
00413 {
00414 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00415 nFeatFill++;
00416 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00417 nFeatFill++;
00418 }
00419 }
00420 }
00421
00422
00423
00424
00425 else
00426 {
00427 int index = nF-NUM;
00428 if ( index >= CAT )
00429 assert ( false );
00430 int size = categoricalAttributes[index].size();
00431 int sizeCnt = categoricalAttributesCnt[index].size();
00432 if ( size != sizeCnt )
00433 assert ( false );
00434
00435 int foundIndex = -1;
00436 for ( int j=0;j<size;j++ )
00437 if ( categoricalAttributes[index][j] == buf1 )
00438 {
00439 foundIndex = j;
00440 break;
00441 }
00442
00443
00444 if ( state==0 )
00445 {
00446
00447 if ( foundIndex == -1 )
00448 {
00449 categoricalAttributes[index].push_back ( buf1 );
00450 categoricalAttributesCnt[index].push_back ( 1 );
00451 }
00452 else
00453 categoricalAttributesCnt[index][foundIndex]++;
00454 }
00455 else if ( state==1 )
00456 {
00457
00458 int fillCnt = 0;
00459 int beforeHot = nrHot;
00460 for ( int j=0;j<size;j++ )
00461 {
00462 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
00463 {
00464 if ( foundIndex == j )
00465 {
00466 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00467 nrHot++;
00468 }
00469 else
00470 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00471 fillCnt++;
00472 nFeatFill++;
00473 }
00474 }
00475
00476
00477 if ( categoricalHasMissingBin[index] )
00478 {
00479 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00480 fillCnt++;
00481 nFeatFill++;
00482 }
00483
00484
00485 if ( categoricalHasUnknownBin[index] )
00486 {
00487 if ( beforeHot == nrHot )
00488 {
00489 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00490 nrHot++;
00491 }
00492 else
00493 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00494 fillCnt++;
00495 nFeatFill++;
00496 }
00497
00498 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
00499 {
00500 cout<<"WARNING: foundIndex:"<<foundIndex<<" "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
00501
00502 }
00503 }
00504 }
00505 }
00506
00507
00508
00509
00510
00511 else
00512 {
00513 nMissing++;
00514
00515 if ( state==0 )
00516 {
00517
00518 if ( nF < NUM )
00519 {
00520 numericMissingCnt[nF]++;
00521 }
00522
00523 if ( nF >= NUM )
00524 {
00525 int index = nF-NUM;
00526 categoricalMissingCnt[index]++;
00527 }
00528 }
00529
00530
00531 if ( state==1 )
00532 {
00533
00534
00535
00536
00537 if ( nF < NUM )
00538 {
00539 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
00540 {
00541
00542 if ( setMissingToMeans )
00543 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
00544 else
00545 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00546 nFeatFill++;
00547
00548
00549 int size = numericalAttributes[nF].size();
00550 if ( size < numericMaxCluster && size > 1 )
00551 {
00552
00553 for ( int j=0;j<size;j++ )
00554 {
00555 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00556 nFeatFill++;
00557 }
00558
00559
00560
00561 }
00562 }
00563
00564
00565 if ( numericHasMissingBin[nF] )
00566 {
00567 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00568 nFeatFill++;
00569 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00570 nFeatFill++;
00571 }
00572 }
00573
00574
00575
00576
00577 else
00578 {
00579 int index = nF - NUM;
00580 if ( index >= CAT )
00581 assert ( false );
00582 int size = categoricalAttributes[index].size();
00583 int sizeCnt = categoricalAttributesCnt[index].size();
00584 if ( size != sizeCnt )
00585 assert ( false );
00586
00587
00588 int fillCnt = 0;
00589 int beforeHot = nrHot;
00590 for ( int j=0;j<size;j++ )
00591 {
00592 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
00593 {
00594 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00595 fillCnt++;
00596 nFeatFill++;
00597 }
00598 }
00599 if ( categoricalHasMissingBin[index] )
00600 {
00601 if ( fillCnt == 0 && categoricalHasUnknownBin[index] == false )
00602 {
00603 cout<<"categoricalMissingCnt["<<index<<"]:"<<categoricalMissingCnt[index]<<endl;
00604 assert ( false );
00605 }
00606
00607 train[nTrainFill*nFeat + nFeatFill] = 1.0;
00608 nrHot++;
00609 fillCnt++;
00610 nFeatFill++;
00611 }
00612
00613 if ( categoricalHasUnknownBin[index] )
00614 {
00615
00616 train[nTrainFill*nFeat + nFeatFill] = 0.0;
00617 fillCnt++;
00618 nFeatFill++;
00619 }
00620
00621 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
00622 {
00623 cout<<"WARNING: "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
00624
00625 }
00626 }
00627 }
00628 }
00629
00630
00631 if ( lineBuf[pos1]!=0 )
00632 pos1++;
00633
00634
00635 pos0 = pos1;
00636
00637
00638 nF++;
00639 }
00640
00641
00642 if ( nF != NUM + CAT )
00643 assert ( false );
00644 if ( state==1 )
00645 {
00646 if ( nFeatFill != nFeat )
00647 {
00648 cout<<"nFeatFill:"<<nFeatFill<<" nFeat:"<<nFeat<<endl;
00649 assert ( false );
00650 }
00651 nTrainFill++;
00652 }
00653
00654 nTrainTmp++;
00655
00656 sparse += nMissing / ( double ) nF;
00657 zeroRatio += nZeros / ( double ) nF;
00658 }
00659
00660 f.close();
00661
00662
00663 sparse /= ( double ) nTrainTmp;
00664 zeroRatio /= ( double ) nTrainTmp;
00665 cout<<"nTrainTmp:"<<nTrainTmp<<endl;
00666 cout<<"missing values:"<<100.0*sparse<<"%"<<endl;
00667 cout<<"zero values:"<<100.0*zeroRatio<<"%"<<endl;
00668
00669 double min0 = 1e20, max0 = -1e20;
00670 for ( int i=0;i<100000;i++ )
00671 {
00672 if ( min0 > minValues[i] )
00673 min0 = minValues[i];
00674 if ( max0 < maxValues[i] )
00675 max0 = maxValues[i];
00676 }
00677 cout<<"min|max values: "<<min0<<"|"<<max0<<endl;
00678
00679 int sum = 0;
00680 for ( int j=0;j<CAT;j++ )
00681 sum += categoricalAttributes[j].size();
00682 cout<<"nCategoricalSum:"<<sum<<endl;
00683
00684 if ( state == 0 )
00685 nTrain += nTrainTmp;
00686
00687 }
00688
00689
00690 if ( state == 1 )
00691 {
00692 if ( nTrain != nTrainFill )
00693 assert ( false );
00694
00695 for ( int i=0;i<nTrain*nFeat;i++ )
00696 if ( train[i] == 1e10 )
00697 {
00698 cout<<"i:"<<i<<endl;
00699 assert ( false );
00700 }
00701 }
00702
00703 if ( state==0 )
00704 {
00705 for ( int i=0;i<NUM;i++ )
00706 numericNonZeroPercent[i] = ( double ) numericNonZeroCnt[i]/ ( double ) nTrain;
00707 for ( int i=0;i<100000;i++ )
00708 if ( meanCnt[i] > 0 )
00709 {
00710 meanValues[i] /= ( double ) meanCnt[i];
00711 stdValues[i] = sqrt ( mean2Values[i]/ ( double ) meanCnt[i] - meanValues[i]/ ( double ) meanCnt[i] );
00712 maxNormValues[i] = fabs ( maxValues[i] - meanValues[i] );
00713 if ( maxNormValues[i] < fabs ( minValues[i] - meanValues[i] ) )
00714 maxNormValues[i] = fabs ( minValues[i] - meanValues[i] );
00715 }
00716
00717 cout<<"nTrain:"<<nTrain<<endl;
00718
00719
00720 nFeat = 1;
00721 int nFeatNum = 0, nFeatNumRaw = 0, nFeatNumCat = 0, nFeatCat = 0, nUnknown = 0, nMissing = 0, nIn = 0, nNumMiss = 0;
00722
00723 for ( int j=0;j<NUM;j++ )
00724 {
00725 if ( numericNonZeroCnt[j] >= minAttributeOccurenceNumerical && maxNormValues[j] < stdValues[j]*maxSTD )
00726 {
00727
00728 nFeat++;
00729 nFeatNum++;
00730 nFeatNumRaw++;
00731
00732
00733 if ( numericalAttributes[j].size() < numericMaxCluster && numericalAttributes[j].size() > 1 )
00734 {
00735 cout<<"nFeatNum:"<<nFeatNum<<" ";
00736 for ( int k=0;k<numericalAttributes[j].size();k++ )
00737 {
00738 cout<<numericalAttributes[j][k]<<"("<<numericalAttributesCnt[j][k]<<") ";
00739 nFeat++;
00740 nFeatNum++;
00741 nFeatNumCat++;
00742 }
00743 cout<<endl;
00744
00745
00746
00747
00748
00749 }
00750 if ( numericMissingCnt[j] >= numericMinMissing )
00751 {
00752 numericHasMissingBin[j] = true;
00753 nFeat+=2;
00754 nNumMiss+=2;
00755 }
00756 }
00757 }
00758
00759 for ( int j=0;j<CAT;j++ )
00760 {
00761 int nUsed = 0, nUn = 0, nCat = 0, nMiss = 0, nUnk = 0;
00762 for ( int k=0;k<categoricalAttributesCnt[j].size();k++ )
00763 {
00764
00765 if ( categoricalAttributesCnt[j][k] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[j][k] < nTrain )
00766 {
00767 nFeat++;
00768 nFeatCat++;
00769 nUsed++;
00770 nIn++;
00771 nCat++;
00772 }
00773 else if ( categoricalAttributesCnt[j][k] < nTrain )
00774 nUn++;
00775 }
00776
00777 if ( ( categoricalMissingCnt[j] >= minAttributeOccurenceCategorical && categoricalMissingCnt[j] < nTrain ) || categoricalMissingCnt[j] > 0 && nCat > 0 )
00778 {
00779
00780 nFeat++;
00781 nFeatCat++;
00782 nMissing++;
00783 nMiss++;
00784 categoricalHasMissingBin[j] = true;
00785 }
00786 if ( nUn > 0 && nCat + nMiss > 0 )
00787 {
00788
00789 nFeat++;
00790 nFeatCat++;
00791 nUnknown++;
00792 nUnk++;
00793 categoricalHasUnknownBin[j] = true;
00794 }
00795
00796 if ( nCat + nMiss + nUnk == 1 )
00797 assert ( false );
00798 }
00799
00800 cout<<"nFeat:"<<nFeat<<" (numInputs:"<<nFeatNum<<" [rawNum:"<<nFeatNumRaw<<" nFeatNumCat:"<<nFeatNumCat<<"] catInputs:"<<nFeatCat<<" [nUnknown:"<<nUnknown<<" nMissing:"<<nMissing<<" nCat:"<<nIn<<"] numMissingHot:"<<nNumMiss<<" [+1const.])"<<endl;
00801
00802 cout<<"Allocate train features: "<< ( double ) nTrain*nFeat/1e6*4.0<<" MB"<<endl;
00803 train = new REAL[nTrain*nFeat];
00804 for ( int i=0;i<nTrain*nFeat;i++ )
00805 train[i] = 1e10;
00806
00807
00808
00809
00810
00811
00812
00813
00814
00815
00816 nClass = 2;
00817 trainTarget = new REAL[nTrain*nClass*nDomain];
00818 trainLabel = new int[nTrain*nDomain];
00819 for ( int d=0;d<nDomain;d++ )
00820 {
00821 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] );
00822 fstream f;
00823 cout<<"Open targets:"<<buf0<<endl;
00824 f.open ( buf0,ios::in );
00825 if ( f.is_open() == false )
00826 assert ( false );
00827 int label;
00828 for ( int i=0;i<nTrain;i++ )
00829 {
00830 f>>label;
00831 if ( label==-1 )
00832 {
00833 trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget;
00834 trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget;
00835 trainLabel[i*nDomain + d] = 0;
00836 }
00837 else if ( label==1 )
00838 {
00839 trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget;
00840 trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget;
00841 trainLabel[i*nDomain + d] = 1;
00842 }
00843 else
00844 assert ( false );
00845 }
00846 f.close();
00847 }
00848
00849 nTest = 0;
00850 test = 0;
00851 testTarget = 0;
00852 testLabel = 0;
00853
00854 }
00855 }
00856
00857 for ( int i=0;i<nTrain;i++ )
00858 for ( int j=0;j<nFeat;j++ )
00859 if ( train[i*nFeat+j] == 1e10 )
00860 {
00861 cout<<"i:"<<i<<" j:"<<j<<" "<<train[i*nFeat+j]<<endl;
00862 assert ( false );
00863 }
00864
00865
00866 fstream f;
00867
00868
00869
00870
00871
00872
00873
00874
00875
00876
00877
00878
00879
00880
00881
00882
00883
00884
00885
00886
00887
00888
00889
00890
00891
00892
00893
00894
00895
00896
00897
00898
00899
00900
00901
00902
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912
00913
00914
00915
00916
00917
00918
00919 if ( lineBuf )
00920 {
00921 delete[] lineBuf;
00922 lineBuf = 0;
00923 }
00924 if ( numericNonZeroCnt )
00925 {
00926 delete[] numericNonZeroCnt;
00927 numericNonZeroCnt = 0;
00928 }
00929 if ( numericNonZeroPercent )
00930 {
00931 delete[] numericNonZeroPercent;
00932 numericNonZeroPercent = 0;
00933 }
00934 if ( categoricalAttributes )
00935 {
00936 delete[] categoricalAttributes;
00937 categoricalAttributes = 0;
00938 }
00939 if ( meanValues )
00940 {
00941 delete[] meanValues;
00942 meanValues = 0;
00943 }
00944 if ( meanCnt )
00945 {
00946 delete[] meanCnt;
00947 meanCnt = 0;
00948 }
00949 if ( categoricalHasMissingBin )
00950 {
00951 delete[] categoricalHasMissingBin;
00952 categoricalHasMissingBin = 0;
00953 }
00954
00955
00956 f.open ( "A.txt",ios::out );
00957 double* mu = new double[nFeat];
00958 for ( int i=0;i<nFeat;i++ )
00959 mu[i] = 0.0;
00960 for ( int i=0;i<nTrain;i++ )
00961 for ( int j=0;j<nFeat;j++ )
00962 mu[j] += train[i*nFeat + j];
00963 for ( int i=0;i<nFeat;i++ )
00964 mu[i] /= ( double ) nTrain;
00965 for ( int i=0;i<nFeat;i++ )
00966 f<<mu[i]<<endl;
00967 f.close();
00968
00969
00970
00971
00972
00973
00974
00975
00976
00977
00978
00979
00980
00981
00982
00983
00984
00985
00986
00987
00988
00989
00990
00991
00992
00993
00994
00995
00996
00997
00998
00999
01000 if ( Framework::getFrameworkMode() == 1 )
01001 {
01002 cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl;
01003 test = train;
01004 train = 0;
01005 nTest = nTrain;
01006 nTrain = 0;
01007 testTarget = trainTarget;
01008 trainTarget = 0;
01009 testLabel = trainLabel;
01010 trainLabel = 0;
01011 }
01012 cout<<endl<<"Finished read in "<<time ( 0 )-t0<<"[s]"<<endl<<endl;
01013 }
01014
01015
01020 void DatasetReader::readKDDCup09Small ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
01021 {
01022 time_t t0 = time ( 0 );
01023
01024 nDomain = 3;
01025
01026 cout<<"Read KDDCup09 from: "<<path<<endl;
01027
01028 char* targetFiles[] = {"orange_small_train_churn.labels"
01029 ,"orange_small_train_appetency.labels"
01030 ,"orange_small_train_upselling.labels"
01031 };
01032
01033 int nPreAlloc = 100000000;
01034 char *buf0 = new char[512], *buf1 = new char[512];
01035 char* lineBuf = new char[nPreAlloc];
01036
01037 int NUM = 190, CAT = 40, NLINES = 50000;
01038 int nFiles = 1;
01039 bool setNumZerosToMeans = false;
01040 bool setMissingToMeans = false;
01041 int numericMinMissing = 1;
01042 int numericMaxCluster = 0;
01043 int minAttributeOccurenceCategorical = 200*nFiles;
01044 int minAttributeOccurenceNumerical = 500*nFiles;
01045 REAL maxSTD = 1e10;
01046 cout<<"nFiles:"<<nFiles<<" minAttrOccurCat:"<<minAttributeOccurenceCategorical<<" minAttrOccurNum:"<<minAttributeOccurenceNumerical<<endl;
01047 cout<<setNumZerosToMeans<<" "<<setMissingToMeans<<" "<<numericMaxCluster<<" "<<minAttributeOccurenceCategorical<<" "<<minAttributeOccurenceNumerical<<" "<<maxSTD<<endl;
01048
01049 vector<string>* numericalAttributes = new vector<string>[NUM];
01050 vector<int>* numericalAttributesCnt = new vector<int>[NUM];
01051 vector<string>* categoricalAttributes = new vector<string>[CAT];
01052 vector<int>* categoricalAttributesCnt = new vector<int>[CAT];
01053 bool* categoricalHasMissingBin = new bool[CAT];
01054 int* categoricalMissingCnt = new int[CAT];
01055 bool* categoricalHasUnknownBin = new bool[CAT];
01056 for ( int i=0;i<CAT;i++ )
01057 {
01058 categoricalHasMissingBin[i] = false;
01059 categoricalHasUnknownBin[i] = false;
01060 categoricalMissingCnt[i] = 0;
01061 }
01062 int* numericNonZeroCnt = new int[NUM];
01063 int* numericMissingCnt = new int[NUM];
01064 bool* numericHasMissingBin = new bool[NUM];
01065 double* numericNonZeroPercent = new double[NUM];
01066 for ( int i=0;i<NUM;i++ )
01067 {
01068 numericMissingCnt[i] = 0;
01069 numericNonZeroCnt[i] = 0;
01070 numericNonZeroPercent[i] = 0.0;
01071 numericHasMissingBin[i] = false;
01072 }
01073
01074 double* minValues = new double[100000];
01075 double* maxValues = new double[100000];
01076 double* maxNormValues = new double[100000];
01077 double* meanValues = new double[100000];
01078 double* stdValues = new double[100000];
01079 double* mean2Values = new double[100000];
01080 int* meanCnt = new int[100000];
01081 for ( int i=0;i<100000;i++ )
01082 {
01083 minValues[i] = 1e20;
01084 maxValues[i] = -1e20;
01085 maxNormValues[i] = 0.0;
01086 meanValues[i] = 0.0;
01087 mean2Values[i] = 0.0;
01088 meanCnt[i] = 0;
01089 stdValues[i] = 0.0;
01090 }
01091
01092
01093
01094
01095
01096
01097
01098 for ( int state=0;state<2;state++ )
01099 {
01100 int nTrainFill = 0;
01101 if ( state == 0 )
01102 {
01103 nTrain = 0;
01104 }
01105
01106
01107
01108
01109
01110 for ( int file=0;file<nFiles;file++ )
01111 {
01112
01113 if ( state == 0 )
01114 sprintf ( buf0,"%s/orange_small_train.data",path.c_str() );
01115 else
01116 {
01117 if ( Framework::getFrameworkMode() == 1 )
01118 sprintf ( buf0,"%s/orange_small_test.data",path.c_str() );
01119 else
01120 sprintf ( buf0,"%s/orange_small_train.data",path.c_str() );
01121 }
01122
01123 cout<<"Open:"<<buf0<<endl;
01124 fstream f;
01125 f.open ( buf0, ios::in );
01126 if ( f.is_open() == false )
01127 assert ( false );
01128
01129
01130 if ( file==0 )
01131 f.getline ( lineBuf, nPreAlloc );
01132
01133
01134 double zeroRatio = 0.0;
01135 double sparse = 0.0;
01136 int nTrainTmp = 0;
01137
01138
01139
01140
01141
01142 while ( f.getline ( lineBuf, nPreAlloc ) )
01143 {
01144 if ( nTrainTmp%1000 == 0 )
01145 cout<<"."<<flush;
01146
01147
01148 int pos0 = 0, pos1 = 0;
01149 int nF = 0, nMissing = 0, nZeros = 0;
01150 int nFeatFill = 0;
01151 int nrHot = 0;
01152 double value;
01153
01154 if ( state == 1 )
01155 {
01156
01157 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01158 nFeatFill++;
01159 }
01160
01161
01162
01163
01164
01165 while ( lineBuf[pos1] )
01166 {
01167
01168 while ( lineBuf[pos1] != '\t' && lineBuf[pos1] != 0 )
01169 pos1++;
01170
01171
01172
01173
01174
01175
01176 if ( pos1 > pos0 && lineBuf[pos1]!=0 )
01177 {
01178
01179 if ( pos1-pos0 <=0 || pos1-pos0 >= 512 )
01180 assert ( false );
01181 for ( int j=0;j<pos1-pos0;j++ )
01182 buf1[j] = lineBuf[pos0+j];
01183 buf1[pos1-pos0] = 0;
01184
01185
01186
01187
01188
01189
01190 if ( nF < NUM )
01191 {
01192 if ( ( buf1[0]>='0' && buf1[0] <='9' ) || buf1[0]=='-' )
01193 ;
01194 else
01195 {
01196 cout<<"BUF:"<<buf1<<endl;
01197 assert ( false );
01198 }
01199
01200
01201 value = atof ( buf1 );
01202
01203 if ( value == 0.0 )
01204 nZeros++;
01205
01206
01207 if ( state==0 )
01208 {
01209 if ( minValues[nF] > value )
01210 minValues[nF] = value;
01211 if ( maxValues[nF] < value )
01212 maxValues[nF] = value;
01213
01214
01215 int size = numericalAttributes[nF].size();
01216 if ( size < numericMaxCluster )
01217 {
01218 int foundIndex = -1;
01219 for ( int j=0;j<size;j++ )
01220 if ( numericalAttributes[nF][j] == buf1 )
01221 {
01222 foundIndex = j;
01223 break;
01224 }
01225
01226 if ( foundIndex == -1 )
01227 {
01228 numericalAttributes[nF].push_back ( buf1 );
01229 numericalAttributesCnt[nF].push_back ( 1 );
01230 }
01231 else
01232 numericalAttributesCnt[nF][foundIndex]++;
01233 }
01234
01235 if ( value != 0.0 )
01236 {
01237 numericNonZeroCnt[nF]++;
01238 if ( numericNonZeroCnt[nF] > nTrain+nTrainTmp+1 )
01239 {
01240 cout<<"numericNonZeroCnt[nF]:"<<numericNonZeroCnt[nF]<<" nF:"<<nF<<" nTrainTmp:"<<nTrainTmp<<" nZeros:"<<nZeros<<" pos0:"<<pos0<<" pos1:"<<pos1<<endl;
01241 assert ( false );
01242 }
01243 }
01244
01245 if ( value != 0.0 )
01246 {
01247
01248 meanValues[nF] += value;
01249 mean2Values[nF] += value * value;
01250 meanCnt[nF]++;
01251 }
01252 }
01253 else if ( state==1 )
01254 {
01255 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
01256 {
01257
01258 if ( value == 0.0 && setNumZerosToMeans )
01259 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
01260 else
01261 train[nTrainFill*nFeat + nFeatFill] = value;
01262 nFeatFill++;
01263
01264
01265 int size = numericalAttributes[nF].size();
01266 if ( size < numericMaxCluster && size > 1 )
01267 {
01268 int foundIndex = -1;
01269 for ( int j=0;j<size;j++ )
01270 if ( numericalAttributes[nF][j] == buf1 )
01271 {
01272 foundIndex = j;
01273 break;
01274 }
01275
01276 int beforeHot = nrHot;
01277 for ( int j=0;j<size;j++ )
01278 {
01279 if ( foundIndex == j )
01280 {
01281 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01282 nrHot++;
01283 }
01284 else
01285 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01286 nFeatFill++;
01287 }
01288
01289
01290
01291
01292
01293
01294 }
01295
01296 }
01297
01298
01299 if ( numericHasMissingBin[nF] )
01300 {
01301 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01302 nFeatFill++;
01303 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01304 nFeatFill++;
01305 }
01306 }
01307 }
01308
01309
01310
01311
01312 else
01313 {
01314 int index = nF-NUM;
01315 if ( index >= CAT )
01316 assert ( false );
01317 int size = categoricalAttributes[index].size();
01318 int sizeCnt = categoricalAttributesCnt[index].size();
01319 if ( size != sizeCnt )
01320 assert ( false );
01321
01322 int foundIndex = -1;
01323 for ( int j=0;j<size;j++ )
01324 if ( categoricalAttributes[index][j] == buf1 )
01325 {
01326 foundIndex = j;
01327 break;
01328 }
01329
01330
01331 if ( state==0 )
01332 {
01333
01334 if ( foundIndex == -1 )
01335 {
01336 categoricalAttributes[index].push_back ( buf1 );
01337 categoricalAttributesCnt[index].push_back ( 1 );
01338 }
01339 else
01340 categoricalAttributesCnt[index][foundIndex]++;
01341 }
01342 else if ( state==1 )
01343 {
01344
01345 int fillCnt = 0;
01346 int beforeHot = nrHot;
01347 for ( int j=0;j<size;j++ )
01348 {
01349 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
01350 {
01351 if ( foundIndex == j )
01352 {
01353 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01354 nrHot++;
01355 }
01356 else
01357 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01358 fillCnt++;
01359 nFeatFill++;
01360 }
01361 }
01362
01363
01364 if ( categoricalHasMissingBin[index] )
01365 {
01366 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01367 fillCnt++;
01368 nFeatFill++;
01369 }
01370
01371
01372 if ( categoricalHasUnknownBin[index] )
01373 {
01374 if ( beforeHot == nrHot )
01375 {
01376 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01377 nrHot++;
01378 }
01379 else
01380 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01381 fillCnt++;
01382 nFeatFill++;
01383 }
01384
01385 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
01386 {
01387 cout<<"WARNING: foundIndex:"<<foundIndex<<" "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
01388
01389 }
01390 }
01391 }
01392 }
01393
01394
01395
01396
01397
01398 else
01399 {
01400 nMissing++;
01401
01402 if ( state==0 )
01403 {
01404
01405 if ( nF < NUM )
01406 {
01407 numericMissingCnt[nF]++;
01408 }
01409
01410 if ( nF >= NUM )
01411 {
01412 int index = nF-NUM;
01413 categoricalMissingCnt[index]++;
01414 }
01415 }
01416
01417
01418 if ( state==1 )
01419 {
01420
01421
01422
01423
01424 if ( nF < NUM )
01425 {
01426 if ( numericNonZeroCnt[nF] >= minAttributeOccurenceNumerical && maxNormValues[nF] < stdValues[nF]*maxSTD )
01427 {
01428
01429 if ( setMissingToMeans )
01430 train[nTrainFill*nFeat + nFeatFill] = meanValues[nF];
01431 else
01432 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01433 nFeatFill++;
01434
01435
01436 int size = numericalAttributes[nF].size();
01437 if ( size < numericMaxCluster && size > 1 )
01438 {
01439
01440 for ( int j=0;j<size;j++ )
01441 {
01442 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01443 nFeatFill++;
01444 }
01445
01446
01447
01448 }
01449 }
01450
01451
01452 if ( numericHasMissingBin[nF] )
01453 {
01454 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01455 nFeatFill++;
01456 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01457 nFeatFill++;
01458 }
01459 }
01460
01461
01462
01463
01464 else
01465 {
01466 int index = nF - NUM;
01467 if ( index >= CAT )
01468 assert ( false );
01469 int size = categoricalAttributes[index].size();
01470 int sizeCnt = categoricalAttributesCnt[index].size();
01471 if ( size != sizeCnt )
01472 assert ( false );
01473
01474
01475 int fillCnt = 0;
01476 int beforeHot = nrHot;
01477 for ( int j=0;j<size;j++ )
01478 {
01479 if ( categoricalAttributesCnt[index][j] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[index][j] < nTrain )
01480 {
01481 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01482 fillCnt++;
01483 nFeatFill++;
01484 }
01485 }
01486 if ( categoricalHasMissingBin[index] )
01487 {
01488 if ( fillCnt == 0 && categoricalHasUnknownBin[index] == false )
01489 {
01490 cout<<"categoricalMissingCnt["<<index<<"]:"<<categoricalMissingCnt[index]<<endl;
01491 assert ( false );
01492 }
01493
01494 train[nTrainFill*nFeat + nFeatFill] = 1.0;
01495 nrHot++;
01496 fillCnt++;
01497 nFeatFill++;
01498 }
01499
01500 if ( categoricalHasUnknownBin[index] )
01501 {
01502
01503 train[nTrainFill*nFeat + nFeatFill] = 0.0;
01504 fillCnt++;
01505 nFeatFill++;
01506 }
01507
01508 if ( nrHot != beforeHot + 1 && fillCnt > 0 )
01509 {
01510 cout<<"WARNING: "<<size<<" "<<index<<" "<<nrHot<<" "<<beforeHot<<" fill:"<<fillCnt<<endl;
01511
01512 }
01513 }
01514 }
01515 }
01516
01517
01518 if ( lineBuf[pos1]!=0 )
01519 pos1++;
01520
01521
01522 pos0 = pos1;
01523
01524
01525 nF++;
01526 }
01527
01528
01529 if ( nF != NUM + CAT )
01530 assert ( false );
01531 if ( state==1 )
01532 {
01533 if ( nFeatFill != nFeat )
01534 {
01535 cout<<"nFeatFill:"<<nFeatFill<<" nFeat:"<<nFeat<<endl;
01536 assert ( false );
01537 }
01538 nTrainFill++;
01539 }
01540
01541 nTrainTmp++;
01542
01543 sparse += nMissing / ( double ) nF;
01544 zeroRatio += nZeros / ( double ) nF;
01545 }
01546
01547 f.close();
01548
01549
01550 sparse /= ( double ) nTrainTmp;
01551 zeroRatio /= ( double ) nTrainTmp;
01552 cout<<"nTrainTmp:"<<nTrainTmp<<endl;
01553 cout<<"missing values:"<<100.0*sparse<<"%"<<endl;
01554 cout<<"zero values:"<<100.0*zeroRatio<<"%"<<endl;
01555
01556 double min0 = 1e20, max0 = -1e20;
01557 for ( int i=0;i<100000;i++ )
01558 {
01559 if ( min0 > minValues[i] )
01560 min0 = minValues[i];
01561 if ( max0 < maxValues[i] )
01562 max0 = maxValues[i];
01563 }
01564 cout<<"min|max values: "<<min0<<"|"<<max0<<endl;
01565
01566 int sum = 0;
01567 for ( int j=0;j<CAT;j++ )
01568 sum += categoricalAttributes[j].size();
01569 cout<<"nCategoricalSum:"<<sum<<endl;
01570
01571 if ( state == 0 )
01572 nTrain += nTrainTmp;
01573
01574 }
01575
01576
01577 if ( state == 1 )
01578 {
01579 if ( nTrain != nTrainFill )
01580 assert ( false );
01581
01582 for ( int i=0;i<nTrain*nFeat;i++ )
01583 if ( train[i] == 1e10 )
01584 {
01585 cout<<"i:"<<i<<endl;
01586 assert ( false );
01587 }
01588 }
01589
01590 if ( state==0 )
01591 {
01592 for ( int i=0;i<NUM;i++ )
01593 numericNonZeroPercent[i] = ( double ) numericNonZeroCnt[i]/ ( double ) nTrain;
01594 for ( int i=0;i<100000;i++ )
01595 if ( meanCnt[i] > 0 )
01596 {
01597 meanValues[i] /= ( double ) meanCnt[i];
01598 stdValues[i] = sqrt ( mean2Values[i]/ ( double ) meanCnt[i] - meanValues[i]/ ( double ) meanCnt[i] );
01599 maxNormValues[i] = fabs ( maxValues[i] - meanValues[i] );
01600 if ( maxNormValues[i] < fabs ( minValues[i] - meanValues[i] ) )
01601 maxNormValues[i] = fabs ( minValues[i] - meanValues[i] );
01602 }
01603
01604 cout<<"nTrain:"<<nTrain<<endl;
01605
01606
01607 nFeat = 1;
01608 int nFeatNum = 0, nFeatNumRaw = 0, nFeatNumCat = 0, nFeatCat = 0, nUnknown = 0, nMissing = 0, nIn = 0, nNumMiss = 0;
01609
01610 for ( int j=0;j<NUM;j++ )
01611 {
01612 if ( numericNonZeroCnt[j] >= minAttributeOccurenceNumerical && maxNormValues[j] < stdValues[j]*maxSTD )
01613 {
01614
01615 nFeat++;
01616 nFeatNum++;
01617 nFeatNumRaw++;
01618
01619
01620 if ( numericalAttributes[j].size() < numericMaxCluster && numericalAttributes[j].size() > 1 )
01621 {
01622 cout<<"nFeatNum:"<<nFeatNum<<" ";
01623 for ( int k=0;k<numericalAttributes[j].size();k++ )
01624 {
01625 cout<<numericalAttributes[j][k]<<"("<<numericalAttributesCnt[j][k]<<") ";
01626 nFeat++;
01627 nFeatNum++;
01628 nFeatNumCat++;
01629 }
01630 cout<<endl;
01631
01632
01633
01634
01635
01636 }
01637 if ( numericMissingCnt[j] >= numericMinMissing )
01638 {
01639 numericHasMissingBin[j] = true;
01640 nFeat+=2;
01641 nNumMiss+=2;
01642 }
01643 }
01644 }
01645
01646 for ( int j=0;j<CAT;j++ )
01647 {
01648 int nUsed = 0, nUn = 0, nCat = 0, nMiss = 0, nUnk = 0;
01649 for ( int k=0;k<categoricalAttributesCnt[j].size();k++ )
01650 {
01651
01652 if ( categoricalAttributesCnt[j][k] >= minAttributeOccurenceCategorical && categoricalAttributesCnt[j][k] < nTrain )
01653 {
01654 nFeat++;
01655 nFeatCat++;
01656 nUsed++;
01657 nIn++;
01658 nCat++;
01659 }
01660 else if ( categoricalAttributesCnt[j][k] < nTrain )
01661 nUn++;
01662 }
01663
01664 if ( ( categoricalMissingCnt[j] >= minAttributeOccurenceCategorical && categoricalMissingCnt[j] < nTrain ) || categoricalMissingCnt[j] > 0 && nCat > 0 )
01665 {
01666
01667 nFeat++;
01668 nFeatCat++;
01669 nMissing++;
01670 nMiss++;
01671 categoricalHasMissingBin[j] = true;
01672 }
01673 if ( nUn > 0 && nCat + nMiss > 0 )
01674 {
01675
01676 nFeat++;
01677 nFeatCat++;
01678 nUnknown++;
01679 nUnk++;
01680 categoricalHasUnknownBin[j] = true;
01681 }
01682
01683 if ( nCat + nMiss + nUnk == 1 )
01684 assert ( false );
01685 }
01686
01687 cout<<"nFeat:"<<nFeat<<" (numInputs:"<<nFeatNum<<" [rawNum:"<<nFeatNumRaw<<" nFeatNumCat:"<<nFeatNumCat<<"] catInputs:"<<nFeatCat<<" [nUnknown:"<<nUnknown<<" nMissing:"<<nMissing<<" nCat:"<<nIn<<"] numMissingHot:"<<nNumMiss<<" [+1const.])"<<endl;
01688
01689 cout<<"Allocate train features: "<< ( double ) nTrain*nFeat/1e6*4.0<<" MB"<<endl;
01690 train = new REAL[nTrain*nFeat];
01691 for ( int i=0;i<nTrain*nFeat;i++ )
01692 train[i] = 1e10;
01693
01694
01695
01696
01697
01698
01699
01700
01701
01702
01703 nClass = 2;
01704 trainTarget = new REAL[nTrain*nClass*nDomain];
01705 trainLabel = new int[nTrain*nDomain];
01706 for ( int d=0;d<nDomain;d++ )
01707 {
01708 sprintf ( buf0,"%s/%s",path.c_str(),targetFiles[d] );
01709 fstream f;
01710 cout<<"Open targets:"<<buf0<<endl;
01711 f.open ( buf0,ios::in );
01712 if ( f.is_open() == false )
01713 assert ( false );
01714 int label;
01715 for ( int i=0;i<nTrain;i++ )
01716 {
01717 f>>label;
01718 if ( label==-1 )
01719 {
01720 trainTarget[i*nClass*nDomain + d*nClass + 0] = positiveTarget;
01721 trainTarget[i*nClass*nDomain + d*nClass + 1] = negativeTarget;
01722 trainLabel[i*nDomain + d] = 0;
01723 }
01724 else if ( label==1 )
01725 {
01726 trainTarget[i*nClass*nDomain + d*nClass + 0] = negativeTarget;
01727 trainTarget[i*nClass*nDomain + d*nClass + 1] = positiveTarget;
01728 trainLabel[i*nDomain + d] = 1;
01729 }
01730 else
01731 assert ( false );
01732 }
01733 f.close();
01734 }
01735
01736 nTest = 0;
01737 test = 0;
01738 testTarget = 0;
01739 testLabel = 0;
01740
01741 }
01742 }
01743
01744 for ( int i=0;i<nTrain;i++ )
01745 for ( int j=0;j<nFeat;j++ )
01746 if ( train[i*nFeat+j] == 1e10 )
01747 {
01748 cout<<"i:"<<i<<" j:"<<j<<" "<<train[i*nFeat+j]<<endl;
01749 assert ( false );
01750 }
01751
01752
01753 fstream f;
01754 if ( lineBuf )
01755 {
01756 delete[] lineBuf;
01757 lineBuf = 0;
01758 }
01759 if ( numericNonZeroCnt )
01760 {
01761 delete[] numericNonZeroCnt;
01762 numericNonZeroCnt = 0;
01763 }
01764 if ( numericNonZeroPercent )
01765 {
01766 delete[] numericNonZeroPercent;
01767 numericNonZeroPercent = 0;
01768 }
01769 if ( categoricalAttributes )
01770 {
01771 delete[] categoricalAttributes;
01772 categoricalAttributes = 0;
01773 }
01774 if ( meanValues )
01775 {
01776 delete[] meanValues;
01777 meanValues = 0;
01778 }
01779 if ( meanCnt )
01780 {
01781 delete[] meanCnt;
01782 meanCnt = 0;
01783 }
01784 if ( categoricalHasMissingBin )
01785 {
01786 delete[] categoricalHasMissingBin;
01787 categoricalHasMissingBin = 0;
01788 }
01789
01790 if ( Framework::getFrameworkMode() == 1 )
01791 {
01792 cout<<endl<<"Set test data (and set train data to 0)"<<endl<<endl;
01793 test = train;
01794 train = 0;
01795 nTest = nTrain;
01796 nTrain = 0;
01797 testTarget = trainTarget;
01798 trainTarget = 0;
01799 testLabel = trainLabel;
01800 trainLabel = 0;
01801 }
01802
01803 cout<<endl<<"Finished read in "<<time ( 0 )-t0<<"[s]"<<endl<<endl;
01804
01805 }
01806
01818 void DatasetReader::readBINARY ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
01819 {
01820 REAL* feat, *target;
01821 int* label, N;
01822
01823 fstream f;
01824 if ( Framework::getFrameworkMode() == 1 )
01825 f.open ( ( path+"/binary.test" ).c_str(), ios::in );
01826 else
01827 f.open ( ( path+"/binary.train" ).c_str(), ios::in );
01828
01829
01830 f.read ( ( char* ) &N, sizeof ( int ) );
01831 f.read ( ( char* ) &nClass, sizeof ( int ) );
01832 f.read ( ( char* ) &nDomain, sizeof ( int ) );
01833 f.read ( ( char* ) &nFeat, sizeof ( int ) );
01834
01835 feat = new REAL[N*nFeat];
01836 target = new REAL[N*nClass*nDomain];
01837 label = new int[N*nDomain];
01838
01839
01840 f.read ( ( char* ) feat, sizeof ( REAL ) *N*nFeat );
01841 f.read ( ( char* ) label, sizeof ( int ) *N*nDomain );
01842 f.close();
01843
01844 for ( int i=0;i<N;i++ )
01845 {
01846 for ( int j=0;j<nClass*nDomain;j++ )
01847 target[i*nClass*nDomain+j] = negativeTarget;
01848 for ( int j=0;j<nDomain;j++ )
01849 target[i*nClass*nDomain + j*nClass + label[i*nDomain+j]] = positiveTarget;
01850 }
01851
01852 if ( Framework::getFrameworkMode() == 1 )
01853 {
01854 nTest = N;
01855 test = feat;
01856 testTarget = target;
01857 testLabel = label;
01858 train = 0;
01859 trainTarget = 0;
01860 trainLabel = 0;
01861 nTrain = 0;
01862 }
01863 else
01864 {
01865 nTrain = N;
01866 train = feat;
01867 trainTarget = target;
01868 trainLabel = label;
01869 test = 0;
01870 testTarget = 0;
01871 testLabel = 0;
01872 nTest = 0;
01873 }
01874
01875 }
01876
01882 void DatasetReader::readCSV ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
01883 {
01884 cout<<"Read CSV from: "<<path<<endl;
01885 nDomain = 1;
01886
01887 char* buf = new char[1024*1024];
01888 char del = 0;
01889 int trainTargetColumn = -1;
01890 string trainName, testName;
01891
01892
01893 fstream fSetting(string(path+"/settings.txt").c_str(),ios::in);
01894 while ( fSetting.getline ( buf, 1024*1024 ) )
01895 {
01896 string s = buf;
01897 size_t pos = s.find_first_of('=');
01898 string token = s.substr(0,pos);
01899
01900 if(token == "delimiter")
01901 del = buf[pos+1];
01902 else if(token == "trainTargetColumn")
01903 trainTargetColumn = atoi(s.substr(pos+1).c_str());
01904 else if(token == "train")
01905 trainName = s.substr(pos+1);
01906 else if(token == "test")
01907 testName = s.substr(pos+1);
01908 }
01909 fSetting.close();
01910
01911
01912 if(trainTargetColumn == -1 || del == 0 || trainName == "" || (Framework::getFrameworkMode() && testName == ""))
01913 assert(false);
01914
01915
01916 fstream fTrain(string(path+"/"+trainName).c_str(),ios::in);
01917 vector<string> targets;
01918 map<string,int> targetMap;
01919 vector<vector<REAL> > features;
01920 while ( fTrain.getline ( buf, 1024*1024 ) )
01921 {
01922 string s = buf;
01923 size_t lastPos = 0;
01924 vector<REAL> feature;
01925 for(int i=0;i<s.length();i++)
01926 {
01927 if(s[i] == del || i == s.length()-1)
01928 {
01929 string token = s.substr(lastPos,i-lastPos);
01930 if(i == s.length()-1)
01931 token = s.substr(lastPos,i-lastPos+1);
01932 if(feature.size() == trainTargetColumn)
01933 {
01934 targets.push_back(token);
01935 if(Framework::getDatasetType())
01936 {
01937 map<string,int>::iterator it = targetMap.find(token);
01938 if(it == targetMap.end())
01939 targetMap[token] = targetMap.size();
01940 }
01941 }
01942 else
01943 {
01944 if((token[0] == '-' || token[0] == '.' || token[0] >= '0' && token[0] <= '9') == 0)
01945 assert(false);
01946 REAL value = atof(token.c_str());
01947
01948 feature.push_back(value);
01949 }
01950 lastPos = i+1;
01951 }
01952 }
01953 if(feature.size())
01954 features.push_back(feature);
01955
01956 }
01957 fTrain.close();
01958
01959
01960 nClass = 1;
01961 if(Framework::getDatasetType())
01962 {
01963 nClass = targetMap.size();
01964 map<string,int>::iterator it;
01965 cout<<"Target values: ";
01966 for(it=targetMap.begin();it!=targetMap.end();it++)
01967 cout<<"["<<it->second<<"]"<<it->first<<" ";
01968 cout<<endl;
01969 }
01970
01971
01972 nTrain = features.size();
01973 nTest = 0;
01974 nFeat = features[0].size();
01975 train = new REAL[nFeat*nTrain];
01976 trainTarget = new REAL[nClass*nTrain];
01977 if(Framework::getDatasetType())
01978 trainLabel = new int[nTrain];
01979
01980
01981 for(int i=0;i<nTrain;i++)
01982 {
01983 for(int j=0;j<nFeat;j++)
01984 train[i*nFeat+j] = features[i][j];
01985 if(Framework::getDatasetType())
01986 {
01987 int label = targetMap[targets[i]];
01988 trainLabel[i] = label;
01989 for(int j=0;j<nClass;j++)
01990 trainTarget[i*nClass+j] = (j==label? positiveTarget : negativeTarget);
01991 }
01992 else
01993 {
01994 REAL target = atof(targets[i].c_str());
01995 trainTarget[i] = target;
01996 }
01997 }
01998
01999
02000 if(Framework::getFrameworkMode())
02001 {
02002 fstream fTest(string(path+"/"+testName).c_str(),ios::in);
02003 targets.clear();
02004 features.clear();
02005 while ( fTest.getline ( buf, 1024*1024 ) )
02006 {
02007 string s = buf;
02008 size_t lastPos = 0;
02009 vector<REAL> feature;
02010 for(int i=0;i<s.length();i++)
02011 {
02012 if(s[i] == del || i == s.length()-1)
02013 {
02014 string token = s.substr(lastPos,i-lastPos);
02015 if(i == s.length()-1)
02016 token = s.substr(lastPos,i-lastPos+1);
02017 if(feature.size() == trainTargetColumn)
02018 {
02019 targets.push_back(token);
02020 if(Framework::getDatasetType())
02021 {
02022 map<string,int>::iterator it = targetMap.find(token);
02023 if(it == targetMap.end())
02024 targetMap[token] = targetMap.size();
02025 }
02026 }
02027 else
02028 {
02029 if((token[0] == '-' || token[0] == '.' || token[0] >= '0' && token[0] <= '9') == 0)
02030 assert(false);
02031 REAL value = atof(token.c_str());
02032
02033 feature.push_back(value);
02034 }
02035 lastPos = i+1;
02036 }
02037 }
02038 if(feature.size())
02039 features.push_back(feature);
02040
02041 }
02042 fTest.close();
02043
02044
02045 nTest = features.size();
02046 test = new REAL[nFeat*nTest];
02047 testTarget = new REAL[nClass*nTest];
02048 if(Framework::getDatasetType())
02049 testLabel = new int[nTrain];
02050
02051
02052 for(int i=0;i<nTest;i++)
02053 {
02054 for(int j=0;j<nFeat;j++)
02055 test[i*nFeat+j] = features[i][j];
02056 if(targets.size() == features.size())
02057 {
02058 if(Framework::getDatasetType())
02059 {
02060 int label = targetMap[targets[i]];
02061 testLabel[i] = label;
02062 for(int j=0;j<nClass;j++)
02063 testTarget[i*nClass+j] = (j==label? positiveTarget : negativeTarget);
02064 }
02065 else
02066 {
02067 REAL target = atof(targets[i].c_str());
02068 testTarget[i] = target;
02069 }
02070 }
02071 }
02072
02073 }
02074
02075 delete[] buf;
02076 }
02077
02083 void DatasetReader::readARFF ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
02084 {
02085 cout<<"Read ARFF from: "<<path<<endl;
02086 nDomain = 1;
02087
02088 char* buf = new char[1024*1024];
02089 char del = 0;
02090 string trainName, trainTargetColumn;
02091
02092
02093 fstream fSetting(string(path+"/settings.txt").c_str(),ios::in);
02094 while ( fSetting.getline ( buf, 1024*1024 ) )
02095 {
02096 string s = buf;
02097 size_t pos = s.find_first_of('=');
02098 string token = s.substr(0,pos);
02099 cout<<token<<endl;
02100 if(token == "trainTargetColumn")
02101 trainTargetColumn = s.substr(pos+1);
02102 else if(token == "train")
02103 trainName = s.substr(pos+1);
02104 }
02105 fSetting.close();
02106
02107 if(trainName=="" || trainTargetColumn=="")
02108 assert(false);
02109
02110
02111 fstream fTrain(string(path+"/"+trainName).c_str(),ios::in);
02112 vector<vector<REAL> > targets;
02113 vector<vector<REAL> > features;
02114 vector<string> featureNames;
02115 vector<map<string,int> > featureValues;
02116 bool dataMode = false;
02117 while ( fTrain.getline ( buf, 1024*1024 ) )
02118 {
02119 string s = buf;
02120
02121 if(s.length() == 0)
02122 continue;
02123 if(s[0] == '%')
02124 continue;
02125 if(s[0] == '@')
02126 {
02127 dataMode = false;
02128 size_t spacePos0 = s.find_first_of(' ');
02129 string token = s.substr(0,spacePos0);
02130
02131 if(token == "@relation" || token == "@RELATION")
02132 cout<<"Dataset name:"<<s.substr(spacePos0+1)<<endl;
02133 else if(token == "@attribute" || token == "@ATTRIBUTE")
02134 {
02135
02136 size_t spacePos1 = s.find_first_of(" \t", spacePos0+1);
02137 string featureName = s.substr(spacePos0+1,spacePos1-spacePos0-1);
02138 featureNames.push_back(featureName);
02139
02140 map<string,int> values;
02141 size_t curlyPos0 = s.find_first_of('{', spacePos1+1);
02142 size_t curlyPos1 = s.find_first_of('}', spacePos1+1);
02143 size_t pos = curlyPos0+1;
02144 if(curlyPos0 != string::npos && curlyPos1 != string::npos)
02145 {
02146 while(pos < s.length())
02147 {
02148 size_t delPos = s.find_first_of(',',pos);
02149 if(delPos==string::npos)
02150 delPos = curlyPos1;
02151 string feature = s.substr(pos,delPos-pos);
02152 while(*(feature.begin()) == ' ')
02153 feature = feature.substr(1);
02154 if(feature.length() > 0)
02155 while(feature[feature.length()-1] == ' ')
02156 {
02157 feature = feature.substr(0,feature.length()-1);
02158 if(feature.length() == 0)
02159 break;
02160 }
02161 if(feature.length() > 0)
02162 values[feature] = values.size();
02163 pos += feature.length()+1;
02164 }
02165 }
02166 featureValues.push_back(values);
02167 }
02168 else if(token == "@data" || token == "@DATA")
02169 dataMode = true;
02170 }
02171 else if(dataMode)
02172 {
02173
02174
02175 size_t pos = 0;
02176 uint valueCnt = 0;
02177 vector<REAL> feature;
02178 while(pos < s.length())
02179 {
02180 size_t delPos = s.find_first_of(',',pos);
02181 if(delPos==string::npos)
02182 delPos = s.length();
02183 string value = s.substr(pos,delPos-pos);
02184
02185 if(featureValues[valueCnt].size() == 0)
02186 {
02187 if(featureNames[valueCnt] == trainTargetColumn)
02188 {
02189 vector<REAL> target;
02190 target.push_back(atof(value.c_str()));
02191 targets.push_back(target);
02192 }
02193 else
02194 feature.push_back(atof(value.c_str()));
02195 }
02196 else
02197 {
02198 uint catSize = featureValues[valueCnt].size();
02199 if(featureNames[valueCnt] == trainTargetColumn)
02200 {
02201 vector<REAL> target;
02202 map<string,int>::iterator it = featureValues[valueCnt].find(value);
02203 for(int i=0;i<catSize;i++)
02204 target.push_back(negativeTarget);
02205 uint catPos = it->second;
02206 target[catPos] = positiveTarget;
02207 targets.push_back(target);
02208 }
02209 else
02210 {
02211 map<string,int>::iterator it = featureValues[valueCnt].find(value);
02212 if(it == featureValues[valueCnt].end())
02213 assert(false);
02214 for(int i=0;i<catSize;i++)
02215 feature.push_back(-1.0);
02216 uint catPos = it->second;
02217 feature[feature.size()-catSize+catPos] = 1.0;
02218 }
02219 }
02220 valueCnt++;
02221 pos += value.length()+1;
02222 }
02223 features.push_back(feature);
02224 }
02225 }
02226 fTrain.close();
02227
02228 assert(features.size() == targets.size());
02229
02230
02231 nTrain = features.size();
02232 nFeat = features[0].size();
02233 nClass = targets[0].size();
02234 cout<<"nTrain:"<<nTrain<<" nFeat:"<<nFeat<<" nClass:"<<nClass<<" nFeatureNames:"<<featureNames.size()<<endl;
02235 for(int i=0;i<featureNames.size();i++)
02236 {
02237 cout<<"name:"<<featureNames[i]<<" ";
02238 if(featureValues[i].size() == 0)
02239 cout<<"[REAL]";
02240 else
02241 {
02242 for(map<string,int>::iterator it = featureValues[i].begin();it != featureValues[i].end(); it++)
02243 cout<<"\""<<it->first<<"\" ";
02244 }
02245 cout<<endl;
02246 }
02247
02248
02249 train = new REAL[nFeat*nTrain];
02250 trainTarget = new REAL[nClass*nTrain];
02251 trainLabel = nClass > 1 ? new int[nTrain] : 0;
02252
02253 for(int i=0;i<nTrain;i++)
02254 {
02255 for(int j=0;j<nFeat;j++)
02256 train[i*nFeat+j] = features[i][j];
02257 for(int j=0;j<nClass;j++)
02258 {
02259 trainTarget[i*nClass+j] = targets[i][j];
02260 if(targets[i][j] == positiveTarget && nClass > 1)
02261 trainLabel[i] = j;
02262 }
02263 }
02264
02265
02266 test = 0;
02267 testTarget = 0;
02268 testLabel = 0;
02269 nTest = 0;
02270
02271 delete[] buf;
02272 }
02273
02279 void DatasetReader::readPRUDSYS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
02280 {
02281 REAL* feat, *target;
02282 int* label, N;
02283
02284 fstream f;
02285 if ( Framework::getFrameworkMode() == 1 )
02286 {
02287 f.open ( ( path+"/dmc2009_forecast.txt" ).c_str(), ios::in );
02288 nFeat = 1857+1;
02289 N = 2418;
02290 nClass = 1;
02291 nDomain = 8;
02292 }
02293 else
02294 {
02295 f.open ( ( path+"/dmc2009_train.txt" ).c_str(), ios::in );
02296 nFeat = 1857+1;
02297 N = 2394;
02298 nClass = 1;
02299 nDomain = 8;
02300 }
02301
02302 feat = new REAL[N*nFeat];
02303 target = new REAL[N*nClass*nDomain];
02304 label = 0;
02305
02306
02307 char *buf = new char[100000];
02308 f.getline ( buf,100000 );
02309 positiveTarget = -1e10;
02310 negativeTarget = 1e10;
02311 for ( int i=0;i<N;i++ )
02312 {
02313 f.getline ( buf,100000 );
02314 stringstream ss ( buf );
02315 REAL r;
02316 int cnt = 0;
02317 feat[nFeat*i + cnt] = 1.0;
02318 cnt++;
02319 while ( ss>>r )
02320 {
02321 if ( cnt < nFeat )
02322 feat[nFeat*i + cnt] = r;
02323 else if ( Framework::getFrameworkMode() == 0 )
02324 target[nDomain*nClass*i + cnt - nFeat] = r;
02325 else if ( Framework::getFrameworkMode() == 1 )
02326 target[nDomain*nClass*i + cnt - nFeat] = 0.0;
02327 cnt++;
02328 }
02329 if ( cnt != nFeat+nClass*nDomain && Framework::getFrameworkMode() == 0 )
02330 assert ( false );
02331 }
02332 f.close();
02333 delete[] buf;
02334
02335 if ( Framework::getFrameworkMode() == 1 )
02336 {
02337 nTest = N;
02338 test = feat;
02339 testTarget = target;
02340 testLabel = label;
02341 train = 0;
02342 trainTarget = 0;
02343 trainLabel = 0;
02344 nTrain = 0;
02345 }
02346 else
02347 {
02348 nTrain = N;
02349 train = feat;
02350 trainTarget = target;
02351 trainLabel = label;
02352 test = 0;
02353 testTarget = 0;
02354 testLabel = 0;
02355 nTest = 0;
02356 }
02357 }
02358
02359
02365 void DatasetReader::readMNIST ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
02366 {
02367 cout<<"Read MNIST from: "<<path<<endl;
02368
02369 fstream fTrain ( ( path+"/"+string ( "train-images-idx3-ubyte" ) ).c_str(), ios::in );
02370 fstream fTrainLabels ( ( path+"/"+string ( "train-labels-idx1-ubyte" ) ).c_str(), ios::in );
02371 fstream fTest ( ( path+"/"+string ( "t10k-images-idx3-ubyte" ) ).c_str(), ios::in );
02372 fstream fTestLabels ( ( path+"/"+string ( "t10k-labels-idx1-ubyte" ) ).c_str(), ios::in );
02373
02374 if ( fTrain.is_open() ==false || fTrainLabels.is_open() ==false || fTest.is_open() ==false || fTestLabels.is_open() ==false )
02375 {
02376 cout<<"Error in opening the files"<<endl;
02377 exit ( 0 );
02378 }
02379
02380
02381 nClass = 10;
02382 nDomain = 1;
02383 nTrain = 60000;
02384 nTest = 10000;
02385 nFeat = 784;
02386
02387
02388 unsigned char* trainChar = new unsigned char[nTrain * nFeat];
02389 unsigned char* testChar = new unsigned char[nTest * nFeat];
02390 unsigned char* trainLabelChar = new unsigned char[nTrain];
02391 unsigned char* testLabelChar = new unsigned char[nTest];
02392
02393
02394 unsigned int dummy;
02395 fTrain.read ( ( char* ) &dummy, sizeof ( int ) );
02396 fTrain.read ( ( char* ) &dummy, sizeof ( int ) );
02397 fTrain.read ( ( char* ) &dummy, sizeof ( int ) );
02398 fTrain.read ( ( char* ) &dummy, sizeof ( int ) );
02399 fTrain.read ( ( char* ) trainChar, sizeof ( unsigned char ) *nTrain*nFeat );
02400 fTrain.close();
02401
02402 fTrainLabels.read ( ( char* ) &dummy, sizeof ( int ) );
02403 fTrainLabels.read ( ( char* ) &dummy, sizeof ( int ) );
02404 fTrainLabels.read ( ( char* ) trainLabelChar, sizeof ( unsigned char ) *nTrain );
02405 fTrainLabels.close();
02406
02407 fTest.read ( ( char* ) &dummy, sizeof ( int ) );
02408 fTest.read ( ( char* ) &dummy, sizeof ( int ) );
02409 fTest.read ( ( char* ) &dummy, sizeof ( int ) );
02410 fTest.read ( ( char* ) &dummy, sizeof ( int ) );
02411 fTest.read ( ( char* ) testChar, sizeof ( unsigned char ) *nTest*nFeat );
02412 fTest.close();
02413
02414 fTestLabels.read ( ( char* ) &dummy, sizeof ( int ) );
02415 fTestLabels.read ( ( char* ) &dummy, sizeof ( int ) );
02416 fTestLabels.read ( ( char* ) testLabelChar, sizeof ( unsigned char ) *nTest );
02417 fTestLabels.close();
02418
02419
02420 int rows = 50, cols = 100;
02421 fstream fimg ( ( path + "/MNIST.pgm" ).c_str(),ios::out );
02422 char buf[256];
02423 sprintf ( buf,"P5\n%d %d\n255\n", cols*28, rows*28 );
02424 fimg<<buf;
02425
02426 for ( int I=0;I<rows;I++ )
02427 {
02428
02429 for ( int j=0;j<28;j++ )
02430 {
02431 for ( int i=0;i<cols;i++ )
02432 {
02433 for ( int k=0;k<28;k++ )
02434 fimg.write ( ( char* ) &trainChar[k + i*nFeat + j*28 + I*cols*nFeat], sizeof ( unsigned char ) );
02435 }
02436 }
02437 }
02438 fimg.close();
02439
02440
02441 train = new REAL[nTrain * nFeat];
02442 trainLabel = new int[nTrain];
02443 test = new REAL[nTest * nFeat];
02444 testLabel = new int[nTest];
02445
02446 for ( int i=0;i<nTrain;i++ )
02447 {
02448 trainLabel[i] = ( int ) trainLabelChar[i];
02449 for ( int j=0;j<nFeat;j++ )
02450 train[i*nFeat + j] = ( REAL ) trainChar[i*nFeat + j] / 255.0;
02451 }
02452
02453 for ( int i=0;i<nTest;i++ )
02454 {
02455 testLabel[i] = ( int ) testLabelChar[i];
02456 for ( int j=0;j<nFeat;j++ )
02457 test[i*nFeat + j] = ( REAL ) testChar[i*nFeat + j] / 255.0;
02458 }
02459
02460
02461 trainTarget = new REAL[nClass*nTrain];
02462 for ( int i=0;i<nTrain;i++ )
02463 {
02464 for ( int j=0;j<nClass;j++ )
02465 trainTarget[i*nClass + j] = negativeTarget;
02466 trainTarget[i*nClass + trainLabel[i]] = positiveTarget;
02467 }
02468
02469
02470 testTarget = new REAL[nClass*nTest];
02471 for ( int i=0;i<nTest;i++ )
02472 {
02473 for ( int j=0;j<nClass;j++ )
02474 testTarget[i*nClass + j] = negativeTarget;
02475 testTarget[i*nClass + testLabel[i]] = positiveTarget;
02476 }
02477
02478
02479 if ( trainChar )
02480 {
02481 delete[] trainChar;
02482 trainChar = 0;
02483 }
02484 if ( testChar )
02485 {
02486 delete[] testChar;
02487 testChar = 0;
02488 }
02489 if ( trainLabelChar )
02490 {
02491 delete[] trainLabelChar;
02492 trainLabelChar = 0;
02493 }
02494 if ( testLabelChar )
02495 {
02496 delete[] testLabelChar;
02497 testLabelChar = 0;
02498 }
02499 }
02500
02510 void DatasetReader::readAusDM2009 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
02511 {
02512 cout<<"Read AusDM2009 from: "<<path<<endl;
02513
02514
02515
02516
02517 string nameTrain = "L_RMSE_Train.csv";
02518 string nameTest = "L_RMSE_Score.csv";
02519 nClass = 1;
02520 bool addConstantOne = true;
02521
02522 if ( Framework::getDatasetType() == 1 )
02523 {
02524
02525
02526
02527
02528 nameTrain = "L_AUC_Train.csv";
02529 nameTest = "L_AUC_Score.csv";
02530 nClass = 2;
02531 }
02532
02533 cout<<"nameTrain:"<<nameTrain<<" nameTest:"<<nameTest<<endl;
02534
02535 int bufSize = 1024*1024;
02536 char *buf = new char[bufSize];
02537
02538 nDomain = 1;
02539
02540 fstream fTrainRMSE;
02541 fstream fTestRMSE;
02542
02543
02544 fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in );
02545 fTrainRMSE.getline ( buf, bufSize );
02546 fTrainRMSE.getline ( buf, bufSize );
02547 nFeat = 0;
02548 char *ptr = buf, *ptrLast = buf;
02549 int pos = 0, val, colCnt = 0;
02550 while ( ptr[pos] )
02551 {
02552 if ( ptr[pos] == ',' || ptr[pos+1] == 0 )
02553 {
02554 sscanf ( ptrLast,"%d",&val );
02555 ptrLast = ptr + pos + 1;
02556 colCnt++;
02557 if ( colCnt > 2 )
02558 nFeat++;
02559 }
02560 pos++;
02561 }
02562 fTrainRMSE.close();
02563
02564 if ( addConstantOne )
02565 nFeat++;
02566
02567
02568 fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in );
02569 fTrainRMSE.getline ( buf, bufSize );
02570 nTrain = 0;
02571 while ( fTrainRMSE.getline ( buf, bufSize ) )
02572 nTrain++;
02573 fTrainRMSE.close();
02574
02575
02576 fTestRMSE.open ( ( path+"/"+nameTest ).c_str(), ios::in );
02577 fTestRMSE.getline ( buf, bufSize );
02578 nTest = 0;
02579 while ( fTestRMSE.getline ( buf, bufSize ) )
02580 nTest++;
02581 fTestRMSE.close();
02582
02583
02584 train = new REAL[nFeat*nTrain];
02585 test = new REAL[nFeat*nTest];
02586 if ( Framework::getDatasetType() == 1 )
02587 {
02588 trainTarget = new REAL[nTrain*2];
02589 trainLabel = new int[nTrain];
02590 testTarget = new REAL[nTest*2];
02591 testLabel = new int[nTest];
02592 }
02593 else
02594 {
02595 trainTarget = new REAL[nTrain];
02596 trainLabel = 0;
02597 testTarget = new REAL[nTest];
02598 testLabel = 0;
02599 }
02600
02601
02602 fTrainRMSE.open ( ( path+"/"+nameTrain ).c_str(), ios::in );
02603 fTrainRMSE.getline ( buf, bufSize );
02604 nTrain = 0;
02605 while ( fTrainRMSE.getline ( buf, bufSize ) )
02606 {
02607 ptr = buf;
02608 ptrLast = buf;
02609 pos = 0;
02610 colCnt = 0;
02611 while ( ptr[pos] )
02612 {
02613 if ( ptr[pos] == ',' || ptr[pos+1] == 0 )
02614 {
02615 sscanf ( ptrLast,"%d",&val );
02616 ptrLast = ptr + pos + 1;
02617 colCnt++;
02618 if ( colCnt == 2 )
02619 {
02620 if ( Framework::getDatasetType() == 1 )
02621 {
02622 trainLabel[nTrain] = val>0? 0 : 1;
02623 trainTarget[2*nTrain+0] = val>0? positiveTarget : negativeTarget;
02624 trainTarget[2*nTrain+1] = val>0? negativeTarget : positiveTarget;
02625 }
02626 else
02627 trainTarget[nTrain] = ( REAL ) val * 0.001;
02628
02629 }
02630 if ( colCnt > 2 )
02631 train[nTrain*nFeat+colCnt-3] = ( REAL ) val * 0.001;
02632
02633 }
02634 pos++;
02635 }
02636 if ( ( colCnt-3 != nFeat-1 && addConstantOne == false ) || ( colCnt-3 != nFeat-2 && addConstantOne == true ) )
02637 {
02638 cout<<"colCnt:"<<colCnt<<" nFeat:"<<nFeat<<" addConstantOne:"<<addConstantOne<<endl;
02639 assert ( false );
02640 }
02641 if ( addConstantOne )
02642 train[nTrain*nFeat+nFeat-1] = 1.0;
02643 nTrain++;
02644 }
02645 fTrainRMSE.close();
02646
02647
02648 fTestRMSE.open ( ( path+"/"+nameTest ).c_str(), ios::in );
02649 fTestRMSE.getline ( buf, bufSize );
02650 nTest = 0;
02651 while ( fTestRMSE.getline ( buf, bufSize ) )
02652 {
02653 ptr = buf;
02654 ptrLast = buf;
02655 pos = 0;
02656 colCnt = 0;
02657 while ( ptr[pos] )
02658 {
02659 if ( ptr[pos] == ',' || ptr[pos+1] == 0 )
02660 {
02661 sscanf ( ptrLast,"%d",&val );
02662 ptrLast = ptr + pos + 1;
02663 colCnt++;
02664
02665 if ( Framework::getDatasetType() == 1 )
02666 {
02667 testTarget[nTest] = val>0? 0 : 1;
02668 testTarget[2*nTest+0] = val>0? positiveTarget : negativeTarget;
02669 testTarget[2*nTest+1] = val>0? negativeTarget : positiveTarget;
02670 }
02671 else
02672 testTarget[nTest] = ( REAL ) val * 0.001;
02673
02674 if ( colCnt > 2 )
02675 test[nTest*nFeat+colCnt-3] = ( REAL ) val * 0.001;
02676
02677 }
02678 pos++;
02679 }
02680 if ( ( colCnt-3 != nFeat-1 && addConstantOne == false ) || ( colCnt-3 != nFeat-2 && addConstantOne == true ) )
02681 {
02682 cout<<"colCnt:"<<colCnt<<" nFeat:"<<nFeat<<" addConstantOne:"<<addConstantOne<<endl;
02683 assert ( false );
02684 }
02685 if ( addConstantOne )
02686 test[nTest*nFeat+nFeat-1] = 1.0;
02687 nTest++;
02688 }
02689 fTestRMSE.close();
02690
02691
02692
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715
02716
02717
02718
02719
02720
02721
02722
02723
02724
02725
02726
02727
02728
02729
02730
02731
02732
02733
02734
02735
02736
02737
02738
02739
02740
02741
02742
02743
02744
02745
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767 delete[] buf;
02768 }
02769
02775 void DatasetReader::readNETFLIX ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
02776 {
02777 cout<<"Read NETFLIX binary predictions from: "<<NETFLIX_DATA_DIR<<endl;
02778
02779 if ( Framework::getAdditionalStartupParameter() < 0 )
02780 {
02781
02782 if ( Framework::getAdditionalStartupParameter() < -100 )
02783 {
02784 srand ( Framework::getRandomSeed() );
02785
02786 cout<<"Probeset subsampling"<<endl;
02787
02788
02789 nClass = 1;
02790 nDomain = 1;
02791 nTrain = 1408395;
02792 nTest = 2817131;
02793
02794 bool* maskProbe = new bool[nTrain];
02795 for ( int i=0;i<nTrain;i++ )
02796 maskProbe[i] = false;
02797 double p = - ( double ) ( Framework::getAdditionalStartupParameter() ) / ( double ) nTrain;
02798 int c = 0;
02799 for ( int i=0;i<nTrain;i++ )
02800 if ( ( double ) rand() / ( double ) RAND_MAX < p )
02801 {
02802 maskProbe[i] = true;
02803 c++;
02804 }
02805 cout<<"Selected: "<<c<<" probe samples"<<endl;
02806
02807
02808
02809 vector<string> files = Data::getDirectoryFileList ( NETFLIX_DATA_DIR );
02810 vector<string> predictionFiles;
02811
02812
02813 nFeat = 0;
02814 for ( int i=0;i<files.size();i++ )
02815 {
02816 int pos = files[i].find ( ".dat" );
02817 string fileEnding = files[i].substr ( files[i].length()-4,4 );
02818 if ( fileEnding == ".dat" )
02819 {
02820 predictionFiles.push_back ( files[i] );
02821 nFeat++;
02822 }
02823 }
02824
02825 cout<<"nFeat: "<<nFeat<<endl;
02826 cout<<"nClass: "<<nClass<<endl;
02827
02828
02829
02830
02831 cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl;
02832 fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in );
02833 float* ratingCache = new float[nTrain];
02834 for ( int i=0;i<nTrain;i++ )
02835 fProbeRatings>>ratingCache[i];
02836 fProbeRatings.close();
02837
02838
02839 test = 0;
02840 testLabel = 0;
02841 testTarget = 0;
02842 if ( Framework::getFrameworkMode() == 1 )
02843 {
02844 test = new REAL[ ( nTrain+nTest ) * nFeat];
02845 testTarget = new REAL[nTrain+nTest];
02846 for ( int i=0;i<nTrain+nTest;i++ )
02847 testTarget[i] = 0.0;
02848 }
02849 train = new REAL[c * nFeat];
02850 trainLabel = 0;
02851 trainTarget = new REAL[c];
02852 int d = 0;
02853 for ( int j=0;j<nTrain;j++ )
02854 {
02855 if ( maskProbe[j] )
02856 {
02857 trainTarget[d] = ratingCache[j];
02858 d++;
02859 }
02860 }
02861
02862
02863 float* trainTmp = new float[nTrain+nTest];
02864 for ( int i=0;i<predictionFiles.size();i++ )
02865 {
02866 fstream f ( predictionFiles[i].c_str(), ios::in );
02867 f.read ( ( char* ) trainTmp, sizeof ( float ) * ( nTrain+nTest ) );
02868 if ( Framework::getFrameworkMode() == 1 )
02869 for ( int j=0;j<nTrain+nTest;j++ )
02870 test[j*nFeat + i] = trainTmp[j];
02871 d = 0;
02872 for ( int j=0;j<nTrain;j++ )
02873 {
02874 if ( maskProbe[j] )
02875 {
02876 train[d*nFeat + i] = trainTmp[j];
02877 d++;
02878 }
02879 }
02880 f.close();
02881 cout<<"Prediction file: "<<predictionFiles[i]<<endl;
02882 }
02883
02884 delete[] trainTmp;
02885 delete[] ratingCache;
02886 delete[] maskProbe;
02887
02888 nTest = nTrain + nTest;
02889 nTrain = c;
02890 cout<<"nTrain:"<<nTrain<<endl<<"nTest:"<<nTest<<endl<<endl;
02891 return;
02892 }
02893
02894
02895
02896 nClass = 1;
02897 nDomain = 1;
02898
02899
02900
02901
02902 nTrain = 704197;
02903 nTest = 704198;
02904
02905
02906 cout<<"read path from:"<<path+"/path.txt"<<endl;
02907 fstream fP ( ( path+"/path.txt" ).c_str(),ios::in );
02908 string predictorPath;
02909 fP>>predictorPath;
02910 cout<<"path:"<<predictorPath<<endl;
02911 fP.close();
02912
02913
02914 vector<string> files = Data::getDirectoryFileList ( predictorPath );
02915 sort(files.begin(), files.end());
02916 vector<string> predictionFiles;
02917
02918
02919 nFeat = 0;
02920 for ( int i=0;i<files.size();i++ )
02921 {
02922 int pos = files[i].find ( ".dat" );
02923 string fileEnding = files[i].substr ( files[i].length()-4,4 );
02924 if ( fileEnding == ".dat" )
02925 {
02926 predictionFiles.push_back ( files[i] );
02927 nFeat++;
02928 }
02929 }
02930
02931
02932 int nProbe = 1408395;
02933 int nQual = 2817131;
02934 REAL* tmp = new float[nProbe+nQual];
02935 REAL* tmp2 = new float[predictionFiles.size()*nQual];
02936 int* tmp3 = new int[nQual];
02937 fstream ff((predictorPath+"/grand_prize/judging.txt").c_str(),ios::in);
02938 char buf[1024];
02939 int cnt = 0;
02940 while(ff.getline(buf,1024))
02941 {
02942 string line(buf);
02943 if(line.length() > 0)
02944 {
02945 if(line[line.length()-2] != ':')
02946 {
02947 int nr = atoi(line.c_str());
02948 tmp3[cnt] = nr;
02949 cnt++;
02950 }
02951 }
02952 }
02953 assert(cnt==nQual);
02954 ff.close();
02955 for ( int i=0;i<predictionFiles.size();i++ )
02956 {
02957 fstream f ( predictionFiles[i].c_str(), ios::in );
02958 f.read ( ( char* ) tmp, sizeof ( float ) *(nProbe+nQual) );
02959 for(int j=0;j<nQual;j++)
02960 tmp2[j*predictionFiles.size()+i] = tmp[nProbe+j];
02961 f.close();
02962 }
02963 fstream trainCSV((path+"/testQual.csv").c_str(), ios::out);
02964 for(int i=0;i<nQual;i++)
02965 {
02966 for(int j=0;j<predictionFiles.size();j++)
02967 trainCSV<<tmp2[i*nFeat+j]<<",";
02968 trainCSV<<tmp3[i]<<endl;
02969 }
02970 trainCSV.close();
02971 exit(0);
02972
02973
02974
02975
02976 cout<<"nFeat: "<<nFeat<<endl;
02977 cout<<"nClass: "<<nClass<<endl;
02978
02979 bool doClipping = true;
02980 if ( Framework::getAdditionalStartupParameter() == -2 )
02981 doClipping = false;
02982
02983
02984 if ( Framework::getFrameworkMode() == 0 )
02985 {
02986 train = new REAL[nTrain * nFeat];
02987 trainLabel = 0;
02988 trainTarget = new REAL[nTrain * nClass];
02989
02990
02991
02992
02993 cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl;
02994 fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in );
02995 for ( int i=0;i<nTrain;i++ )
02996 fProbeRatings>>trainTarget[i];
02997 fProbeRatings.close();
02998
02999 float* trainTmp = new float[nTrain];
03000
03001
03002 for ( int i=0;i<predictionFiles.size();i++ )
03003 {
03004 fstream f ( predictionFiles[i].c_str(), ios::in );
03005 f.read ( ( char* ) trainTmp, sizeof ( float ) *nTrain );
03006 double mean = 0.0;
03007 for ( int j=0;j<nTrain;j++ )
03008 mean += trainTmp[j];
03009 mean /= ( double ) nTrain;
03010 if ( mean > 1.0 && mean < 5.0 && doClipping )
03011 cout<<"[clip] ";
03012 for ( int j=0;j<nTrain;j++ )
03013 {
03014 train[j*nFeat + i] = trainTmp[j];
03015 if ( mean > 1.0 && mean < 5.0 && doClipping )
03016 {
03017 if ( train[j*nFeat + i] > 5.0 )
03018 train[j*nFeat + i] = 5.0;
03019 if ( train[j*nFeat + i] < 1.0 )
03020 train[j*nFeat + i] = 1.0;
03021 }
03022 }
03023 f.close();
03024 cout<<"Prediction file: "<<predictionFiles[i]<<" mean:"<<mean<<endl;
03025 }
03026
03027 if ( trainTmp )
03028 {
03029 delete[] trainTmp;
03030 trainTmp = 0;
03031 }
03032
03033 test = 0;
03034 testLabel = 0;
03035 testTarget = 0;
03036 nTest = 0;
03037
03038
03039
03040
03041
03042
03043
03044
03045
03046
03047 }
03048
03049 if ( Framework::getFrameworkMode() == 1 )
03050 {
03051 cout<<"alloc: "<<nTest * ( uint ) nFeat<<endl;
03052 test = new REAL[nTest * ( uint ) nFeat];
03053 testLabel = 0;
03054 testTarget = new REAL[nTest * ( uint ) nClass];
03055
03056
03057 for ( int i=0;i<nTest;i++ )
03058 testTarget[i] = 3.7;
03059
03060
03061 cout<<"Targets Read:"<<path+"/"+string ( "probeRatings.txt.rand" ) <<endl;
03062 fstream fProbeRatings ( ( path+"/"+string ( "probeRatings.txt.rand" ) ).c_str(), ios::in );
03063 REAL dummy;
03064 for ( int i=0;i<nTrain;i++ )
03065 fProbeRatings>>dummy;
03066 for ( int i=0;i<nTest;i++ )
03067 fProbeRatings>>testTarget[i];
03068 fProbeRatings.close();
03069
03070 float* testTmp = new float[nTest];
03071
03072
03073 for ( uint i=0;i<predictionFiles.size();i++ )
03074 {
03075 fstream f ( predictionFiles[i].c_str(), ios::in );
03076 f.read ( ( char* ) testTmp, sizeof ( float ) *nTrain );
03077 f.read ( ( char* ) testTmp, sizeof ( float ) *nTest );
03078 double mean = 0.0;
03079 for ( int j=0;j<nTest;j++ )
03080 mean += testTmp[j];
03081 mean /= ( double ) nTest;
03082 if ( mean > 1.0 && mean < 5.0 && doClipping )
03083 cout<<"[clip] ";
03084 for ( uint j=0;j<nTest;j++ )
03085 {
03086 test[j* ( uint ) nFeat + i] = testTmp[j];
03087 if ( mean > 1.0 && mean < 5.0 && doClipping )
03088 {
03089 if ( test[j* ( uint ) nFeat + i] > 5.0 )
03090 test[j* ( uint ) nFeat + i] = 5.0;
03091 if ( test[j* ( uint ) nFeat + i] < 1.0 )
03092 test[j* ( uint ) nFeat + i] = 1.0;
03093 }
03094 }
03095 f.close();
03096 cout<<"Prediction file: "<<predictionFiles[i]<<" mean:"<<mean<<endl;
03097 }
03098
03099 if ( testTmp )
03100 {
03101 delete[] testTmp;
03102 testTmp = 0;
03103 }
03104
03105 train = 0;
03106 trainLabel = 0;
03107 trainTarget = 0;
03108 nTrain = 0;
03109
03110
03111
03112
03113
03114
03115
03116
03117
03118
03119 }
03120 }
03121 else
03122 {
03123
03124 nClass = 1;
03125 nDomain = 1;
03126 char buf0[512];
03127 char buf1[512];
03128 char buf2[512];
03129 char buf3[512];
03130 char buf4[512];
03131 sprintf ( buf0,"%s/%s%d/",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03132 sprintf ( buf1,"%s/%s%d/nProbe.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03133 sprintf ( buf2,"%s/%s%d/nQual.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03134 sprintf ( buf3,"%s/%s%d/ratings.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03135 sprintf ( buf4,"%s/%s%d/ratingsTest.data",string ( NETFLIX_SLOTDATA_ROOT_DIR ).c_str(),"p",Framework::getAdditionalStartupParameter() );
03136
03137 fstream f;
03138
03139
03140 f.open ( buf1,ios::in );
03141 f.read ( ( char* ) &nTrain,sizeof ( int ) );
03142 f.close();
03143
03144
03145 f.open ( buf2,ios::in );
03146 f.read ( ( char* ) &nTest,sizeof ( int ) );
03147 f.close();
03148
03149
03150 float* tmp = new float[nTrain+nTest];
03151 f.open ( buf3,ios::in );
03152 f.read ( ( char* ) tmp,sizeof ( float ) *nTrain );
03153 f.close();
03154 trainTarget = new REAL[nTrain];
03155 for ( int i=0;i<nTrain;i++ )
03156 trainTarget[i] = tmp[i];
03157 testTarget = new REAL[nTest];
03158
03159
03160 f.open ( buf4,ios::in );
03161 f.read ( ( char* ) tmp,sizeof ( float ) *nTest );
03162 f.close();
03163 for ( int i=0;i<nTest;i++ )
03164 testTarget[i] = tmp[i];
03165
03166
03167 vector<string> files = Data::getDirectoryFileList ( buf0 );
03168 vector<string> predictionFiles;
03169
03170
03171 nFeat = 0;
03172 for ( int i=0;i<files.size();i++ )
03173 {
03174 string fileEnding = files[i].substr ( files[i].length()-4,4 );
03175 if ( fileEnding == ".dat" )
03176 {
03177 predictionFiles.push_back ( files[i] );
03178 nFeat++;
03179 }
03180 }
03181
03182 cout<<"nFeat: "<<nFeat<<endl;
03183 cout<<"nClass: "<<nClass<<endl;
03184 cout<<"nTrain: "<<nTrain<<endl;
03185 cout<<"nTest: "<<nTest<<endl;
03186
03187
03188 if ( Framework::getFrameworkMode() == 0 )
03189 {
03190 cout<<"allocate trainset: "<<nTrain * nFeat<<" elements"<<endl;
03191 train = new REAL[nTrain * nFeat];
03192 trainLabel = 0;
03193 }
03194 else
03195 {
03196 cout<<"allocate testset : "<< ( uint ) nTest * nFeat<<" elements"<<endl;
03197 test = new REAL[nTest * nFeat];
03198 testLabel = 0;
03199 }
03200
03201
03202 for ( int i=0;i<predictionFiles.size();i++ )
03203 {
03204 cout<<i<<"/"<< ( int ) predictionFiles.size() <<" ";
03205 f.open ( predictionFiles[i].c_str(), ios::in );
03206 f.read ( ( char* ) tmp, sizeof ( float ) * ( nTrain+nTest ) );
03207 f.close();
03208 double mean = 0.0;
03209 for ( int j=0;j<nTrain+nTest;j++ )
03210 mean += tmp[j];
03211 mean /= ( double ) ( nTrain+nTest );
03212 if ( mean > 1.0 && mean < 5.0 )
03213 cout<<"[clip] ";
03214 cout<<"mu:"<<mean<<" ";
03215 if ( Framework::getFrameworkMode() == 0 )
03216 {
03217
03218 for ( int j=0;j<nTrain;j++ )
03219 {
03220 train[j*nFeat + i] = tmp[j];
03221 if ( mean > 1.0 && mean < 5.0 )
03222 {
03223 if ( train[j*nFeat + i] > 5.0 )
03224 train[j*nFeat + i] = 5.0;
03225 if ( train[j*nFeat + i] < 1.0 )
03226 train[j*nFeat + i] = 1.0;
03227 }
03228 }
03229 }
03230 else
03231 {
03232
03233 for ( int j=0;j<nTest;j++ )
03234 {
03235 test[j*nFeat + i] = tmp[j+nTrain];
03236 if ( mean > 1.0 && mean < 5.0 )
03237 {
03238 if ( test[j*nFeat + i] > 5.0 )
03239 test[j*nFeat + i] = 5.0;
03240 if ( test[j*nFeat + i] < 1.0 )
03241 test[j*nFeat + i] = 1.0;
03242 }
03243 }
03244 }
03245 cout<<"Prediction file: "<<predictionFiles[i]<<endl;
03246 }
03247
03248 if ( Framework::getFrameworkMode() == 0 )
03249 nTest = 0;
03250 else
03251 nTrain = 0;
03252
03253 delete[] tmp;
03254
03255 }
03256
03257 }
03258
03266 void DatasetReader::readADULT ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03267 {
03268 cout<<"Read ADULT from: "<<path<<endl;
03269 nDomain = 1;
03270
03271
03272 int targetColumn = 15;
03273 char columnType[] = "ndndndddddnnndd";
03274 char enabledCol[] = "111111111111111";
03275 const char* dataFiles[] = { ( new string ( path+"/adult.data" ) )->c_str(), ( new string ( path+"/adult.test" ) )->c_str(),0};
03276
03277
03278 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03279
03280
03281 train = new REAL[nTrain * nFeat];
03282 trainLabel = new int[nTrain];
03283
03284
03285 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03286
03287
03288
03289 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03290
03291
03292 test = new REAL[nTest * nFeat];
03293 testLabel = new int[nTest];
03294
03295
03296 getDataBounds ( dataFiles, ", ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03297
03298
03299 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03300 }
03301
03307 void DatasetReader::readAUSTRALIAN ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03308 {
03309 cout<<"Read AUSTRALIAN from: "<<path<<endl;
03310 nDomain = 1;
03311
03312 char* dirs[] = {"tctodd1","tctodd2","tctodd3","tctodd4","tctodd5","tctodd6","tctodd7","tctodd8","tctodd9",0};
03313
03314 char* signs[] = {"alive","all","answer","boy","building","buy","change_mind_","cold","come","computer_PC_","cost","crazy","danger","deaf","different","draw","drink","eat","exit","flash-light","forget","girl","give","glove","go","God","happy","head","hear","hello","his_hers","hot","how","hurry","hurt","I","innocent","is_true_","joke","juice","know","later","lose","love","make","man","maybe","mine","money","more","name","no","Norway","not-my-problem","paper","pen","please","polite","question","read","ready","research","responsible","right","sad","same","science","share","shop","soon","sorry","spend","stubborn","surprise","take","temper","thank","think","tray","us","voluntary","wait_notyet_","what","when","where","which","who","why","wild","will","write","wrong","yes","you","zero",0};
03315
03316 nClass = 0;
03317 while ( signs[nClass] )
03318 nClass++;
03319
03320 cout<<"nClass:"<<nClass<<endl;
03321
03322 fstream fTrain;
03323
03324
03325 int nTrainTmp = 0;
03326 int dirCnt = 0;
03327 char buf[10000];
03328 int maxFrames = 0;
03329 int dataPerLine = 22;
03330 while ( dirs[dirCnt] )
03331 {
03332 int signCnt = 0;
03333 while ( signs[signCnt] )
03334 {
03335 for ( int i=0;i<3;i++ )
03336 {
03337 sprintf ( buf,"%s/%s/%s-%d.tsd",path.c_str(),dirs[dirCnt],signs[signCnt],i+1 );
03338 fTrain.open ( buf, ios::in );
03339 if ( fTrain.is_open() == false )
03340 cout<<"Can not open "<<buf<<endl;
03341 else
03342 {
03343 int lines = 0;
03344 while ( fTrain.getline ( buf, 10000 ) )
03345 {
03346 stringstream ss ( buf );
03347 REAL r;
03348 int cnt = 0;
03349 while ( ss>>r )
03350 cnt++;
03351 if ( cnt != dataPerLine )
03352 assert ( false );
03353 lines++;
03354 }
03355 if ( lines > maxFrames )
03356 maxFrames = lines;
03357 nTrainTmp++;
03358 }
03359 fTrain.close();
03360 }
03361 signCnt++;
03362 }
03363 dirCnt++;
03364 }
03365
03366 cout<<"nTrainTmp:"<<nTrainTmp<<endl;
03367 cout<<"maxFrames:"<<maxFrames<<endl;
03368
03369 nFeat = maxFrames * dataPerLine;
03370 cout<<"nFeat:"<<nFeat<<" ("<<maxFrames<<"*"<<dataPerLine<<")"<<endl;
03371
03372
03373 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03374 int* trainLabelTmp = new int[nTrainTmp];
03375 for ( int i=0;i<nTrainTmp * nFeat;i++ )
03376 trainTmp[i] = 0.0;
03377 for ( int i=0;i<nTrainTmp;i++ )
03378 trainLabelTmp[i] = 0;
03379
03380
03381 nTrainTmp = 0;
03382 dirCnt = 0;
03383 while ( dirs[dirCnt] )
03384 {
03385 int signCnt = 0;
03386 while ( signs[signCnt] )
03387 {
03388 for ( int i=0;i<3;i++ )
03389 {
03390 sprintf ( buf,"%s/%s/%s-%d.tsd",path.c_str(),dirs[dirCnt],signs[signCnt],i+1 );
03391 fTrain.open ( buf, ios::in );
03392 if ( fTrain.is_open() == false )
03393 cout<<"Can not open "<<buf<<endl;
03394 else
03395 {
03396 int lines = 0;
03397 while ( fTrain.getline ( buf, 10000 ) )
03398 {
03399 stringstream ss ( buf );
03400 REAL r;
03401 int cnt = 0;
03402 while ( ss>>r )
03403 {
03404 trainTmp[nTrainTmp * nFeat + lines * dataPerLine + cnt] = r;
03405 trainLabelTmp[nTrainTmp] = signCnt;
03406 cnt++;
03407 }
03408 if ( cnt != dataPerLine )
03409 assert ( false );
03410 lines++;
03411 }
03412 nTrainTmp++;
03413 }
03414 fTrain.close();
03415 }
03416 signCnt++;
03417 }
03418 dirCnt++;
03419 }
03420
03421
03422 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03423
03424 delete[] trainTmp;
03425 delete[] trainLabelTmp;
03426
03427 }
03428
03436 void DatasetReader::readBALANCE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03437 {
03438 cout<<"Read BALANCE from: "<<path<<endl;
03439 nDomain = 1;
03440
03441
03442 int targetColumn = 1;
03443 uint nTrainTmp;
03444 char columnType[] = "dnnnn";
03445 char enabledCol[] = "11111";
03446 const char* dataFiles[] = { ( new string ( path+"/balance-scale.data" ) )->c_str(),0};
03447
03448
03449 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03450
03451
03452 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03453 int* trainLabelTmp = new int[nTrainTmp];
03454
03455
03456 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03457
03458
03459 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03460
03461 delete[] trainTmp;
03462 delete[] trainLabelTmp;
03463
03464 }
03465
03473 void DatasetReader::readCYLINDERBANDS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03474 {
03475 cout<<"Read CYLINDER-BANDS from: "<<path<<endl;
03476 nDomain = 1;
03477
03478
03479 int targetColumn = 40;
03480 uint nTrainTmp;
03481 char columnType[] = "ndddddddddddddddddddnnnnnnnnnnnnnnnnnnnd";
03482 char enabledCol[] = "1111111111111111111111111111111111111111";
03483 const char* dataFiles[] = { ( new string ( path+"/bands.data" ) )->c_str(),0};
03484
03485
03486 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03487
03488
03489 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03490 int* trainLabelTmp = new int[nTrainTmp];
03491
03492
03493 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03494
03495
03496 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03497
03498 delete[] trainTmp;
03499 delete[] trainLabelTmp;
03500
03501 }
03502
03515 void DatasetReader::readBREASTCANCERWISCONSIN ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03516 {
03517 cout<<"Read BREAST-CANCER-WISCONSIN from: "<<path<<endl;
03518 nDomain = 1;
03519
03520
03521 int targetColumn = 11;
03522 uint nTrainTmp;
03523 char columnType[] = "nnnnnnnnnnd";
03524 char enabledCol[] = "11111111111";
03525
03526 const char* dataFiles[] = { ( new string ( path+"/breast-cancer-wisconsin.data" ) )->c_str(),0};
03527
03528
03529 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03530
03531
03532 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03533 int* trainLabelTmp = new int[nTrainTmp];
03534
03535
03536 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03537
03538
03539 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03540
03541 delete[] trainTmp;
03542 delete[] trainLabelTmp;
03543
03544 }
03545
03553 void DatasetReader::readAUSTRALIANCREDIT ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03554 {
03555 cout<<"Read AUSTRALIAN-CREDIT from: "<<path<<endl;
03556 nDomain = 1;
03557
03558
03559 int targetColumn = 15;
03560 uint nTrainTmp;
03561 char columnType[] = "dnndddnddnddnnd";
03562 char enabledCol[] = "111111111111111";
03563 const char* dataFiles[] = { ( new string ( path+"/australian.dat" ) )->c_str(),0};
03564
03565
03566 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03567
03568
03569 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03570 int* trainLabelTmp = new int[nTrainTmp];
03571
03572
03573 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03574
03575
03576 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03577
03578 delete[] trainTmp;
03579 delete[] trainLabelTmp;
03580 }
03581
03582
03589 void DatasetReader::readDIABETES ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03590 {
03591 cout<<"Read DIABETES from: "<<path<<endl;
03592 nDomain = 1;
03593
03594
03595 int targetColumn = 9;
03596 uint nTrainTmp;
03597 char columnType[] = "nnnnnnnnd";
03598 char enabledCol[] = "111111111";
03599 const char* dataFiles[] = { ( new string ( path+"/pima-indians-diabetes.data" ) )->c_str(),0};
03600
03601
03602 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03603
03604
03605 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03606 int* trainLabelTmp = new int[nTrainTmp];
03607
03608
03609 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03610
03611
03612 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03613
03614 delete[] trainTmp;
03615 delete[] trainLabelTmp;
03616
03617 }
03618
03626 void DatasetReader::readGERMAN ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03627 {
03628 cout<<"Read GERMAN from: "<<path<<endl;
03629 nDomain = 1;
03630
03631
03632 int targetColumn = 21;
03633 uint nTrainTmp;
03634 char columnType[] = "dnddnddnddndnddndnddd";
03635 char enabledCol[] = "111111111111111111111";
03636 const char* dataFiles[] = { ( new string ( path+"/german.data" ) )->c_str(),0};
03637
03638
03639 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03640
03641
03642 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03643 int* trainLabelTmp = new int[nTrainTmp];
03644
03645
03646 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03647
03648
03649 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03650
03651 delete[] trainTmp;
03652 delete[] trainLabelTmp;
03653
03654 }
03655
03663 void DatasetReader::readGLASS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03664 {
03665 cout<<"Read GLASS from: "<<path<<endl;
03666 nDomain = 1;
03667
03668
03669 int targetColumn = 11;
03670 uint nTrainTmp;
03671 char columnType[] = "nnnnnnnnnnd";
03672 char enabledCol[] = "01111111111";
03673 const char* dataFiles[] = { ( new string ( path+"/glass.data" ) )->c_str(),0};
03674
03675
03676 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03677
03678
03679 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03680 int* trainLabelTmp = new int[nTrainTmp];
03681
03682
03683 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03684
03685
03686 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03687
03688 delete[] trainTmp;
03689 delete[] trainLabelTmp;
03690
03691 }
03692
03700 void DatasetReader::readHEART ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03701 {
03702 cout<<"Read HEART from: "<<path<<endl;
03703 nDomain = 1;
03704
03705
03706 int targetColumn = 1;
03707 char columnType[] = "dnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn";
03708 char enabledCol[] = "111111111111111111111111111111111111111111111";
03709 const char* dataFiles[] = { ( new string ( path+"/SPECTF.train" ) )->c_str(), ( new string ( path+"/SPECTF.test" ) )->c_str(),0};
03710
03711
03712 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03713 train = new REAL[nFeat*nTrain];
03714 trainLabel = new int[nTrain];
03715 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03716
03717
03718 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03719 test = new REAL[nFeat*nTest];
03720 testLabel = new int[nTest];
03721 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03722
03723
03724 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03725
03726 }
03727
03734 void DatasetReader::readHEPATITIS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03735 {
03736 cout<<"Read HEPATITIS from: "<<path<<endl;
03737 nDomain = 1;
03738
03739
03740 int targetColumn = 1;
03741 uint nTrainTmp;
03742 char columnType[] = "dnnnnnnnnnnnnnnnnnnn";
03743 char enabledCol[] = "11111111111111111111";
03744 const char* dataFiles[] = { ( new string ( path+"/hepatitis.data" ) )->c_str(),0};
03745
03746
03747 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03748
03749
03750 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03751 int* trainLabelTmp = new int[nTrainTmp];
03752
03753
03754 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03755
03756
03757 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03758
03759 delete[] trainTmp;
03760 delete[] trainLabelTmp;
03761
03762 }
03763
03770 void DatasetReader::readIONOSPHERE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03771 {
03772 cout<<"Read IONOSPHERE from: "<<path<<endl;
03773 nDomain = 1;
03774
03775
03776 int targetColumn = 35;
03777 uint nTrainTmp;
03778 char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd";
03779 char enabledCol[] = "11111111111111111111111111111111111";
03780 const char* dataFiles[] = { ( new string ( path+"/ionosphere.data" ) )->c_str(),0};
03781
03782
03783 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03784
03785
03786 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03787 int* trainLabelTmp = new int[nTrainTmp];
03788
03789
03790 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03791
03792
03793 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03794
03795 delete[] trainTmp;
03796 delete[] trainLabelTmp;
03797
03798 }
03799
03800
03807 void DatasetReader::readIRIS ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03808 {
03809 cout<<"Read IRIS from: "<<path<<endl;
03810 nDomain = 1;
03811
03812
03813 int targetColumn = 5;
03814 uint nTrainTmp;
03815 char columnType[] = "nnnnd";
03816 char enabledCol[] = "11111";
03817 const char* dataFiles[] = { ( new string ( path+"/iris.data" ) )->c_str(),0};
03818
03819
03820 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03821
03822
03823 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03824 int* trainLabelTmp = new int[nTrainTmp];
03825
03826
03827 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03828
03829
03830 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
03831
03832 delete[] trainTmp;
03833 delete[] trainLabelTmp;
03834
03835 }
03836
03843 void DatasetReader::readLETTER ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03844 {
03845 cout<<"Read LETTER from: "<<path<<endl;
03846 nDomain = 1;
03847
03848
03849 int targetColumn = 1;
03850 uint nTrainTmp;
03851 char columnType[] = "dnnnnnnnnnnnnnnnn";
03852 char enabledCol[] = "11111111111111111";
03853 const char* dataFiles[] = { ( new string ( path+"/letter-recognition.data" ) )->c_str(),0};
03854
03855
03856 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03857
03858
03859 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03860 int* trainLabelTmp = new int[nTrainTmp];
03861
03862
03863 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
03864
03865
03866 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget, true );
03867
03868 delete[] trainTmp;
03869 delete[] trainLabelTmp;
03870
03871 }
03872
03879 void DatasetReader::readMONKS1 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03880 {
03881 cout<<"Read MONKS1 from: "<<path<<endl;
03882 nDomain = 1;
03883
03884
03885 int targetColumn = 1;
03886 char columnType[] = "dnnnnnnd";
03887 char enabledCol[] = "11111110";
03888 const char* dataFiles[] = { ( new string ( path+"/monks-1.train" ) )->c_str(), ( new string ( path+"/monks-1.test" ) )->c_str(),0};
03889
03890
03891 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03892 train = new REAL[nFeat*nTrain];
03893 trainLabel = new int[nTrain];
03894 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03895
03896
03897 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03898 test = new REAL[nFeat*nTest];
03899 testLabel = new int[nTest];
03900 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03901
03902
03903 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03904
03905 }
03906
03913 void DatasetReader::readMONKS2 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03914 {
03915 cout<<"Read MONKS2 from: "<<path<<endl;
03916 nDomain = 1;
03917
03918
03919 int targetColumn = 1;
03920 char columnType[] = "dnnnnnnd";
03921 char enabledCol[] = "11111110";
03922 const char* dataFiles[] = { ( new string ( path+"/monks-2.train" ) )->c_str(), ( new string ( path+"/monks-2.test" ) )->c_str(),0};
03923
03924
03925 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03926 train = new REAL[nFeat*nTrain];
03927 trainLabel = new int[nTrain];
03928 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03929
03930
03931 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03932 test = new REAL[nFeat*nTest];
03933 testLabel = new int[nTest];
03934 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03935
03936
03937 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03938
03939 }
03940
03947 void DatasetReader::readMONKS3 ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03948 {
03949 cout<<"Read MONKS3 from: "<<path<<endl;
03950 nDomain = 1;
03951
03952
03953 int targetColumn = 1;
03954 char columnType[] = "dnnnnnnd";
03955 char enabledCol[] = "11111110";
03956 const char* dataFiles[] = { ( new string ( path+"/monks-3.train" ) )->c_str(), ( new string ( path+"/monks-3.test" ) )->c_str(),0};
03957
03958
03959 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
03960 train = new REAL[nFeat*nTrain];
03961 trainLabel = new int[nTrain];
03962 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
03963
03964
03965 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
03966 test = new REAL[nFeat*nTest];
03967 testLabel = new int[nTest];
03968 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
03969
03970
03971 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
03972
03973 }
03974
03981 void DatasetReader::readMUSHROOM ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
03982 {
03983 cout<<"Read MUSHROOM from: "<<path<<endl;
03984 nDomain = 1;
03985
03986
03987 int targetColumn = 1;
03988 uint nTrainTmp;
03989 char columnType[] = "ddddddddddddddddddddddd";
03990 char enabledCol[] = "11111111111111111111111";
03991 const char* dataFiles[] = { ( new string ( path+"/agaricus-lepiota.data" ) )->c_str(),0};
03992
03993
03994 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
03995
03996
03997 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
03998 int* trainLabelTmp = new int[nTrainTmp];
03999
04000
04001 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04002
04003
04004 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04005
04006 delete[] trainTmp;
04007 delete[] trainLabelTmp;
04008
04009 }
04010
04018 void DatasetReader::readSATIMAGE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04019 {
04020 cout<<"Read SATIMAGE from: "<<path<<endl;
04021 nDomain = 1;
04022
04023
04024 int targetColumn = 37;
04025 uint nTrainTmp;
04026 char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd";
04027 char enabledCol[] = "1111111111111111111111111111111111111";
04028 const char* dataFiles[] = { ( new string ( path+"/sat.trn" ) )->c_str(), ( new string ( path+"/sat.tst" ) )->c_str(),0};
04029
04030
04031 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
04032 train = new REAL[nFeat*nTrain];
04033 trainLabel = new int[nTrain];
04034 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
04035
04036
04037 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
04038 test = new REAL[nFeat*nTest];
04039 testLabel = new int[nTest];
04040 getDataBounds ( dataFiles, " ", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
04041
04042
04043 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04044
04045 }
04046
04054 void DatasetReader::readSEGMENTATION ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04055 {
04056 cout<<"Read SEGMENTATION from: "<<path<<endl;
04057 nDomain = 1;
04058
04059
04060 int targetColumn = 1;
04061 uint nTrainTmp;
04062 char columnType[] = "dnnnnnnnnnnnnnnnnnnn";
04063 char enabledCol[] = "11111111111111111111";
04064 const char* dataFiles[] = { ( new string ( path+"/segmentation.data" ) )->c_str(), ( new string ( path+"/segmentation.test" ) )->c_str(),0};
04065
04066
04067 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
04068 train = new REAL[nFeat*nTrain];
04069 trainLabel = new int[nTrain];
04070 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
04071
04072
04073 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
04074 test = new REAL[nFeat*nTest];
04075 testLabel = new int[nTest];
04076 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
04077
04078
04079 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04080
04081 }
04082
04089 void DatasetReader::readSONAR ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04090 {
04091 cout<<"Read SONAR from: "<<path<<endl;
04092 nDomain = 1;
04093
04094
04095 int targetColumn = 61;
04096 uint nTrainTmp;
04097 char columnType[] = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnd";
04098 char enabledCol[] = "1111111111111111111111111111111111111111111111111111111111111";
04099 const char* dataFiles[] = { ( new string ( path+"/sonar.all-data" ) )->c_str(),0};
04100
04101
04102 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04103
04104
04105 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04106 int* trainLabelTmp = new int[nTrainTmp];
04107
04108
04109 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04110
04111
04112 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04113
04114 delete[] trainTmp;
04115 delete[] trainLabelTmp;
04116
04117 }
04118
04119
04126 void DatasetReader::readVEHICLE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04127 {
04128 cout<<"Read VEHICLE from: "<<path<<endl;
04129 nDomain = 1;
04130
04131
04132 int targetColumn = 19;
04133 uint nTrainTmp;
04134 char columnType[] = "nnnnnnnnnnnnnnnnnnd";
04135 char enabledCol[] = "1111111111111111111";
04136 const char* dataFiles[] = { ( new string ( path+"/train.data" ) )->c_str(),0};
04137
04138
04139 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04140
04141
04142 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04143 int* trainLabelTmp = new int[nTrainTmp];
04144
04145
04146 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04147
04148
04149 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04150
04151 delete[] trainTmp;
04152 delete[] trainLabelTmp;
04153
04154 }
04155
04162 void DatasetReader::readVOTES ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04163 {
04164 cout<<"Read VOTES from: "<<path<<endl;
04165 nDomain = 1;
04166
04167
04168 int targetColumn = 1;
04169 uint nTrainTmp;
04170 char columnType[] = "ddddddddddddddddd";
04171 char enabledCol[] = "11111111111111111";
04172 const char* dataFiles[] = { ( new string ( path+"/house-votes-84.data" ) )->c_str(),0};
04173
04174
04175 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04176
04177
04178 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04179 int* trainLabelTmp = new int[nTrainTmp];
04180
04181
04182 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04183
04184
04185 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04186
04187 delete[] trainTmp;
04188 delete[] trainLabelTmp;
04189
04190 }
04191
04198 void DatasetReader::readWINE ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04199 {
04200 cout<<"Read WINE from: "<<path<<endl;
04201 nDomain = 1;
04202
04203
04204 int targetColumn = 1;
04205 uint nTrainTmp;
04206 char columnType[] = "dnnnnnnnnnnnnn";
04207 char enabledCol[] = "11111111111111";
04208 const char* dataFiles[] = { ( new string ( path+"/wine.data" ) )->c_str(),0};
04209
04210
04211 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04212
04213
04214 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04215 int* trainLabelTmp = new int[nTrainTmp];
04216
04217
04218 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04219
04220
04221 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04222
04223 delete[] trainTmp;
04224 delete[] trainLabelTmp;
04225
04226 }
04227
04235 void DatasetReader::readPOKER ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04236 {
04237 cout<<"Read POKER from: "<<path<<endl;
04238 nDomain = 1;
04239
04240
04241 int targetColumn = 11;
04242 char columnType[] = "ddddddddddd";
04243 char enabledCol[] = "11111111111";
04244 const char* dataFiles[] = { ( new string ( path+"/poker-hand-training-true.data" ) )->c_str(), ( new string ( path+"/poker-hand-testing.data" ) )->c_str(),0};
04245
04246
04247 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0 );
04248 train = new REAL[nFeat*nTrain];
04249 trainLabel = new int[nTrain];
04250 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrain, columnType, enabledCol, targetColumn, 0, true, train, trainLabel );
04251
04252
04253 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1 );
04254 test = new REAL[nFeat*nTest];
04255 testLabel = new int[nTest];
04256 getDataBounds ( dataFiles, ",", nFeat, nClass, nTest, columnType, enabledCol, targetColumn, 1, true, test, testLabel );
04257
04258
04259 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04260
04261 }
04262
04269 void DatasetReader::readYEAST ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04270 {
04271 cout<<"Read YEAST from: "<<path<<endl;
04272 nDomain = 1;
04273
04274
04275 int targetColumn = 10;
04276 uint nTrainTmp;
04277 char columnType[] = "dnnnnnnnnd";
04278 char enabledCol[] = "0111111111";
04279 const char* dataFiles[] = { ( new string ( path+"/yeast.data" ) )->c_str(),0};
04280
04281
04282 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04283
04284
04285 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04286 int* trainLabelTmp = new int[nTrainTmp];
04287
04288
04289 getDataBounds ( dataFiles, " ", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04290
04291
04292 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04293
04294 delete[] trainTmp;
04295 delete[] trainLabelTmp;
04296
04297 }
04298
04305 void DatasetReader::readSURVIVAL ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04306 {
04307 cout<<"Read SURVIVAL from: "<<path<<endl;
04308 nDomain = 1;
04309
04310
04311 int targetColumn = 4;
04312 uint nTrainTmp;
04313 char columnType[] = "nnnd";
04314 char enabledCol[] = "1111";
04315 const char* dataFiles[] = { ( new string ( path+"/haberman.data" ) )->c_str(),0};
04316
04317
04318 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0 );
04319
04320
04321 REAL* trainTmp = new REAL[nTrainTmp * nFeat];
04322 int* trainLabelTmp = new int[nTrainTmp];
04323
04324
04325 getDataBounds ( dataFiles, ",", nFeat, nClass, nTrainTmp, columnType, enabledCol, targetColumn, 0, true, trainTmp, trainLabelTmp );
04326
04327
04328 splitRandomTestset ( 0.2, trainTmp, trainLabelTmp, nTrainTmp, nFeat, nClass, train, trainLabel, trainTarget, test, testLabel, testTarget, nTrain, nTest, positiveTarget, negativeTarget );
04329
04330 delete[] trainTmp;
04331 delete[] trainLabelTmp;
04332
04333 }
04334
04341 void DatasetReader::readSPIDER ( string path, REAL* &train, REAL* &trainTarget, int* &trainLabel, REAL* &test, REAL* &testTarget, int* &testLabel, uint& nTrain, uint& nTest, int& nClass, int& nDomain, int& nFeat, REAL positiveTarget, REAL negativeTarget )
04342 {
04343 cout<<"Read SPIDER from: "<<path<<endl;
04344 nDomain = 1;
04345 nFeat = 3;
04346 nClass = 2;
04347
04348
04349 int bufLen = 1024 * 1024;
04350 char *buf = new char[bufLen];
04351
04352
04353 nTrain = 0;
04354 fstream f ( ( path+"/train.data" ).c_str(), ios::in );
04355 while ( f.getline ( buf,bufLen ) )
04356 nTrain++;
04357 f.close();
04358 train = new REAL[3*nTrain];
04359 trainTarget = new REAL[2*nTrain];
04360 trainLabel = new int[nTrain];
04361
04362 f.open ( ( path+"/train.data" ).c_str(), ios::in );
04363 nTrain = 0;
04364 while ( f.getline ( buf,bufLen ) )
04365 {
04366 sscanf ( buf,"%f %f %d",&train[3*nTrain],&train[3*nTrain+1],&trainLabel[nTrain] );
04367 train[3*nTrain+2] = 1.0;
04368 if ( trainLabel[nTrain] > 0 )
04369 {
04370 trainTarget[2*nTrain] = positiveTarget;
04371 trainTarget[2*nTrain+1] = negativeTarget;
04372 trainLabel[nTrain] = 0;
04373 }
04374 else
04375 {
04376 trainTarget[2*nTrain] = negativeTarget;
04377 trainTarget[2*nTrain+1] = positiveTarget;
04378 trainLabel[nTrain] = 1;
04379 }
04380 nTrain++;
04381 }
04382 f.close();
04383
04384
04385 nTest = 0;
04386 f.open ( ( path+"/test.data" ).c_str(), ios::in );
04387 while ( f.getline ( buf,bufLen ) )
04388 nTest++;
04389 f.close();
04390 test = new REAL[3*nTest];
04391 testTarget = new REAL[2*nTest];
04392 testLabel = new int[nTest];
04393
04394 f.open ( ( path+"/test.data" ).c_str(), ios::in );
04395 nTest = 0;
04396 while ( f.getline ( buf,bufLen ) )
04397 {
04398 sscanf ( buf,"%f %f %d",&test[3*nTest],&test[3*nTest+1],&testLabel[nTest] );
04399 test[3*nTest+2] = 1.0;
04400 if ( testLabel[nTrain] > 0 )
04401 {
04402 testTarget[2*nTest] = positiveTarget;
04403 testTarget[2*nTest+1] = negativeTarget;
04404 testLabel[nTest] = 0;
04405 }
04406 else
04407 {
04408 testTarget[2*nTest] = negativeTarget;
04409 testTarget[2*nTest+1] = positiveTarget;
04410 testLabel[nTest] = 1;
04411 }
04412 nTest++;
04413 }
04414 f.close();
04415
04416 delete[] buf;
04417
04418
04419
04420
04421
04422
04423
04424
04425
04426
04427
04428
04429
04430
04431
04432
04433
04434
04435
04436
04437
04438
04439
04440
04441
04442
04443
04444
04445
04446
04447
04448
04449
04450
04451
04452
04453
04454
04455
04456
04457
04458
04459
04460
04461
04462
04463
04464
04465
04466
04467
04468
04469
04470
04471
04472
04473
04474
04475
04476
04477
04478
04479
04480
04481
04482
04483
04484 }
04485
04486
04510 void DatasetReader::getDataBounds ( const char** filenames, string delimiter, int& nFeat, int& nClass, uint& nLines, char* columnType, char* enabledCol, int targetColumn, int filenameID, bool fillData, REAL* data, int* labels, bool addConstantOne, bool skipFirstLine )
04511 {
04512 int bufSize = 1024*1024;
04513 int nFiles = 0;
04514 while ( filenames[nFiles] )
04515 nFiles++;
04516 cout<<"nFiles:"<<nFiles<<endl;
04517
04518 fstream f;
04519
04520 for ( int i=0;i<nFiles;i++ )
04521 {
04522 f.open ( filenames[i], ios::in );
04523 if ( f.is_open() == false )
04524 {
04525 cout<<"Can not open "<<filenames[i]<<endl;
04526 exit ( 0 );
04527 }
04528 f.close();
04529 }
04530
04531 int columnTypeSize = 0;
04532 while ( columnType[columnTypeSize] )
04533 columnTypeSize++;
04534 cout<<"columnTypeSize:"<<columnTypeSize<<endl;
04535 char buf0[bufSize], buf1[bufSize];
04536 int delimiterLength = delimiter.length();
04537 const char* delimiterCharPtr = delimiter.c_str();
04538 vector<string>* discreteValues = new vector<string>[columnTypeSize];
04539 double* numericMean = new double[columnTypeSize];
04540 int* numericMeanCnt = new int[columnTypeSize];
04541 for ( int i=0;i<columnTypeSize;i++ )
04542 {
04543 numericMean[i] = 0.0;
04544 numericMeanCnt[i] = 0;
04545 }
04546 for ( int fileCnt=0;fileCnt<nFiles;fileCnt++ )
04547 {
04548 f.open ( filenames[fileCnt], ios::in );
04549 if ( fileCnt == filenameID )
04550 nLines = 0;
04551
04552 if ( skipFirstLine )
04553 f.getline ( buf0, bufSize );
04554
04555 while ( f.getline ( buf0, bufSize ) )
04556 {
04557 int cnt0 = 0, cnt1 = 0, cellCnt = 0;
04558 while ( buf0[cnt1] != 0 && cnt1 < bufSize )
04559 {
04560 int matchCnt = 0;
04561 for ( int i=0;i<delimiterLength;i++ )
04562 matchCnt += delimiterCharPtr[i] == buf0[cnt1+i];
04563
04564 if ( buf0[cnt1+delimiterLength]!=' ' && cnt1 > 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) )
04565 {
04566 if ( cellCnt >= columnTypeSize )
04567 break;
04568
04569 int addOne = 0;
04570 if ( buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' )
04571 addOne = 1;
04572 strncpy ( buf1, buf0 + cnt0, cnt1 - cnt0 + addOne );
04573 buf1[cnt1 - cnt0 + addOne] = 0;
04574 cnt0 = cnt1 + delimiterLength;
04575 if ( cnt1 < cnt0 - 1 )
04576 cnt1 = cnt0 - 1;
04577 if ( enabledCol[cellCnt] == '1' )
04578 {
04579 if ( columnType[cellCnt] == 'd' )
04580 {
04581
04582 bool exists = false;
04583 for ( int i=0;i<discreteValues[cellCnt].size();i++ )
04584 if ( discreteValues[cellCnt][i] == string ( buf1 ) )
04585 exists = true;
04586 if ( exists == false )
04587 discreteValues[cellCnt].push_back ( string ( buf1 ) );
04588 }
04589 else if ( columnType[cellCnt] == 'n' )
04590 {
04591 if ( ( buf1[0] >= '0' && buf1[0] <= '9' ) || buf1[0] == '.' || buf1[0] == '-' )
04592 {
04593 float num;
04594 sscanf ( buf1,"%f",&num );
04595 if ( fileCnt == filenameID )
04596 {
04597 numericMean[cellCnt] += num;
04598 numericMeanCnt[cellCnt]++;
04599 }
04600 }
04601 else
04602 {
04603 ;
04604 }
04605 }
04606 else
04607 assert ( false );
04608 }
04609
04610 cellCnt++;
04611 if ( buf0[cnt1+1] == 0 )
04612 break;
04613 }
04614 else if ( cnt1 == 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 ) )
04615 cnt0++;
04616 cnt1++;
04617 }
04618
04619
04620
04621 if ( cnt1 > 1 )
04622 {
04623 if ( cellCnt != columnTypeSize && cellCnt > 1 )
04624 {
04625 cout<<"cellCnt:"<<cellCnt<<" columnTypeSize:"<<columnTypeSize<<endl;
04626 assert ( false );
04627 }
04628 if ( fileCnt == filenameID )
04629 nLines++;
04630 }
04631 memset ( buf0, 0, bufSize );
04632 }
04633 f.close();
04634
04635 }
04636
04637
04638 nFeat = 0;
04639 cout<<"ValuesPerDiscreteInput:"<<endl;
04640 for ( int i=0;i<columnTypeSize;i++ )
04641 {
04642 if ( i+1 != targetColumn )
04643 {
04644 if ( enabledCol[i] == '1' )
04645 {
04646 if ( columnType[i] == 'd' )
04647 {
04648 cout<<i<<": #"<< ( int ) discreteValues[i].size() <<" {";
04649 for ( int j=0;j<discreteValues[i].size();j++ )
04650 cout<<discreteValues[i][j]<<",";
04651 cout<<"}"<<endl;
04652 nFeat += discreteValues[i].size();
04653 }
04654 else if ( columnType[i] == 'n' )
04655 nFeat++;
04656 else
04657 assert ( false );
04658 }
04659 }
04660 }
04661 if ( addConstantOne )
04662 nFeat++;
04663 cout<<endl;
04664
04665 nClass = discreteValues[targetColumn-1].size();
04666 cout<<"#Targets:"<< ( int ) nClass<<" {";
04667 for ( int j=0;j<nClass;j++ )
04668 {
04669 string value = discreteValues[targetColumn-1][j];
04670 cout<<value<<","<<flush;
04671 }
04672 cout<<"}"<<endl;
04673
04674 cout<<endl;
04675 cout<<"nFeat:"<<nFeat<<endl;
04676 cout<<"nLines:"<<nLines<<endl;
04677
04678 if ( fillData )
04679 {
04680
04681 for ( int i=0;i<nLines*nFeat;i++ )
04682 data[i] = 0.0;
04683 if ( addConstantOne )
04684 {
04685 for ( int i=0;i<nLines;i++ )
04686 data[i*nFeat + nFeat-1] = 1.0;
04687 }
04688 for ( int i=0;i<nLines;i++ )
04689 labels[i] = 0;
04690
04691 f.open ( filenames[filenameID], ios::in );
04692 nLines = 0;
04693
04694 if ( skipFirstLine )
04695 f.getline ( buf0, bufSize );
04696
04697 while ( f.getline ( buf0, bufSize ) )
04698 {
04699 int cnt0 = 0, cnt1 = 0, cellCnt = 0, pos = 0;
04700 while ( buf0[cnt1] != 0 && cnt1 < bufSize )
04701 {
04702 int matchCnt = 0;
04703 for ( int i=0;i<delimiterLength;i++ )
04704 matchCnt += delimiterCharPtr[i] == buf0[cnt1+i];
04705
04706 if ( buf0[cnt1+delimiterLength]!=' ' && cnt1 > 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' ) )
04707 {
04708 if ( cellCnt >= columnTypeSize )
04709 break;
04710
04711 int addOne = 0;
04712 if ( buf0[cnt1+1] == 0 || buf0[cnt1+1] == '\r' )
04713 addOne = 1;
04714 strncpy ( buf1, buf0 + cnt0, cnt1 - cnt0 + addOne );
04715 buf1[cnt1 - cnt0 + addOne] = 0;
04716 cnt0 = cnt1 + delimiterLength;
04717 if ( cnt1 < cnt0 - 1 )
04718 cnt1 = cnt0 - 1;
04719 if ( enabledCol[cellCnt] == '1' )
04720 {
04721 if ( columnType[cellCnt] == 'd' )
04722 {
04723
04724 int searchPos = -1;
04725 for ( int i=0;i<discreteValues[cellCnt].size();i++ )
04726 if ( discreteValues[cellCnt][i] == string ( buf1 ) )
04727 searchPos = i;
04728
04729 if ( searchPos == -1 )
04730 assert ( false );
04731
04732
04733 if ( cellCnt+1 == targetColumn )
04734 {
04735 labels[nLines] = searchPos;
04736 }
04737 else
04738 {
04739 data[nLines*nFeat + pos + searchPos] = 1.0;
04740 pos += discreteValues[cellCnt].size();
04741 }
04742 }
04743 else if ( columnType[cellCnt] == 'n' )
04744 {
04745 if ( ( buf1[0] >= '0' && buf1[0] <= '9' ) || buf1[0] == '.' || buf1[0] == '-' )
04746 {
04747 float num;
04748 sscanf ( buf1,"%f",&num );
04749 data[nLines*nFeat + pos] = num;
04750 }
04751 else
04752 {
04753 data[nLines*nFeat + pos] = 0.0;
04754 if ( numericMeanCnt[cellCnt] > 0 )
04755 data[nLines*nFeat + pos] = numericMean[cellCnt] / numericMeanCnt[cellCnt];
04756 }
04757 pos++;
04758 }
04759 else
04760 assert ( false );
04761 }
04762 cellCnt++;
04763 }
04764 else if ( cnt1 == 0 && ( matchCnt == delimiterLength || buf0[cnt1+1] == 0 ) )
04765 cnt0++;
04766 cnt1++;
04767 }
04768
04769
04770 if ( cnt1 > 1 )
04771 {
04772 if ( cellCnt != columnTypeSize && cellCnt > 1 )
04773 {
04774 cout<<"cellCnt:"<<cellCnt<<" columnTypeSize:"<<columnTypeSize<<endl;
04775 assert ( false );
04776 }
04777 nLines++;
04778
04779 if ( pos != nFeat - ( int ) addConstantOne )
04780 {
04781 cout<<"pos:"<<pos<<" nFeat:"<<nFeat<<endl;
04782 assert ( false );
04783 }
04784 }
04785 memset ( buf0, 0, bufSize );
04786 }
04787 f.close();
04788
04789
04790 for ( int i=0;i<nLines*nFeat;i++ )
04791 if ( isnan ( data[i] ) || isinf ( data[i] ) || data[i]>1e10 || data[i]<-1e10 )
04792 {
04793 cout<<"data["<<i<<"]:"<<data[i]<<endl;
04794 assert ( false );
04795 }
04796 for ( int i=0;i<nLines;i++ )
04797 if ( isnan ( labels[i] ) || isinf ( labels[i] ) || labels[i]<0 )
04798 {
04799 cout<<"labels["<<i<<"]:"<<labels[i]<<endl;
04800 assert ( false );
04801 }
04802
04803 }
04804
04805 }
04806
04811 void DatasetReader::splitRandomTestset ( REAL percentTest, REAL* data, int* labels, int nData, int nFeat, int nClass, REAL* &train, int* &trainLabel, REAL* &trainTarget, REAL* &test, int* &testLabel, REAL* &testTarget, uint& nTrain, uint& nTest, REAL positiveTarget, REAL negativeTarget, bool noRandom )
04812 {
04813
04814 if ( noRandom )
04815 cout<<"take the last percentTest:"<<100.0*percentTest<<"[%]"<<endl;
04816 else
04817 cout<<"random percentTest:"<<100.0*percentTest<<"[%]"<<endl;
04818
04819
04820 nTrain = 0;
04821 nTest = 0;
04822 srand ( getRandomSeed() );
04823 for ( int i=0;i<nData;i++ )
04824 {
04825 REAL r = ( double ) rand() / ( double ) RAND_MAX;
04826 if ( noRandom )
04827 r = ( double ) i/ ( double ) nData< ( 1.0 - percentTest ) ?1.0:0.0;
04828 if ( r < percentTest )
04829 nTest++;
04830 else
04831 nTrain++;
04832 }
04833 cout<<"nTrain:"<<nTrain<<endl;
04834 cout<<"nTest:"<<nTest<<endl;
04835
04836
04837 train = new REAL[nTrain * nFeat];
04838 trainLabel = new int[nTrain];
04839 test = new REAL[nTest * nFeat];
04840 testLabel = new int[nTest];
04841
04842
04843 nTrain = 0;
04844 nTest = 0;
04845 srand ( getRandomSeed() );
04846 for ( int i=0;i<nData;i++ )
04847 {
04848 REAL r = ( double ) rand() / ( double ) RAND_MAX;
04849 if ( noRandom )
04850 r = ( double ) i/ ( double ) nData< ( 1.0 - percentTest ) ?1.0:0.0;
04851 if ( r < percentTest )
04852 {
04853 for ( int j=0;j<nFeat;j++ )
04854 test[nTest*nFeat + j] = data[i*nFeat + j];
04855 testLabel[nTest] = labels[i];
04856 nTest++;
04857 }
04858 else
04859 {
04860 for ( int j=0;j<nFeat;j++ )
04861 train[nTrain*nFeat + j] = data[i*nFeat + j];
04862 trainLabel[nTrain] = labels[i];
04863 nTrain++;
04864 }
04865 }
04866
04867
04868 for ( int i=0;i<nTrain*nFeat;i++ )
04869 if ( isnan ( train[i] ) || isinf ( train[i] ) || train[i]>1e10 || train[i]<-1e10 )
04870 {
04871 cout<<"train["<<i<<"]:"<<train[i]<<endl;
04872 assert ( false );
04873 }
04874
04875 for ( int i=0;i<nTest*nFeat;i++ )
04876 if ( isnan ( test[i] ) || isinf ( test[i] ) || test[i]>1e10 || test[i]<-1e10 )
04877 {
04878 cout<<"test["<<i<<"]:"<<test[i]<<endl;
04879 assert ( false );
04880 }
04881
04882 makeNumericTrainAndTestTargets ( nClass, nTrain, nTest, positiveTarget, negativeTarget, trainLabel, testLabel, trainTarget, testTarget );
04883 }
04884
04889 void DatasetReader::makeNumericTrainAndTestTargets ( int nClass, int nTrain, int nTest, REAL positiveTarget, REAL negativeTarget, int* trainLabel, int* testLabel, REAL* &trainTarget, REAL* &testTarget )
04890 {
04891
04892 trainTarget = new REAL[nClass*nTrain];
04893 for ( int i=0;i<nTrain;i++ )
04894 {
04895 for ( int j=0;j<nClass;j++ )
04896 trainTarget[i*nClass + j] = negativeTarget;
04897 trainTarget[i*nClass + trainLabel[i]] = positiveTarget;
04898 }
04899
04900
04901 testTarget = new REAL[nClass*nTest];
04902 for ( int i=0;i<nTest;i++ )
04903 {
04904 for ( int j=0;j<nClass;j++ )
04905 testTarget[i*nClass + j] = negativeTarget;
04906 testTarget[i*nClass + testLabel[i]] = positiveTarget;
04907 }
04908
04909
04910 for ( int i=0;i<nTrain*nClass;i++ )
04911 if ( isnan ( trainTarget[i] ) || isinf ( trainTarget[i] ) || trainTarget[i]>1e10 || trainTarget[i]<-1e10 )
04912 {
04913 cout<<"trainTarget["<<i<<"]:"<<trainTarget[i]<<endl;
04914 assert ( false );
04915 }
04916
04917 for ( int i=0;i<nTest*nClass;i++ )
04918 if ( isnan ( testTarget[i] ) || isinf ( testTarget[i] ) || testTarget[i]>1e10 || testTarget[i]<-1e10 )
04919 {
04920 cout<<"testTarget["<<i<<"]:"<<testTarget[i]<<endl;
04921 assert ( false );
04922 }
04923
04924 }
04925