modules/ml/src/gbt.cpp

   1
   2 #include "precomp.hpp"
   3 #include <string>
   4 #include <time.h>
   5
   6 using namespace std;
   7
   8 #define pCvSeq CvSeq*
   9 #define pCvDTreeNode CvDTreeNode*
  10
  11 #define CV_CMP_FLOAT(a,b) ((a) < (b))
  12 static CV_IMPLEMENT_QSORT_EX( icvSortFloat, float, CV_CMP_FLOAT, float)
  13 #define CV_CMP_INT(a,b) ((a) < (b))
  14 static CV_IMPLEMENT_QSORT_EX( icvSortInt, int, CV_CMP_INT, int)
  15
  16 //===========================================================================
  17 static string ToString(int i)
  18 {
  19     stringstream tmp;
  20     tmp << i;
  21
  22     return tmp.str();
  23 }
  24
  25
  26 //===========================================================================
  27 //----------------------------- CvGBTreesParams -----------------------------
  28 //===========================================================================
  29
  30 CvGBTreesParams::CvGBTreesParams()
  31             : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
  32 {
  33     weak_count = 200;
  34     loss_function_type = CvGBTrees::SQUARED_LOSS;
  35     subsample_portion = 0.8f;
  36     shrinkage = 0.01f;
  37 }
  38
  39 //===========================================================================
  40
  41 CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count,
  42                          float _shrinkage, float _subsample_portion,
  43                          int _max_depth, bool _use_surrogates )
  44             : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
  45 {
  46     loss_function_type = _loss_function_type;
  47     weak_count = _weak_count;
  48     shrinkage = _shrinkage;
  49     subsample_portion = _subsample_portion;
  50     max_depth = _max_depth;
  51     use_surrogates = _use_surrogates;
  52 }
  53
  54 //===========================================================================
  55 //------------------------------- CvGBTrees ---------------------------------
  56 //===========================================================================
  57
  58 CvGBTrees::CvGBTrees()
  59 {
  60     data = 0;
  61     weak = 0;
  62     default_model_name = "my_boost_tree";
  63     orig_response = sum_response = sum_response_tmp = 0;
  64     subsample_train = subsample_test = 0;
  65     missing = sample_idx = 0;
  66     class_labels = 0;
  67     class_count = 1;
  68     delta = 0.0f;
  69
  70     clear();
  71 }
  72
  73 //===========================================================================
  74
  75 int CvGBTrees::get_len(const CvMat* mat) const
  76 {
  77     return (mat->cols > mat->rows) ? mat->cols : mat->rows;
  78 }
  79
  80 //===========================================================================
  81
  82 void CvGBTrees::clear()
  83 {
  84     if( weak )
  85     {
  86         CvSeqReader reader;
  87         CvSlice slice = CV_WHOLE_SEQ;
  88         CvDTree* tree;
  89
  90         //data->shared = false;
  91         for (int i=0; i<class_count; ++i)
  92         {
  93             int weak_count = cvSliceLength( slice, weak[i] );
  94             if ((weak[i]) && (weak_count))
  95             {
  96                 cvStartReadSeq( weak[i], &reader );
  97                 cvSetSeqReaderPos( &reader, slice.start_index );
  98                 for (int j=0; j<weak_count; ++j)
  99                 {
 100                     CV_READ_SEQ_ELEM( tree, reader );
 101                     //tree->clear();
 102                     delete tree;
 103                     tree = 0;
 104                 }
 105             }
 106         }
 107         for (int i=0; i<class_count; ++i)
 108             if (weak[i]) cvReleaseMemStorage( &(weak[i]->storage) );
 109         delete[] weak;
 110     }
 111     if (data)
 112     {
 113         data->shared = false;
 114         delete data;
 115     }
 116     weak = 0;
 117     data = 0;
 118     delta = 0.0f;
 119     cvReleaseMat( &orig_response );
 120     cvReleaseMat( &sum_response );
 121     cvReleaseMat( &sum_response_tmp );
 122     cvReleaseMat( &subsample_train );
 123     cvReleaseMat( &subsample_test );
 124     cvReleaseMat( &sample_idx );
 125     cvReleaseMat( &missing );
 126     cvReleaseMat( &class_labels );
 127 }
 128
 129 //===========================================================================
 130
 131 CvGBTrees::~CvGBTrees()
 132 {
 133     clear();
 134 }
 135
 136 //===========================================================================
 137
 138 CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag,
 139                   const CvMat* _responses, const CvMat* _var_idx,
 140                   const CvMat* _sample_idx, const CvMat* _var_type,
 141                   const CvMat* _missing_mask, CvGBTreesParams _params )
 142 {
 143     weak = 0;
 144     data = 0;
 145     default_model_name = "my_boost_tree";
 146     orig_response = sum_response = sum_response_tmp = 0;
 147     subsample_train = subsample_test = 0;
 148     missing = sample_idx = 0;
 149     class_labels = 0;
 150     class_count = 1;
 151     delta = 0.0f;
 152
 153     train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
 154            _var_type, _missing_mask, _params );
 155 }
 156
 157 //===========================================================================
 158
 159 bool CvGBTrees::problem_type() const
 160 {
 161     switch (params.loss_function_type)
 162     {
 163     case DEVIANCE_LOSS: return false;
 164     default: return true;
 165     }
 166 }
 167
 168 //===========================================================================
 169
 170 bool
 171 CvGBTrees::train( CvMLData* _data, CvGBTreesParams _params, bool update )
 172 {
 173     bool result;
 174     result = train ( _data->get_values(), CV_ROW_SAMPLE,
 175             _data->get_responses(), _data->get_var_idx(),
 176             _data->get_train_sample_idx(), _data->get_var_types(),
 177             _data->get_missing(), _params, update);
 178                                          //update is not supported
 179     return result;
 180 }
 181
 182 //===========================================================================
 183
 184
 185 bool
 186 CvGBTrees::train( const CvMat* _train_data, int _tflag,
 187               const CvMat* _responses, const CvMat* _var_idx,
 188               const CvMat* _sample_idx, const CvMat* _var_type,
 189               const CvMat* _missing_mask,
 190               CvGBTreesParams _params, bool /*_update*/ ) //update is not supported
 191 {
 192     CvMemStorage* storage = 0;
 193
 194     params = _params;
 195     bool is_regression = problem_type();
 196
 197     clear();
 198     /*
 199       n - count of samples
 200       m - count of variables
 201     */
 202     int n = _train_data->rows;
 203     int m = _train_data->cols;
 204     if (_tflag != CV_ROW_SAMPLE)
 205     {
 206         int tmp;
 207         CV_SWAP(n,m,tmp);
 208     }
 209
 210     CvMat* new_responses = cvCreateMat( n, 1, CV_32F);
 211     cvZero(new_responses);
 212
 213     data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx,
 214         _sample_idx, _var_type, _missing_mask, _params, true, true );
 215     if (_missing_mask)
 216     {
 217         missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols,
 218                               _missing_mask->type);
 219         cvCopy( _missing_mask, missing);
 220     }
 221
 222     orig_response = cvCreateMat( 1, n, CV_32F );
 223     int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
 224     switch (CV_MAT_TYPE(_responses->type))
 225     {
 226         case CV_32FC1:
 227         {
 228             for (int i=0; i<n; ++i)
 229                 orig_response->data.fl[i] = _responses->data.fl[i*step];
 230         }; break;
 231         case CV_32SC1:
 232         {
 233             for (int i=0; i<n; ++i)
 234                 orig_response->data.fl[i] = (float) _responses->data.i[i*step];
 235         }; break;
 236         default:
 237             CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
 238     }
 239
 240     if (!is_regression)
 241     {
 242         class_count = 0;
 243         unsigned char * mask = new unsigned char[n];
 244         memset(mask, 0, n);
 245         // compute the count of different output classes
 246         for (int i=0; i<n; ++i)
 247             if (!mask[i])
 248             {
 249                 class_count++;
 250                 for (int j=i; j<n; ++j)
 251                     if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
 252                         mask[j] = 1;
 253             }
 254         delete[] mask;
 255
 256         class_labels = cvCreateMat(1, class_count, CV_32S);
 257         class_labels->data.i[0] = int(orig_response->data.fl[0]);
 258         int j = 1;
 259         for (int i=1; i<n; ++i)
 260         {
 261             int k = 0;
 262             while ((k<j) && (int(orig_response->data.fl[i]) - class_labels->data.i[k]))
 263                 k++;
 264             if (k == j)
 265             {
 266                 class_labels->data.i[k] = int(orig_response->data.fl[i]);
 267                 j++;
 268             }
 269         }
 270     }
 271
 272     // inside gbt learning proccess only regression decision trees are built
 273     data->is_classifier = false;
 274
 275     // preproccessing sample indices
 276     if (_sample_idx)
 277     {
 278         int sample_idx_len = get_len(_sample_idx);
 279
 280         switch (CV_MAT_TYPE(_sample_idx->type))
 281         {
 282             case CV_32SC1:
 283             {
 284                 sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S );
 285                 for (int i=0; i<sample_idx_len; ++i)
 286                     sample_idx->data.i[i] = _sample_idx->data.i[i];
 287                 icvSortInt(sample_idx->data.i, sample_idx_len, 0);
 288             } break;
 289             case CV_8S:
 290             case CV_8U:
 291             {
 292                 int active_samples_count = 0;
 293                 for (int i=0; i<sample_idx_len; ++i)
 294                     active_samples_count += int( _sample_idx->data.ptr[i] );
 295                 sample_idx = cvCreateMat( 1, active_samples_count, CV_32S );
 296                 active_samples_count = 0;
 297                 for (int i=0; i<sample_idx_len; ++i)
 298                     if (int( _sample_idx->data.ptr[i] ))
 299                         sample_idx->data.i[active_samples_count++] = i;
 300
 301             } break;
 302             default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
 303         }
 304     }
 305     else
 306     {
 307         sample_idx = cvCreateMat( 1, n, CV_32S );
 308         for (int i=0; i<n; ++i)
 309             sample_idx->data.i[i] = i;
 310     }
 311
 312     sum_response = cvCreateMat(class_count, n, CV_32F);
 313     sum_response_tmp = cvCreateMat(class_count, n, CV_32F);
 314     cvZero(sum_response);
 315
 316     delta = 0.0f;
 317     /*
 318       in the case of a regression problem the initial guess (the zero term
 319       in the sum) is set to the mean of all the training responses, that is
 320       the best constant model
 321     */
 322     if (is_regression) base_value = find_optimal_value(sample_idx);
 323     /*
 324       in the case of a classification problem the initial guess (the zero term
 325       in the sum) is set to zero for all the trees sequences
 326     */
 327     else base_value = 0.0f;
 328     /*
 329       current predicition on all training samples is set to be
 330       equal to the base_value
 331     */
 332     cvSet( sum_response, cvScalar(base_value) );
 333
 334     weak = new pCvSeq[class_count];
 335     for (int i=0; i<class_count; ++i)
 336     {
 337         storage = cvCreateMemStorage();
 338         weak[i] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
 339         storage = 0;
 340     }
 341
 342     // subsample params and data
 343     rng = &cv::theRNG();
 344
 345     int samples_count = get_len(sample_idx);
 346
 347     params.subsample_portion = params.subsample_portion <= FLT_EPSILON ||
 348         1 - params.subsample_portion <= FLT_EPSILON
 349         ? 1 : params.subsample_portion;
 350     int train_sample_count = cvFloor(params.subsample_portion * samples_count);
 351     if (train_sample_count == 0)
 352         train_sample_count = samples_count;
 353     int test_sample_count = samples_count - train_sample_count;
 354     int* idx_data = new int[samples_count];
 355     subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 );
 356     *subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data );
 357     if (test_sample_count)
 358     {
 359         subsample_test  = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 );
 360         *subsample_test = cvMat( 1, test_sample_count, CV_32SC1,
 361                                  idx_data + train_sample_count );
 362     }
 363
 364     // training procedure
 365
 366     for ( int i=0; i < params.weak_count; ++i )
 367     {
 368         do_subsample();
 369         for ( int k=0; k < class_count; ++k )
 370         {
 371             find_gradient(k);
 372             CvDTree* tree = new CvDTree;
 373             tree->train( data, subsample_train );
 374             change_values(tree, k);
 375
 376             if (subsample_test)
 377             {
 378                 CvMat x;
 379                 CvMat x_miss;
 380                 int* sample_data = sample_idx->data.i;
 381                 int* subsample_data = subsample_test->data.i;
 382                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
 383                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
 384                 for (int j=0; j<get_len(subsample_test); ++j)
 385                 {
 386                     int idx = *(sample_data + subsample_data[j]*s_step);
 387                     float res = 0.0f;
 388                     if (_tflag == CV_ROW_SAMPLE)
 389                         cvGetRow( data->train_data, &x, idx);
 390                     else
 391                         cvGetCol( data->train_data, &x, idx);
 392
 393                     if (missing)
 394                     {
 395                         if (_tflag == CV_ROW_SAMPLE)
 396                             cvGetRow( missing, &x_miss, idx);
 397                         else
 398                             cvGetCol( missing, &x_miss, idx);
 399
 400                         res = (float)tree->predict(&x, &x_miss)->value;
 401                     }
 402                     else
 403                     {
 404                         res = (float)tree->predict(&x)->value;
 405                     }
 406                     sum_response_tmp->data.fl[idx + k*n] =
 407                                     sum_response->data.fl[idx + k*n] +
 408                                     params.shrinkage * res;
 409                 }
 410             }
 411
 412             cvSeqPush( weak[k], &tree );
 413             tree = 0;
 414         } // k=0..class_count
 415     CvMat* tmp;
 416     tmp = sum_response_tmp;
 417     sum_response_tmp = sum_response;
 418     sum_response = tmp;
 419     tmp = 0;
 420     } // i=0..params.weak_count
 421
 422     delete[] idx_data;
 423     cvReleaseMat(&new_responses);
 424     data->free_train_data();
 425
 426     return true;
 427
 428 } // CvGBTrees::train(...)
 429
 430 //===========================================================================
 431
 432 inline float Sign(float x)
 433   {
 434   if (x<0.0f) return -1.0f;
 435   else if (x>0.0f) return 1.0f;
 436   return 0.0f;
 437   }
 438
 439 //===========================================================================
 440
 441 void CvGBTrees::find_gradient(const int k)
 442 {
 443     int* sample_data = sample_idx->data.i;
 444     int* subsample_data = subsample_train->data.i;
 445     float* grad_data = data->responses->data.fl;
 446     float* resp_data = orig_response->data.fl;
 447     float* current_data = sum_response->data.fl;
 448
 449     switch (params.loss_function_type)
 450     // loss_function_type in
 451     // {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS}
 452     {
 453         case SQUARED_LOSS:
 454         {
 455             for (int i=0; i<get_len(subsample_train); ++i)
 456             {
 457                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
 458                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
 459                 int idx = *(sample_data + subsample_data[i]*s_step);
 460                 grad_data[idx] = resp_data[idx] - current_data[idx];
 461             }
 462         }; break;
 463
 464         case ABSOLUTE_LOSS:
 465         {
 466             for (int i=0; i<get_len(subsample_train); ++i)
 467             {
 468                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
 469                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
 470                 int idx = *(sample_data + subsample_data[i]*s_step);
 471                 grad_data[idx] = Sign(resp_data[idx] - current_data[idx]);
 472             }
 473         }; break;
 474
 475         case HUBER_LOSS:
 476         {
 477             float alpha = 0.2f;
 478             int n = get_len(subsample_train);
 479             int s_step = (sample_idx->cols > sample_idx->rows) ? 1
 480                          : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
 481
 482             float* residuals = new float[n];
 483             for (int i=0; i<n; ++i)
 484             {
 485                 int idx = *(sample_data + subsample_data[i]*s_step);
 486                 residuals[i] = fabs(resp_data[idx] - current_data[idx]);
 487             }
 488             icvSortFloat(residuals, n, 0.0f);
 489
 490             delta = residuals[int(ceil(n*alpha))];
 491
 492             for (int i=0; i<n; ++i)
 493             {
 494                 int idx = *(sample_data + subsample_data[i]*s_step);
 495                 float r = resp_data[idx] - current_data[idx];
 496                 grad_data[idx] = (fabs(r) > delta) ? delta*Sign(r) : r;
 497             }
 498             delete[] residuals;
 499
 500         }; break;
 501
 502         case DEVIANCE_LOSS:
 503         {
 504             for (int i=0; i<get_len(subsample_train); ++i)
 505             {
 506                 double exp_fk = 0;
 507                 double exp_sfi = 0;
 508                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
 509                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
 510                 int idx = *(sample_data + subsample_data[i]*s_step);
 511
 512                 for (int j=0; j<class_count; ++j)
 513                 {
 514                     double res;
 515                     res = current_data[idx + j*sum_response->cols];
 516                     res = exp(res);
 517                     if (j == k) exp_fk = res;
 518                     exp_sfi += res;
 519                 }
 520                 int orig_label = int(resp_data[idx]);
 521                 /*
 522                 grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) -
 523                                  (float)(exp_fk / exp_sfi);
 524                 */
 525                 int ensemble_label = 0;
 526                 while (class_labels->data.i[ensemble_label] - orig_label)
 527                     ensemble_label++;
 528
 529                 grad_data[idx] = (float)(!(k-ensemble_label)) -
 530                                  (float)(exp_fk / exp_sfi);
 531             }
 532         }; break;
 533
 534         default: break;
 535     }
 536
 537 } // CvGBTrees::find_gradient(...)
 538
 539 //===========================================================================
 540
 541 void CvGBTrees::change_values(CvDTree* tree, const int _k)
 542 {
 543     CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)];
 544
 545     int* sample_data = sample_idx->data.i;
 546     int* subsample_data = subsample_train->data.i;
 547     int s_step = (sample_idx->cols > sample_idx->rows) ? 1
 548                  : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
 549
 550     CvMat x;
 551     CvMat miss_x;
 552
 553     for (int i=0; i<get_len(subsample_train); ++i)
 554     {
 555         int idx = *(sample_data + subsample_data[i]*s_step);
 556         if (data->tflag == CV_ROW_SAMPLE)
 557             cvGetRow( data->train_data, &x, idx);
 558         else
 559             cvGetCol( data->train_data, &x, idx);
 560
 561         if (missing)
 562         {
 563             if (data->tflag == CV_ROW_SAMPLE)
 564                 cvGetRow( missing, &miss_x, idx);
 565             else
 566                 cvGetCol( missing, &miss_x, idx);
 567
 568             predictions[i] = tree->predict(&x, &miss_x);
 569         }
 570         else
 571             predictions[i] = tree->predict(&x);
 572     }
 573
 574
 575     CvDTreeNode** leaves;
 576     int leaves_count = 0;
 577     leaves = GetLeaves( tree, leaves_count);
 578
 579     for (int i=0; i<leaves_count; ++i)
 580     {
 581         int samples_in_leaf = 0;
 582         for (int j=0; j<get_len(subsample_train); ++j)
 583         {
 584             if (leaves[i] == predictions[j]) samples_in_leaf++;
 585         }
 586
 587         if (!samples_in_leaf) // It should not be done anyways! but...
 588         {
 589             leaves[i]->value = 0.0;
 590             continue;
 591         }
 592
 593         CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S);
 594         int* leaf_idx_data = leaf_idx->data.i;
 595
 596         for (int j=0; j<get_len(subsample_train); ++j)
 597         {
 598             int idx = *(sample_data + subsample_data[j]*s_step);
 599             if (leaves[i] == predictions[j])
 600                 *leaf_idx_data++ = idx;
 601         }
 602
 603         float value = find_optimal_value(leaf_idx);
 604         leaves[i]->value = value;
 605
 606         leaf_idx_data = leaf_idx->data.i;
 607
 608         int len = sum_response_tmp->cols;
 609         for (int j=0; j<get_len(leaf_idx); ++j)
 610         {
 611             int idx = leaf_idx_data[j];
 612             sum_response_tmp->data.fl[idx + _k*len] =
 613                                     sum_response->data.fl[idx + _k*len] +
 614                                     params.shrinkage * value;
 615         }
 616         leaf_idx_data = 0;
 617         cvReleaseMat(&leaf_idx);
 618     }
 619
 620     // releasing the memory
 621     for (int i=0; i<get_len(subsample_train); ++i)
 622     {
 623         predictions[i] = 0;
 624     }
 625     delete[] predictions;
 626
 627     for (int i=0; i<leaves_count; ++i)
 628     {
 629         leaves[i] = 0;
 630     }
 631     delete[] leaves;
 632
 633 }
 634
 635 //===========================================================================
 636 /*
 637 void CvGBTrees::change_values(CvDTree* tree, const int _k)
 638 {
 639
 640     CvDTreeNode** leaves;
 641     int leaves_count = 0;
 642     int offset = _k*sum_response_tmp->cols;
 643     CvMat leaf_idx;
 644     leaf_idx.rows = 1;
 645
 646     leaves = GetLeaves( tree, leaves_count);
 647
 648     for (int i=0; i<leaves_count; ++i)
 649     {
 650         int n = leaves[i]->sample_count;
 651         int* leaf_idx_data = new int[n];
 652         data->get_sample_indices(leaves[i], leaf_idx_data);
 653         //CvMat* leaf_idx = new CvMat();
 654         //cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
 655         leaf_idx.cols = n;
 656         leaf_idx.data.i = leaf_idx_data;
 657
 658         float value = find_optimal_value(&leaf_idx);
 659         leaves[i]->value = value;
 660         float val = params.shrinkage * value;
 661
 662
 663         for (int j=0; j<n; ++j)
 664         {
 665             int idx = leaf_idx_data[j] + offset;
 666             sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val;
 667         }
 668         //leaf_idx_data = 0;
 669         //cvReleaseMat(&leaf_idx);
 670         leaf_idx.data.i = 0;
 671         //delete leaf_idx;
 672         delete[] leaf_idx_data;
 673     }
 674
 675     // releasing the memory
 676     for (int i=0; i<leaves_count; ++i)
 677     {
 678         leaves[i] = 0;
 679     }
 680     delete[] leaves;
 681
 682 }    //change_values(...);
 683 */
 684 //===========================================================================
 685
 686 float CvGBTrees::find_optimal_value( const CvMat* _Idx )
 687 {
 688
 689     double gamma = (double)0.0;
 690
 691     int* idx = _Idx->data.i;
 692     float* resp_data = orig_response->data.fl;
 693     float* cur_data = sum_response->data.fl;
 694     int n = get_len(_Idx);
 695
 696     switch (params.loss_function_type)
 697     // SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4
 698     {
 699     case SQUARED_LOSS:
 700         {
 701             for (int i=0; i<n; ++i)
 702                 gamma += resp_data[idx[i]] - cur_data[idx[i]];
 703             gamma /= (double)n;
 704         }; break;
 705
 706     case ABSOLUTE_LOSS:
 707         {
 708             float* residuals = new float[n];
 709             for (int i=0; i<n; ++i, ++idx)
 710                 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
 711             icvSortFloat(residuals, n, 0.0f);
 712             if (n % 2)
 713                 gamma = residuals[n/2];
 714             else gamma = (residuals[n/2-1] + residuals[n/2]) / 2.0f;
 715             delete[] residuals;
 716         }; break;
 717
 718     case HUBER_LOSS:
 719         {
 720             float* residuals = new float[n];
 721             for (int i=0; i<n; ++i, ++idx)
 722                 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
 723             icvSortFloat(residuals, n, 0.0f);
 724
 725             int n_half = n >> 1;
 726             float r_median = (n == n_half<<1) ?
 727                         (residuals[n_half-1] + residuals[n_half]) / 2.0f :
 728                         residuals[n_half];
 729
 730             for (int i=0; i<n; ++i)
 731             {
 732                 float dif = residuals[i] - r_median;
 733                 gamma += (delta < fabs(dif)) ? Sign(dif)*delta : dif;
 734             }
 735             gamma /= (double)n;
 736             gamma += r_median;
 737             delete[] residuals;
 738
 739         }; break;
 740
 741     case DEVIANCE_LOSS:
 742         {
 743             float* grad_data = data->responses->data.fl;
 744             double tmp1 = 0;
 745             double tmp2 = 0;
 746             double tmp  = 0;
 747             for (int i=0; i<n; ++i)
 748             {
 749                 tmp = grad_data[idx[i]];
 750                 tmp1 += tmp;
 751                 tmp2 += fabs(tmp)*(1-fabs(tmp));
 752             };
 753             if (tmp2 == 0)
 754             {
 755                 tmp2 = 1;
 756             }
 757
 758             gamma = ((double)(class_count-1)) / (double)class_count * (tmp1/tmp2);
 759         }; break;
 760
 761     default: break;
 762     }
 763
 764     return float(gamma);
 765
 766 } // CvGBTrees::find_optimal_value
 767
 768 //===========================================================================
 769
 770
 771 void CvGBTrees::leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node )
 772 {
 773     if (node->left != NULL)  leaves_get(leaves, count, node->left);
 774     if (node->right != NULL) leaves_get(leaves, count, node->right);
 775     if ((node->left == NULL) && (node->right == NULL))
 776         leaves[count++] = node;
 777 }
 778
 779 //---------------------------------------------------------------------------
 780
 781 CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len )
 782 {
 783     len = 0;
 784     CvDTreeNode** leaves = new pCvDTreeNode[(size_t)1 << params.max_depth];
 785     leaves_get(leaves, len, const_cast<pCvDTreeNode>(dtree->get_root()));
 786     return leaves;
 787 }
 788
 789 //===========================================================================
 790
 791 void CvGBTrees::do_subsample()
 792 {
 793
 794     int n = get_len(sample_idx);
 795     int* idx = subsample_train->data.i;
 796
 797     for (int i = 0; i < n; i++ )
 798         idx[i] = i;
 799
 800     if (subsample_test)
 801         for (int i = 0; i < n; i++)
 802         {
 803             int a = (*rng)(n);
 804             int b = (*rng)(n);
 805             int t;
 806             CV_SWAP( idx[a], idx[b], t );
 807         }
 808
 809 /*
 810     int n = get_len(sample_idx);
 811     if (subsample_train == 0)
 812         subsample_train = cvCreateMat(1, n, CV_32S);
 813     int* subsample_data = subsample_train->data.i;
 814     for (int i=0; i<n; ++i)
 815         subsample_data[i] = i;
 816     subsample_test = 0;
 817 */
 818 }
 819
 820 //===========================================================================
 821
 822 float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
 823         CvMat* weak_responses, CvSlice slice, int k) const
 824 {
 825     float result = 0.0f;
 826
 827     if (!weak) return 0.0f;
 828
 829     CvSeqReader reader;
 830     int weak_count = cvSliceLength( slice, weak[class_count-1] );
 831     CvDTree* tree;
 832
 833     if (weak_responses)
 834     {
 835         if (CV_MAT_TYPE(weak_responses->type) != CV_32F)
 836             return 0.0f;
 837         if ((k >= 0) && (k<class_count) && (weak_responses->rows != 1))
 838             return 0.0f;
 839         if ((k == -1) && (weak_responses->rows != class_count))
 840             return 0.0f;
 841         if (weak_responses->cols != weak_count)
 842             return 0.0f;
 843     }
 844
 845     float* sum = new float[class_count];
 846     memset(sum, 0, class_count*sizeof(float));
 847
 848     for (int i=0; i<class_count; ++i)
 849     {
 850         if ((weak[i]) && (weak_count))
 851         {
 852             cvStartReadSeq( weak[i], &reader );
 853             cvSetSeqReaderPos( &reader, slice.start_index );
 854             for (int j=0; j<weak_count; ++j)
 855             {
 856                 CV_READ_SEQ_ELEM( tree, reader );
 857                 float p = (float)(tree->predict(_sample, _missing)->value);
 858                 sum[i] += params.shrinkage * p;
 859                 if (weak_responses)
 860                     weak_responses->data.fl[i*weak_count+j] = p;
 861             }
 862         }
 863     }
 864
 865     for (int i=0; i<class_count; ++i)
 866         sum[i] += base_value;
 867
 868     if (class_count == 1)
 869     {
 870         result = sum[0];
 871         delete[] sum;
 872         return result;
 873     }
 874
 875     if ((k>=0) && (k<class_count))
 876     {
 877         result = sum[k];
 878         delete[] sum;
 879         return result;
 880     }
 881
 882     float max = sum[0];
 883     int class_label = 0;
 884     for (int i=1; i<class_count; ++i)
 885         if (sum[i] > max)
 886         {
 887             max = sum[i];
 888             class_label = i;
 889         }
 890
 891     delete[] sum;
 892
 893     /*
 894     int orig_class_label = -1;
 895     for (int i=0; i<get_len(class_labels); ++i)
 896         if (class_labels->data.i[i] == class_label+1)
 897             orig_class_label = i;
 898     */
 899     int orig_class_label = class_labels->data.i[class_label];
 900
 901     return float(orig_class_label);
 902 }
 903
 904
 905 class Tree_predictor : public cv::ParallelLoopBody
 906 {
 907 private:
 908     pCvSeq* weak;
 909     float* sum;
 910     const int k;
 911     const CvMat* sample;
 912     const CvMat* missing;
 913     const float shrinkage;
 914
 915     static cv::Mutex SumMutex;
 916
 917
 918 public:
 919     Tree_predictor() : weak(0), sum(0), k(0), sample(0), missing(0), shrinkage(1.0f) {}
 920     Tree_predictor(pCvSeq* _weak, const int _k, const float _shrinkage,
 921                    const CvMat* _sample, const CvMat* _missing, float* _sum ) :
 922                    weak(_weak), sum(_sum), k(_k), sample(_sample),
 923                    missing(_missing), shrinkage(_shrinkage)
 924     {}
 925
 926     Tree_predictor( const Tree_predictor& p, cv::Split ) :
 927             weak(p.weak), sum(p.sum), k(p.k), sample(p.sample),
 928             missing(p.missing), shrinkage(p.shrinkage)
 929     {}
 930
 931     Tree_predictor& operator=( const Tree_predictor& )
 932     { return *this; }
 933
 934     virtual void operator()(const cv::Range& range) const
 935     {
 936         CvSeqReader reader;
 937         int begin = range.start;
 938         int end = range.end;
 939
 940         int weak_count = end - begin;
 941         CvDTree* tree;
 942
 943         for (int i=0; i<k; ++i)
 944         {
 945             float tmp_sum = 0.0f;
 946             if ((weak[i]) && (weak_count))
 947             {
 948                 cvStartReadSeq( weak[i], &reader );
 949                 cvSetSeqReaderPos( &reader, begin );
 950                 for (int j=0; j<weak_count; ++j)
 951                 {
 952                     CV_READ_SEQ_ELEM( tree, reader );
 953                     tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
 954                 }
 955             }
 956
 957             {
 958                 cv::AutoLock lock(SumMutex);
 959                 sum[i] += tmp_sum;
 960             }
 961         }
 962     } // Tree_predictor::operator()
 963
 964     virtual ~Tree_predictor() {}
 965
 966 }; // class Tree_predictor
 967
 968 cv::Mutex Tree_predictor::SumMutex;
 969
 970
 971 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
 972             CvMat* /*weak_responses*/, CvSlice slice, int k) const
 973     {
 974         float result = 0.0f;
 975         if (!weak) return 0.0f;
 976         float* sum = new float[class_count];
 977         for (int i=0; i<class_count; ++i)
 978             sum[i] = 0.0f;
 979         int begin = slice.start_index;
 980         int end = begin + cvSliceLength( slice, weak[0] );
 981
 982         pCvSeq* weak_seq = weak;
 983         Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
 984                                     params.shrinkage, _sample, _missing, sum);
 985
 986         cv::parallel_for_(cv::Range(begin, end), predictor);
 987
 988         for (int i=0; i<class_count; ++i)
 989             sum[i] = sum[i] /** params.shrinkage*/ + base_value;
 990
 991         if (class_count == 1)
 992         {
 993             result = sum[0];
 994             delete[] sum;
 995             return result;
 996         }
 997
 998         if ((k>=0) && (k<class_count))
 999         {
1000             result = sum[k];
1001             delete[] sum;
1002             return result;
1003         }
1004
1005         float max = sum[0];
1006         int class_label = 0;
1007         for (int i=1; i<class_count; ++i)
1008             if (sum[i] > max)
1009             {
1010                 max = sum[i];
1011                 class_label = i;
1012             }
1013
1014         delete[] sum;
1015         int orig_class_label = class_labels->data.i[class_label];
1016
1017         return float(orig_class_label);
1018     }
1019
1020
1021 //===========================================================================
1022
1023 void CvGBTrees::write_params( CvFileStorage* fs ) const
1024 {
1025     const char* loss_function_type_str =
1026         params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" :
1027         params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" :
1028         params.loss_function_type == HUBER_LOSS ? "HuberLoss" :
1029         params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0;
1030
1031
1032     if( loss_function_type_str )
1033         cvWriteString( fs, "loss_function", loss_function_type_str );
1034     else
1035         cvWriteInt( fs, "loss_function", params.loss_function_type );
1036
1037     cvWriteInt( fs, "ensemble_length", params.weak_count );
1038     cvWriteReal( fs, "shrinkage", params.shrinkage );
1039     cvWriteReal( fs, "subsample_portion", params.subsample_portion );
1040     //cvWriteInt( fs, "max_tree_depth", params.max_depth );
1041     //cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false");
1042     if (class_labels) cvWrite( fs, "class_labels", class_labels);
1043
1044     data->is_classifier = !problem_type();
1045     data->write_params( fs );
1046     data->is_classifier = 0;
1047 }
1048
1049
1050 //===========================================================================
1051
1052 void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
1053 {
1054     CV_FUNCNAME( "CvGBTrees::read_params" );
1055     __BEGIN__;
1056
1057
1058     CvFileNode* temp;
1059
1060     if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1061         return;
1062
1063     data = new CvDTreeTrainData();
1064     CV_CALL( data->read_params(fs, fnode));
1065     data->shared = true;
1066
1067     params.max_depth = data->params.max_depth;
1068     params.min_sample_count = data->params.min_sample_count;
1069     params.max_categories = data->params.max_categories;
1070     params.priors = data->params.priors;
1071     params.regression_accuracy = data->params.regression_accuracy;
1072     params.use_surrogates = data->params.use_surrogates;
1073
1074     temp = cvGetFileNodeByName( fs, fnode, "loss_function" );
1075     if( !temp )
1076         EXIT;
1077
1078     if( temp && CV_NODE_IS_STRING(temp->tag) )
1079     {
1080         const char* loss_function_type_str = cvReadString( temp, "" );
1081         params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS :
1082                             strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS :
1083                             strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS :
1084                             strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1;
1085     }
1086     else
1087         params.loss_function_type = cvReadInt( temp, -1 );
1088
1089
1090     if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS ||  params.loss_function_type == 2)
1091         CV_ERROR( CV_StsBadArg, "Unknown loss function" );
1092
1093     params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" );
1094     params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 );
1095     params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 );
1096
1097     if (data->is_classifier)
1098     {
1099         class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" );
1100         if( class_labels && !CV_IS_MAT(class_labels))
1101             CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix");
1102     }
1103     data->is_classifier = 0;
1104
1105     __END__;
1106 }
1107
1108
1109
1110
1111 void CvGBTrees::write( CvFileStorage* fs, const char* name ) const
1112 {
1113     CV_FUNCNAME( "CvGBTrees::write" );
1114
1115     __BEGIN__;
1116
1117     CvSeqReader reader;
1118     int i;
1119     std::string s;
1120
1121     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT );
1122
1123     if( !weak )
1124         CV_ERROR( CV_StsBadArg, "The model has not been trained yet" );
1125
1126     write_params( fs );
1127     cvWriteReal( fs, "base_value", base_value);
1128     cvWriteInt( fs, "class_count", class_count);
1129
1130     for ( int j=0; j < class_count; ++j )
1131     {
1132         s = "trees_";
1133         s += ToString(j);
1134         cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ );
1135
1136         cvStartReadSeq( weak[j], &reader );
1137
1138         for( i = 0; i < weak[j]->total; i++ )
1139         {
1140             CvDTree* tree;
1141             CV_READ_SEQ_ELEM( tree, reader );
1142             cvStartWriteStruct( fs, 0, CV_NODE_MAP );
1143             tree->write( fs );
1144             cvEndWriteStruct( fs );
1145         }
1146
1147         cvEndWriteStruct( fs );
1148     }
1149
1150     cvEndWriteStruct( fs );
1151
1152     __END__;
1153 }
1154
1155
1156 //===========================================================================
1157
1158
1159 void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
1160 {
1161
1162     CV_FUNCNAME( "CvGBTrees::read" );
1163
1164     __BEGIN__;
1165
1166     CvSeqReader reader;
1167     CvFileNode* trees_fnode;
1168     CvMemStorage* storage;
1169     int i, ntrees;
1170     std::string s;
1171
1172     clear();
1173     read_params( fs, node );
1174
1175     if( !data )
1176         EXIT;
1177
1178     base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 );
1179     class_count = cvReadIntByName( fs, node, "class_count", 1 );
1180
1181     weak = new pCvSeq[class_count];
1182
1183
1184     for (int j=0; j<class_count; ++j)
1185     {
1186         s = "trees_";
1187         s += ToString(j);
1188
1189         trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() );
1190         if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
1191             CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" );
1192
1193         cvStartReadSeq( trees_fnode->data.seq, &reader );
1194         ntrees = trees_fnode->data.seq->total;
1195
1196         if( ntrees != params.weak_count )
1197             CV_ERROR( CV_StsUnmatchedSizes,
1198             "The number of trees stored does not match <ntrees> tag value" );
1199
1200         CV_CALL( storage = cvCreateMemStorage() );
1201         weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
1202
1203         for( i = 0; i < ntrees; i++ )
1204         {
1205             CvDTree* tree = new CvDTree();
1206             CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data ));
1207             CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
1208             cvSeqPush( weak[j], &tree );
1209         }
1210     }
1211
1212     __END__;
1213 }
1214
1215 //===========================================================================
1216
1217 class Sample_predictor : public cv::ParallelLoopBody
1218 {
1219 private:
1220     const CvGBTrees* gbt;
1221     float* predictions;
1222     const CvMat* samples;
1223     const CvMat* missing;
1224     const CvMat* idx;
1225     CvSlice slice;
1226
1227 public:
1228     Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0),
1229                          idx(0), slice(CV_WHOLE_SEQ)
1230     {}
1231
1232     Sample_predictor(const CvGBTrees* _gbt, float* _predictions,
1233                    const CvMat* _samples, const CvMat* _missing,
1234                    const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) :
1235                    gbt(_gbt), predictions(_predictions), samples(_samples),
1236                    missing(_missing), idx(_idx), slice(_slice)
1237     {}
1238
1239
1240     Sample_predictor( const Sample_predictor& p, cv::Split ) :
1241             gbt(p.gbt), predictions(p.predictions),
1242             samples(p.samples), missing(p.missing), idx(p.idx),
1243             slice(p.slice)
1244     {}
1245
1246
1247     virtual void operator()(const cv::Range& range) const
1248     {
1249         int begin = range.start;
1250         int end = range.end;
1251
1252         CvMat x;
1253         CvMat miss;
1254
1255         for (int i=begin; i<end; ++i)
1256         {
1257             int j = idx ? idx->data.i[i] : i;
1258             cvGetRow(samples, &x, j);
1259             if (!missing)
1260             {
1261                 predictions[i] = gbt->predict_serial(&x,0,0,slice);
1262             }
1263             else
1264             {
1265                 cvGetRow(missing, &miss, j);
1266                 predictions[i] = gbt->predict_serial(&x,&miss,0,slice);
1267             }
1268         }
1269     } // Sample_predictor::operator()
1270
1271     virtual ~Sample_predictor() {}
1272
1273 }; // class Sample_predictor
1274
1275
1276
1277 // type in {CV_TRAIN_ERROR, CV_TEST_ERROR}
1278 float
1279 CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1280 {
1281
1282     float err = 0.0f;
1283     const CvMat* _sample_idx = (type == CV_TRAIN_ERROR) ?
1284                               _data->get_train_sample_idx() :
1285                               _data->get_test_sample_idx();
1286     const CvMat* response = _data->get_responses();
1287
1288     int n = _sample_idx ? get_len(_sample_idx) : 0;
1289     n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n;
1290
1291     if (!n)
1292         return -FLT_MAX;
1293
1294     float* pred_resp = 0;
1295     bool needsFreeing = false;
1296
1297     if (resp)
1298     {
1299         resp->resize(n);
1300         pred_resp = &((*resp)[0]);
1301     }
1302     else
1303     {
1304         pred_resp = new float[n];
1305         needsFreeing = true;
1306     }
1307
1308     Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
1309             _data->get_missing(), _sample_idx);
1310
1311     cv::parallel_for_(cv::Range(0,n), predictor);
1312
1313     int* sidx = _sample_idx ? _sample_idx->data.i : 0;
1314     int r_step = CV_IS_MAT_CONT(response->type) ?
1315                 1 : response->step / CV_ELEM_SIZE(response->type);
1316
1317
1318     if ( !problem_type() )
1319     {
1320         for( int i = 0; i < n; i++ )
1321         {
1322             int si = sidx ? sidx[i] : i;
1323             int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1324             err += d;
1325         }
1326         err = err / (float)n * 100.0f;
1327     }
1328     else
1329     {
1330         for( int i = 0; i < n; i++ )
1331         {
1332             int si = sidx ? sidx[i] : i;
1333             float d = pred_resp[i] - response->data.fl[si*r_step];
1334             err += d*d;
1335         }
1336         err = err / (float)n;
1337     }
1338
1339     if (needsFreeing)
1340         delete[]pred_resp;
1341
1342     return err;
1343 }
1344
1345
1346 CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag,
1347           const cv::Mat& responses, const cv::Mat& varIdx,
1348           const cv::Mat& sampleIdx, const cv::Mat& varType,
1349           const cv::Mat& missingDataMask,
1350           CvGBTreesParams _params )
1351 {
1352     data = 0;
1353     weak = 0;
1354     default_model_name = "my_boost_tree";
1355     orig_response = sum_response = sum_response_tmp = 0;
1356     subsample_train = subsample_test = 0;
1357     missing = sample_idx = 0;
1358     class_labels = 0;
1359     class_count = 1;
1360     delta = 0.0f;
1361
1362     clear();
1363
1364     train(trainData, tflag, responses, varIdx, sampleIdx, varType, missingDataMask, _params, false);
1365 }
1366
1367 bool CvGBTrees::train( const cv::Mat& trainData, int tflag,
1368                    const cv::Mat& responses, const cv::Mat& varIdx,
1369                    const cv::Mat& sampleIdx, const cv::Mat& varType,
1370                    const cv::Mat& missingDataMask,
1371                    CvGBTreesParams _params,
1372                    bool update )
1373 {
1374     CvMat _trainData = trainData, _responses = responses;
1375     CvMat _varIdx = varIdx, _sampleIdx = sampleIdx, _varType = varType;
1376     CvMat _missingDataMask = missingDataMask;
1377
1378     return train( &_trainData, tflag, &_responses, varIdx.empty() ? 0 : &_varIdx,
1379                   sampleIdx.empty() ? 0 : &_sampleIdx, varType.empty() ? 0 : &_varType,
1380                   missingDataMask.empty() ? 0 : &_missingDataMask, _params, update);
1381 }
1382
1383 float CvGBTrees::predict( const cv::Mat& sample, const cv::Mat& _missing,
1384                           const cv::Range& slice, int k ) const
1385 {
1386     CvMat _sample = sample, miss = _missing;
1387     return predict(&_sample, _missing.empty() ? 0 : &miss, 0,
1388                    slice==cv::Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), k);
1389 }