Merge pull request #5285 from ilya-lavrenov:ml5
[platform/upstream/opencv.git] / modules / ml / src / gbt.cpp
1
2 #include "precomp.hpp"
3 #include <string>
4 #include <time.h>
5
6 using namespace std;
7
8 #define pCvSeq CvSeq*
9 #define pCvDTreeNode CvDTreeNode*
10
11 #define CV_CMP_FLOAT(a,b) ((a) < (b))
12 static CV_IMPLEMENT_QSORT_EX( icvSortFloat, float, CV_CMP_FLOAT, float)
13 #define CV_CMP_INT(a,b) ((a) < (b))
14 static CV_IMPLEMENT_QSORT_EX( icvSortInt, int, CV_CMP_INT, int)
15
16 //===========================================================================
17 static string ToString(int i)
18 {
19     stringstream tmp;
20     tmp << i;
21
22     return tmp.str();
23 }
24
25
26 //===========================================================================
27 //----------------------------- CvGBTreesParams -----------------------------
28 //===========================================================================
29
30 CvGBTreesParams::CvGBTreesParams()
31             : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
32 {
33     weak_count = 200;
34     loss_function_type = CvGBTrees::SQUARED_LOSS;
35     subsample_portion = 0.8f;
36     shrinkage = 0.01f;
37 }
38
39 //===========================================================================
40
41 CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count,
42                          float _shrinkage, float _subsample_portion,
43                          int _max_depth, bool _use_surrogates )
44             : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
45 {
46     loss_function_type = _loss_function_type;
47     weak_count = _weak_count;
48     shrinkage = _shrinkage;
49     subsample_portion = _subsample_portion;
50     max_depth = _max_depth;
51     use_surrogates = _use_surrogates;
52 }
53
54 //===========================================================================
55 //------------------------------- CvGBTrees ---------------------------------
56 //===========================================================================
57
58 CvGBTrees::CvGBTrees()
59 {
60     data = 0;
61     weak = 0;
62     default_model_name = "my_boost_tree";
63     orig_response = sum_response = sum_response_tmp = 0;
64     subsample_train = subsample_test = 0;
65     missing = sample_idx = 0;
66     class_labels = 0;
67     class_count = 1;
68     delta = 0.0f;
69
70     clear();
71 }
72
73 //===========================================================================
74
75 int CvGBTrees::get_len(const CvMat* mat) const
76 {
77     return (mat->cols > mat->rows) ? mat->cols : mat->rows;
78 }
79
80 //===========================================================================
81
82 void CvGBTrees::clear()
83 {
84     if( weak )
85     {
86         CvSeqReader reader;
87         CvSlice slice = CV_WHOLE_SEQ;
88         CvDTree* tree;
89
90         //data->shared = false;
91         for (int i=0; i<class_count; ++i)
92         {
93             int weak_count = cvSliceLength( slice, weak[i] );
94             if ((weak[i]) && (weak_count))
95             {
96                 cvStartReadSeq( weak[i], &reader );
97                 cvSetSeqReaderPos( &reader, slice.start_index );
98                 for (int j=0; j<weak_count; ++j)
99                 {
100                     CV_READ_SEQ_ELEM( tree, reader );
101                     //tree->clear();
102                     delete tree;
103                     tree = 0;
104                 }
105             }
106         }
107         for (int i=0; i<class_count; ++i)
108             if (weak[i]) cvReleaseMemStorage( &(weak[i]->storage) );
109         delete[] weak;
110     }
111     if (data)
112     {
113         data->shared = false;
114         delete data;
115     }
116     weak = 0;
117     data = 0;
118     delta = 0.0f;
119     cvReleaseMat( &orig_response );
120     cvReleaseMat( &sum_response );
121     cvReleaseMat( &sum_response_tmp );
122     cvReleaseMat( &subsample_train );
123     cvReleaseMat( &subsample_test );
124     cvReleaseMat( &sample_idx );
125     cvReleaseMat( &missing );
126     cvReleaseMat( &class_labels );
127 }
128
129 //===========================================================================
130
131 CvGBTrees::~CvGBTrees()
132 {
133     clear();
134 }
135
136 //===========================================================================
137
138 CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag,
139                   const CvMat* _responses, const CvMat* _var_idx,
140                   const CvMat* _sample_idx, const CvMat* _var_type,
141                   const CvMat* _missing_mask, CvGBTreesParams _params )
142 {
143     weak = 0;
144     data = 0;
145     default_model_name = "my_boost_tree";
146     orig_response = sum_response = sum_response_tmp = 0;
147     subsample_train = subsample_test = 0;
148     missing = sample_idx = 0;
149     class_labels = 0;
150     class_count = 1;
151     delta = 0.0f;
152
153     train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
154            _var_type, _missing_mask, _params );
155 }
156
157 //===========================================================================
158
159 bool CvGBTrees::problem_type() const
160 {
161     switch (params.loss_function_type)
162     {
163     case DEVIANCE_LOSS: return false;
164     default: return true;
165     }
166 }
167
168 //===========================================================================
169
170 bool
171 CvGBTrees::train( CvMLData* _data, CvGBTreesParams _params, bool update )
172 {
173     bool result;
174     result = train ( _data->get_values(), CV_ROW_SAMPLE,
175             _data->get_responses(), _data->get_var_idx(),
176             _data->get_train_sample_idx(), _data->get_var_types(),
177             _data->get_missing(), _params, update);
178                                          //update is not supported
179     return result;
180 }
181
182 //===========================================================================
183
184
185 bool
186 CvGBTrees::train( const CvMat* _train_data, int _tflag,
187               const CvMat* _responses, const CvMat* _var_idx,
188               const CvMat* _sample_idx, const CvMat* _var_type,
189               const CvMat* _missing_mask,
190               CvGBTreesParams _params, bool /*_update*/ ) //update is not supported
191 {
192     CvMemStorage* storage = 0;
193
194     params = _params;
195     bool is_regression = problem_type();
196
197     clear();
198     /*
199       n - count of samples
200       m - count of variables
201     */
202     int n = _train_data->rows;
203     int m = _train_data->cols;
204     if (_tflag != CV_ROW_SAMPLE)
205     {
206         int tmp;
207         CV_SWAP(n,m,tmp);
208     }
209
210     CvMat* new_responses = cvCreateMat( n, 1, CV_32F);
211     cvZero(new_responses);
212
213     data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx,
214         _sample_idx, _var_type, _missing_mask, _params, true, true );
215     if (_missing_mask)
216     {
217         missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols,
218                               _missing_mask->type);
219         cvCopy( _missing_mask, missing);
220     }
221
222     orig_response = cvCreateMat( 1, n, CV_32F );
223     int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
224     switch (CV_MAT_TYPE(_responses->type))
225     {
226         case CV_32FC1:
227         {
228             for (int i=0; i<n; ++i)
229                 orig_response->data.fl[i] = _responses->data.fl[i*step];
230         }; break;
231         case CV_32SC1:
232         {
233             for (int i=0; i<n; ++i)
234                 orig_response->data.fl[i] = (float) _responses->data.i[i*step];
235         }; break;
236         default:
237             CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
238     }
239
240     if (!is_regression)
241     {
242         class_count = 0;
243         unsigned char * mask = new unsigned char[n];
244         memset(mask, 0, n);
245         // compute the count of different output classes
246         for (int i=0; i<n; ++i)
247             if (!mask[i])
248             {
249                 class_count++;
250                 for (int j=i; j<n; ++j)
251                     if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
252                         mask[j] = 1;
253             }
254         delete[] mask;
255
256         class_labels = cvCreateMat(1, class_count, CV_32S);
257         class_labels->data.i[0] = int(orig_response->data.fl[0]);
258         int j = 1;
259         for (int i=1; i<n; ++i)
260         {
261             int k = 0;
262             while ((k<j) && (int(orig_response->data.fl[i]) - class_labels->data.i[k]))
263                 k++;
264             if (k == j)
265             {
266                 class_labels->data.i[k] = int(orig_response->data.fl[i]);
267                 j++;
268             }
269         }
270     }
271
272     // inside gbt learning proccess only regression decision trees are built
273     data->is_classifier = false;
274
275     // preproccessing sample indices
276     if (_sample_idx)
277     {
278         int sample_idx_len = get_len(_sample_idx);
279
280         switch (CV_MAT_TYPE(_sample_idx->type))
281         {
282             case CV_32SC1:
283             {
284                 sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S );
285                 for (int i=0; i<sample_idx_len; ++i)
286                     sample_idx->data.i[i] = _sample_idx->data.i[i];
287                 icvSortInt(sample_idx->data.i, sample_idx_len, 0);
288             } break;
289             case CV_8S:
290             case CV_8U:
291             {
292                 int active_samples_count = 0;
293                 for (int i=0; i<sample_idx_len; ++i)
294                     active_samples_count += int( _sample_idx->data.ptr[i] );
295                 sample_idx = cvCreateMat( 1, active_samples_count, CV_32S );
296                 active_samples_count = 0;
297                 for (int i=0; i<sample_idx_len; ++i)
298                     if (int( _sample_idx->data.ptr[i] ))
299                         sample_idx->data.i[active_samples_count++] = i;
300
301             } break;
302             default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
303         }
304     }
305     else
306     {
307         sample_idx = cvCreateMat( 1, n, CV_32S );
308         for (int i=0; i<n; ++i)
309             sample_idx->data.i[i] = i;
310     }
311
312     sum_response = cvCreateMat(class_count, n, CV_32F);
313     sum_response_tmp = cvCreateMat(class_count, n, CV_32F);
314     cvZero(sum_response);
315
316     delta = 0.0f;
317     /*
318       in the case of a regression problem the initial guess (the zero term
319       in the sum) is set to the mean of all the training responses, that is
320       the best constant model
321     */
322     if (is_regression) base_value = find_optimal_value(sample_idx);
323     /*
324       in the case of a classification problem the initial guess (the zero term
325       in the sum) is set to zero for all the trees sequences
326     */
327     else base_value = 0.0f;
328     /*
329       current predicition on all training samples is set to be
330       equal to the base_value
331     */
332     cvSet( sum_response, cvScalar(base_value) );
333
334     weak = new pCvSeq[class_count];
335     for (int i=0; i<class_count; ++i)
336     {
337         storage = cvCreateMemStorage();
338         weak[i] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
339         storage = 0;
340     }
341
342     // subsample params and data
343     rng = &cv::theRNG();
344
345     int samples_count = get_len(sample_idx);
346
347     params.subsample_portion = params.subsample_portion <= FLT_EPSILON ||
348         1 - params.subsample_portion <= FLT_EPSILON
349         ? 1 : params.subsample_portion;
350     int train_sample_count = cvFloor(params.subsample_portion * samples_count);
351     if (train_sample_count == 0)
352         train_sample_count = samples_count;
353     int test_sample_count = samples_count - train_sample_count;
354     int* idx_data = new int[samples_count];
355     subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 );
356     *subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data );
357     if (test_sample_count)
358     {
359         subsample_test  = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 );
360         *subsample_test = cvMat( 1, test_sample_count, CV_32SC1,
361                                  idx_data + train_sample_count );
362     }
363
364     // training procedure
365
366     for ( int i=0; i < params.weak_count; ++i )
367     {
368         do_subsample();
369         for ( int k=0; k < class_count; ++k )
370         {
371             find_gradient(k);
372             CvDTree* tree = new CvDTree;
373             tree->train( data, subsample_train );
374             change_values(tree, k);
375
376             if (subsample_test)
377             {
378                 CvMat x;
379                 CvMat x_miss;
380                 int* sample_data = sample_idx->data.i;
381                 int* subsample_data = subsample_test->data.i;
382                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
383                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
384                 for (int j=0; j<get_len(subsample_test); ++j)
385                 {
386                     int idx = *(sample_data + subsample_data[j]*s_step);
387                     float res = 0.0f;
388                     if (_tflag == CV_ROW_SAMPLE)
389                         cvGetRow( data->train_data, &x, idx);
390                     else
391                         cvGetCol( data->train_data, &x, idx);
392
393                     if (missing)
394                     {
395                         if (_tflag == CV_ROW_SAMPLE)
396                             cvGetRow( missing, &x_miss, idx);
397                         else
398                             cvGetCol( missing, &x_miss, idx);
399
400                         res = (float)tree->predict(&x, &x_miss)->value;
401                     }
402                     else
403                     {
404                         res = (float)tree->predict(&x)->value;
405                     }
406                     sum_response_tmp->data.fl[idx + k*n] =
407                                     sum_response->data.fl[idx + k*n] +
408                                     params.shrinkage * res;
409                 }
410             }
411
412             cvSeqPush( weak[k], &tree );
413             tree = 0;
414         } // k=0..class_count
415     CvMat* tmp;
416     tmp = sum_response_tmp;
417     sum_response_tmp = sum_response;
418     sum_response = tmp;
419     tmp = 0;
420     } // i=0..params.weak_count
421
422     delete[] idx_data;
423     cvReleaseMat(&new_responses);
424     data->free_train_data();
425
426     return true;
427
428 } // CvGBTrees::train(...)
429
430 //===========================================================================
431
432 inline float Sign(float x)
433   {
434   if (x<0.0f) return -1.0f;
435   else if (x>0.0f) return 1.0f;
436   return 0.0f;
437   }
438
439 //===========================================================================
440
441 void CvGBTrees::find_gradient(const int k)
442 {
443     int* sample_data = sample_idx->data.i;
444     int* subsample_data = subsample_train->data.i;
445     float* grad_data = data->responses->data.fl;
446     float* resp_data = orig_response->data.fl;
447     float* current_data = sum_response->data.fl;
448
449     switch (params.loss_function_type)
450     // loss_function_type in
451     // {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS}
452     {
453         case SQUARED_LOSS:
454         {
455             for (int i=0; i<get_len(subsample_train); ++i)
456             {
457                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
458                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
459                 int idx = *(sample_data + subsample_data[i]*s_step);
460                 grad_data[idx] = resp_data[idx] - current_data[idx];
461             }
462         }; break;
463
464         case ABSOLUTE_LOSS:
465         {
466             for (int i=0; i<get_len(subsample_train); ++i)
467             {
468                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
469                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
470                 int idx = *(sample_data + subsample_data[i]*s_step);
471                 grad_data[idx] = Sign(resp_data[idx] - current_data[idx]);
472             }
473         }; break;
474
475         case HUBER_LOSS:
476         {
477             float alpha = 0.2f;
478             int n = get_len(subsample_train);
479             int s_step = (sample_idx->cols > sample_idx->rows) ? 1
480                          : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
481
482             float* residuals = new float[n];
483             for (int i=0; i<n; ++i)
484             {
485                 int idx = *(sample_data + subsample_data[i]*s_step);
486                 residuals[i] = fabs(resp_data[idx] - current_data[idx]);
487             }
488             icvSortFloat(residuals, n, 0.0f);
489
490             delta = residuals[int(ceil(n*alpha))];
491
492             for (int i=0; i<n; ++i)
493             {
494                 int idx = *(sample_data + subsample_data[i]*s_step);
495                 float r = resp_data[idx] - current_data[idx];
496                 grad_data[idx] = (fabs(r) > delta) ? delta*Sign(r) : r;
497             }
498             delete[] residuals;
499
500         }; break;
501
502         case DEVIANCE_LOSS:
503         {
504             for (int i=0; i<get_len(subsample_train); ++i)
505             {
506                 double exp_fk = 0;
507                 double exp_sfi = 0;
508                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
509                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
510                 int idx = *(sample_data + subsample_data[i]*s_step);
511
512                 for (int j=0; j<class_count; ++j)
513                 {
514                     double res;
515                     res = current_data[idx + j*sum_response->cols];
516                     res = exp(res);
517                     if (j == k) exp_fk = res;
518                     exp_sfi += res;
519                 }
520                 int orig_label = int(resp_data[idx]);
521                 /*
522                 grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) -
523                                  (float)(exp_fk / exp_sfi);
524                 */
525                 int ensemble_label = 0;
526                 while (class_labels->data.i[ensemble_label] - orig_label)
527                     ensemble_label++;
528
529                 grad_data[idx] = (float)(!(k-ensemble_label)) -
530                                  (float)(exp_fk / exp_sfi);
531             }
532         }; break;
533
534         default: break;
535     }
536
537 } // CvGBTrees::find_gradient(...)
538
539 //===========================================================================
540
541 void CvGBTrees::change_values(CvDTree* tree, const int _k)
542 {
543     CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)];
544
545     int* sample_data = sample_idx->data.i;
546     int* subsample_data = subsample_train->data.i;
547     int s_step = (sample_idx->cols > sample_idx->rows) ? 1
548                  : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
549
550     CvMat x;
551     CvMat miss_x;
552
553     for (int i=0; i<get_len(subsample_train); ++i)
554     {
555         int idx = *(sample_data + subsample_data[i]*s_step);
556         if (data->tflag == CV_ROW_SAMPLE)
557             cvGetRow( data->train_data, &x, idx);
558         else
559             cvGetCol( data->train_data, &x, idx);
560
561         if (missing)
562         {
563             if (data->tflag == CV_ROW_SAMPLE)
564                 cvGetRow( missing, &miss_x, idx);
565             else
566                 cvGetCol( missing, &miss_x, idx);
567
568             predictions[i] = tree->predict(&x, &miss_x);
569         }
570         else
571             predictions[i] = tree->predict(&x);
572     }
573
574
575     CvDTreeNode** leaves;
576     int leaves_count = 0;
577     leaves = GetLeaves( tree, leaves_count);
578
579     for (int i=0; i<leaves_count; ++i)
580     {
581         int samples_in_leaf = 0;
582         for (int j=0; j<get_len(subsample_train); ++j)
583         {
584             if (leaves[i] == predictions[j]) samples_in_leaf++;
585         }
586
587         if (!samples_in_leaf) // It should not be done anyways! but...
588         {
589             leaves[i]->value = 0.0;
590             continue;
591         }
592
593         CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S);
594         int* leaf_idx_data = leaf_idx->data.i;
595
596         for (int j=0; j<get_len(subsample_train); ++j)
597         {
598             int idx = *(sample_data + subsample_data[j]*s_step);
599             if (leaves[i] == predictions[j])
600                 *leaf_idx_data++ = idx;
601         }
602
603         float value = find_optimal_value(leaf_idx);
604         leaves[i]->value = value;
605
606         leaf_idx_data = leaf_idx->data.i;
607
608         int len = sum_response_tmp->cols;
609         for (int j=0; j<get_len(leaf_idx); ++j)
610         {
611             int idx = leaf_idx_data[j];
612             sum_response_tmp->data.fl[idx + _k*len] =
613                                     sum_response->data.fl[idx + _k*len] +
614                                     params.shrinkage * value;
615         }
616         leaf_idx_data = 0;
617         cvReleaseMat(&leaf_idx);
618     }
619
620     // releasing the memory
621     for (int i=0; i<get_len(subsample_train); ++i)
622     {
623         predictions[i] = 0;
624     }
625     delete[] predictions;
626
627     for (int i=0; i<leaves_count; ++i)
628     {
629         leaves[i] = 0;
630     }
631     delete[] leaves;
632
633 }
634
635 //===========================================================================
636 /*
637 void CvGBTrees::change_values(CvDTree* tree, const int _k)
638 {
639
640     CvDTreeNode** leaves;
641     int leaves_count = 0;
642     int offset = _k*sum_response_tmp->cols;
643     CvMat leaf_idx;
644     leaf_idx.rows = 1;
645
646     leaves = GetLeaves( tree, leaves_count);
647
648     for (int i=0; i<leaves_count; ++i)
649     {
650         int n = leaves[i]->sample_count;
651         int* leaf_idx_data = new int[n];
652         data->get_sample_indices(leaves[i], leaf_idx_data);
653         //CvMat* leaf_idx = new CvMat();
654         //cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
655         leaf_idx.cols = n;
656         leaf_idx.data.i = leaf_idx_data;
657
658         float value = find_optimal_value(&leaf_idx);
659         leaves[i]->value = value;
660         float val = params.shrinkage * value;
661
662
663         for (int j=0; j<n; ++j)
664         {
665             int idx = leaf_idx_data[j] + offset;
666             sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val;
667         }
668         //leaf_idx_data = 0;
669         //cvReleaseMat(&leaf_idx);
670         leaf_idx.data.i = 0;
671         //delete leaf_idx;
672         delete[] leaf_idx_data;
673     }
674
675     // releasing the memory
676     for (int i=0; i<leaves_count; ++i)
677     {
678         leaves[i] = 0;
679     }
680     delete[] leaves;
681
682 }    //change_values(...);
683 */
684 //===========================================================================
685
686 float CvGBTrees::find_optimal_value( const CvMat* _Idx )
687 {
688
689     double gamma = (double)0.0;
690
691     int* idx = _Idx->data.i;
692     float* resp_data = orig_response->data.fl;
693     float* cur_data = sum_response->data.fl;
694     int n = get_len(_Idx);
695
696     switch (params.loss_function_type)
697     // SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4
698     {
699     case SQUARED_LOSS:
700         {
701             for (int i=0; i<n; ++i)
702                 gamma += resp_data[idx[i]] - cur_data[idx[i]];
703             gamma /= (double)n;
704         }; break;
705
706     case ABSOLUTE_LOSS:
707         {
708             float* residuals = new float[n];
709             for (int i=0; i<n; ++i, ++idx)
710                 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
711             icvSortFloat(residuals, n, 0.0f);
712             if (n % 2)
713                 gamma = residuals[n/2];
714             else gamma = (residuals[n/2-1] + residuals[n/2]) / 2.0f;
715             delete[] residuals;
716         }; break;
717
718     case HUBER_LOSS:
719         {
720             float* residuals = new float[n];
721             for (int i=0; i<n; ++i, ++idx)
722                 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
723             icvSortFloat(residuals, n, 0.0f);
724
725             int n_half = n >> 1;
726             float r_median = (n == n_half<<1) ?
727                         (residuals[n_half-1] + residuals[n_half]) / 2.0f :
728                         residuals[n_half];
729
730             for (int i=0; i<n; ++i)
731             {
732                 float dif = residuals[i] - r_median;
733                 gamma += (delta < fabs(dif)) ? Sign(dif)*delta : dif;
734             }
735             gamma /= (double)n;
736             gamma += r_median;
737             delete[] residuals;
738
739         }; break;
740
741     case DEVIANCE_LOSS:
742         {
743             float* grad_data = data->responses->data.fl;
744             double tmp1 = 0;
745             double tmp2 = 0;
746             double tmp  = 0;
747             for (int i=0; i<n; ++i)
748             {
749                 tmp = grad_data[idx[i]];
750                 tmp1 += tmp;
751                 tmp2 += fabs(tmp)*(1-fabs(tmp));
752             };
753             if (tmp2 == 0)
754             {
755                 tmp2 = 1;
756             }
757
758             gamma = ((double)(class_count-1)) / (double)class_count * (tmp1/tmp2);
759         }; break;
760
761     default: break;
762     }
763
764     return float(gamma);
765
766 } // CvGBTrees::find_optimal_value
767
768 //===========================================================================
769
770
771 void CvGBTrees::leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node )
772 {
773     if (node->left != NULL)  leaves_get(leaves, count, node->left);
774     if (node->right != NULL) leaves_get(leaves, count, node->right);
775     if ((node->left == NULL) && (node->right == NULL))
776         leaves[count++] = node;
777 }
778
779 //---------------------------------------------------------------------------
780
781 CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len )
782 {
783     len = 0;
784     CvDTreeNode** leaves = new pCvDTreeNode[(size_t)1 << params.max_depth];
785     leaves_get(leaves, len, const_cast<pCvDTreeNode>(dtree->get_root()));
786     return leaves;
787 }
788
789 //===========================================================================
790
791 void CvGBTrees::do_subsample()
792 {
793
794     int n = get_len(sample_idx);
795     int* idx = subsample_train->data.i;
796
797     for (int i = 0; i < n; i++ )
798         idx[i] = i;
799
800     if (subsample_test)
801         for (int i = 0; i < n; i++)
802         {
803             int a = (*rng)(n);
804             int b = (*rng)(n);
805             int t;
806             CV_SWAP( idx[a], idx[b], t );
807         }
808
809 /*
810     int n = get_len(sample_idx);
811     if (subsample_train == 0)
812         subsample_train = cvCreateMat(1, n, CV_32S);
813     int* subsample_data = subsample_train->data.i;
814     for (int i=0; i<n; ++i)
815         subsample_data[i] = i;
816     subsample_test = 0;
817 */
818 }
819
820 //===========================================================================
821
822 float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
823         CvMat* weak_responses, CvSlice slice, int k) const
824 {
825     float result = 0.0f;
826
827     if (!weak) return 0.0f;
828
829     CvSeqReader reader;
830     int weak_count = cvSliceLength( slice, weak[class_count-1] );
831     CvDTree* tree;
832
833     if (weak_responses)
834     {
835         if (CV_MAT_TYPE(weak_responses->type) != CV_32F)
836             return 0.0f;
837         if ((k >= 0) && (k<class_count) && (weak_responses->rows != 1))
838             return 0.0f;
839         if ((k == -1) && (weak_responses->rows != class_count))
840             return 0.0f;
841         if (weak_responses->cols != weak_count)
842             return 0.0f;
843     }
844
845     float* sum = new float[class_count];
846     memset(sum, 0, class_count*sizeof(float));
847
848     for (int i=0; i<class_count; ++i)
849     {
850         if ((weak[i]) && (weak_count))
851         {
852             cvStartReadSeq( weak[i], &reader );
853             cvSetSeqReaderPos( &reader, slice.start_index );
854             for (int j=0; j<weak_count; ++j)
855             {
856                 CV_READ_SEQ_ELEM( tree, reader );
857                 float p = (float)(tree->predict(_sample, _missing)->value);
858                 sum[i] += params.shrinkage * p;
859                 if (weak_responses)
860                     weak_responses->data.fl[i*weak_count+j] = p;
861             }
862         }
863     }
864
865     for (int i=0; i<class_count; ++i)
866         sum[i] += base_value;
867
868     if (class_count == 1)
869     {
870         result = sum[0];
871         delete[] sum;
872         return result;
873     }
874
875     if ((k>=0) && (k<class_count))
876     {
877         result = sum[k];
878         delete[] sum;
879         return result;
880     }
881
882     float max = sum[0];
883     int class_label = 0;
884     for (int i=1; i<class_count; ++i)
885         if (sum[i] > max)
886         {
887             max = sum[i];
888             class_label = i;
889         }
890
891     delete[] sum;
892
893     /*
894     int orig_class_label = -1;
895     for (int i=0; i<get_len(class_labels); ++i)
896         if (class_labels->data.i[i] == class_label+1)
897             orig_class_label = i;
898     */
899     int orig_class_label = class_labels->data.i[class_label];
900
901     return float(orig_class_label);
902 }
903
904
905 class Tree_predictor : public cv::ParallelLoopBody
906 {
907 private:
908     pCvSeq* weak;
909     float* sum;
910     const int k;
911     const CvMat* sample;
912     const CvMat* missing;
913     const float shrinkage;
914
915     static cv::Mutex SumMutex;
916
917
918 public:
919     Tree_predictor() : weak(0), sum(0), k(0), sample(0), missing(0), shrinkage(1.0f) {}
920     Tree_predictor(pCvSeq* _weak, const int _k, const float _shrinkage,
921                    const CvMat* _sample, const CvMat* _missing, float* _sum ) :
922                    weak(_weak), sum(_sum), k(_k), sample(_sample),
923                    missing(_missing), shrinkage(_shrinkage)
924     {}
925
926     Tree_predictor( const Tree_predictor& p, cv::Split ) :
927             weak(p.weak), sum(p.sum), k(p.k), sample(p.sample),
928             missing(p.missing), shrinkage(p.shrinkage)
929     {}
930
931     Tree_predictor& operator=( const Tree_predictor& )
932     { return *this; }
933
934     virtual void operator()(const cv::Range& range) const
935     {
936         CvSeqReader reader;
937         int begin = range.start;
938         int end = range.end;
939
940         int weak_count = end - begin;
941         CvDTree* tree;
942
943         for (int i=0; i<k; ++i)
944         {
945             float tmp_sum = 0.0f;
946             if ((weak[i]) && (weak_count))
947             {
948                 cvStartReadSeq( weak[i], &reader );
949                 cvSetSeqReaderPos( &reader, begin );
950                 for (int j=0; j<weak_count; ++j)
951                 {
952                     CV_READ_SEQ_ELEM( tree, reader );
953                     tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
954                 }
955             }
956
957             {
958                 cv::AutoLock lock(SumMutex);
959                 sum[i] += tmp_sum;
960             }
961         }
962     } // Tree_predictor::operator()
963
964     virtual ~Tree_predictor() {}
965
966 }; // class Tree_predictor
967
968 cv::Mutex Tree_predictor::SumMutex;
969
970
971 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
972             CvMat* /*weak_responses*/, CvSlice slice, int k) const
973     {
974         float result = 0.0f;
975         if (!weak) return 0.0f;
976         float* sum = new float[class_count];
977         for (int i=0; i<class_count; ++i)
978             sum[i] = 0.0f;
979         int begin = slice.start_index;
980         int end = begin + cvSliceLength( slice, weak[0] );
981
982         pCvSeq* weak_seq = weak;
983         Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
984                                     params.shrinkage, _sample, _missing, sum);
985
986         cv::parallel_for_(cv::Range(begin, end), predictor);
987
988         for (int i=0; i<class_count; ++i)
989             sum[i] = sum[i] /** params.shrinkage*/ + base_value;
990
991         if (class_count == 1)
992         {
993             result = sum[0];
994             delete[] sum;
995             return result;
996         }
997
998         if ((k>=0) && (k<class_count))
999         {
1000             result = sum[k];
1001             delete[] sum;
1002             return result;
1003         }
1004
1005         float max = sum[0];
1006         int class_label = 0;
1007         for (int i=1; i<class_count; ++i)
1008             if (sum[i] > max)
1009             {
1010                 max = sum[i];
1011                 class_label = i;
1012             }
1013
1014         delete[] sum;
1015         int orig_class_label = class_labels->data.i[class_label];
1016
1017         return float(orig_class_label);
1018     }
1019
1020
1021 //===========================================================================
1022
1023 void CvGBTrees::write_params( CvFileStorage* fs ) const
1024 {
1025     const char* loss_function_type_str =
1026         params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" :
1027         params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" :
1028         params.loss_function_type == HUBER_LOSS ? "HuberLoss" :
1029         params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0;
1030
1031
1032     if( loss_function_type_str )
1033         cvWriteString( fs, "loss_function", loss_function_type_str );
1034     else
1035         cvWriteInt( fs, "loss_function", params.loss_function_type );
1036
1037     cvWriteInt( fs, "ensemble_length", params.weak_count );
1038     cvWriteReal( fs, "shrinkage", params.shrinkage );
1039     cvWriteReal( fs, "subsample_portion", params.subsample_portion );
1040     //cvWriteInt( fs, "max_tree_depth", params.max_depth );
1041     //cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false");
1042     if (class_labels) cvWrite( fs, "class_labels", class_labels);
1043
1044     data->is_classifier = !problem_type();
1045     data->write_params( fs );
1046     data->is_classifier = 0;
1047 }
1048
1049
1050 //===========================================================================
1051
1052 void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
1053 {
1054     CV_FUNCNAME( "CvGBTrees::read_params" );
1055     __BEGIN__;
1056
1057
1058     CvFileNode* temp;
1059
1060     if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1061         return;
1062
1063     data = new CvDTreeTrainData();
1064     CV_CALL( data->read_params(fs, fnode));
1065     data->shared = true;
1066
1067     params.max_depth = data->params.max_depth;
1068     params.min_sample_count = data->params.min_sample_count;
1069     params.max_categories = data->params.max_categories;
1070     params.priors = data->params.priors;
1071     params.regression_accuracy = data->params.regression_accuracy;
1072     params.use_surrogates = data->params.use_surrogates;
1073
1074     temp = cvGetFileNodeByName( fs, fnode, "loss_function" );
1075     if( !temp )
1076         EXIT;
1077
1078     if( temp && CV_NODE_IS_STRING(temp->tag) )
1079     {
1080         const char* loss_function_type_str = cvReadString( temp, "" );
1081         params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS :
1082                             strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS :
1083                             strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS :
1084                             strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1;
1085     }
1086     else
1087         params.loss_function_type = cvReadInt( temp, -1 );
1088
1089
1090     if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS ||  params.loss_function_type == 2)
1091         CV_ERROR( CV_StsBadArg, "Unknown loss function" );
1092
1093     params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" );
1094     params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 );
1095     params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 );
1096
1097     if (data->is_classifier)
1098     {
1099         class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" );
1100         if( class_labels && !CV_IS_MAT(class_labels))
1101             CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix");
1102     }
1103     data->is_classifier = 0;
1104
1105     __END__;
1106 }
1107
1108
1109
1110
1111 void CvGBTrees::write( CvFileStorage* fs, const char* name ) const
1112 {
1113     CV_FUNCNAME( "CvGBTrees::write" );
1114
1115     __BEGIN__;
1116
1117     CvSeqReader reader;
1118     int i;
1119     std::string s;
1120
1121     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT );
1122
1123     if( !weak )
1124         CV_ERROR( CV_StsBadArg, "The model has not been trained yet" );
1125
1126     write_params( fs );
1127     cvWriteReal( fs, "base_value", base_value);
1128     cvWriteInt( fs, "class_count", class_count);
1129
1130     for ( int j=0; j < class_count; ++j )
1131     {
1132         s = "trees_";
1133         s += ToString(j);
1134         cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ );
1135
1136         cvStartReadSeq( weak[j], &reader );
1137
1138         for( i = 0; i < weak[j]->total; i++ )
1139         {
1140             CvDTree* tree;
1141             CV_READ_SEQ_ELEM( tree, reader );
1142             cvStartWriteStruct( fs, 0, CV_NODE_MAP );
1143             tree->write( fs );
1144             cvEndWriteStruct( fs );
1145         }
1146
1147         cvEndWriteStruct( fs );
1148     }
1149
1150     cvEndWriteStruct( fs );
1151
1152     __END__;
1153 }
1154
1155
1156 //===========================================================================
1157
1158
1159 void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
1160 {
1161
1162     CV_FUNCNAME( "CvGBTrees::read" );
1163
1164     __BEGIN__;
1165
1166     CvSeqReader reader;
1167     CvFileNode* trees_fnode;
1168     CvMemStorage* storage;
1169     int i, ntrees;
1170     std::string s;
1171
1172     clear();
1173     read_params( fs, node );
1174
1175     if( !data )
1176         EXIT;
1177
1178     base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 );
1179     class_count = cvReadIntByName( fs, node, "class_count", 1 );
1180
1181     weak = new pCvSeq[class_count];
1182
1183
1184     for (int j=0; j<class_count; ++j)
1185     {
1186         s = "trees_";
1187         s += ToString(j);
1188
1189         trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() );
1190         if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
1191             CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" );
1192
1193         cvStartReadSeq( trees_fnode->data.seq, &reader );
1194         ntrees = trees_fnode->data.seq->total;
1195
1196         if( ntrees != params.weak_count )
1197             CV_ERROR( CV_StsUnmatchedSizes,
1198             "The number of trees stored does not match <ntrees> tag value" );
1199
1200         CV_CALL( storage = cvCreateMemStorage() );
1201         weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
1202
1203         for( i = 0; i < ntrees; i++ )
1204         {
1205             CvDTree* tree = new CvDTree();
1206             CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data ));
1207             CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
1208             cvSeqPush( weak[j], &tree );
1209         }
1210     }
1211
1212     __END__;
1213 }
1214
1215 //===========================================================================
1216
1217 class Sample_predictor : public cv::ParallelLoopBody
1218 {
1219 private:
1220     const CvGBTrees* gbt;
1221     float* predictions;
1222     const CvMat* samples;
1223     const CvMat* missing;
1224     const CvMat* idx;
1225     CvSlice slice;
1226
1227 public:
1228     Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0),
1229                          idx(0), slice(CV_WHOLE_SEQ)
1230     {}
1231
1232     Sample_predictor(const CvGBTrees* _gbt, float* _predictions,
1233                    const CvMat* _samples, const CvMat* _missing,
1234                    const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) :
1235                    gbt(_gbt), predictions(_predictions), samples(_samples),
1236                    missing(_missing), idx(_idx), slice(_slice)
1237     {}
1238
1239
1240     Sample_predictor( const Sample_predictor& p, cv::Split ) :
1241             gbt(p.gbt), predictions(p.predictions),
1242             samples(p.samples), missing(p.missing), idx(p.idx),
1243             slice(p.slice)
1244     {}
1245
1246
1247     virtual void operator()(const cv::Range& range) const
1248     {
1249         int begin = range.start;
1250         int end = range.end;
1251
1252         CvMat x;
1253         CvMat miss;
1254
1255         for (int i=begin; i<end; ++i)
1256         {
1257             int j = idx ? idx->data.i[i] : i;
1258             cvGetRow(samples, &x, j);
1259             if (!missing)
1260             {
1261                 predictions[i] = gbt->predict_serial(&x,0,0,slice);
1262             }
1263             else
1264             {
1265                 cvGetRow(missing, &miss, j);
1266                 predictions[i] = gbt->predict_serial(&x,&miss,0,slice);
1267             }
1268         }
1269     } // Sample_predictor::operator()
1270
1271     virtual ~Sample_predictor() {}
1272
1273 }; // class Sample_predictor
1274
1275
1276
1277 // type in {CV_TRAIN_ERROR, CV_TEST_ERROR}
1278 float
1279 CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1280 {
1281
1282     float err = 0.0f;
1283     const CvMat* _sample_idx = (type == CV_TRAIN_ERROR) ?
1284                               _data->get_train_sample_idx() :
1285                               _data->get_test_sample_idx();
1286     const CvMat* response = _data->get_responses();
1287
1288     int n = _sample_idx ? get_len(_sample_idx) : 0;
1289     n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n;
1290
1291     if (!n)
1292         return -FLT_MAX;
1293
1294     float* pred_resp = 0;
1295     bool needsFreeing = false;
1296
1297     if (resp)
1298     {
1299         resp->resize(n);
1300         pred_resp = &((*resp)[0]);
1301     }
1302     else
1303     {
1304         pred_resp = new float[n];
1305         needsFreeing = true;
1306     }
1307
1308     Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
1309             _data->get_missing(), _sample_idx);
1310
1311     cv::parallel_for_(cv::Range(0,n), predictor);
1312
1313     int* sidx = _sample_idx ? _sample_idx->data.i : 0;
1314     int r_step = CV_IS_MAT_CONT(response->type) ?
1315                 1 : response->step / CV_ELEM_SIZE(response->type);
1316
1317
1318     if ( !problem_type() )
1319     {
1320         for( int i = 0; i < n; i++ )
1321         {
1322             int si = sidx ? sidx[i] : i;
1323             int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1324             err += d;
1325         }
1326         err = err / (float)n * 100.0f;
1327     }
1328     else
1329     {
1330         for( int i = 0; i < n; i++ )
1331         {
1332             int si = sidx ? sidx[i] : i;
1333             float d = pred_resp[i] - response->data.fl[si*r_step];
1334             err += d*d;
1335         }
1336         err = err / (float)n;
1337     }
1338
1339     if (needsFreeing)
1340         delete[]pred_resp;
1341
1342     return err;
1343 }
1344
1345
1346 CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag,
1347           const cv::Mat& responses, const cv::Mat& varIdx,
1348           const cv::Mat& sampleIdx, const cv::Mat& varType,
1349           const cv::Mat& missingDataMask,
1350           CvGBTreesParams _params )
1351 {
1352     data = 0;
1353     weak = 0;
1354     default_model_name = "my_boost_tree";
1355     orig_response = sum_response = sum_response_tmp = 0;
1356     subsample_train = subsample_test = 0;
1357     missing = sample_idx = 0;
1358     class_labels = 0;
1359     class_count = 1;
1360     delta = 0.0f;
1361
1362     clear();
1363
1364     train(trainData, tflag, responses, varIdx, sampleIdx, varType, missingDataMask, _params, false);
1365 }
1366
1367 bool CvGBTrees::train( const cv::Mat& trainData, int tflag,
1368                    const cv::Mat& responses, const cv::Mat& varIdx,
1369                    const cv::Mat& sampleIdx, const cv::Mat& varType,
1370                    const cv::Mat& missingDataMask,
1371                    CvGBTreesParams _params,
1372                    bool update )
1373 {
1374     CvMat _trainData = trainData, _responses = responses;
1375     CvMat _varIdx = varIdx, _sampleIdx = sampleIdx, _varType = varType;
1376     CvMat _missingDataMask = missingDataMask;
1377
1378     return train( &_trainData, tflag, &_responses, varIdx.empty() ? 0 : &_varIdx,
1379                   sampleIdx.empty() ? 0 : &_sampleIdx, varType.empty() ? 0 : &_varType,
1380                   missingDataMask.empty() ? 0 : &_missingDataMask, _params, update);
1381 }
1382
1383 float CvGBTrees::predict( const cv::Mat& sample, const cv::Mat& _missing,
1384                           const cv::Range& slice, int k ) const
1385 {
1386     CvMat _sample = sample, miss = _missing;
1387     return predict(&_sample, _missing.empty() ? 0 : &miss, 0,
1388                    slice==cv::Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), k);
1389 }