2 #include "precomp.hpp"
\r
8 #define pCvSeq CvSeq*
\r
9 #define pCvDTreeNode CvDTreeNode*
\r
11 #define CV_CMP_FLOAT(a,b) ((a) < (b))
\r
12 static CV_IMPLEMENT_QSORT_EX( icvSortFloat, float, CV_CMP_FLOAT, float)
\r
14 //===========================================================================
\r
15 static string ToString(int i)
\r
24 //===========================================================================
\r
25 //----------------------------- CvGBTreesParams -----------------------------
\r
26 //===========================================================================
\r
28 CvGBTreesParams::CvGBTreesParams()
\r
29 : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
\r
32 loss_function_type = CvGBTrees::SQUARED_LOSS;
\r
33 subsample_portion = 0.8f;
\r
37 //===========================================================================
\r
39 CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count,
\r
40 float _shrinkage, float _subsample_portion,
\r
41 int _max_depth, bool _use_surrogates )
\r
42 : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
\r
44 loss_function_type = _loss_function_type;
\r
45 weak_count = _weak_count;
\r
46 shrinkage = _shrinkage;
\r
47 subsample_portion = _subsample_portion;
\r
48 max_depth = _max_depth;
\r
49 use_surrogates = _use_surrogates;
\r
52 //===========================================================================
\r
53 //------------------------------- CvGBTrees ---------------------------------
\r
54 //===========================================================================
\r
56 CvGBTrees::CvGBTrees()
\r
60 default_model_name = "my_boost_tree";
\r
61 orig_response = sum_response = sum_response_tmp = 0;
\r
62 subsample_train = subsample_test = 0;
\r
63 missing = sample_idx = 0;
\r
71 //===========================================================================
\r
73 int CvGBTrees::get_len(const CvMat* mat) const
\r
75 return (mat->cols > mat->rows) ? mat->cols : mat->rows;
\r
78 //===========================================================================
\r
80 void CvGBTrees::clear()
\r
85 CvSlice slice = CV_WHOLE_SEQ;
\r
88 //data->shared = false;
\r
89 for (int i=0; i<class_count; ++i)
\r
91 int weak_count = cvSliceLength( slice, weak[i] );
\r
92 if ((weak[i]) && (weak_count))
\r
94 cvStartReadSeq( weak[i], &reader );
\r
95 cvSetSeqReaderPos( &reader, slice.start_index );
\r
96 for (int j=0; j<weak_count; ++j)
\r
98 CV_READ_SEQ_ELEM( tree, reader );
\r
105 for (int i=0; i<class_count; ++i)
\r
106 if (weak[i]) cvReleaseMemStorage( &(weak[i]->storage) );
\r
111 data->shared = false;
\r
117 cvReleaseMat( &orig_response );
\r
118 cvReleaseMat( &sum_response );
\r
119 cvReleaseMat( &sum_response_tmp );
\r
120 cvReleaseMat( &subsample_train );
\r
121 cvReleaseMat( &subsample_test );
\r
122 cvReleaseMat( &sample_idx );
\r
123 cvReleaseMat( &missing );
\r
124 cvReleaseMat( &class_labels );
\r
127 //===========================================================================
\r
129 CvGBTrees::~CvGBTrees()
\r
134 //===========================================================================
\r
136 CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag,
\r
137 const CvMat* _responses, const CvMat* _var_idx,
\r
138 const CvMat* _sample_idx, const CvMat* _var_type,
\r
139 const CvMat* _missing_mask, CvGBTreesParams _params )
\r
143 default_model_name = "my_boost_tree";
\r
144 orig_response = sum_response = sum_response_tmp = 0;
\r
145 subsample_train = subsample_test = 0;
\r
146 missing = sample_idx = 0;
\r
151 train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
\r
152 _var_type, _missing_mask, _params );
\r
155 //===========================================================================
\r
157 bool CvGBTrees::problem_type() const
\r
159 switch (params.loss_function_type)
\r
161 case DEVIANCE_LOSS: return false;
\r
162 default: return true;
\r
166 //===========================================================================
\r
169 CvGBTrees::train( CvMLData* _data, CvGBTreesParams _params, bool update )
\r
172 result = train ( _data->get_values(), CV_ROW_SAMPLE,
\r
173 _data->get_responses(), _data->get_var_idx(),
\r
174 _data->get_train_sample_idx(), _data->get_var_types(),
\r
175 _data->get_missing(), _params, update);
\r
176 //update is not supported
\r
180 //===========================================================================
\r
184 CvGBTrees::train( const CvMat* _train_data, int _tflag,
\r
185 const CvMat* _responses, const CvMat* _var_idx,
\r
186 const CvMat* _sample_idx, const CvMat* _var_type,
\r
187 const CvMat* _missing_mask,
\r
188 CvGBTreesParams _params, bool /*_update*/ ) //update is not supported
\r
190 CvMemStorage* storage = 0;
\r
193 bool is_regression = problem_type();
\r
197 n - count of samples
\r
198 m - count of variables
\r
200 int n = _train_data->rows;
\r
201 int m = _train_data->cols;
\r
202 if (_tflag != CV_ROW_SAMPLE)
\r
208 CvMat* new_responses = cvCreateMat( n, 1, CV_32F);
\r
209 cvZero(new_responses);
\r
211 data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx,
\r
212 _sample_idx, _var_type, _missing_mask, _params, true, true );
\r
215 missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols,
\r
216 _missing_mask->type);
\r
217 cvCopy( _missing_mask, missing);
\r
220 orig_response = cvCreateMat( 1, n, CV_32F );
\r
221 int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
\r
222 switch (CV_MAT_TYPE(_responses->type))
\r
226 for (int i=0; i<n; ++i)
\r
227 orig_response->data.fl[i] = _responses->data.fl[i*step];
\r
231 for (int i=0; i<n; ++i)
\r
232 orig_response->data.fl[i] = (float) _responses->data.i[i*step];
\r
235 CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
\r
238 if (!is_regression)
\r
241 unsigned char * mask = new unsigned char[n];
\r
242 memset(mask, 0, n);
\r
243 // compute the count of different output classes
\r
244 for (int i=0; i<n; ++i)
\r
248 for (int j=i; j<n; ++j)
\r
249 if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
\r
254 class_labels = cvCreateMat(1, class_count, CV_32S);
\r
255 class_labels->data.i[0] = int(orig_response->data.fl[0]);
\r
257 for (int i=1; i<n; ++i)
\r
260 while ((int(orig_response->data.fl[i]) - class_labels->data.i[k]) && (k<j))
\r
264 class_labels->data.i[k] = int(orig_response->data.fl[i]);
\r
270 // inside gbt learning proccess only regression decision trees are built
\r
271 data->is_classifier = false;
\r
273 // preproccessing sample indices
\r
276 int sample_idx_len = get_len(_sample_idx);
\r
278 switch (CV_MAT_TYPE(_sample_idx->type))
\r
282 sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S );
\r
283 for (int i=0; i<sample_idx_len; ++i)
\r
284 sample_idx->data.i[i] = _sample_idx->data.i[i];
\r
289 int active_samples_count = 0;
\r
290 for (int i=0; i<sample_idx_len; ++i)
\r
291 active_samples_count += int( _sample_idx->data.ptr[i] );
\r
292 sample_idx = cvCreateMat( 1, active_samples_count, CV_32S );
\r
293 active_samples_count = 0;
\r
294 for (int i=0; i<sample_idx_len; ++i)
\r
295 if (int( _sample_idx->data.ptr[i] ))
\r
296 sample_idx->data.i[active_samples_count++] = i;
\r
299 default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
\r
301 icvSortFloat(sample_idx->data.fl, sample_idx_len, 0);
\r
305 sample_idx = cvCreateMat( 1, n, CV_32S );
\r
306 for (int i=0; i<n; ++i)
\r
307 sample_idx->data.i[i] = i;
\r
310 sum_response = cvCreateMat(class_count, n, CV_32F);
\r
311 sum_response_tmp = cvCreateMat(class_count, n, CV_32F);
\r
312 cvZero(sum_response);
\r
316 in the case of a regression problem the initial guess (the zero term
\r
317 in the sum) is set to the mean of all the training responses, that is
\r
318 the best constant model
\r
320 if (is_regression) base_value = find_optimal_value(sample_idx);
\r
322 in the case of a classification problem the initial guess (the zero term
\r
323 in the sum) is set to zero for all the trees sequences
\r
325 else base_value = 0.0f;
\r
327 current predicition on all training samples is set to be
\r
328 equal to the base_value
\r
330 cvSet( sum_response, cvScalar(base_value) );
\r
332 weak = new pCvSeq[class_count];
\r
333 for (int i=0; i<class_count; ++i)
\r
335 storage = cvCreateMemStorage();
\r
336 weak[i] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
\r
340 // subsample params and data
\r
341 rng = &cv::theRNG();
\r
343 int samples_count = get_len(sample_idx);
\r
345 params.subsample_portion = params.subsample_portion <= FLT_EPSILON ||
\r
346 1 - params.subsample_portion <= FLT_EPSILON
\r
347 ? 1 : params.subsample_portion;
\r
348 int train_sample_count = cvFloor(params.subsample_portion * samples_count);
\r
349 if (train_sample_count == 0)
\r
350 train_sample_count = samples_count;
\r
351 int test_sample_count = samples_count - train_sample_count;
\r
352 int* idx_data = new int[samples_count];
\r
353 subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 );
\r
354 *subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data );
\r
355 if (test_sample_count)
\r
357 subsample_test = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 );
\r
358 *subsample_test = cvMat( 1, test_sample_count, CV_32SC1,
\r
359 idx_data + train_sample_count );
\r
362 // training procedure
\r
364 for ( int i=0; i < params.weak_count; ++i )
\r
367 for ( int k=0; k < class_count; ++k )
\r
370 CvDTree* tree = new CvDTree;
\r
371 tree->train( data, subsample_train );
\r
372 change_values(tree, k);
\r
374 if (subsample_test)
\r
378 int* sample_data = sample_idx->data.i;
\r
379 int* subsample_data = subsample_test->data.i;
\r
380 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
\r
381 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
\r
382 for (int j=0; j<get_len(subsample_test); ++j)
\r
384 int idx = *(sample_data + subsample_data[j]*s_step);
\r
386 if (_tflag == CV_ROW_SAMPLE)
\r
387 cvGetRow( data->train_data, &x, idx);
\r
389 cvGetCol( data->train_data, &x, idx);
\r
393 if (_tflag == CV_ROW_SAMPLE)
\r
394 cvGetRow( missing, &x_miss, idx);
\r
396 cvGetCol( missing, &x_miss, idx);
\r
398 res = (float)tree->predict(&x, &x_miss)->value;
\r
402 res = (float)tree->predict(&x)->value;
\r
404 sum_response_tmp->data.fl[idx + k*n] =
\r
405 sum_response->data.fl[idx + k*n] +
\r
406 params.shrinkage * res;
\r
410 cvSeqPush( weak[k], &tree );
\r
412 } // k=0..class_count
\r
414 tmp = sum_response_tmp;
\r
415 sum_response_tmp = sum_response;
\r
416 sum_response = tmp;
\r
418 } // i=0..params.weak_count
\r
421 cvReleaseMat(&new_responses);
\r
422 data->free_train_data();
\r
426 } // CvGBTrees::train(...)
\r
428 //===========================================================================
\r
430 inline float Sign(float x)
\r
432 if (x<0.0f) return -1.0f;
\r
433 else if (x>0.0f) return 1.0f;
\r
437 //===========================================================================
\r
439 void CvGBTrees::find_gradient(const int k)
\r
441 int* sample_data = sample_idx->data.i;
\r
442 int* subsample_data = subsample_train->data.i;
\r
443 float* grad_data = data->responses->data.fl;
\r
444 float* resp_data = orig_response->data.fl;
\r
445 float* current_data = sum_response->data.fl;
\r
447 switch (params.loss_function_type)
\r
448 // loss_function_type in
\r
449 // {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS}
\r
453 for (int i=0; i<get_len(subsample_train); ++i)
\r
455 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
\r
456 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
\r
457 int idx = *(sample_data + subsample_data[i]*s_step);
\r
458 grad_data[idx] = resp_data[idx] - current_data[idx];
\r
462 case ABSOLUTE_LOSS:
\r
464 for (int i=0; i<get_len(subsample_train); ++i)
\r
466 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
\r
467 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
\r
468 int idx = *(sample_data + subsample_data[i]*s_step);
\r
469 grad_data[idx] = Sign(resp_data[idx] - current_data[idx]);
\r
475 float alpha = 0.2f;
\r
476 int n = get_len(subsample_train);
\r
477 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
\r
478 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
\r
480 float* residuals = new float[n];
\r
481 for (int i=0; i<n; ++i)
\r
483 int idx = *(sample_data + subsample_data[i]*s_step);
\r
484 residuals[i] = fabs(resp_data[idx] - current_data[idx]);
\r
486 icvSortFloat(residuals, n, 0.0f);
\r
488 delta = residuals[int(ceil(n*alpha))];
\r
490 for (int i=0; i<n; ++i)
\r
492 int idx = *(sample_data + subsample_data[i]*s_step);
\r
493 float r = resp_data[idx] - current_data[idx];
\r
494 grad_data[idx] = (fabs(r) > delta) ? delta*Sign(r) : r;
\r
496 delete[] residuals;
\r
500 case DEVIANCE_LOSS:
\r
502 for (int i=0; i<get_len(subsample_train); ++i)
\r
505 double exp_sfi = 0;
\r
506 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
\r
507 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
\r
508 int idx = *(sample_data + subsample_data[i]*s_step);
\r
510 for (int j=0; j<class_count; ++j)
\r
513 res = current_data[idx + j*sum_response->cols];
\r
515 if (j == k) exp_fk = res;
\r
518 int orig_label = int(resp_data[idx]);
\r
520 grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) -
\r
521 (float)(exp_fk / exp_sfi);
\r
523 int ensemble_label = 0;
\r
524 while (class_labels->data.i[ensemble_label] - orig_label)
\r
527 grad_data[idx] = (float)(!(k-ensemble_label)) -
\r
528 (float)(exp_fk / exp_sfi);
\r
535 } // CvGBTrees::find_gradient(...)
\r
537 //===========================================================================
\r
539 void CvGBTrees::change_values(CvDTree* tree, const int _k)
\r
541 CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)];
\r
543 int* sample_data = sample_idx->data.i;
\r
544 int* subsample_data = subsample_train->data.i;
\r
545 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
\r
546 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
\r
551 for (int i=0; i<get_len(subsample_train); ++i)
\r
553 int idx = *(sample_data + subsample_data[i]*s_step);
\r
554 if (data->tflag == CV_ROW_SAMPLE)
\r
555 cvGetRow( data->train_data, &x, idx);
\r
557 cvGetCol( data->train_data, &x, idx);
\r
561 if (data->tflag == CV_ROW_SAMPLE)
\r
562 cvGetRow( missing, &miss_x, idx);
\r
564 cvGetCol( missing, &miss_x, idx);
\r
566 predictions[i] = tree->predict(&x, &miss_x);
\r
569 predictions[i] = tree->predict(&x);
\r
573 CvDTreeNode** leaves;
\r
574 int leaves_count = 0;
\r
575 leaves = GetLeaves( tree, leaves_count);
\r
577 for (int i=0; i<leaves_count; ++i)
\r
579 int samples_in_leaf = 0;
\r
580 for (int j=0; j<get_len(subsample_train); ++j)
\r
582 if (leaves[i] == predictions[j]) samples_in_leaf++;
\r
585 if (!samples_in_leaf) // It should not be done anyways! but...
\r
587 leaves[i]->value = 0.0;
\r
591 CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S);
\r
592 int* leaf_idx_data = leaf_idx->data.i;
\r
594 for (int j=0; j<get_len(subsample_train); ++j)
\r
596 int idx = *(sample_data + subsample_data[j]*s_step);
\r
597 if (leaves[i] == predictions[j])
\r
598 *leaf_idx_data++ = idx;
\r
601 float value = find_optimal_value(leaf_idx);
\r
602 leaves[i]->value = value;
\r
604 leaf_idx_data = leaf_idx->data.i;
\r
606 int len = sum_response_tmp->cols;
\r
607 for (int j=0; j<get_len(leaf_idx); ++j)
\r
609 int idx = leaf_idx_data[j];
\r
610 sum_response_tmp->data.fl[idx + _k*len] =
\r
611 sum_response->data.fl[idx + _k*len] +
\r
612 params.shrinkage * value;
\r
615 cvReleaseMat(&leaf_idx);
\r
618 // releasing the memory
\r
619 for (int i=0; i<get_len(subsample_train); ++i)
\r
621 predictions[i] = 0;
\r
623 delete[] predictions;
\r
625 for (int i=0; i<leaves_count; ++i)
\r
633 //===========================================================================
\r
635 void CvGBTrees::change_values(CvDTree* tree, const int _k)
\r
638 CvDTreeNode** leaves;
\r
639 int leaves_count = 0;
\r
640 int offset = _k*sum_response_tmp->cols;
\r
644 leaves = GetLeaves( tree, leaves_count);
\r
646 for (int i=0; i<leaves_count; ++i)
\r
648 int n = leaves[i]->sample_count;
\r
649 int* leaf_idx_data = new int[n];
\r
650 data->get_sample_indices(leaves[i], leaf_idx_data);
\r
651 //CvMat* leaf_idx = new CvMat();
\r
652 //cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
\r
654 leaf_idx.data.i = leaf_idx_data;
\r
656 float value = find_optimal_value(&leaf_idx);
\r
657 leaves[i]->value = value;
\r
658 float val = params.shrinkage * value;
\r
661 for (int j=0; j<n; ++j)
\r
663 int idx = leaf_idx_data[j] + offset;
\r
664 sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val;
\r
666 //leaf_idx_data = 0;
\r
667 //cvReleaseMat(&leaf_idx);
\r
668 leaf_idx.data.i = 0;
\r
670 delete[] leaf_idx_data;
\r
673 // releasing the memory
\r
674 for (int i=0; i<leaves_count; ++i)
\r
680 } //change_values(...);
\r
682 //===========================================================================
\r
684 float CvGBTrees::find_optimal_value( const CvMat* _Idx )
\r
687 double gamma = (double)0.0;
\r
689 int* idx = _Idx->data.i;
\r
690 float* resp_data = orig_response->data.fl;
\r
691 float* cur_data = sum_response->data.fl;
\r
692 int n = get_len(_Idx);
\r
694 switch (params.loss_function_type)
\r
695 // SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4
\r
699 for (int i=0; i<n; ++i)
\r
700 gamma += resp_data[idx[i]] - cur_data[idx[i]];
\r
701 gamma /= (double)n;
\r
704 case ABSOLUTE_LOSS:
\r
706 float* residuals = new float[n];
\r
707 for (int i=0; i<n; ++i, ++idx)
\r
708 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
\r
709 icvSortFloat(residuals, n, 0.0f);
\r
711 gamma = residuals[n/2];
\r
712 else gamma = (residuals[n/2-1] + residuals[n/2]) / 2.0f;
\r
713 delete[] residuals;
\r
718 float* residuals = new float[n];
\r
719 for (int i=0; i<n; ++i, ++idx)
\r
720 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
\r
721 icvSortFloat(residuals, n, 0.0f);
\r
723 int n_half = n >> 1;
\r
724 float r_median = (n == n_half<<1) ?
\r
725 (residuals[n_half-1] + residuals[n_half]) / 2.0f :
\r
728 for (int i=0; i<n; ++i)
\r
730 float dif = residuals[i] - r_median;
\r
731 gamma += (delta < fabs(dif)) ? Sign(dif)*delta : dif;
\r
733 gamma /= (double)n;
\r
735 delete[] residuals;
\r
739 case DEVIANCE_LOSS:
\r
741 float* grad_data = data->responses->data.fl;
\r
745 for (int i=0; i<n; ++i)
\r
747 tmp = grad_data[idx[i]];
\r
749 tmp2 += fabs(tmp)*(1-fabs(tmp));
\r
756 gamma = ((double)(class_count-1)) / (double)class_count * (tmp1/tmp2);
\r
762 return float(gamma);
\r
764 } // CvGBTrees::find_optimal_value
\r
766 //===========================================================================
\r
769 void CvGBTrees::leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node )
\r
771 if (node->left != NULL) leaves_get(leaves, count, node->left);
\r
772 if (node->right != NULL) leaves_get(leaves, count, node->right);
\r
773 if ((node->left == NULL) && (node->right == NULL))
\r
774 leaves[count++] = node;
\r
777 //---------------------------------------------------------------------------
\r
779 CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len )
\r
782 CvDTreeNode** leaves = new pCvDTreeNode[(size_t)1 << params.max_depth];
\r
783 leaves_get(leaves, len, const_cast<pCvDTreeNode>(dtree->get_root()));
\r
787 //===========================================================================
\r
789 void CvGBTrees::do_subsample()
\r
792 int n = get_len(sample_idx);
\r
793 int* idx = subsample_train->data.i;
\r
795 for (int i = 0; i < n; i++ )
\r
798 if (subsample_test)
\r
799 for (int i = 0; i < n; i++)
\r
804 CV_SWAP( idx[a], idx[b], t );
\r
808 int n = get_len(sample_idx);
\r
809 if (subsample_train == 0)
\r
810 subsample_train = cvCreateMat(1, n, CV_32S);
\r
811 int* subsample_data = subsample_train->data.i;
\r
812 for (int i=0; i<n; ++i)
\r
813 subsample_data[i] = i;
\r
814 subsample_test = 0;
\r
818 //===========================================================================
\r
820 float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
\r
821 CvMat* weak_responses, CvSlice slice, int k) const
\r
823 float result = 0.0f;
\r
825 if (!weak) return 0.0f;
\r
827 CvSeqReader reader;
\r
828 int weak_count = cvSliceLength( slice, weak[class_count-1] );
\r
831 if (weak_responses)
\r
833 if (CV_MAT_TYPE(weak_responses->type) != CV_32F)
\r
835 if ((k >= 0) && (k<class_count) && (weak_responses->rows != 1))
\r
837 if ((k == -1) && (weak_responses->rows != class_count))
\r
839 if (weak_responses->cols != weak_count)
\r
843 float* sum = new float[class_count];
\r
844 memset(sum, 0, class_count*sizeof(float));
\r
846 for (int i=0; i<class_count; ++i)
\r
848 if ((weak[i]) && (weak_count))
\r
850 cvStartReadSeq( weak[i], &reader );
\r
851 cvSetSeqReaderPos( &reader, slice.start_index );
\r
852 for (int j=0; j<weak_count; ++j)
\r
854 CV_READ_SEQ_ELEM( tree, reader );
\r
855 float p = (float)(tree->predict(_sample, _missing)->value);
\r
856 sum[i] += params.shrinkage * p;
\r
857 if (weak_responses)
\r
858 weak_responses->data.fl[i*weak_count+j] = p;
\r
863 for (int i=0; i<class_count; ++i)
\r
864 sum[i] += base_value;
\r
866 if (class_count == 1)
\r
873 if ((k>=0) && (k<class_count))
\r
880 float max = sum[0];
\r
881 int class_label = 0;
\r
882 for (int i=1; i<class_count; ++i)
\r
892 int orig_class_label = -1;
\r
893 for (int i=0; i<get_len(class_labels); ++i)
\r
894 if (class_labels->data.i[i] == class_label+1)
\r
895 orig_class_label = i;
\r
897 int orig_class_label = class_labels->data.i[class_label];
\r
899 return float(orig_class_label);
\r
903 class Tree_predictor
\r
909 const CvMat* sample;
\r
910 const CvMat* missing;
\r
911 const float shrinkage;
\r
914 static tbb::spin_mutex SumMutex;
\r
919 Tree_predictor() : weak(0), sum(0), k(0), sample(0), missing(0), shrinkage(1.0f) {}
\r
920 Tree_predictor(pCvSeq* _weak, const int _k, const float _shrinkage,
\r
921 const CvMat* _sample, const CvMat* _missing, float* _sum ) :
\r
922 weak(_weak), sum(_sum), k(_k), sample(_sample),
\r
923 missing(_missing), shrinkage(_shrinkage)
\r
926 Tree_predictor( const Tree_predictor& p, cv::Split ) :
\r
927 weak(p.weak), sum(p.sum), k(p.k), sample(p.sample),
\r
928 missing(p.missing), shrinkage(p.shrinkage)
\r
931 Tree_predictor& operator=( const Tree_predictor& )
\r
934 virtual void operator()(const cv::BlockedRange& range) const
\r
937 tbb::spin_mutex::scoped_lock lock;
\r
939 CvSeqReader reader;
\r
940 int begin = range.begin();
\r
941 int end = range.end();
\r
943 int weak_count = end - begin;
\r
946 for (int i=0; i<k; ++i)
\r
948 float tmp_sum = 0.0f;
\r
949 if ((weak[i]) && (weak_count))
\r
951 cvStartReadSeq( weak[i], &reader );
\r
952 cvSetSeqReaderPos( &reader, begin );
\r
953 for (int j=0; j<weak_count; ++j)
\r
955 CV_READ_SEQ_ELEM( tree, reader );
\r
956 tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
\r
960 lock.acquire(SumMutex);
\r
967 } // Tree_predictor::operator()
\r
969 virtual ~Tree_predictor() {}
\r
971 }; // class Tree_predictor
\r
975 tbb::spin_mutex Tree_predictor::SumMutex;
\r
980 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
\r
981 CvMat* /*weak_responses*/, CvSlice slice, int k) const
\r
983 float result = 0.0f;
\r
984 if (!weak) return 0.0f;
\r
985 float* sum = new float[class_count];
\r
986 for (int i=0; i<class_count; ++i)
\r
988 int begin = slice.start_index;
\r
989 int end = begin + cvSliceLength( slice, weak[0] );
\r
991 pCvSeq* weak_seq = weak;
\r
992 Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
\r
993 params.shrinkage, _sample, _missing, sum);
\r
996 // tbb::parallel_for(cv::BlockedRange(begin, end), predictor,
\r
997 // tbb::auto_partitioner());
\r
999 cv::parallel_for(cv::BlockedRange(begin, end), predictor);
\r
1002 for (int i=0; i<class_count; ++i)
\r
1003 sum[i] = sum[i] /** params.shrinkage*/ + base_value;
\r
1005 if (class_count == 1)
\r
1012 if ((k>=0) && (k<class_count))
\r
1019 float max = sum[0];
\r
1020 int class_label = 0;
\r
1021 for (int i=1; i<class_count; ++i)
\r
1029 int orig_class_label = class_labels->data.i[class_label];
\r
1031 return float(orig_class_label);
\r
1035 //===========================================================================
\r
1037 void CvGBTrees::write_params( CvFileStorage* fs ) const
\r
1039 const char* loss_function_type_str =
\r
1040 params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" :
\r
1041 params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" :
\r
1042 params.loss_function_type == HUBER_LOSS ? "HuberLoss" :
\r
1043 params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0;
\r
1046 if( loss_function_type_str )
\r
1047 cvWriteString( fs, "loss_function", loss_function_type_str );
\r
1049 cvWriteInt( fs, "loss_function", params.loss_function_type );
\r
1051 cvWriteInt( fs, "ensemble_length", params.weak_count );
\r
1052 cvWriteReal( fs, "shrinkage", params.shrinkage );
\r
1053 cvWriteReal( fs, "subsample_portion", params.subsample_portion );
\r
1054 //cvWriteInt( fs, "max_tree_depth", params.max_depth );
\r
1055 //cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false");
\r
1056 if (class_labels) cvWrite( fs, "class_labels", class_labels);
\r
1058 data->is_classifier = !problem_type();
\r
1059 data->write_params( fs );
\r
1060 data->is_classifier = 0;
\r
1064 //===========================================================================
\r
1066 void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
\r
1068 CV_FUNCNAME( "CvGBTrees::read_params" );
\r
1074 if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
\r
1077 data = new CvDTreeTrainData();
\r
1078 CV_CALL( data->read_params(fs, fnode));
\r
1079 data->shared = true;
\r
1081 params.max_depth = data->params.max_depth;
\r
1082 params.min_sample_count = data->params.min_sample_count;
\r
1083 params.max_categories = data->params.max_categories;
\r
1084 params.priors = data->params.priors;
\r
1085 params.regression_accuracy = data->params.regression_accuracy;
\r
1086 params.use_surrogates = data->params.use_surrogates;
\r
1088 temp = cvGetFileNodeByName( fs, fnode, "loss_function" );
\r
1092 if( temp && CV_NODE_IS_STRING(temp->tag) )
\r
1094 const char* loss_function_type_str = cvReadString( temp, "" );
\r
1095 params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS :
\r
1096 strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS :
\r
1097 strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS :
\r
1098 strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1;
\r
1101 params.loss_function_type = cvReadInt( temp, -1 );
\r
1104 if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS || params.loss_function_type == 2)
\r
1105 CV_ERROR( CV_StsBadArg, "Unknown loss function" );
\r
1107 params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" );
\r
1108 params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 );
\r
1109 params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 );
\r
1111 if (data->is_classifier)
\r
1113 class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" );
\r
1114 if( class_labels && !CV_IS_MAT(class_labels))
\r
1115 CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix");
\r
1117 data->is_classifier = 0;
\r
1125 void CvGBTrees::write( CvFileStorage* fs, const char* name ) const
\r
1127 CV_FUNCNAME( "CvGBTrees::write" );
\r
1131 CvSeqReader reader;
\r
1135 cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT );
\r
1138 CV_ERROR( CV_StsBadArg, "The model has not been trained yet" );
\r
1140 write_params( fs );
\r
1141 cvWriteReal( fs, "base_value", base_value);
\r
1142 cvWriteInt( fs, "class_count", class_count);
\r
1144 for ( int j=0; j < class_count; ++j )
\r
1148 cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ );
\r
1150 cvStartReadSeq( weak[j], &reader );
\r
1152 for( i = 0; i < weak[j]->total; i++ )
\r
1155 CV_READ_SEQ_ELEM( tree, reader );
\r
1156 cvStartWriteStruct( fs, 0, CV_NODE_MAP );
\r
1157 tree->write( fs );
\r
1158 cvEndWriteStruct( fs );
\r
1161 cvEndWriteStruct( fs );
\r
1164 cvEndWriteStruct( fs );
\r
1170 //===========================================================================
\r
1173 void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
\r
1176 CV_FUNCNAME( "CvGBTrees::read" );
\r
1180 CvSeqReader reader;
\r
1181 CvFileNode* trees_fnode;
\r
1182 CvMemStorage* storage;
\r
1187 read_params( fs, node );
\r
1192 base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 );
\r
1193 class_count = cvReadIntByName( fs, node, "class_count", 1 );
\r
1195 weak = new pCvSeq[class_count];
\r
1198 for (int j=0; j<class_count; ++j)
\r
1203 trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() );
\r
1204 if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
\r
1205 CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" );
\r
1207 cvStartReadSeq( trees_fnode->data.seq, &reader );
\r
1208 ntrees = trees_fnode->data.seq->total;
\r
1210 if( ntrees != params.weak_count )
\r
1211 CV_ERROR( CV_StsUnmatchedSizes,
\r
1212 "The number of trees stored does not match <ntrees> tag value" );
\r
1214 CV_CALL( storage = cvCreateMemStorage() );
\r
1215 weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
\r
1217 for( i = 0; i < ntrees; i++ )
\r
1219 CvDTree* tree = new CvDTree();
\r
1220 CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data ));
\r
1221 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
\r
1222 cvSeqPush( weak[j], &tree );
\r
1229 //===========================================================================
\r
1231 class Sample_predictor
\r
1234 const CvGBTrees* gbt;
\r
1235 float* predictions;
\r
1236 const CvMat* samples;
\r
1237 const CvMat* missing;
\r
1242 Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0),
\r
1243 idx(0), slice(CV_WHOLE_SEQ)
\r
1246 Sample_predictor(const CvGBTrees* _gbt, float* _predictions,
\r
1247 const CvMat* _samples, const CvMat* _missing,
\r
1248 const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) :
\r
1249 gbt(_gbt), predictions(_predictions), samples(_samples),
\r
1250 missing(_missing), idx(_idx), slice(_slice)
\r
1254 Sample_predictor( const Sample_predictor& p, cv::Split ) :
\r
1255 gbt(p.gbt), predictions(p.predictions),
\r
1256 samples(p.samples), missing(p.missing), idx(p.idx),
\r
1261 virtual void operator()(const cv::BlockedRange& range) const
\r
1263 int begin = range.begin();
\r
1264 int end = range.end();
\r
1269 for (int i=begin; i<end; ++i)
\r
1271 int j = idx ? idx->data.i[i] : i;
\r
1272 cvGetRow(samples, &x, j);
\r
1275 predictions[i] = gbt->predict_serial(&x,0,0,slice);
\r
1279 cvGetRow(missing, &miss, j);
\r
1280 predictions[i] = gbt->predict_serial(&x,&miss,0,slice);
\r
1283 } // Sample_predictor::operator()
\r
1285 virtual ~Sample_predictor() {}
\r
1287 }; // class Sample_predictor
\r
1291 // type in {CV_TRAIN_ERROR, CV_TEST_ERROR}
\r
1293 CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
\r
1297 const CvMat* _sample_idx = (type == CV_TRAIN_ERROR) ?
\r
1298 _data->get_train_sample_idx() :
\r
1299 _data->get_test_sample_idx();
\r
1300 const CvMat* response = _data->get_responses();
\r
1302 int n = _sample_idx ? get_len(_sample_idx) : 0;
\r
1303 n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n;
\r
1308 float* pred_resp = 0;
\r
1312 pred_resp = &((*resp)[0]);
\r
1315 pred_resp = new float[n];
\r
1317 Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
\r
1318 _data->get_missing(), _sample_idx);
\r
1321 // tbb::parallel_for(cv::BlockedRange(0,n), predictor, tbb::auto_partitioner());
\r
1323 cv::parallel_for(cv::BlockedRange(0,n), predictor);
\r
1326 int* sidx = _sample_idx ? _sample_idx->data.i : 0;
\r
1327 int r_step = CV_IS_MAT_CONT(response->type) ?
\r
1328 1 : response->step / CV_ELEM_SIZE(response->type);
\r
1331 if ( !problem_type() )
\r
1333 for( int i = 0; i < n; i++ )
\r
1335 int si = sidx ? sidx[i] : i;
\r
1336 int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
\r
1339 err = err / (float)n * 100.0f;
\r
1343 for( int i = 0; i < n; i++ )
\r
1345 int si = sidx ? sidx[i] : i;
\r
1346 float d = pred_resp[i] - response->data.fl[si*r_step];
\r
1349 err = err / (float)n;
\r
1356 CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag,
\r
1357 const cv::Mat& responses, const cv::Mat& varIdx,
\r
1358 const cv::Mat& sampleIdx, const cv::Mat& varType,
\r
1359 const cv::Mat& missingDataMask,
\r
1360 CvGBTreesParams _params )
\r
1364 default_model_name = "my_boost_tree";
\r
1365 orig_response = sum_response = sum_response_tmp = 0;
\r
1366 subsample_train = subsample_test = 0;
\r
1367 missing = sample_idx = 0;
\r
1374 train(trainData, tflag, responses, varIdx, sampleIdx, varType, missingDataMask, _params, false);
\r
1377 bool CvGBTrees::train( const cv::Mat& trainData, int tflag,
\r
1378 const cv::Mat& responses, const cv::Mat& varIdx,
\r
1379 const cv::Mat& sampleIdx, const cv::Mat& varType,
\r
1380 const cv::Mat& missingDataMask,
\r
1381 CvGBTreesParams _params,
\r
1384 CvMat _trainData = trainData, _responses = responses;
\r
1385 CvMat _varIdx = varIdx, _sampleIdx = sampleIdx, _varType = varType;
\r
1386 CvMat _missingDataMask = missingDataMask;
\r
1388 return train( &_trainData, tflag, &_responses, varIdx.empty() ? 0 : &_varIdx,
\r
1389 sampleIdx.empty() ? 0 : &_sampleIdx, varType.empty() ? 0 : &_varType,
\r
1390 missingDataMask.empty() ? 0 : &_missingDataMask, _params, update);
\r
1393 float CvGBTrees::predict( const cv::Mat& sample, const cv::Mat& _missing,
\r
1394 const cv::Range& slice, int k ) const
\r
1396 CvMat _sample = sample, miss = _missing;
\r
1397 return predict(&_sample, _missing.empty() ? 0 : &miss, 0,
\r
1398 slice==cv::Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), k);
\r