1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
10 // Intel License Agreement
12 // Copyright (C) 2000, Intel Corporation, all rights reserved.
13 // Third party copyrights are property of their respective owners.
15 // Redistribution and use in source and binary forms, with or without modification,
16 // are permitted provided that the following conditions are met:
18 // * Redistribution's of source code must retain the above copyright notice,
19 // this list of conditions and the following disclaimer.
21 // * Redistribution's in binary form must reproduce the above copyright notice,
22 // this list of conditions and the following disclaimer in the documentation
23 // and/or other materials provided with the distribution.
25 // * The name of Intel Corporation may not be used to endorse or promote products
26 // derived from this software without specific prior written permission.
28 // This software is provided by the copyright holders and contributors "as is" and
29 // any express or implied warranties, including, but not limited to, the implied
30 // warranties of merchantability and fitness for a particular purpose are disclaimed.
31 // In no event shall the Intel Corporation or contributors be liable for any direct,
32 // indirect, incidental, special, exemplary, or consequential damages
33 // (including, but not limited to, procurement of substitute goods or services;
34 // loss of use, data, or profits; or business interruption) however caused
35 // and on any theory of liability, whether in contract, strict liability,
36 // or tort (including negligence or otherwise) arising in any way out of
37 // the use of this software, even if advised of the possibility of such damage.
41 #include "precomp.hpp"
44 log_ratio( double val )
46 const double eps = 1e-5;
48 val = MAX( val, eps );
49 val = MIN( val, 1. - eps );
50 return log( val/(1. - val) );
54 CvBoostParams::CvBoostParams()
56 boost_type = CvBoost::REAL;
58 weight_trim_rate = 0.95;
64 CvBoostParams::CvBoostParams( int _boost_type, int _weak_count,
65 double _weight_trim_rate, int _max_depth,
66 bool _use_surrogates, const float* _priors )
68 boost_type = _boost_type;
69 weak_count = _weak_count;
70 weight_trim_rate = _weight_trim_rate;
71 split_criteria = CvBoost::DEFAULT;
73 max_depth = _max_depth;
74 use_surrogates = _use_surrogates;
80 ///////////////////////////////// CvBoostTree ///////////////////////////////////
82 CvBoostTree::CvBoostTree()
88 CvBoostTree::~CvBoostTree()
103 CvBoostTree::train( CvDTreeTrainData* _train_data,
104 const CvMat* _subsample_idx, CvBoost* _ensemble )
107 ensemble = _ensemble;
110 return do_train( _subsample_idx );
115 CvBoostTree::train( const CvMat*, int, const CvMat*, const CvMat*,
116 const CvMat*, const CvMat*, const CvMat*, CvDTreeParams )
124 CvBoostTree::train( CvDTreeTrainData*, const CvMat* )
132 CvBoostTree::scale( double scale )
134 CvDTreeNode* node = root;
136 // traverse the tree and scale all the node values
142 node->value *= scale;
148 for( parent = node->parent; parent && parent->right == node;
149 node = parent, parent = parent->parent )
155 node = parent->right;
161 CvBoostTree::try_split_node( CvDTreeNode* node )
163 CvDTree::try_split_node( node );
167 // if the node has not been split,
168 // store the responses for the corresponding training samples
169 double* weak_eval = ensemble->get_weak_response()->data.db;
170 cv::AutoBuffer<int> inn_buf(node->sample_count);
171 const int* labels = data->get_cv_labels( node, (int*)inn_buf );
172 int i, count = node->sample_count;
173 double value = node->value;
175 for( i = 0; i < count; i++ )
176 weak_eval[labels[i]] = value;
182 CvBoostTree::calc_node_dir( CvDTreeNode* node )
184 char* dir = (char*)data->direction->data.ptr;
185 const double* weights = ensemble->get_subtree_weights()->data.db;
186 int i, n = node->sample_count, vi = node->split->var_idx;
189 assert( !node->split->inversed );
191 if( data->get_var_type(vi) >= 0 ) // split on categorical var
193 cv::AutoBuffer<int> inn_buf(n);
194 const int* cat_labels = data->get_cat_var_data( node, vi, (int*)inn_buf );
195 const int* subset = node->split->subset;
196 double sum = 0, sum_abs = 0;
198 for( i = 0; i < n; i++ )
200 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
201 double w = weights[i];
202 int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0;
203 sum += d*w; sum_abs += (d & 1)*w;
207 R = (sum_abs + sum) * 0.5;
208 L = (sum_abs - sum) * 0.5;
210 else // split on ordered var
212 cv::AutoBuffer<uchar> inn_buf(2*n*sizeof(int)+n*sizeof(float));
213 float* values_buf = (float*)(uchar*)inn_buf;
214 int* sorted_indices_buf = (int*)(values_buf + n);
215 int* sample_indices_buf = sorted_indices_buf + n;
216 const float* values = 0;
217 const int* sorted_indices = 0;
218 data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
219 int split_point = node->split->ord.split_point;
220 int n1 = node->get_num_valid(vi);
222 assert( 0 <= split_point && split_point < n1-1 );
225 for( i = 0; i <= split_point; i++ )
227 int idx = sorted_indices[i];
228 double w = weights[idx];
235 int idx = sorted_indices[i];
236 double w = weights[idx];
242 dir[sorted_indices[i]] = (char)0;
245 node->maxlr = MAX( L, R );
246 return node->split->quality/(L + R);
251 CvBoostTree::find_split_ord_class( CvDTreeNode* node, int vi, float init_quality,
252 CvDTreeSplit* _split, uchar* _ext_buf )
254 const float epsilon = FLT_EPSILON*2;
256 const double* weights = ensemble->get_subtree_weights()->data.db;
257 int n = node->sample_count;
258 int n1 = node->get_num_valid(vi);
260 cv::AutoBuffer<uchar> inn_buf;
262 inn_buf.allocate(n*(3*sizeof(int)+sizeof(float)));
263 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
264 float* values_buf = (float*)ext_buf;
265 int* sorted_indices_buf = (int*)(values_buf + n);
266 int* sample_indices_buf = sorted_indices_buf + n;
267 const float* values = 0;
268 const int* sorted_indices = 0;
269 data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
270 int* responses_buf = sorted_indices_buf + n;
271 const int* responses = data->get_class_labels( node, responses_buf );
272 const double* rcw0 = weights + n;
273 double lcw[2] = {0,0}, rcw[2];
275 double best_val = init_quality;
276 int boost_type = ensemble->get_params().boost_type;
277 int split_criteria = ensemble->get_params().split_criteria;
279 rcw[0] = rcw0[0]; rcw[1] = rcw0[1];
280 for( i = n1; i < n; i++ )
282 int idx = sorted_indices[i];
283 double w = weights[idx];
284 rcw[responses[idx]] -= w;
287 if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
288 split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
290 if( split_criteria == CvBoost::GINI )
292 double L = 0, R = rcw[0] + rcw[1];
293 double lsum2 = 0, rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
295 for( i = 0; i < n1 - 1; i++ )
297 int idx = sorted_indices[i];
298 double w = weights[idx], w2 = w*w;
300 idx = responses[idx];
302 lv = lcw[idx]; rv = rcw[idx];
303 lsum2 += 2*lv*w + w2;
304 rsum2 -= 2*rv*w - w2;
305 lcw[idx] = lv + w; rcw[idx] = rv - w;
307 if( values[i] + epsilon < values[i+1] )
309 double val = (lsum2*R + rsum2*L)/(L*R);
320 for( i = 0; i < n1 - 1; i++ )
322 int idx = sorted_indices[i];
323 double w = weights[idx];
324 idx = responses[idx];
328 if( values[i] + epsilon < values[i+1] )
330 double val = lcw[0] + rcw[1], val2 = lcw[1] + rcw[0];
331 val = MAX(val, val2);
341 CvDTreeSplit* split = 0;
344 split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
346 split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
347 split->ord.split_point = best_i;
349 split->quality = (float)best_val;
355 #define CV_CMP_NUM_PTR(a,b) (*(a) < *(b))
356 static CV_IMPLEMENT_QSORT_EX( icvSortDblPtr, double*, CV_CMP_NUM_PTR, int )
359 CvBoostTree::find_split_cat_class( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
361 int ci = data->get_var_type(vi);
362 int n = node->sample_count;
363 int mi = data->cat_count->data.i[ci];
365 int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
366 cv::AutoBuffer<uchar> inn_buf((2*mi+3)*sizeof(double) + mi*sizeof(double*));
368 inn_buf.allocate( base_size + 2*n*sizeof(int) );
369 uchar* base_buf = (uchar*)inn_buf;
370 uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
372 int* cat_labels_buf = (int*)ext_buf;
373 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
374 int* responses_buf = cat_labels_buf + n;
375 const int* responses = data->get_class_labels(node, responses_buf);
376 double lcw[2]={0,0}, rcw[2]={0,0};
378 double* cjk = (double*)cv::alignPtr(base_buf,sizeof(double))+2;
379 const double* weights = ensemble->get_subtree_weights()->data.db;
380 double** dbl_ptr = (double**)(cjk + 2*mi);
383 double best_val = init_quality;
384 int best_subset = -1, subset_i;
385 int boost_type = ensemble->get_params().boost_type;
386 int split_criteria = ensemble->get_params().split_criteria;
388 // init array of counters:
389 // c_{jk} - number of samples that have vi-th input variable = j and response = k.
390 for( j = -1; j < mi; j++ )
391 cjk[j*2] = cjk[j*2+1] = 0;
393 for( i = 0; i < n; i++ )
395 double w = weights[i];
396 j = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
401 for( j = 0; j < mi; j++ )
404 rcw[1] += cjk[j*2+1];
405 dbl_ptr[j] = cjk + j*2 + 1;
410 if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
411 split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
413 // sort rows of c_jk by increasing c_j,1
414 // (i.e. by the weight of samples in j-th category that belong to class 1)
415 icvSortDblPtr( dbl_ptr, mi, 0 );
417 for( subset_i = 0; subset_i < mi-1; subset_i++ )
419 idx = (int)(dbl_ptr[subset_i] - cjk)/2;
420 const double* crow = cjk + idx*2;
421 double w0 = crow[0], w1 = crow[1];
422 double weight = w0 + w1;
424 if( weight < FLT_EPSILON )
427 lcw[0] += w0; rcw[0] -= w0;
428 lcw[1] += w1; rcw[1] -= w1;
430 if( split_criteria == CvBoost::GINI )
432 double lsum2 = lcw[0]*lcw[0] + lcw[1]*lcw[1];
433 double rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
438 if( L > FLT_EPSILON && R > FLT_EPSILON )
440 double val = (lsum2*R + rsum2*L)/(L*R);
444 best_subset = subset_i;
450 double val = lcw[0] + rcw[1];
451 double val2 = lcw[1] + rcw[0];
453 val = MAX(val, val2);
457 best_subset = subset_i;
462 CvDTreeSplit* split = 0;
463 if( best_subset >= 0 )
465 split = _split ? _split : data->new_split_cat( 0, -1.0f);
467 split->quality = (float)best_val;
468 memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
469 for( i = 0; i <= best_subset; i++ )
471 idx = (int)(dbl_ptr[i] - cjk) >> 1;
472 split->subset[idx >> 5] |= 1 << (idx & 31);
480 CvBoostTree::find_split_ord_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
482 const float epsilon = FLT_EPSILON*2;
483 const double* weights = ensemble->get_subtree_weights()->data.db;
484 int n = node->sample_count;
485 int n1 = node->get_num_valid(vi);
487 cv::AutoBuffer<uchar> inn_buf;
489 inn_buf.allocate(2*n*(sizeof(int)+sizeof(float)));
490 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
492 float* values_buf = (float*)ext_buf;
493 int* indices_buf = (int*)(values_buf + n);
494 int* sample_indices_buf = indices_buf + n;
495 const float* values = 0;
496 const int* indices = 0;
497 data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
498 float* responses_buf = (float*)(indices_buf + n);
499 const float* responses = data->get_ord_responses( node, responses_buf, sample_indices_buf );
502 double L = 0, R = weights[n];
503 double best_val = init_quality, lsum = 0, rsum = node->value*R;
505 // compensate for missing values
506 for( i = n1; i < n; i++ )
508 int idx = indices[i];
509 double w = weights[idx];
510 rsum -= responses[idx]*w;
514 // find the optimal split
515 for( i = 0; i < n1 - 1; i++ )
517 int idx = indices[i];
518 double w = weights[idx];
519 double t = responses[idx]*w;
521 lsum += t; rsum -= t;
523 if( values[i] + epsilon < values[i+1] )
525 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
534 CvDTreeSplit* split = 0;
537 split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
539 split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
540 split->ord.split_point = best_i;
542 split->quality = (float)best_val;
549 CvBoostTree::find_split_cat_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
551 const double* weights = ensemble->get_subtree_weights()->data.db;
552 int ci = data->get_var_type(vi);
553 int n = node->sample_count;
554 int mi = data->cat_count->data.i[ci];
555 int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
556 cv::AutoBuffer<uchar> inn_buf(base_size);
558 inn_buf.allocate(base_size + n*(2*sizeof(int) + sizeof(float)));
559 uchar* base_buf = (uchar*)inn_buf;
560 uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
562 int* cat_labels_buf = (int*)ext_buf;
563 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
564 float* responses_buf = (float*)(cat_labels_buf + n);
565 int* sample_indices_buf = (int*)(responses_buf + n);
566 const float* responses = data->get_ord_responses(node, responses_buf, sample_indices_buf);
568 double* sum = (double*)cv::alignPtr(base_buf,sizeof(double)) + 1;
569 double* counts = sum + mi + 1;
570 double** sum_ptr = (double**)(counts + mi);
571 double L = 0, R = 0, best_val = init_quality, lsum = 0, rsum = 0;
572 int i, best_subset = -1, subset_i;
574 for( i = -1; i < mi; i++ )
575 sum[i] = counts[i] = 0;
577 // calculate sum response and weight of each category of the input var
578 for( i = 0; i < n; i++ )
580 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
581 double w = weights[i];
582 double s = sum[idx] + responses[i]*w;
583 double nc = counts[idx] + w;
588 // calculate average response in each category
589 for( i = 0; i < mi; i++ )
593 sum[i] = fabs(counts[i]) > DBL_EPSILON ? sum[i]/counts[i] : 0;
594 sum_ptr[i] = sum + i;
597 icvSortDblPtr( sum_ptr, mi, 0 );
599 // revert back to unnormalized sums
600 // (there should be a very little loss in accuracy)
601 for( i = 0; i < mi; i++ )
604 for( subset_i = 0; subset_i < mi-1; subset_i++ )
606 int idx = (int)(sum_ptr[subset_i] - sum);
607 double ni = counts[idx];
609 if( ni > FLT_EPSILON )
615 if( L > FLT_EPSILON && R > FLT_EPSILON )
617 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
621 best_subset = subset_i;
627 CvDTreeSplit* split = 0;
628 if( best_subset >= 0 )
630 split = _split ? _split : data->new_split_cat( 0, -1.0f);
632 split->quality = (float)best_val;
633 memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
634 for( i = 0; i <= best_subset; i++ )
636 int idx = (int)(sum_ptr[i] - sum);
637 split->subset[idx >> 5] |= 1 << (idx & 31);
645 CvBoostTree::find_surrogate_split_ord( CvDTreeNode* node, int vi, uchar* _ext_buf )
647 const float epsilon = FLT_EPSILON*2;
648 int n = node->sample_count;
649 cv::AutoBuffer<uchar> inn_buf;
651 inn_buf.allocate(n*(2*sizeof(int)+sizeof(float)));
652 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
653 float* values_buf = (float*)ext_buf;
654 int* indices_buf = (int*)(values_buf + n);
655 int* sample_indices_buf = indices_buf + n;
656 const float* values = 0;
657 const int* indices = 0;
658 data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
660 const double* weights = ensemble->get_subtree_weights()->data.db;
661 const char* dir = (char*)data->direction->data.ptr;
662 int n1 = node->get_num_valid(vi);
663 // LL - number of samples that both the primary and the surrogate splits send to the left
664 // LR - ... primary split sends to the left and the surrogate split sends to the right
665 // RL - ... primary split sends to the right and the surrogate split sends to the left
666 // RR - ... both send to the right
667 int i, best_i = -1, best_inversed = 0;
669 double LL = 0, RL = 0, LR, RR;
670 double worst_val = node->maxlr;
671 double sum = 0, sum_abs = 0;
672 best_val = worst_val;
674 for( i = 0; i < n1; i++ )
676 int idx = indices[i];
677 double w = weights[idx];
679 sum += d*w; sum_abs += (d & 1)*w;
682 // sum_abs = R + L; sum = R - L
683 RR = (sum_abs + sum)*0.5;
684 LR = (sum_abs - sum)*0.5;
686 // initially all the samples are sent to the right by the surrogate split,
687 // LR of them are sent to the left by primary split, and RR - to the right.
688 // now iteratively compute LL, LR, RL and RR for every possible surrogate split value.
689 for( i = 0; i < n1 - 1; i++ )
691 int idx = indices[i];
692 double w = weights[idx];
698 if( LL + RR > best_val && values[i] + epsilon < values[i+1] )
701 best_i = i; best_inversed = 0;
707 if( RL + LR > best_val && values[i] + epsilon < values[i+1] )
710 best_i = i; best_inversed = 1;
715 return best_i >= 0 && best_val > node->maxlr ? data->new_split_ord( vi,
716 (values[best_i] + values[best_i+1])*0.5f, best_i,
717 best_inversed, (float)best_val ) : 0;
722 CvBoostTree::find_surrogate_split_cat( CvDTreeNode* node, int vi, uchar* _ext_buf )
724 const char* dir = (char*)data->direction->data.ptr;
725 const double* weights = ensemble->get_subtree_weights()->data.db;
726 int n = node->sample_count;
727 int i, mi = data->cat_count->data.i[data->get_var_type(vi)];
729 int base_size = (2*mi+3)*sizeof(double);
730 cv::AutoBuffer<uchar> inn_buf(base_size);
732 inn_buf.allocate(base_size + n*sizeof(int));
733 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
734 int* cat_labels_buf = (int*)ext_buf;
735 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
737 // LL - number of samples that both the primary and the surrogate splits send to the left
738 // LR - ... primary split sends to the left and the surrogate split sends to the right
739 // RL - ... primary split sends to the right and the surrogate split sends to the left
740 // RR - ... both send to the right
741 CvDTreeSplit* split = data->new_split_cat( vi, 0 );
743 double* lc = (double*)cv::alignPtr(cat_labels_buf + n, sizeof(double)) + 1;
744 double* rc = lc + mi + 1;
746 for( i = -1; i < mi; i++ )
749 // 1. for each category calculate the weight of samples
750 // sent to the left (lc) and to the right (rc) by the primary split
751 for( i = 0; i < n; i++ )
753 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
754 double w = weights[i];
756 double sum = lc[idx] + d*w;
757 double sum_abs = rc[idx] + (d & 1)*w;
758 lc[idx] = sum; rc[idx] = sum_abs;
761 for( i = 0; i < mi; i++ )
764 double sum_abs = rc[i];
765 lc[i] = (sum_abs - sum) * 0.5;
766 rc[i] = (sum_abs + sum) * 0.5;
769 // 2. now form the split.
770 // in each category send all the samples to the same direction as majority
771 for( i = 0; i < mi; i++ )
773 double lval = lc[i], rval = rc[i];
776 split->subset[i >> 5] |= 1 << (i & 31);
783 split->quality = (float)best_val;
784 if( split->quality <= node->maxlr )
785 cvSetRemoveByPtr( data->split_heap, split ), split = 0;
792 CvBoostTree::calc_node_value( CvDTreeNode* node )
794 int i, n = node->sample_count;
795 const double* weights = ensemble->get_weights()->data.db;
796 cv::AutoBuffer<uchar> inn_buf(n*(sizeof(int) + ( data->is_classifier ? sizeof(int) : sizeof(int) + sizeof(float))));
797 int* labels_buf = (int*)(uchar*)inn_buf;
798 const int* labels = data->get_cv_labels(node, labels_buf);
799 double* subtree_weights = ensemble->get_subtree_weights()->data.db;
800 double rcw[2] = {0,0};
801 int boost_type = ensemble->get_params().boost_type;
803 if( data->is_classifier )
805 int* _responses_buf = labels_buf + n;
806 const int* _responses = data->get_class_labels(node, _responses_buf);
807 int m = data->get_num_classes();
808 int* cls_count = data->counts->data.i;
809 for( int k = 0; k < m; k++ )
812 for( i = 0; i < n; i++ )
815 double w = weights[idx];
816 int r = _responses[i];
819 subtree_weights[i] = w;
822 node->class_idx = rcw[1] > rcw[0];
824 if( boost_type == CvBoost::DISCRETE )
826 // ignore cat_map for responses, and use {-1,1},
827 // as the whole ensemble response is computes as sign(sum_i(weak_response_i)
828 node->value = node->class_idx*2 - 1;
832 double p = rcw[1]/(rcw[0] + rcw[1]);
833 assert( boost_type == CvBoost::REAL );
835 // store log-ratio of the probability
836 node->value = 0.5*log_ratio(p);
841 // in case of regression tree:
842 // * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,
843 // n is the number of samples in the node.
844 // * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)
845 double sum = 0, sum2 = 0, iw;
846 float* values_buf = (float*)(labels_buf + n);
847 int* sample_indices_buf = (int*)(values_buf + n);
848 const float* values = data->get_ord_responses(node, values_buf, sample_indices_buf);
850 for( i = 0; i < n; i++ )
853 double w = weights[idx]/*priors[values[i] > 0]*/;
854 double t = values[i];
856 subtree_weights[i] = w;
862 node->value = sum*iw;
863 node->node_risk = sum2 - (sum*iw)*sum;
865 // renormalize the risk, as in try_split_node the unweighted formula
866 // sqrt(risk)/n is used, rather than sqrt(risk)/sum(weights_i)
867 node->node_risk *= n*iw*n*iw;
870 // store summary weights
871 subtree_weights[n] = rcw[0];
872 subtree_weights[n+1] = rcw[1];
876 void CvBoostTree::read( CvFileStorage* fs, CvFileNode* fnode, CvBoost* _ensemble, CvDTreeTrainData* _data )
878 CvDTree::read( fs, fnode, _data );
879 ensemble = _ensemble;
883 void CvBoostTree::read( CvFileStorage*, CvFileNode* )
888 void CvBoostTree::read( CvFileStorage* _fs, CvFileNode* _node,
889 CvDTreeTrainData* _data )
891 CvDTree::read( _fs, _node, _data );
895 /////////////////////////////////// CvBoost /////////////////////////////////////
901 default_model_name = "my_boost_tree";
903 active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
904 subsample_mask = weights = subtree_weights = 0;
905 have_active_cat_vars = have_subsample = false;
911 void CvBoost::prune( CvSlice slice )
916 int i, count = cvSliceLength( slice, weak );
918 cvStartReadSeq( weak, &reader );
919 cvSetSeqReaderPos( &reader, slice.start_index );
921 for( i = 0; i < count; i++ )
924 CV_READ_SEQ_ELEM( w, reader );
928 cvSeqRemoveSlice( weak, slice );
933 void CvBoost::clear()
937 prune( CV_WHOLE_SEQ );
938 cvReleaseMemStorage( &weak->storage );
944 cvReleaseMat( &active_vars );
945 cvReleaseMat( &active_vars_abs );
946 cvReleaseMat( &orig_response );
947 cvReleaseMat( &sum_response );
948 cvReleaseMat( &weak_eval );
949 cvReleaseMat( &subsample_mask );
950 cvReleaseMat( &weights );
951 cvReleaseMat( &subtree_weights );
953 have_subsample = false;
963 CvBoost::CvBoost( const CvMat* _train_data, int _tflag,
964 const CvMat* _responses, const CvMat* _var_idx,
965 const CvMat* _sample_idx, const CvMat* _var_type,
966 const CvMat* _missing_mask, CvBoostParams _params )
970 default_model_name = "my_boost_tree";
972 active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
973 subsample_mask = weights = subtree_weights = 0;
975 train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
976 _var_type, _missing_mask, _params );
981 CvBoost::set_params( const CvBoostParams& _params )
985 CV_FUNCNAME( "CvBoost::set_params" );
990 if( params.boost_type != DISCRETE && params.boost_type != REAL &&
991 params.boost_type != LOGIT && params.boost_type != GENTLE )
992 CV_ERROR( CV_StsBadArg, "Unknown/unsupported boosting type" );
994 params.weak_count = MAX( params.weak_count, 1 );
995 params.weight_trim_rate = MAX( params.weight_trim_rate, 0. );
996 params.weight_trim_rate = MIN( params.weight_trim_rate, 1. );
997 if( params.weight_trim_rate < FLT_EPSILON )
998 params.weight_trim_rate = 1.f;
1000 if( params.boost_type == DISCRETE &&
1001 params.split_criteria != GINI && params.split_criteria != MISCLASS )
1002 params.split_criteria = MISCLASS;
1003 if( params.boost_type == REAL &&
1004 params.split_criteria != GINI && params.split_criteria != MISCLASS )
1005 params.split_criteria = GINI;
1006 if( (params.boost_type == LOGIT || params.boost_type == GENTLE) &&
1007 params.split_criteria != SQERR )
1008 params.split_criteria = SQERR;
1019 CvBoost::train( const CvMat* _train_data, int _tflag,
1020 const CvMat* _responses, const CvMat* _var_idx,
1021 const CvMat* _sample_idx, const CvMat* _var_type,
1022 const CvMat* _missing_mask,
1023 CvBoostParams _params, bool _update )
1026 CvMemStorage* storage = 0;
1028 CV_FUNCNAME( "CvBoost::train" );
1034 set_params( _params );
1036 cvReleaseMat( &active_vars );
1037 cvReleaseMat( &active_vars_abs );
1039 if( !_update || !data )
1042 data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx,
1043 _sample_idx, _var_type, _missing_mask, _params, true, true );
1045 if( data->get_num_classes() != 2 )
1046 CV_ERROR( CV_StsNotImplemented,
1047 "Boosted trees can only be used for 2-class classification." );
1048 CV_CALL( storage = cvCreateMemStorage() );
1049 weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
1054 data->set_data( _train_data, _tflag, _responses, _var_idx,
1055 _sample_idx, _var_type, _missing_mask, _params, true, true, true );
1058 if ( (_params.boost_type == LOGIT) || (_params.boost_type == GENTLE) )
1059 data->do_responses_copy();
1061 update_weights( 0 );
1063 for( i = 0; i < params.weak_count; i++ )
1065 CvBoostTree* tree = new CvBoostTree;
1066 if( !tree->train( data, subsample_mask, this ) )
1071 //cvCheckArr( get_weak_response());
1072 cvSeqPush( weak, &tree );
1073 update_weights( tree );
1075 if( cvCountNonZero(subsample_mask) == 0 )
1079 get_active_vars(); // recompute active_vars* maps and condensed_idx's in the splits.
1080 data->is_classifier = true;
1083 data->free_train_data();
1090 bool CvBoost::train( CvMLData* _data,
1091 CvBoostParams params,
1094 bool result = false;
1096 CV_FUNCNAME( "CvBoost::train" );
1100 const CvMat* values = _data->get_values();
1101 const CvMat* response = _data->get_responses();
1102 const CvMat* missing = _data->get_missing();
1103 const CvMat* var_types = _data->get_var_types();
1104 const CvMat* train_sidx = _data->get_train_sample_idx();
1105 const CvMat* var_idx = _data->get_var_idx();
1107 CV_CALL( result = train( values, CV_ROW_SAMPLE, response, var_idx,
1108 train_sidx, var_types, missing, params, update ) );
1116 CvBoost::update_weights( CvBoostTree* tree )
1118 CV_FUNCNAME( "CvBoost::update_weights" );
1122 int i, n = data->sample_count;
1126 int *sample_idx_buf;
1127 const int* sample_idx = 0;
1128 cv::AutoBuffer<uchar> inn_buf;
1129 int _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? data->sample_count*sizeof(int) : 0;
1131 _buf_size += n*sizeof(int);
1134 if( have_subsample )
1135 _buf_size += data->buf->step*(sizeof(float)+sizeof(uchar));
1137 inn_buf.allocate(_buf_size);
1138 uchar* cur_buf_pos = (uchar*)inn_buf;
1140 if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )
1142 step = CV_IS_MAT_CONT(data->responses_copy->type) ?
1143 1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);
1144 fdata = data->responses_copy->data.fl;
1145 sample_idx_buf = (int*)cur_buf_pos;
1146 cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);
1147 sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );
1149 CvMat* dtree_data_buf = data->buf;
1150 if( !tree ) // before training the first tree, initialize weights and other parameters
1152 int* class_labels_buf = (int*)cur_buf_pos;
1153 cur_buf_pos = (uchar*)(class_labels_buf + n);
1154 const int* class_labels = data->get_class_labels(data->data_root, class_labels_buf);
1155 // in case of logitboost and gentle adaboost each weak tree is a regression tree,
1156 // so we need to convert class labels to floating-point values
1159 double p[2] = { 1, 1 };
1161 cvReleaseMat( &orig_response );
1162 cvReleaseMat( &sum_response );
1163 cvReleaseMat( &weak_eval );
1164 cvReleaseMat( &subsample_mask );
1165 cvReleaseMat( &weights );
1166 cvReleaseMat( &subtree_weights );
1168 CV_CALL( orig_response = cvCreateMat( 1, n, CV_32S ));
1169 CV_CALL( weak_eval = cvCreateMat( 1, n, CV_64F ));
1170 CV_CALL( subsample_mask = cvCreateMat( 1, n, CV_8U ));
1171 CV_CALL( weights = cvCreateMat( 1, n, CV_64F ));
1172 CV_CALL( subtree_weights = cvCreateMat( 1, n + 2, CV_64F ));
1174 if( data->have_priors )
1176 // compute weight scale for each class from their prior probabilities
1178 for( i = 0; i < n; i++ )
1179 c1 += class_labels[i];
1180 p[0] = data->priors->data.db[0]*(c1 < n ? 1./(n - c1) : 0.);
1181 p[1] = data->priors->data.db[1]*(c1 > 0 ? 1./c1 : 0.);
1182 p[0] /= p[0] + p[1];
1186 if (data->is_buf_16u)
1188 unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*dtree_data_buf->cols +
1189 data->data_root->offset + (data->work_var_count-1)*data->sample_count);
1190 for( i = 0; i < n; i++ )
1192 // save original categorical responses {0,1}, convert them to {-1,1}
1193 orig_response->data.i[i] = class_labels[i]*2 - 1;
1194 // make all the samples active at start.
1195 // later, in trim_weights() deactivate/reactive again some, if need
1196 subsample_mask->data.ptr[i] = (uchar)1;
1197 // make all the initial weights the same.
1198 weights->data.db[i] = w0*p[class_labels[i]];
1199 // set the labels to find (from within weak tree learning proc)
1200 // the particular sample weight, and where to store the response.
1201 labels[i] = (unsigned short)i;
1206 int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*dtree_data_buf->cols +
1207 data->data_root->offset + (data->work_var_count-1)*data->sample_count;
1209 for( i = 0; i < n; i++ )
1211 // save original categorical responses {0,1}, convert them to {-1,1}
1212 orig_response->data.i[i] = class_labels[i]*2 - 1;
1213 // make all the samples active at start.
1214 // later, in trim_weights() deactivate/reactive again some, if need
1215 subsample_mask->data.ptr[i] = (uchar)1;
1216 // make all the initial weights the same.
1217 weights->data.db[i] = w0*p[class_labels[i]];
1218 // set the labels to find (from within weak tree learning proc)
1219 // the particular sample weight, and where to store the response.
1224 if( params.boost_type == LOGIT )
1226 CV_CALL( sum_response = cvCreateMat( 1, n, CV_64F ));
1228 for( i = 0; i < n; i++ )
1230 sum_response->data.db[i] = 0;
1231 fdata[sample_idx[i]*step] = orig_response->data.i[i] > 0 ? 2.f : -2.f;
1234 // in case of logitboost each weak tree is a regression tree.
1235 // the target function values are recalculated for each of the trees
1236 data->is_classifier = false;
1238 else if( params.boost_type == GENTLE )
1240 for( i = 0; i < n; i++ )
1241 fdata[sample_idx[i]*step] = (float)orig_response->data.i[i];
1243 data->is_classifier = false;
1248 // at this moment, for all the samples that participated in the training of the most
1249 // recent weak classifier we know the responses. For other samples we need to compute them
1250 if( have_subsample )
1252 float* values = (float*)cur_buf_pos;
1253 cur_buf_pos = (uchar*)(values + data->buf->step);
1254 uchar* missing = cur_buf_pos;
1255 cur_buf_pos = missing + data->buf->step;
1256 CvMat _sample, _mask;
1258 // invert the subsample mask
1259 cvXorS( subsample_mask, cvScalar(1.), subsample_mask );
1260 data->get_vectors( subsample_mask, values, missing, 0 );
1262 _sample = cvMat( 1, data->var_count, CV_32F );
1263 _mask = cvMat( 1, data->var_count, CV_8U );
1265 // run tree through all the non-processed samples
1266 for( i = 0; i < n; i++ )
1267 if( subsample_mask->data.ptr[i] )
1269 _sample.data.fl = values;
1270 _mask.data.ptr = missing;
1271 values += _sample.cols;
1272 missing += _mask.cols;
1273 weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;
1277 // now update weights and other parameters for each type of boosting
1278 if( params.boost_type == DISCRETE )
1280 // Discrete AdaBoost:
1281 // weak_eval[i] (=f(x_i)) is in {-1,1}
1282 // err = sum(w_i*(f(x_i) != y_i))/sum(w_i)
1283 // C = log((1-err)/err)
1284 // w_i *= exp(C*(f(x_i) != y_i))
1287 double scale[] = { 1., 0. };
1289 for( i = 0; i < n; i++ )
1291 double w = weights->data.db[i];
1293 err += w*(weak_eval->data.db[i] != orig_response->data.i[i]);
1298 C = err = -log_ratio( err );
1299 scale[1] = exp(err);
1302 for( i = 0; i < n; i++ )
1304 double w = weights->data.db[i]*
1305 scale[weak_eval->data.db[i] != orig_response->data.i[i]];
1307 weights->data.db[i] = w;
1312 else if( params.boost_type == REAL )
1315 // weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)
1316 // w_i *= exp(-y_i*f(x_i))
1318 for( i = 0; i < n; i++ )
1319 weak_eval->data.db[i] *= -orig_response->data.i[i];
1321 cvExp( weak_eval, weak_eval );
1323 for( i = 0; i < n; i++ )
1325 double w = weights->data.db[i]*weak_eval->data.db[i];
1327 weights->data.db[i] = w;
1330 else if( params.boost_type == LOGIT )
1333 // weak_eval[i] = f(x_i) in [-z_max,z_max]
1334 // sum_response = F(x_i).
1335 // F(x_i) += 0.5*f(x_i)
1336 // p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))
1337 // reuse weak_eval: weak_eval[i] <- p(x_i)
1338 // w_i = p(x_i)*1(1 - p(x_i))
1339 // z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))
1340 // store z_i to the data->data_root as the new target responses
1342 const double lb_weight_thresh = FLT_EPSILON;
1343 const double lb_z_max = 10.;
1344 /*float* responses_buf = data->get_resp_float_buf();
1345 const float* responses = 0;
1346 data->get_ord_responses(data->data_root, responses_buf, &responses);*/
1348 /*if( weak->total == 7 )
1351 for( i = 0; i < n; i++ )
1353 double s = sum_response->data.db[i] + 0.5*weak_eval->data.db[i];
1354 sum_response->data.db[i] = s;
1355 weak_eval->data.db[i] = -2*s;
1358 cvExp( weak_eval, weak_eval );
1360 for( i = 0; i < n; i++ )
1362 double p = 1./(1. + weak_eval->data.db[i]);
1363 double w = p*(1 - p), z;
1364 w = MAX( w, lb_weight_thresh );
1365 weights->data.db[i] = w;
1367 if( orig_response->data.i[i] > 0 )
1370 fdata[sample_idx[i]*step] = (float)MIN(z, lb_z_max);
1375 fdata[sample_idx[i]*step] = (float)-MIN(z, lb_z_max);
1382 // weak_eval[i] = f(x_i) in [-1,1]
1383 // w_i *= exp(-y_i*f(x_i))
1384 assert( params.boost_type == GENTLE );
1386 for( i = 0; i < n; i++ )
1387 weak_eval->data.db[i] *= -orig_response->data.i[i];
1389 cvExp( weak_eval, weak_eval );
1391 for( i = 0; i < n; i++ )
1393 double w = weights->data.db[i] * weak_eval->data.db[i];
1394 weights->data.db[i] = w;
1400 // renormalize weights
1401 if( sumw > FLT_EPSILON )
1404 for( i = 0; i < n; ++i )
1405 weights->data.db[i] *= sumw;
1412 static CV_IMPLEMENT_QSORT_EX( icvSort_64f, double, CV_LT, int )
1416 CvBoost::trim_weights()
1418 //CV_FUNCNAME( "CvBoost::trim_weights" );
1422 int i, count = data->sample_count, nz_count = 0;
1423 double sum, threshold;
1425 if( params.weight_trim_rate <= 0. || params.weight_trim_rate >= 1. )
1428 // use weak_eval as temporary buffer for sorted weights
1429 cvCopy( weights, weak_eval );
1431 icvSort_64f( weak_eval->data.db, count, 0 );
1433 // as weight trimming occurs immediately after updating the weights,
1434 // where they are renormalized, we assume that the weight sum = 1.
1435 sum = 1. - params.weight_trim_rate;
1437 for( i = 0; i < count; i++ )
1439 double w = weak_eval->data.db[i];
1445 threshold = i < count ? weak_eval->data.db[i] : DBL_MAX;
1447 for( i = 0; i < count; i++ )
1449 double w = weights->data.db[i];
1450 int f = w >= threshold;
1451 subsample_mask->data.ptr[i] = (uchar)f;
1455 have_subsample = nz_count < count;
1462 CvBoost::get_active_vars( bool absolute_idx )
1468 CV_FUNCNAME( "CvBoost::get_active_vars" );
1473 CV_ERROR( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1475 if( !active_vars || !active_vars_abs )
1478 int i, j, nactive_vars;
1480 const CvDTreeNode* node;
1482 assert(!active_vars && !active_vars_abs);
1483 mask = cvCreateMat( 1, data->var_count, CV_8U );
1484 inv_map = cvCreateMat( 1, data->var_count, CV_32S );
1486 cvSet( inv_map, cvScalar(-1) );
1488 // first pass: compute the mask of used variables
1489 cvStartReadSeq( weak, &reader );
1490 for( i = 0; i < weak->total; i++ )
1492 CV_READ_SEQ_ELEM(wtree, reader);
1494 node = wtree->get_root();
1495 assert( node != 0 );
1498 const CvDTreeNode* parent;
1501 CvDTreeSplit* split = node->split;
1502 for( ; split != 0; split = split->next )
1503 mask->data.ptr[split->var_idx] = 1;
1509 for( parent = node->parent; parent && parent->right == node;
1510 node = parent, parent = parent->parent )
1516 node = parent->right;
1520 nactive_vars = cvCountNonZero(mask);
1522 //if ( nactive_vars > 0 )
1524 active_vars = cvCreateMat( 1, nactive_vars, CV_32S );
1525 active_vars_abs = cvCreateMat( 1, nactive_vars, CV_32S );
1527 have_active_cat_vars = false;
1529 for( i = j = 0; i < data->var_count; i++ )
1531 if( mask->data.ptr[i] )
1533 active_vars->data.i[j] = i;
1534 active_vars_abs->data.i[j] = data->var_idx ? data->var_idx->data.i[i] : i;
1535 inv_map->data.i[i] = j;
1536 if( data->var_type->data.i[i] >= 0 )
1537 have_active_cat_vars = true;
1543 // second pass: now compute the condensed indices
1544 cvStartReadSeq( weak, &reader );
1545 for( i = 0; i < weak->total; i++ )
1547 CV_READ_SEQ_ELEM(wtree, reader);
1548 node = wtree->get_root();
1551 const CvDTreeNode* parent;
1554 CvDTreeSplit* split = node->split;
1555 for( ; split != 0; split = split->next )
1557 split->condensed_idx = inv_map->data.i[split->var_idx];
1558 assert( split->condensed_idx >= 0 );
1566 for( parent = node->parent; parent && parent->right == node;
1567 node = parent, parent = parent->parent )
1573 node = parent->right;
1579 result = absolute_idx ? active_vars_abs : active_vars;
1583 cvReleaseMat( &mask );
1584 cvReleaseMat( &inv_map );
1591 CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
1592 CvMat* weak_responses, CvSlice slice,
1593 bool raw_mode, bool return_sum ) const
1595 float value = -FLT_MAX;
1597 CvMat sample, missing;
1601 const float* sample_data;
1604 CV_Error( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1606 if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
1607 (_sample->cols != 1 && _sample->rows != 1) ||
1608 (_sample->cols + _sample->rows - 1 != data->var_all && !raw_mode) ||
1609 (active_vars && _sample->cols + _sample->rows - 1 != active_vars->cols && raw_mode) )
1610 CV_Error( CV_StsBadArg,
1611 "the input sample must be 1d floating-point vector with the same "
1612 "number of elements as the total number of variables or "
1613 "as the number of variables used for training" );
1617 if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
1618 !CV_ARE_SIZES_EQ(_missing, _sample) )
1619 CV_Error( CV_StsBadArg,
1620 "the missing data mask must be 8-bit vector of the same size as input sample" );
1623 int i, weak_count = cvSliceLength( slice, weak );
1624 if( weak_count >= weak->total )
1626 weak_count = weak->total;
1627 slice.start_index = 0;
1630 if( weak_responses )
1632 if( !CV_IS_MAT(weak_responses) ||
1633 CV_MAT_TYPE(weak_responses->type) != CV_32FC1 ||
1634 (weak_responses->cols != 1 && weak_responses->rows != 1) ||
1635 weak_responses->cols + weak_responses->rows - 1 != weak_count )
1636 CV_Error( CV_StsBadArg,
1637 "The output matrix of weak classifier responses must be valid "
1638 "floating-point vector of the same number of components as the length of input slice" );
1639 wstep = CV_IS_MAT_CONT(weak_responses->type) ? 1 : weak_responses->step/sizeof(float);
1642 int var_count = active_vars->cols;
1643 const int* vtype = data->var_type->data.i;
1644 const int* cmap = data->cat_map->data.i;
1645 const int* cofs = data->cat_ofs->data.i;
1647 // if need, preprocess the input vector
1650 int step, mstep = 0;
1651 const float* src_sample;
1652 const uchar* src_mask = 0;
1655 const int* vidx = active_vars->data.i;
1656 const int* vidx_abs = active_vars_abs->data.i;
1657 bool have_mask = _missing != 0;
1659 cv::AutoBuffer<float> buf(var_count + (var_count+3)/4);
1660 dst_sample = &buf[0];
1661 dst_mask = (uchar*)&buf[var_count];
1663 src_sample = _sample->data.fl;
1664 step = CV_IS_MAT_CONT(_sample->type) ? 1 : _sample->step/sizeof(src_sample[0]);
1668 src_mask = _missing->data.ptr;
1669 mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step;
1672 for( i = 0; i < var_count; i++ )
1674 int idx = vidx[i], idx_abs = vidx_abs[i];
1675 float val = src_sample[idx_abs*step];
1676 int ci = vtype[idx];
1677 uchar m = src_mask ? src_mask[idx_abs*mstep] : (uchar)0;
1681 int a = cofs[ci], b = (ci+1 >= data->cat_ofs->cols) ? data->cat_map->cols : cofs[ci+1],
1683 int ival = cvRound(val);
1684 if ( (ival != val) && (!m) )
1685 CV_Error( CV_StsBadArg,
1686 "one of input categorical variable is not an integer" );
1691 if( ival < cmap[c] )
1693 else if( ival > cmap[c] )
1699 if( c < 0 || ival != cmap[c] )
1706 val = (float)(c - cofs[ci]);
1710 dst_sample[i] = val;
1714 sample = cvMat( 1, var_count, CV_32F, dst_sample );
1719 missing = cvMat( 1, var_count, CV_8UC1, dst_mask );
1720 _missing = &missing;
1725 if( !CV_IS_MAT_CONT(_sample->type & (_missing ? _missing->type : -1)) )
1726 CV_Error( CV_StsBadArg, "In raw mode the input vectors must be continuous" );
1729 cvStartReadSeq( weak, &reader );
1730 cvSetSeqReaderPos( &reader, slice.start_index );
1732 sample_data = _sample->data.fl;
1734 if( !have_active_cat_vars && !_missing && !weak_responses )
1736 for( i = 0; i < weak_count; i++ )
1739 const CvDTreeNode* node;
1740 CV_READ_SEQ_ELEM( wtree, reader );
1742 node = wtree->get_root();
1745 CvDTreeSplit* split = node->split;
1746 int vi = split->condensed_idx;
1747 float val = sample_data[vi];
1748 int dir = val <= split->ord.c ? -1 : 1;
1749 if( split->inversed )
1751 node = dir < 0 ? node->left : node->right;
1758 const int* avars = active_vars->data.i;
1759 const uchar* m = _missing ? _missing->data.ptr : 0;
1761 // full-featured version
1762 for( i = 0; i < weak_count; i++ )
1765 const CvDTreeNode* node;
1766 CV_READ_SEQ_ELEM( wtree, reader );
1768 node = wtree->get_root();
1771 const CvDTreeSplit* split = node->split;
1773 for( ; !dir && split != 0; split = split->next )
1775 int vi = split->condensed_idx;
1776 int ci = vtype[avars[vi]];
1777 float val = sample_data[vi];
1780 if( ci < 0 ) // ordered
1781 dir = val <= split->ord.c ? -1 : 1;
1784 int c = cvRound(val);
1785 dir = CV_DTREE_CAT_DIR(c, split->subset);
1787 if( split->inversed )
1793 int diff = node->right->sample_count - node->left->sample_count;
1794 dir = diff < 0 ? -1 : 1;
1796 node = dir < 0 ? node->left : node->right;
1798 if( weak_responses )
1799 weak_responses->data.fl[i*wstep] = (float)node->value;
1808 int cls_idx = sum >= 0;
1810 value = (float)cls_idx;
1812 value = (float)cmap[cofs[vtype[data->var_count]] + cls_idx];
1818 float CvBoost::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1821 const CvMat* values = _data->get_values();
1822 const CvMat* response = _data->get_responses();
1823 const CvMat* missing = _data->get_missing();
1824 const CvMat* sample_idx = (type == CV_TEST_ERROR) ? _data->get_test_sample_idx() : _data->get_train_sample_idx();
1825 const CvMat* var_types = _data->get_var_types();
1826 int* sidx = sample_idx ? sample_idx->data.i : 0;
1827 int r_step = CV_IS_MAT_CONT(response->type) ?
1828 1 : response->step / CV_ELEM_SIZE(response->type);
1829 bool is_classifier = var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL;
1830 int sample_count = sample_idx ? sample_idx->cols : 0;
1831 sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? values->rows : sample_count;
1832 float* pred_resp = 0;
1833 if( resp && (sample_count > 0) )
1835 resp->resize( sample_count );
1836 pred_resp = &((*resp)[0]);
1838 if ( is_classifier )
1840 for( int i = 0; i < sample_count; i++ )
1843 int si = sidx ? sidx[i] : i;
1844 cvGetRow( values, &sample, si );
1846 cvGetRow( missing, &miss, si );
1847 float r = (float)predict( &sample, missing ? &miss : 0 );
1850 int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1853 err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX;
1857 for( int i = 0; i < sample_count; i++ )
1860 int si = sidx ? sidx[i] : i;
1861 cvGetRow( values, &sample, si );
1863 cvGetRow( missing, &miss, si );
1864 float r = (float)predict( &sample, missing ? &miss : 0 );
1867 float d = r - response->data.fl[si*r_step];
1870 err = sample_count ? err / (float)sample_count : -FLT_MAX;
1875 void CvBoost::write_params( CvFileStorage* fs ) const
1877 const char* boost_type_str =
1878 params.boost_type == DISCRETE ? "DiscreteAdaboost" :
1879 params.boost_type == REAL ? "RealAdaboost" :
1880 params.boost_type == LOGIT ? "LogitBoost" :
1881 params.boost_type == GENTLE ? "GentleAdaboost" : 0;
1883 const char* split_crit_str =
1884 params.split_criteria == DEFAULT ? "Default" :
1885 params.split_criteria == GINI ? "Gini" :
1886 params.boost_type == MISCLASS ? "Misclassification" :
1887 params.boost_type == SQERR ? "SquaredErr" : 0;
1889 if( boost_type_str )
1890 cvWriteString( fs, "boosting_type", boost_type_str );
1892 cvWriteInt( fs, "boosting_type", params.boost_type );
1894 if( split_crit_str )
1895 cvWriteString( fs, "splitting_criteria", split_crit_str );
1897 cvWriteInt( fs, "splitting_criteria", params.split_criteria );
1899 cvWriteInt( fs, "ntrees", params.weak_count );
1900 cvWriteReal( fs, "weight_trimming_rate", params.weight_trim_rate );
1902 data->write_params( fs );
1906 void CvBoost::read_params( CvFileStorage* fs, CvFileNode* fnode )
1908 CV_FUNCNAME( "CvBoost::read_params" );
1914 if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1917 data = new CvDTreeTrainData();
1918 CV_CALL( data->read_params(fs, fnode));
1919 data->shared = true;
1921 params.max_depth = data->params.max_depth;
1922 params.min_sample_count = data->params.min_sample_count;
1923 params.max_categories = data->params.max_categories;
1924 params.priors = data->params.priors;
1925 params.regression_accuracy = data->params.regression_accuracy;
1926 params.use_surrogates = data->params.use_surrogates;
1928 temp = cvGetFileNodeByName( fs, fnode, "boosting_type" );
1932 if( temp && CV_NODE_IS_STRING(temp->tag) )
1934 const char* boost_type_str = cvReadString( temp, "" );
1935 params.boost_type = strcmp( boost_type_str, "DiscreteAdaboost" ) == 0 ? DISCRETE :
1936 strcmp( boost_type_str, "RealAdaboost" ) == 0 ? REAL :
1937 strcmp( boost_type_str, "LogitBoost" ) == 0 ? LOGIT :
1938 strcmp( boost_type_str, "GentleAdaboost" ) == 0 ? GENTLE : -1;
1941 params.boost_type = cvReadInt( temp, -1 );
1943 if( params.boost_type < DISCRETE || params.boost_type > GENTLE )
1944 CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1946 temp = cvGetFileNodeByName( fs, fnode, "splitting_criteria" );
1947 if( temp && CV_NODE_IS_STRING(temp->tag) )
1949 const char* split_crit_str = cvReadString( temp, "" );
1950 params.split_criteria = strcmp( split_crit_str, "Default" ) == 0 ? DEFAULT :
1951 strcmp( split_crit_str, "Gini" ) == 0 ? GINI :
1952 strcmp( split_crit_str, "Misclassification" ) == 0 ? MISCLASS :
1953 strcmp( split_crit_str, "SquaredErr" ) == 0 ? SQERR : -1;
1956 params.split_criteria = cvReadInt( temp, -1 );
1958 if( params.split_criteria < DEFAULT || params.boost_type > SQERR )
1959 CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1961 params.weak_count = cvReadIntByName( fs, fnode, "ntrees" );
1962 params.weight_trim_rate = cvReadRealByName( fs, fnode, "weight_trimming_rate", 0. );
1970 CvBoost::read( CvFileStorage* fs, CvFileNode* node )
1972 CV_FUNCNAME( "CvBoost::read" );
1977 CvFileNode* trees_fnode;
1978 CvMemStorage* storage;
1982 read_params( fs, node );
1987 trees_fnode = cvGetFileNodeByName( fs, node, "trees" );
1988 if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
1989 CV_ERROR( CV_StsParseError, "<trees> tag is missing" );
1991 cvStartReadSeq( trees_fnode->data.seq, &reader );
1992 ntrees = trees_fnode->data.seq->total;
1994 if( ntrees != params.weak_count )
1995 CV_ERROR( CV_StsUnmatchedSizes,
1996 "The number of trees stored does not match <ntrees> tag value" );
1998 CV_CALL( storage = cvCreateMemStorage() );
1999 weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
2001 for( i = 0; i < ntrees; i++ )
2003 CvBoostTree* tree = new CvBoostTree();
2004 CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, this, data ));
2005 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
2006 cvSeqPush( weak, &tree );
2015 CvBoost::write( CvFileStorage* fs, const char* name ) const
2017 CV_FUNCNAME( "CvBoost::write" );
2024 cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_BOOSTING );
2027 CV_ERROR( CV_StsBadArg, "The classifier has not been trained yet" );
2030 cvStartWriteStruct( fs, "trees", CV_NODE_SEQ );
2032 cvStartReadSeq( weak, &reader );
2034 for( i = 0; i < weak->total; i++ )
2037 CV_READ_SEQ_ELEM( tree, reader );
2038 cvStartWriteStruct( fs, 0, CV_NODE_MAP );
2040 cvEndWriteStruct( fs );
2043 cvEndWriteStruct( fs );
2044 cvEndWriteStruct( fs );
2051 CvBoost::get_weights()
2058 CvBoost::get_subtree_weights()
2060 return subtree_weights;
2065 CvBoost::get_weak_response()
2071 const CvBoostParams&
2072 CvBoost::get_params() const
2077 CvSeq* CvBoost::get_weak_predictors()
2082 const CvDTreeTrainData* CvBoost::get_data() const
2089 CvBoost::CvBoost( const Mat& _train_data, int _tflag,
2090 const Mat& _responses, const Mat& _var_idx,
2091 const Mat& _sample_idx, const Mat& _var_type,
2092 const Mat& _missing_mask,
2093 CvBoostParams _params )
2097 default_model_name = "my_boost_tree";
2098 active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
2099 subsample_mask = weights = subtree_weights = 0;
2101 train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
2102 _var_type, _missing_mask, _params );
2107 CvBoost::train( const Mat& _train_data, int _tflag,
2108 const Mat& _responses, const Mat& _var_idx,
2109 const Mat& _sample_idx, const Mat& _var_type,
2110 const Mat& _missing_mask,
2111 CvBoostParams _params, bool _update )
2113 CvMat tdata = _train_data, responses = _responses, vidx = _var_idx,
2114 sidx = _sample_idx, vtype = _var_type, mmask = _missing_mask;
2115 return train(&tdata, _tflag, &responses, vidx.data.ptr ? &vidx : 0,
2116 sidx.data.ptr ? &sidx : 0, vtype.data.ptr ? &vtype : 0,
2117 mmask.data.ptr ? &mmask : 0, _params, _update);
2121 CvBoost::predict( const Mat& _sample, const Mat& _missing,
2122 const Range& slice, bool raw_mode, bool return_sum ) const
2124 CvMat sample = _sample, mmask = _missing;
2125 /*if( weak_responses )
2127 int weak_count = cvSliceLength( slice, weak );
2128 if( weak_count >= weak->total )
2130 weak_count = weak->total;
2131 slice.start_index = 0;
2134 if( !(weak_responses->data && weak_responses->type() == CV_32FC1 &&
2135 (weak_responses->cols == 1 || weak_responses->rows == 1) &&
2136 weak_responses->cols + weak_responses->rows - 1 == weak_count) )
2137 weak_responses->create(weak_count, 1, CV_32FC1);
2138 pwr = &(wr = *weak_responses);
2140 return predict(&sample, _missing.empty() ? 0 : &mmask, 0,
2141 slice == Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end),
2142 raw_mode, return_sum);