From e20d570ed1d30943967e03b2614366acd3c3fc9d Mon Sep 17 00:00:00 2001 From: "P. Druzhkov" Date: Wed, 15 Jun 2011 21:54:25 +0000 Subject: [PATCH] brief gbt documentation added. some sample fixes made. code updated. --- modules/ml/doc/gradient_boosted_trees.rst | 371 ++++++++++++++++++++++++++++++ modules/ml/doc/ml.rst | 2 +- modules/ml/include/opencv2/ml/ml.hpp | 42 +++- modules/ml/src/gbt.cpp | 309 ++++++++++++++++++++----- samples/c/tree_engine.cpp | 5 +- 5 files changed, 668 insertions(+), 61 deletions(-) create mode 100644 modules/ml/doc/gradient_boosted_trees.rst diff --git a/modules/ml/doc/gradient_boosted_trees.rst b/modules/ml/doc/gradient_boosted_trees.rst new file mode 100644 index 0000000..8cef701 --- /dev/null +++ b/modules/ml/doc/gradient_boosted_trees.rst @@ -0,0 +1,371 @@ +.. _Gradient Boosted Trees: + +Gradient Boosted Trees +====================== + +Gradient Boosted Trees (GBT) is a generalized boosting algorithm, introduced by +Jerome Friedman: http://www.salfordsystems.com/doc/GreedyFuncApproxSS.pdf . +In contrast to AdaBoost.M1 algorithm GBT can deal with both multiclass +classification and regression problems. More than that it can use any +differential loss function, some popular ones are implemented. +Decision trees (:ref:`CvDTree`) usage as base learners allows to process ordered +and categorical variables. + + +.. _Training the GBT model: + +Training the GBT model +---------------------- + +Gradient Boosted Trees model represents an ensemble of single regression trees, +that are built in a greedy fashion. Training procedure is an iterative proccess +similar to the numerical optimazation via gradient descent method. Summary loss +on the training set depends only from the current model predictions on the +thaining samples, in other words +:math:`\sum^N_{i=1}L(y_i, F(x_i)) \equiv \mathcal{L}(F(x_1), F(x_2), ... , F(x_N)) +\equiv \mathcal{L}(F)`. And the :math:`\mathcal{L}(F)` +gradient can be computed as follows: + +.. math:: + grad(\mathcal{L}(F)) = \left( \dfrac{\partial{L(y_1, F(x_1))}}{\partial{F(x_1)}}, + \dfrac{\partial{L(y_2, F(x_2))}}{\partial{F(x_2)}}, ... , + \dfrac{\partial{L(y_N, F(x_N))}}{\partial{F(x_N)}} \right) . +On every training step a single regression tree is built to predict an +antigradient vector components. Step length is computed corresponding to the +loss function and separately for every region determined by the tree leaf, and +can be eliminated by changing leaves' values directly. + +The main scheme of the training proccess is shown below. + +#. + Find the best constant model. +#. + For :math:`i` in :math:`[1,M]`: + + #. + Compute the antigradient. + #. + Grow a regression tree to predict antigradient components. + #. + Change values in the tree leaves. + #. + Add the tree to the model. + + +The following loss functions are implemented: + +*for regression problems:* + +#. + Squared loss (``CvGBTrees::SQUARED_LOSS``): + :math:`L(y,f(x))=\dfrac{1}{2}(y-f(x))^2` +#. + Absolute loss (``CvGBTrees::ABSOLUTE_LOSS``): + :math:`L(y,f(x))=|y-f(x)|` +#. + Huber loss (``CvGBTrees::HUBER_LOSS``): + :math:`L(y,f(x)) = \left\{ \begin{array}{lr} + \delta\cdot\left(|y-f(x)|-\dfrac{\delta}{2}\right) & : |y-f(x)|>\delta\\ + \dfrac{1}{2}\cdot(y-f(x))^2 & : |y-f(x)|\leq\delta \end{array} \right.`, + where :math:`\delta` is the :math:`\alpha`-quantile estimation of the + :math:`|y-f(x)|`. In the current implementation :math:`\alpha=0.2`. + +*for classification problems:* + +4. + Deviance or cross-entropy loss (``CvGBTrees::DEVIANCE_LOSS``): + :math:`K` functions are built, one function for each output class, and + :math:`L(y,f_1(x),...,f_K(x)) = -\sum^K_{k=0}1(y=k)\ln{p_k(x)}`, + where :math:`p_k(x)=\dfrac{\exp{f_k(x)}}{\sum^K_{i=1}\exp{f_i(x)}}` + is the estimation of the probability that :math:`y=k`. + +In the end we get the model in the following form: + +.. math:: f(x) = f_0 + \nu\cdot\sum^M_{i=1}T_i(x) , +where :math:`f_0` is the initial guess (the best constant model) and :math:`\nu` +is a regularization parameter from the interval :math:`(0,1]`, futher called +*shrinkage*. + + +.. _Predicting with GBT model: + +Predicting with GBT model +------------------------- + +To get the GBT model prediciton it is needed to compute the sum of responses of +all the trees in the ensemble. For regression problems it is the answer, and +for classification problems the result is :math:`\arg\max_{i=1..K}(f_i(x))`. + + +.. highlight:: cpp + + +.. index:: CvGBTreesParams +.. _CvGBTreesParams: + +CvGBTreesParams +--------------- +.. c:type:: CvGBTreesParams + +GBT training parameters :: + + struct CvGBTreesParams : public CvDTreeParams + { + int weak_count; + int loss_function_type; + float subsample_portion; + float shrinkage; + + CvGBTreesParams(); + CvGBTreesParams( int loss_function_type, int weak_count, float shrinkage, + float subsample_portion, int max_depth, bool use_surrogates ); + }; + +The structure contains parameters for each sigle decision tree in the ensemble, +as well as the whole model characteristics. The structure is derived from +:ref:`CvDTreeParams` but not all of the decision tree parameters are supported: +cross-validation, pruning and class priorities are not used. The whole +parameters list is shown below: + +``weak_count`` + + The count of boosting algorithm iterations. ``weak_count*K`` -- is the total + count of trees in the GBT model, where ``K`` is the output classes count + (equal to one in the case of regression). + +``loss_function_type`` + + The type of the loss function used for training + (see :ref:`Training the GBT model`). It must be one of the + following: ``CvGBTrees::SQUARED_LOSS``, ``CvGBTrees::ABSOLUTE_LOSS``, + ``CvGBTrees::HUBER_LOSS``, ``CvGBTrees::DEVIANCE_LOSS``. The first three + ones are used for the case of regression problems, and the last one for + classification. + +``shrinkage`` + + Regularization parameter (see :ref:`Training the GBT model`). + +``subsample_portion`` + + The portion of the whole training set used on each algorithm iteration. + Subset is generated randomly + (For more information see + http://www.salfordsystems.com/doc/StochasticBoostingSS.pdf). + +``max_depth`` + + The maximal depth of each decision tree in the ensemble (see :ref:`CvDTree`). + +``use_surrogates`` + + If ``true`` surrogate splits are built (see :ref:`CvDTree`). + +By default the following constructor is used: + +.. code-block:: cpp + + CvGBTreesParams(CvGBTrees::SQUARED_LOSS, 200, 0.8f, 0.01f, 3, false) + : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 ) + + + +.. index:: CvGBTrees +.. _CvGBTrees: + +CvGBTrees +--------- +.. c:type:: CvGBTrees + +GBT model :: + + class CvGBTrees : public CvStatModel + { + public: + + enum {SQUARED_LOSS=0, ABSOLUTE_LOSS, HUBER_LOSS=3, DEVIANCE_LOSS}; + + CvGBTrees(); + CvGBTrees( const cv::Mat& trainData, int tflag, + const Mat& responses, const Mat& varIdx=Mat(), + const Mat& sampleIdx=Mat(), const cv::Mat& varType=Mat(), + const Mat& missingDataMask=Mat(), + CvGBTreesParams params=CvGBTreesParams() ); + + virtual ~CvGBTrees(); + virtual bool train( const Mat& trainData, int tflag, + const Mat& responses, const Mat& varIdx=Mat(), + const Mat& sampleIdx=Mat(), const Mat& varType=Mat(), + const Mat& missingDataMask=Mat(), + CvGBTreesParams params=CvGBTreesParams(), + bool update=false ); + + virtual bool train( CvMLData* data, + CvGBTreesParams params=CvGBTreesParams(), + bool update=false ); + + virtual float predict( const Mat& sample, const Mat& missing=Mat(), + const Range& slice = Range::all(), + int k=-1 ) const; + + virtual void clear(); + + virtual float calc_error( CvMLData* _data, int type, + std::vector *resp = 0 ); + + virtual void write( CvFileStorage* fs, const char* name ) const; + + virtual void read( CvFileStorage* fs, CvFileNode* node ); + + protected: + + CvDTreeTrainData* data; + CvGBTreesParams params; + CvSeq** weak; + Mat& orig_response; + Mat& sum_response; + Mat& sum_response_tmp; + Mat& weak_eval; + Mat& sample_idx; + Mat& subsample_train; + Mat& subsample_test; + Mat& missing; + Mat& class_labels; + RNG* rng; + int class_count; + float delta; + float base_value; + + ... + + }; + + + +.. index:: CvGBTrees::train + +.. _CvGBTrees::train: + +CvGBTrees::train +---------------- +.. c:function:: bool train(const Mat & trainData, int tflag, const Mat & responses, const Mat & varIdx=Mat(), const Mat & sampleIdx=Mat(), const Mat & varType=Mat(), const Mat & missingDataMask=Mat(), CvGBTreesParams params=CvGBTreesParams(), bool update=false) + +.. c:function:: bool train(CvMLData* data, CvGBTreesParams params=CvGBTreesParams(), bool update=false) + + Trains a Gradient boosted tree model. + +The first train method follows the common template (see :ref:`CvStatModel::train`). +Both ``tflag`` values (``CV_ROW_SAMPLE``, ``CV_COL_SAMPLE``) are supported. +``trainData`` must be of ``CV_32F`` type. ``responses`` must be a matrix of type +``CV_32S`` or ``CV_32F``, in both cases it is converted into the ``CV_32F`` +matrix inside the training procedure. ``varIdx`` and ``sampleIdx`` must be a +list of indices (``CV_32S``), or a mask (``CV_8U`` or ``CV_8S``). ``update`` is +a dummy parameter. + +The second form of :ref:`CvGBTrees::train` function uses :ref:`CvMLData` as a +data set container. ``update`` is still a dummy parameter. + +All parameters specific to the GBT model are passed into the training function +as a :ref:`CvGBTreesParams` structure. + + +.. index:: CvGBTrees::predict + +.. _CvGBTrees::predict: + +CvGBTrees::predict +------------------ +.. c:function:: float predict(const Mat & sample, const Mat & missing=Mat(), const Range & slice = Range::all(), int k=-1) const + + Predicts a response for an input sample. + +The method predicts the response, corresponding to the given sample +(see :ref:`Predicting with GBT model`). +The result is either the class label or the estimated function value. +:c:func:`predict` method allows to use the parallel version of the GBT model +prediction if the OpenCV is built with the TBB library. In this case predicitons +of single trees are computed in a parallel fashion. + +``sample`` + + An input feature vector, that has the same format as every training set + element. Hence, if not all the variables were actualy used while training, + ``sample`` have to contain fictive values on the appropriate places. + +``missing`` + + The missing values mask. The one dimentional matrix of the same size as + ``sample`` having a ``CV_8U`` type. ``1`` corresponds to the missing value + in the same position in the ``sample`` vector. If there are no missing values + in the feature vector empty matrix can be passed instead of the missing mask. + +``weak_responses`` + + In addition to the prediciton of the whole model all the trees' predcitions + can be obtained by passing a ``weak_responses`` matrix with :math:`K` rows, + where :math:`K` is the output classes count (1 for the case of regression) + and having as many columns as the ``slice`` length. + +``slice`` + + Defines the part of the ensemble used for prediction. + All trees are used when ``slice = Range::all()``. This parameter is useful to + get predictions of the GBT models with different ensemble sizes learning + only the one model actually. + +``k`` + + In the case of the classification problem not the one, but :math:`K` tree + ensembles are built (see :ref:`Training the GBT model`). By passing this + parameter the ouput can be changed to sum of the trees' predictions in the + ``k``'th ensemble only. To get the total GBT model prediction ``k`` value + must be -1. For regression problems ``k`` have to be equal to -1 also. + + + +.. index:: CvGBTrees::clear + +.. _CvGBTrees::clear: + +CvGBTrees::clear +---------------- +.. c:function:: void clear() + + Clears the model. + +Deletes the data set information, all the weak models and sets all internal +variables to the initial state. Is called in :ref:`CvGBTrees::train` and in the +destructor. + + +.. index:: CvGBTrees::calc_error + +.. _CvGBTrees::calc_error: + +CvGBTrees::calc_error +--------------------- +.. c:function:: float calc_error( CvMLData* _data, int type, std::vector *resp = 0 ) + + Calculates training or testing error. + +If the :ref:`CvMLData` data is used to store the data set :c:func:`calc_error` can be +used to get the training or testing error easily and (optionally) all predictions +on the training/testing set. If TBB library is used, the error is computed in a +parallel way: predictions for different samples are computed at the same time. +In the case of regression problem mean squared error is returned. For +classifications the result is the misclassification error in percent. + +``_data`` + + Data set. + +``type`` + + Defines what error should be computed: train (``CV_TRAIN_ERROR``) or test + (``CV_TEST_ERROR``). + +``resp`` + + If not ``0`` a vector of predictions on the corresponding data set is + returned. + diff --git a/modules/ml/doc/ml.rst b/modules/ml/doc/ml.rst index 4f0d6bb..1cc01b9 100644 --- a/modules/ml/doc/ml.rst +++ b/modules/ml/doc/ml.rst @@ -15,7 +15,7 @@ Most of the classification and regression algorithms are implemented as C++ clas support_vector_machines decision_trees boosting + gradient_boosted_trees random_trees expectation_maximization neural_networks - diff --git a/modules/ml/include/opencv2/ml/ml.hpp b/modules/ml/include/opencv2/ml/ml.hpp index 2263fbe..33782cf 100644 --- a/modules/ml/include/opencv2/ml/ml.hpp +++ b/modules/ml/include/opencv2/ml/ml.hpp @@ -1571,7 +1571,7 @@ public: // Response value prediction // // API - // virtual float predict( const CvMat* sample, const CvMat* missing=0, + // virtual float predict_serial( const CvMat* sample, const CvMat* missing=0, CvMat* weak_responses=0, CvSlice slice = CV_WHOLE_SEQ, int k=-1 ) const; @@ -1594,12 +1594,44 @@ public: // RESULT // Predicted value. */ + virtual float predict_serial( const CvMat* sample, const CvMat* missing=0, + CvMat* weakResponses=0, CvSlice slice = CV_WHOLE_SEQ, + int k=-1 ) const; + + /* + // Response value prediction. + // Parallel version (in the case of TBB existence) + // + // API + // virtual float predict( const CvMat* sample, const CvMat* missing=0, + CvMat* weak_responses=0, CvSlice slice = CV_WHOLE_SEQ, + int k=-1 ) const; + + // INPUT + // sample - input sample of the same type as in the training set. + // missing - missing values mask. missing=0 if there are no + // missing values in sample vector. + // weak_responses - predictions of all of the trees. + // not implemented (!) + // slice - part of the ensemble used for prediction. + // slice = CV_WHOLE_SEQ when all trees are used. + // k - number of ensemble used. + // k is in {-1,0,1,..,}. + // in the case of classification problem + // ensembles are built. + // If k = -1 ordinary prediction is the result, + // otherwise function gives the prediction of the + // k-th ensemble only. + // OUTPUT + // RESULT + // Predicted value. + */ virtual float predict( const CvMat* sample, const CvMat* missing=0, CvMat* weakResponses=0, CvSlice slice = CV_WHOLE_SEQ, int k=-1 ) const; /* - // Delete all temporary data. + // Deletes all the data. // // API // virtual void clear(); @@ -1607,7 +1639,7 @@ public: // INPUT // OUTPUT // delete data, weak, orig_response, sum_response, - // weak_eval, ubsample_train, subsample_test, + // weak_eval, subsample_train, subsample_test, // sample_idx, missing, lass_labels // delta = 0.0 // RESULT @@ -1623,7 +1655,7 @@ public: // // INPUT // data - dataset - // type - defines which error is to compute^ train (CV_TRAIN_ERROR) or + // type - defines which error is to compute: train (CV_TRAIN_ERROR) or // test (CV_TEST_ERROR). // OUTPUT // resp - vector of predicitons @@ -1633,7 +1665,6 @@ public: virtual float calc_error( CvMLData* _data, int type, std::vector *resp = 0 ); - /* // // Write parameters of the gtb model and data. Write learned model. @@ -1852,7 +1883,6 @@ protected: CvMat* orig_response; CvMat* sum_response; CvMat* sum_response_tmp; - CvMat* weak_eval; CvMat* sample_idx; CvMat* subsample_train; CvMat* subsample_test; diff --git a/modules/ml/src/gbt.cpp b/modules/ml/src/gbt.cpp index d512fea..60b1469 100644 --- a/modules/ml/src/gbt.cpp +++ b/modules/ml/src/gbt.cpp @@ -59,7 +59,7 @@ CvGBTrees::CvGBTrees() weak = 0; default_model_name = "my_boost_tree"; orig_response = sum_response = sum_response_tmp = 0; - weak_eval = subsample_train = subsample_test = 0; + subsample_train = subsample_test = 0; missing = sample_idx = 0; class_labels = 0; class_count = 1; @@ -117,7 +117,6 @@ void CvGBTrees::clear() cvReleaseMat( &orig_response ); cvReleaseMat( &sum_response ); cvReleaseMat( &sum_response_tmp ); - cvReleaseMat( &weak_eval ); cvReleaseMat( &subsample_train ); cvReleaseMat( &subsample_test ); cvReleaseMat( &sample_idx ); @@ -143,7 +142,7 @@ CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag, data = 0; default_model_name = "my_boost_tree"; orig_response = sum_response = sum_response_tmp = 0; - weak_eval = subsample_train = subsample_test = 0; + subsample_train = subsample_test = 0; missing = sample_idx = 0; class_labels = 0; class_count = 1; @@ -276,7 +275,7 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag, { int sample_idx_len = get_len(_sample_idx); - switch (CV_ELEM_SIZE(_sample_idx->type)) + switch (CV_MAT_TYPE(_sample_idx->type)) { case CV_32SC1: { @@ -818,20 +817,31 @@ void CvGBTrees::do_subsample() //=========================================================================== -float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing, - CvMat* /*weak_responses*/, CvSlice slice, int k) const +float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing, + CvMat* weak_responses, CvSlice slice, int k) const { float result = 0.0f; if (!weak) return 0.0f; - float* sum = new float[class_count]; - for (int i=0; itype) != CV_32F) + return 0.0f; + if ((k >= 0) && (krows != 1)) + return 0.0f; + if ((k == -1) && (weak_responses->rows != class_count)) + return 0.0f; + if (weak_responses->cols != weak_count) + return 0.0f; + } + + float* sum = new float[class_count]; + memset(sum, 0, class_count*sizeof(float)); for (int i=0; ipredict(_sample, _missing)->value); + float p = (float)(tree->predict(_sample, _missing)->value); + sum[i] += params.shrinkage * p; + if (weak_responses) + weak_responses->data.fl[i*weak_count+j] = p; } } } + + for (int i=0; ipredict(sample, missing)->value); + } + } +#ifdef HAVE_TBB + lock.acquire(SumMutex); + sum[i] += tmp_sum; + lock.release(); +#else + sum[i] += tmp_sum; +#endif + } + } // Tree_predictor::operator() + +}; // class Tree_predictor + + +#ifdef HAVE_TBB +tbb::spin_mutex Tree_predictor::SumMutex; +#endif + + + +float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing, + CvMat* /*weak_responses*/, CvSlice slice, int k) const + { + float result = 0.0f; + if (!weak) return 0.0f; + float* sum = new float[class_count]; + for (int i=0; i=0) && (k max) + { + max = sum[i]; + class_label = i; + } + + delete[] sum; + int orig_class_label = class_labels->data.i[class_label]; + + return float(orig_class_label); + } + + //=========================================================================== void CvGBTrees::write_params( CvFileStorage* fs ) const @@ -1080,69 +1226,126 @@ void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node ) //=========================================================================== +class Sample_predictor +{ +private: + const CvGBTrees* gbt; + float* predictions; + const CvMat* samples; + const CvMat* missing; + const CvMat* idx; + CvSlice slice; + +public: + Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0), + idx(0), slice(CV_WHOLE_SEQ) + {} + + Sample_predictor(const CvGBTrees* _gbt, float* _predictions, + const CvMat* _samples, const CvMat* _missing, + const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) : + gbt(_gbt), predictions(_predictions), samples(_samples), + missing(_missing), idx(_idx), slice(_slice) + {} + + + Sample_predictor( const Sample_predictor& p, cv::Split ) : + gbt(p.gbt), predictions(p.predictions), + samples(p.samples), missing(p.missing), idx(p.idx), + slice(p.slice) + {} + + + virtual void operator()(const cv::BlockedRange& range) const + { + int begin = range.begin(); + int end = range.end(); + + CvMat x; + CvMat miss; + + for (int i=begin; idata.i[i] : i; + cvGetRow(samples, &x, j); + if (!missing) + { + predictions[i] = gbt->predict_serial(&x,0,0,slice); + } + else + { + cvGetRow(missing, &miss, j); + predictions[i] = gbt->predict_serial(&x,&miss,0,slice); + } + } + } // Sample_predictor::operator() + +}; // class Sample_predictor + + + // type in {CV_TRAIN_ERROR, CV_TEST_ERROR} float CvGBTrees::calc_error( CvMLData* _data, int type, std::vector *resp ) { - float err = 0; - const CvMat* values = _data->get_values(); + + float err = 0.0f; + const CvMat* sample_idx = (type == CV_TRAIN_ERROR) ? + _data->get_train_sample_idx() : + _data->get_test_sample_idx(); const CvMat* response = _data->get_responses(); - const CvMat* missing = _data->get_missing(); - const CvMat* sample_idx = (type == CV_TEST_ERROR) ? - _data->get_test_sample_idx() : - _data->get_train_sample_idx(); - //const CvMat* var_types = _data->get_var_types(); - int* sidx = sample_idx ? sample_idx->data.i : 0; - int r_step = CV_IS_MAT_CONT(response->type) ? - 1 : response->step / CV_ELEM_SIZE(response->type); - //bool is_classifier = - // var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL; - int sample_count = sample_idx ? sample_idx->cols : 0; - sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? - values->rows : - sample_count; - float* pred_resp = 0; - if( resp && (sample_count > 0) ) + + int n = sample_idx ? get_len(sample_idx) : 0; + n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n; + + if (!n) + return -FLT_MAX; + + float* pred_resp = 0; + if (resp) { - resp->resize( sample_count ); + resp->resize(n); pred_resp = &((*resp)[0]); } + else + pred_resp = new float[n]; + + Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(), + _data->get_missing(), sample_idx); + +//#ifdef HAVE_TBB +// tbb::parallel_for(cv::BlockedRange(0,n), predictor, tbb::auto_partitioner()); +//#else + cv::parallel_for(cv::BlockedRange(0,n), predictor); +//#endif + + int* sidx = sample_idx ? sample_idx->data.i : 0; + int r_step = CV_IS_MAT_CONT(response->type) ? + 1 : response->step / CV_ELEM_SIZE(response->type); + + if ( !problem_type() ) { - for( int i = 0; i < sample_count; i++ ) + for( int i = 0; i < n; i++ ) { - CvMat sample, miss; int si = sidx ? sidx[i] : i; - cvGetRow( values, &sample, si ); - if( missing ) - cvGetRow( missing, &miss, si ); - float r = (float)predict( &sample, missing ? &miss : 0 ); - if( pred_resp ) - pred_resp[i] = r; - int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1; + int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1; err += d; } - err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX; + err = err / (float)n * 100.0f; } else { - for( int i = 0; i < sample_count; i++ ) + for( int i = 0; i < n; i++ ) { - CvMat sample, miss; int si = sidx ? sidx[i] : i; - cvGetRow( values, &sample, si ); - if( missing ) - cvGetRow( missing, &miss, si ); - float r = (float)predict( &sample, missing ? &miss : 0 ); - if( pred_resp ) - pred_resp[i] = r; - float d = r - response->data.fl[si*r_step]; + float d = pred_resp[i] - response->data.fl[si*r_step]; err += d*d; } - err = sample_count ? err / (float)sample_count : -FLT_MAX; + err = err / (float)n; } + return err; - } @@ -1156,7 +1359,7 @@ CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag, weak = 0; default_model_name = "my_boost_tree"; orig_response = sum_response = sum_response_tmp = 0; - weak_eval = subsample_train = subsample_test = 0; + subsample_train = subsample_test = 0; missing = sample_idx = 0; class_labels = 0; class_count = 1; diff --git a/samples/c/tree_engine.cpp b/samples/c/tree_engine.cpp index 4f41884..2517953 100644 --- a/samples/c/tree_engine.cpp +++ b/samples/c/tree_engine.cpp @@ -125,7 +125,10 @@ int main(int argc, char** argv) print_result( ertrees.calc_error( &data, CV_TRAIN_ERROR), ertrees.calc_error( &data, CV_TEST_ERROR ), ertrees.get_var_importance() ); printf("======GBTREES=====\n"); - gbtrees.train( &data, CvGBTreesParams(CvGBTrees::DEVIANCE_LOSS, 100, 0.05f, 0.6f, 10, true)); + if (categorical_response) + gbtrees.train( &data, CvGBTreesParams(CvGBTrees::DEVIANCE_LOSS, 100, 0.1f, 0.8f, 5, false)); + else + gbtrees.train( &data, CvGBTreesParams(CvGBTrees::SQUARED_LOSS, 100, 0.1f, 0.8f, 5, false)); print_result( gbtrees.calc_error( &data, CV_TRAIN_ERROR), gbtrees.calc_error( &data, CV_TEST_ERROR ), 0 ); //doesn't compute importance } else -- 2.7.4