Calcerror uses now weighted samples (#10346)
[platform/upstream/opencv.git] / modules / ml / src / ann_mlp.cpp
index 69e4448..cd10c0d 100644 (file)
 
 #include "precomp.hpp"
 
-CvANN_MLP_TrainParams::CvANN_MLP_TrainParams()
-{
-    term_crit = cvTermCriteria( CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 1000, 0.01 );
-    train_method = RPROP;
-    bp_dw_scale = bp_moment_scale = 0.1;
-    rp_dw0 = 0.1; rp_dw_plus = 1.2; rp_dw_minus = 0.5;
-    rp_dw_min = FLT_EPSILON; rp_dw_max = 50.;
-}
+namespace cv { namespace ml {
 
-
-CvANN_MLP_TrainParams::CvANN_MLP_TrainParams( CvTermCriteria _term_crit,
-                                              int _train_method,
-                                              double _param1, double _param2 )
+struct SimulatedAnnealingSolver::Impl
 {
-    term_crit = _term_crit;
-    train_method = _train_method;
-    bp_dw_scale = bp_moment_scale = 0.1;
-    rp_dw0 = 1.; rp_dw_plus = 1.2; rp_dw_minus = 0.5;
-    rp_dw_min = FLT_EPSILON; rp_dw_max = 50.;
-
-    if( train_method == RPROP )
+    int refcount;
+
+    const Ptr<SimulatedAnnealingSolverSystem> systemPtr;
+    SimulatedAnnealingSolverSystem& system;
+    RNG rEnergy;
+    double coolingRatio;
+    double initialT;
+    double finalT;
+    int iterPerStep;
+
+    Impl(const Ptr<SimulatedAnnealingSolverSystem>& s) :
+        refcount(1),
+        systemPtr(s),
+        system(*(s.get())),
+        rEnergy(12345)
     {
-        rp_dw0 = _param1;
-        if( rp_dw0 < FLT_EPSILON )
-            rp_dw0 = 1.;
-        rp_dw_min = _param2;
-        rp_dw_min = MAX( rp_dw_min, 0 );
+        CV_Assert(!systemPtr.empty());
+        initialT = 2;
+        finalT = 0.1;
+        coolingRatio = 0.95;
+        iterPerStep = 100;
     }
-    else if( train_method == BACKPROP )
+
+    inline double energy() { return system.energy(); }
+    inline void changeState() { system.changeState(); }
+    inline void reverseState() { system.reverseState(); }
+
+    void addref() { CV_XADD(&refcount, 1); }
+    void release() { if (CV_XADD(&refcount, -1) == 1) delete this; }
+protected:
+    virtual ~Impl() { CV_Assert(refcount==0); }
+};
+
+struct AnnParams
+{
+    AnnParams()
     {
-        bp_dw_scale = _param1;
-        if( bp_dw_scale <= 0 )
-            bp_dw_scale = 0.1;
-        bp_dw_scale = MAX( bp_dw_scale, 1e-3 );
-        bp_dw_scale = MIN( bp_dw_scale, 1 );
-        bp_moment_scale = _param2;
-        if( bp_moment_scale < 0 )
-            bp_moment_scale = 0.1;
-        bp_moment_scale = MIN( bp_moment_scale, 1 );
+        termCrit = TermCriteria( TermCriteria::COUNT + TermCriteria::EPS, 1000, 0.01 );
+        trainMethod = ANN_MLP::RPROP;
+        bpDWScale = bpMomentScale = 0.1;
+        rpDW0 = 0.1; rpDWPlus = 1.2; rpDWMinus = 0.5;
+        rpDWMin = FLT_EPSILON; rpDWMax = 50.;
+        initialT=10;finalT=0.1,coolingRatio=0.95;itePerStep=10;
+        rEnergy = cv::RNG(12345);
     }
-    else
-        train_method = RPROP;
-}
 
+    TermCriteria termCrit;
+    int trainMethod;
 
-CvANN_MLP_TrainParams::~CvANN_MLP_TrainParams()
-{
-}
+    double bpDWScale;
+    double bpMomentScale;
 
+    double rpDW0;
+    double rpDWPlus;
+    double rpDWMinus;
+    double rpDWMin;
+    double rpDWMax;
 
-CvANN_MLP::CvANN_MLP()
+    double initialT;
+    double finalT;
+    double coolingRatio;
+    int itePerStep;
+    RNG rEnergy;
+};
+
+template <typename T>
+inline T inBounds(T val, T min_val, T max_val)
 {
-    layer_sizes = wbuf = 0;
-    min_val = max_val = min_val1 = max_val1 = 0.;
-    weights = 0;
-    rng = &cv::theRNG();
-    default_model_name = "my_nn";
-    clear();
+    return std::min(std::max(val, min_val), max_val);
 }
 
-
-CvANN_MLP::CvANN_MLP( const CvMat* _layer_sizes,
-                      int _activ_func,
-                      double _f_param1, double _f_param2 )
+SimulatedAnnealingSolver::SimulatedAnnealingSolver(const Ptr<SimulatedAnnealingSolverSystem>& system)
 {
-    layer_sizes = wbuf = 0;
-    min_val = max_val = min_val1 = max_val1 = 0.;
-    weights = 0;
-    rng = &cv::theRNG();
-    default_model_name = "my_nn";
-    create( _layer_sizes, _activ_func, _f_param1, _f_param2 );
+    impl = new Impl(system);
 }
 
-
-CvANN_MLP::~CvANN_MLP()
+SimulatedAnnealingSolver::SimulatedAnnealingSolver(const SimulatedAnnealingSolver& b)
 {
-    clear();
+    if (b.impl) b.impl->addref();
+    release();
+    impl = b.impl;
 }
 
-
-void CvANN_MLP::clear()
+void SimulatedAnnealingSolver::release()
 {
-    cvReleaseMat( &layer_sizes );
-    cvReleaseMat( &wbuf );
-    cvFree( &weights );
-    activ_func = SIGMOID_SYM;
-    f_param1 = f_param2 = 1;
-    max_buf_sz = 1 << 12;
+    if (impl) { impl->release(); impl = NULL; }
 }
 
-
-void CvANN_MLP::set_activ_func( int _activ_func, double _f_param1, double _f_param2 )
+void SimulatedAnnealingSolver::setIterPerStep(int ite)
 {
-    CV_FUNCNAME( "CvANN_MLP::set_activ_func" );
-
-    __BEGIN__;
-
-    if( _activ_func < 0 || _activ_func > GAUSSIAN )
-        CV_ERROR( CV_StsOutOfRange, "Unknown activation function" );
-
-    activ_func = _activ_func;
-
-    switch( activ_func )
-    {
-    case SIGMOID_SYM:
-        max_val = 0.95; min_val = -max_val;
-        max_val1 = 0.98; min_val1 = -max_val1;
-        if( fabs(_f_param1) < FLT_EPSILON )
-            _f_param1 = 2./3;
-        if( fabs(_f_param2) < FLT_EPSILON )
-            _f_param2 = 1.7159;
-        break;
-    case GAUSSIAN:
-        max_val = 1.; min_val = 0.05;
-        max_val1 = 1.; min_val1 = 0.02;
-        if( fabs(_f_param1) < FLT_EPSILON )
-            _f_param1 = 1.;
-        if( fabs(_f_param2) < FLT_EPSILON )
-            _f_param2 = 1.;
-        break;
-    default:
-        min_val = max_val = min_val1 = max_val1 = 0.;
-        _f_param1 = 1.;
-        _f_param2 = 0.;
-    }
-
-    f_param1 = _f_param1;
-    f_param2 = _f_param2;
-
-    __END__;
+    CV_Assert(impl);
+    CV_Assert(ite>0);
+    impl->iterPerStep = ite;
 }
 
-
-void CvANN_MLP::init_weights()
+int SimulatedAnnealingSolver::run()
 {
-    int i, j, k;
-
-    for( i = 1; i < layer_sizes->cols; i++ )
+    CV_Assert(impl);
+    CV_Assert(impl->initialT>impl->finalT);
+    double Ti = impl->initialT;
+    double previousEnergy = impl->energy();
+    int exchange = 0;
+    while (Ti > impl->finalT)
     {
-        int n1 = layer_sizes->data.i[i-1];
-        int n2 = layer_sizes->data.i[i];
-        double val = 0, G = n2 > 2 ? 0.7*pow((double)n1,1./(n2-1)) : 1.;
-        double* w = weights[i];
-
-        // initialize weights using Nguyen-Widrow algorithm
-        for( j = 0; j < n2; j++ )
+        for (int i = 0; i < impl->iterPerStep; i++)
         {
-            double s = 0;
-            for( k = 0; k <= n1; k++ )
+            impl->changeState();
+            double newEnergy = impl->energy();
+            if (newEnergy < previousEnergy)
             {
-                val = rng->uniform(0., 1.)*2-1.;
-                w[k*n2 + j] = val;
-                s += fabs(val);
+                previousEnergy = newEnergy;
+                exchange++;
             }
-
-            if( i < layer_sizes->cols - 1 )
+            else
             {
-                s = 1./(s - fabs(val));
-                for( k = 0; k <= n1; k++ )
-                    w[k*n2 + j] *= s;
-                w[n1*n2 + j] *= G*(-1+j*2./n2);
+                double r = impl->rEnergy.uniform(0.0, 1.0);
+                if (r < std::exp(-(newEnergy - previousEnergy) / Ti))
+                {
+                    previousEnergy = newEnergy;
+                    exchange++;
+                }
+                else
+                {
+                    impl->reverseState();
+                }
             }
+
         }
+        Ti *= impl->coolingRatio;
     }
+    impl->finalT = Ti;
+    return exchange;
 }
 
-
-void CvANN_MLP::create( const CvMat* _layer_sizes, int _activ_func,
-                        double _f_param1, double _f_param2 )
+void SimulatedAnnealingSolver::setEnergyRNG(const RNG& rng)
 {
-    CV_FUNCNAME( "CvANN_MLP::create" );
-
-    __BEGIN__;
-
-    int i, l_step, l_count, buf_sz = 0;
-    int *l_src, *l_dst;
-
-    clear();
-
-    if( !CV_IS_MAT(_layer_sizes) ||
-        (_layer_sizes->cols != 1 && _layer_sizes->rows != 1) ||
-        CV_MAT_TYPE(_layer_sizes->type) != CV_32SC1 )
-        CV_ERROR( CV_StsBadArg,
-        "The array of layer neuron counters must be an integer vector" );
-
-    CV_CALL( set_activ_func( _activ_func, _f_param1, _f_param2 ));
-
-    l_count = _layer_sizes->rows + _layer_sizes->cols - 1;
-    l_src = _layer_sizes->data.i;
-    l_step = CV_IS_MAT_CONT(_layer_sizes->type) ? 1 :
-                _layer_sizes->step / sizeof(l_src[0]);
-    CV_CALL( layer_sizes = cvCreateMat( 1, l_count, CV_32SC1 ));
-    l_dst = layer_sizes->data.i;
-
-    max_count = 0;
-    for( i = 0; i < l_count; i++ )
-    {
-        int n = l_src[i*l_step];
-        if( n < 1 + (0 < i && i < l_count-1))
-            CV_ERROR( CV_StsOutOfRange,
-            "there should be at least one input and one output "
-            "and every hidden layer must have more than 1 neuron" );
-        l_dst[i] = n;
-        max_count = MAX( max_count, n );
-        if( i > 0 )
-            buf_sz += (l_dst[i-1]+1)*n;
-    }
-
-    buf_sz += (l_dst[0] + l_dst[l_count-1]*2)*2;
+    CV_Assert(impl);
+    impl->rEnergy = rng;
+}
 
-    CV_CALL( wbuf = cvCreateMat( 1, buf_sz, CV_64F ));
-    CV_CALL( weights = (double**)cvAlloc( (l_count+2)*sizeof(weights[0]) ));
+void SimulatedAnnealingSolver::setInitialTemperature(double x)
+{
+    CV_Assert(impl);
+    CV_Assert(x>0);
+    impl->initialT = x;
+}
 
-    weights[0] = wbuf->data.db;
-    weights[1] = weights[0] + l_dst[0]*2;
-    for( i = 1; i < l_count; i++ )
-        weights[i+1] = weights[i] + (l_dst[i-1] + 1)*l_dst[i];
-    weights[l_count+1] = weights[l_count] + l_dst[l_count-1]*2;
+void SimulatedAnnealingSolver::setFinalTemperature(double x)
+{
+    CV_Assert(impl);
+    CV_Assert(x>0);
+    impl->finalT = x;
+}
 
-    __END__;
+double SimulatedAnnealingSolver::getFinalTemperature()
+{
+    CV_Assert(impl);
+    return impl->finalT;
 }
 
+void SimulatedAnnealingSolver::setCoolingRatio(double x)
+{
+    CV_Assert(impl);
+    CV_Assert(x>0 && x<1);
+    impl->coolingRatio = x;
+}
 
-float CvANN_MLP::predict( const CvMat* _inputs, CvMat* _outputs ) const
+class SimulatedAnnealingANN_MLP : public SimulatedAnnealingSolverSystem
 {
-    int i, j, n, dn = 0, l_count, dn0, buf_sz, min_buf_sz;
-
-    if( !layer_sizes )
-        CV_Error( CV_StsError, "The network has not been initialized" );
-
-    if( !CV_IS_MAT(_inputs) || !CV_IS_MAT(_outputs) ||
-        !CV_ARE_TYPES_EQ(_inputs,_outputs) ||
-        (CV_MAT_TYPE(_inputs->type) != CV_32FC1 &&
-        CV_MAT_TYPE(_inputs->type) != CV_64FC1) ||
-        _inputs->rows != _outputs->rows )
-        CV_Error( CV_StsBadArg, "Both input and output must be floating-point matrices "
-                                "of the same type and have the same number of rows" );
-
-    if( _inputs->cols != layer_sizes->data.i[0] )
-        CV_Error( CV_StsBadSize, "input matrix must have the same number of columns as "
-                                 "the number of neurons in the input layer" );
-
-    if( _outputs->cols != layer_sizes->data.i[layer_sizes->cols - 1] )
-        CV_Error( CV_StsBadSize, "output matrix must have the same number of columns as "
-                                 "the number of neurons in the output layer" );
-    n = dn0 = _inputs->rows;
-    min_buf_sz = 2*max_count;
-    buf_sz = n*min_buf_sz;
-
-    if( buf_sz > max_buf_sz )
+protected:
+    ml::ANN_MLP& nn;
+    Ptr<ml::TrainData> data;
+    int nbVariables;
+    vector<double*> adrVariables;
+    RNG rVar;
+    RNG rIndex;
+    double varTmp;
+    int index;
+public:
+    SimulatedAnnealingANN_MLP(ml::ANN_MLP& x, const Ptr<ml::TrainData>& d) : nn(x), data(d)
     {
-        dn0 = max_buf_sz/min_buf_sz;
-        dn0 = MAX( dn0, 1 );
-        buf_sz = dn0*min_buf_sz;
+        initVarMap();
     }
-
-    cv::AutoBuffer<double> buf(buf_sz);
-    l_count = layer_sizes->cols;
-
-    for( i = 0; i < n; i += dn )
+    ~SimulatedAnnealingANN_MLP() {}
+protected:
+    void changeState()
     {
-        CvMat hdr[2], _w, *layer_in = &hdr[0], *layer_out = &hdr[1], *temp;
-        dn = MIN( dn0, n - i );
+        index = rIndex.uniform(0, nbVariables);
+        double dv = rVar.uniform(-1.0, 1.0);
+        varTmp = *adrVariables[index];
+        *adrVariables[index] = dv;
+    }
 
-        cvGetRows( _inputs, layer_in, i, i + dn );
-        cvInitMatHeader( layer_out, dn, layer_in->cols, CV_64F, &buf[0] );
+    void reverseState()
+    {
+        *adrVariables[index] = varTmp;
+    }
 
-        scale_input( layer_in, layer_out );
-        CV_SWAP( layer_in, layer_out, temp );
+    double energy() const { return nn.calcError(data, false, noArray()); }
 
-        for( j = 1; j < l_count; j++ )
+protected:
+    void initVarMap()
+    {
+        Mat l = nn.getLayerSizes();
+        nbVariables = 0;
+        adrVariables.clear();
+        for (int i = 1; i < l.rows-1; i++)
         {
-            double* data = buf + (j&1 ? max_count*dn0 : 0);
-            int cols = layer_sizes->data.i[j];
-
-            cvInitMatHeader( layer_out, dn, cols, CV_64F, data );
-            cvInitMatHeader( &_w, layer_in->cols, layer_out->cols, CV_64F, weights[j] );
-            cvGEMM( layer_in, &_w, 1, 0, 0, layer_out );
-            calc_activ_func( layer_out, _w.data.db + _w.rows*_w.cols );
-
-            CV_SWAP( layer_in, layer_out, temp );
+            Mat w = nn.getWeights(i);
+            for (int j = 0; j < w.rows; j++)
+            {
+                for (int k = 0; k < w.cols; k++, nbVariables++)
+                {
+                    if (j == w.rows - 1)
+                    {
+                        adrVariables.push_back(&w.at<double>(w.rows - 1, k));
+                    }
+                    else
+                    {
+                        adrVariables.push_back(&w.at<double>(j, k));
+                    }
+                }
+            }
         }
-
-        cvGetRows( _outputs, layer_out, i, i + dn );
-        scale_output( layer_in, layer_out );
     }
 
-    return 0.f;
-}
-
+};
 
-void CvANN_MLP::scale_input( const CvMat* _src, CvMat* _dst ) const
+double ANN_MLP::getAnnealInitialT() const
 {
-    int i, j, cols = _src->cols;
-    double* dst = _dst->data.db;
-    const double* w = weights[0];
-    int step = _src->step;
-
-    if( CV_MAT_TYPE( _src->type ) == CV_32F )
-    {
-        const float* src = _src->data.fl;
-        step /= sizeof(src[0]);
+    const ANN_MLP_ANNEAL* this_ = dynamic_cast<const ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    return this_->getAnnealInitialT();
+}
 
-        for( i = 0; i < _src->rows; i++, src += step, dst += cols )
-            for( j = 0; j < cols; j++ )
-                dst[j] = src[j]*w[j*2] + w[j*2+1];
-    }
-    else
-    {
-        const double* src = _src->data.db;
-        step /= sizeof(src[0]);
+void ANN_MLP::setAnnealInitialT(double val)
+{
+    ANN_MLP_ANNEAL* this_ = dynamic_cast<ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    this_->setAnnealInitialT(val);
+}
 
-        for( i = 0; i < _src->rows; i++, src += step, dst += cols )
-            for( j = 0; j < cols; j++ )
-                dst[j] = src[j]*w[j*2] + w[j*2+1];
-    }
+double ANN_MLP::getAnnealFinalT() const
+{
+    const ANN_MLP_ANNEAL* this_ = dynamic_cast<const ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    return this_->getAnnealFinalT();
 }
 
+void ANN_MLP::setAnnealFinalT(double val)
+{
+    ANN_MLP_ANNEAL* this_ = dynamic_cast<ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    this_->setAnnealFinalT(val);
+}
 
-void CvANN_MLP::scale_output( const CvMat* _src, CvMat* _dst ) const
+double ANN_MLP::getAnnealCoolingRatio() const
 {
-    int i, j, cols = _src->cols;
-    const double* src = _src->data.db;
-    const double* w = weights[layer_sizes->cols];
-    int step = _dst->step;
+    const ANN_MLP_ANNEAL* this_ = dynamic_cast<const ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    return this_->getAnnealCoolingRatio();
+}
 
-    if( CV_MAT_TYPE( _dst->type ) == CV_32F )
-    {
-        float* dst = _dst->data.fl;
-        step /= sizeof(dst[0]);
+void ANN_MLP::setAnnealCoolingRatio(double val)
+{
+    ANN_MLP_ANNEAL* this_ = dynamic_cast<ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    this_->setAnnealCoolingRatio(val);
+}
 
-        for( i = 0; i < _src->rows; i++, src += cols, dst += step )
-            for( j = 0; j < cols; j++ )
-                dst[j] = (float)(src[j]*w[j*2] + w[j*2+1]);
-    }
-    else
-    {
-        double* dst = _dst->data.db;
-        step /= sizeof(dst[0]);
+int ANN_MLP::getAnnealItePerStep() const
+{
+    const ANN_MLP_ANNEAL* this_ = dynamic_cast<const ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    return this_->getAnnealItePerStep();
+}
 
-        for( i = 0; i < _src->rows; i++, src += cols, dst += step )
-            for( j = 0; j < cols; j++ )
-                dst[j] = src[j]*w[j*2] + w[j*2+1];
-    }
+void ANN_MLP::setAnnealItePerStep(int val)
+{
+    ANN_MLP_ANNEAL* this_ = dynamic_cast<ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    this_->setAnnealItePerStep(val);
 }
 
+void ANN_MLP::setAnnealEnergyRNG(const RNG& rng)
+{
+    ANN_MLP_ANNEAL* this_ = dynamic_cast<ANN_MLP_ANNEAL*>(this);
+    if (!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not ANN_MLP_ANNEAL");
+    this_->setAnnealEnergyRNG(rng);
+}
 
-void CvANN_MLP::calc_activ_func( CvMat* sums, const double* bias ) const
+class ANN_MLPImpl : public ANN_MLP_ANNEAL
 {
-    int i, j, n = sums->rows, cols = sums->cols;
-    double* data = sums->data.db;
-    double scale = 0, scale2 = f_param2;
+public:
+    ANN_MLPImpl()
+    {
+        clear();
+        setActivationFunction( SIGMOID_SYM, 0, 0);
+        setLayerSizes(Mat());
+        setTrainMethod(ANN_MLP::RPROP, 0.1, FLT_EPSILON);
+    }
 
-    switch( activ_func )
+    virtual ~ANN_MLPImpl() {}
+
+    CV_IMPL_PROPERTY(TermCriteria, TermCriteria, params.termCrit)
+    CV_IMPL_PROPERTY(double, BackpropWeightScale, params.bpDWScale)
+    CV_IMPL_PROPERTY(double, BackpropMomentumScale, params.bpMomentScale)
+    CV_IMPL_PROPERTY(double, RpropDW0, params.rpDW0)
+    CV_IMPL_PROPERTY(double, RpropDWPlus, params.rpDWPlus)
+    CV_IMPL_PROPERTY(double, RpropDWMinus, params.rpDWMinus)
+    CV_IMPL_PROPERTY(double, RpropDWMin, params.rpDWMin)
+    CV_IMPL_PROPERTY(double, RpropDWMax, params.rpDWMax)
+    CV_IMPL_PROPERTY(double, AnnealInitialT, params.initialT)
+    CV_IMPL_PROPERTY(double, AnnealFinalT, params.finalT)
+    CV_IMPL_PROPERTY(double, AnnealCoolingRatio, params.coolingRatio)
+    CV_IMPL_PROPERTY(int, AnnealItePerStep, params.itePerStep)
+
+    //CV_IMPL_PROPERTY(RNG, AnnealEnergyRNG, params.rEnergy)
+    inline void setAnnealEnergyRNG(const RNG& val) { params.rEnergy = val; }
+
+    void clear()
     {
-    case IDENTITY:
-        scale = 1.;
-        break;
-    case SIGMOID_SYM:
-        scale = -f_param1;
-        break;
-    case GAUSSIAN:
-        scale = -f_param1*f_param1;
-        break;
-    default:
-        ;
+        min_val = max_val = min_val1 = max_val1 = 0.;
+        rng = RNG((uint64)-1);
+        weights.clear();
+        trained = false;
+        max_buf_sz = 1 << 12;
     }
 
-    assert( CV_IS_MAT_CONT(sums->type) );
+    int layer_count() const { return (int)layer_sizes.size(); }
 
-    if( activ_func != GAUSSIAN )
+    void setTrainMethod(int method, double param1, double param2)
     {
-        for( i = 0; i < n; i++, data += cols )
-            for( j = 0; j < cols; j++ )
-                data[j] = (data[j] + bias[j])*scale;
-
-        if( activ_func == IDENTITY )
-            return;
+        if (method != ANN_MLP::RPROP && method != ANN_MLP::BACKPROP && method != ANN_MLP::ANNEAL)
+            method = ANN_MLP::RPROP;
+        params.trainMethod = method;
+        if(method == ANN_MLP::RPROP )
+        {
+            if( param1 < FLT_EPSILON )
+                param1 = 1.;
+            params.rpDW0 = param1;
+            params.rpDWMin = std::max( param2, 0. );
+        }
+        else if (method == ANN_MLP::BACKPROP)
+        {
+            if (param1 <= 0)
+                param1 = 0.1;
+            params.bpDWScale = inBounds<double>(param1, 1e-3, 1.);
+            if (param2 < 0)
+                param2 = 0.1;
+            params.bpMomentScale = std::min(param2, 1.);
+        }
     }
-    else
+
+    int getTrainMethod() const
     {
-        for( i = 0; i < n; i++, data += cols )
-            for( j = 0; j < cols; j++ )
-            {
-                double t = data[j] + bias[j];
-                data[j] = t*t*scale;
-            }
+        return params.trainMethod;
     }
 
-    cvExp( sums, sums );
-
-    n *= cols;
-    data -= n;
-
-    switch( activ_func )
+    void setActivationFunction(int _activ_func, double _f_param1, double _f_param2)
     {
-    case SIGMOID_SYM:
-        for( i = 0; i <= n - 4; i += 4 )
-        {
-            double x0 = 1.+data[i], x1 = 1.+data[i+1], x2 = 1.+data[i+2], x3 = 1.+data[i+3];
-            double a = x0*x1, b = x2*x3, d = scale2/(a*b), t0, t1;
-            a *= d; b *= d;
-            t0 = (2 - x0)*b*x1; t1 = (2 - x1)*b*x0;
-            data[i] = t0; data[i+1] = t1;
-            t0 = (2 - x2)*a*x3; t1 = (2 - x3)*a*x2;
-            data[i+2] = t0; data[i+3] = t1;
-        }
+        if( _activ_func < 0 || _activ_func > LEAKYRELU)
+            CV_Error( CV_StsOutOfRange, "Unknown activation function" );
 
-        for( ; i < n; i++ )
+        activ_func = _activ_func;
+
+        switch( activ_func )
         {
-            double t = scale2*(1. - data[i])/(1. + data[i]);
-            data[i] = t;
+        case SIGMOID_SYM:
+            max_val = 0.95; min_val = -max_val;
+            max_val1 = 0.98; min_val1 = -max_val1;
+            if( fabs(_f_param1) < FLT_EPSILON )
+                _f_param1 = 2./3;
+            if( fabs(_f_param2) < FLT_EPSILON )
+                _f_param2 = 1.7159;
+            break;
+        case GAUSSIAN:
+            max_val = 1.; min_val = 0.05;
+            max_val1 = 1.; min_val1 = 0.02;
+            if (fabs(_f_param1) < FLT_EPSILON)
+                _f_param1 = 1.;
+            if (fabs(_f_param2) < FLT_EPSILON)
+                _f_param2 = 1.;
+            break;
+        case RELU:
+            if (fabs(_f_param1) < FLT_EPSILON)
+                _f_param1 = 1;
+            min_val = max_val = min_val1 = max_val1 = 0.;
+            _f_param2 = 0.;
+            break;
+        case LEAKYRELU:
+            if (fabs(_f_param1) < FLT_EPSILON)
+                _f_param1 = 0.01;
+            min_val = max_val = min_val1 = max_val1 = 0.;
+            _f_param2 = 0.;
+            break;
+        default:
+            min_val = max_val = min_val1 = max_val1 = 0.;
+            _f_param1 = 1.;
+            _f_param2 = 0.;
         }
-        break;
 
-    case GAUSSIAN:
-        for( i = 0; i < n; i++ )
-            data[i] = scale2*data[i];
-        break;
-
-    default:
-        ;
+        f_param1 = _f_param1;
+        f_param2 = _f_param2;
     }
-}
 
 
-void CvANN_MLP::calc_activ_func_deriv( CvMat* _xf, CvMat* _df,
-                                       const double* bias ) const
-{
-    int i, j, n = _xf->rows, cols = _xf->cols;
-    double* xf = _xf->data.db;
-    double* df = _df->data.db;
-    double scale, scale2 = f_param2;
-    assert( CV_IS_MAT_CONT( _xf->type & _df->type ) );
-
-    if( activ_func == IDENTITY )
+    void init_weights()
     {
-        for( i = 0; i < n; i++, xf += cols, df += cols )
-            for( j = 0; j < cols; j++ )
+        int i, j, k, l_count = layer_count();
+
+        for( i = 1; i < l_count; i++ )
+        {
+            int n1 = layer_sizes[i-1];
+            int n2 = layer_sizes[i];
+            double val = 0, G = n2 > 2 ? 0.7*pow((double)n1,1./(n2-1)) : 1.;
+            double* w = weights[i].ptr<double>();
+
+            // initialize weights using Nguyen-Widrow algorithm
+            for( j = 0; j < n2; j++ )
             {
-                xf[j] += bias[j];
-                df[j] = 1;
+                double s = 0;
+                for( k = 0; k <= n1; k++ )
+                {
+                    val = rng.uniform(0., 1.)*2-1.;
+                    w[k*n2 + j] = val;
+                    s += fabs(val);
+                }
+
+                if( i < l_count - 1 )
+                {
+                    s = 1./(s - fabs(val));
+                    for( k = 0; k <= n1; k++ )
+                        w[k*n2 + j] *= s;
+                    w[n1*n2 + j] *= G*(-1+j*2./n2);
+                }
             }
-        return;
+        }
     }
-    else if( activ_func == GAUSSIAN )
-    {
-        scale = -f_param1*f_param1;
-        scale2 *= scale;
-        for( i = 0; i < n; i++, xf += cols, df += cols )
-            for( j = 0; j < cols; j++ )
-            {
-                double t = xf[j] + bias[j];
-                df[j] = t*2*scale2;
-                xf[j] = t*t*scale;
-            }
-        cvExp( _xf, _xf );
 
-        n *= cols;
-        xf -= n; df -= n;
-
-        for( i = 0; i < n; i++ )
-            df[i] *= xf[i];
+    Mat getLayerSizes() const
+    {
+        return Mat_<int>(layer_sizes, true);
     }
-    else
+
+    void setLayerSizes( InputArray _layer_sizes )
     {
-        scale = f_param1;
-        for( i = 0; i < n; i++, xf += cols, df += cols )
-            for( j = 0; j < cols; j++ )
-            {
-                xf[j] = (xf[j] + bias[j])*scale;
-                df[j] = -fabs(xf[j]);
-            }
+        clear();
 
-        cvExp( _df, _df );
+        _layer_sizes.copyTo(layer_sizes);
+        int l_count = layer_count();
 
-        n *= cols;
-        xf -= n; df -= n;
+        weights.resize(l_count + 2);
+        max_lsize = 0;
 
-        // ((1+exp(-ax))^-1)'=a*((1+exp(-ax))^-2)*exp(-ax);
-        // ((1-exp(-ax))/(1+exp(-ax)))'=(a*exp(-ax)*(1+exp(-ax)) + a*exp(-ax)*(1-exp(-ax)))/(1+exp(-ax))^2=
-        // 2*a*exp(-ax)/(1+exp(-ax))^2
-        scale *= 2*f_param2;
-        for( i = 0; i < n; i++ )
+        if( l_count > 0 )
         {
-            int s0 = xf[i] > 0 ? 1 : -1;
-            double t0 = 1./(1. + df[i]);
-            double t1 = scale*df[i]*t0*t0;
-            t0 *= scale2*(1. - df[i])*s0;
-            df[i] = t1;
-            xf[i] = t0;
+            for( int i = 0; i < l_count; i++ )
+            {
+                int n = layer_sizes[i];
+                if( n < 1 + (0 < i && i < l_count-1))
+                    CV_Error( CV_StsOutOfRange,
+                             "there should be at least one input and one output "
+                             "and every hidden layer must have more than 1 neuron" );
+                max_lsize = std::max( max_lsize, n );
+                if( i > 0 )
+                    weights[i].create(layer_sizes[i-1]+1, n, CV_64F);
+            }
+
+            int ninputs = layer_sizes.front();
+            int noutputs = layer_sizes.back();
+            weights[0].create(1, ninputs*2, CV_64F);
+            weights[l_count].create(1, noutputs*2, CV_64F);
+            weights[l_count+1].create(1, noutputs*2, CV_64F);
         }
     }
-}
-
 
-void CvANN_MLP::calc_input_scale( const CvVectors* vecs, int flags )
-{
-    bool reset_weights = (flags & UPDATE_WEIGHTS) == 0;
-    bool no_scale = (flags & NO_INPUT_SCALE) != 0;
-    double* scale = weights[0];
-    int count = vecs->count;
-
-    if( reset_weights )
+    float predict( InputArray _inputs, OutputArray _outputs, int ) const
     {
-        int i, j, vcount = layer_sizes->data.i[0];
-        int type = vecs->type;
-        double a = no_scale ? 1. : 0.;
+        if( !trained )
+            CV_Error( CV_StsError, "The network has not been trained or loaded" );
 
-        for( j = 0; j < vcount; j++ )
-            scale[2*j] = a, scale[j*2+1] = 0.;
+        Mat inputs = _inputs.getMat();
+        int type = inputs.type(), l_count = layer_count();
+        int n = inputs.rows, dn0 = n;
 
-        if( no_scale )
-            return;
+        CV_Assert( (type == CV_32F || type == CV_64F) && inputs.cols == layer_sizes[0] );
+        int noutputs = layer_sizes[l_count-1];
+        Mat outputs;
 
-        for( i = 0; i < count; i++ )
+        int min_buf_sz = 2*max_lsize;
+        int buf_sz = n*min_buf_sz;
+
+        if( buf_sz > max_buf_sz )
         {
-            const float* f = vecs->data.fl[i];
-            const double* d = vecs->data.db[i];
-            for( j = 0; j < vcount; j++ )
-            {
-                double t = type == CV_32F ? (double)f[j] : d[j];
-                scale[j*2] += t;
-                scale[j*2+1] += t*t;
-            }
+            dn0 = max_buf_sz/min_buf_sz;
+            dn0 = std::max( dn0, 1 );
+            buf_sz = dn0*min_buf_sz;
         }
 
-        for( j = 0; j < vcount; j++ )
+        cv::AutoBuffer<double> _buf(buf_sz+noutputs);
+        double* buf = _buf;
+
+        if( !_outputs.needed() )
         {
-            double s = scale[j*2], s2 = scale[j*2+1];
-            double m = s/count, sigma2 = s2/count - m*m;
-            scale[j*2] = sigma2 < DBL_EPSILON ? 1 : 1./sqrt(sigma2);
-            scale[j*2+1] = -m*scale[j*2];
+            CV_Assert( n == 1 );
+            outputs = Mat(n, noutputs, type, buf + buf_sz);
         }
-    }
-}
+        else
+        {
+            _outputs.create(n, noutputs, type);
+            outputs = _outputs.getMat();
+        }
+
+        int dn = 0;
+        for( int i = 0; i < n; i += dn )
+        {
+            dn = std::min( dn0, n - i );
 
+            Mat layer_in = inputs.rowRange(i, i + dn);
+            Mat layer_out( dn, layer_in.cols, CV_64F, buf);
 
-void CvANN_MLP::calc_output_scale( const CvVectors* vecs, int flags )
-{
-    int i, j, vcount = layer_sizes->data.i[layer_sizes->cols-1];
-    int type = vecs->type;
-    double m = min_val, M = max_val, m1 = min_val1, M1 = max_val1;
-    bool reset_weights = (flags & UPDATE_WEIGHTS) == 0;
-    bool no_scale = (flags & NO_OUTPUT_SCALE) != 0;
-    int l_count = layer_sizes->cols;
-    double* scale = weights[l_count];
-    double* inv_scale = weights[l_count+1];
-    int count = vecs->count;
+            scale_input( layer_in, layer_out );
+            layer_in = layer_out;
 
-    CV_FUNCNAME( "CvANN_MLP::calc_output_scale" );
+            for( int j = 1; j < l_count; j++ )
+            {
+                double* data = buf + ((j&1) ? max_lsize*dn0 : 0);
+                int cols = layer_sizes[j];
 
-    __BEGIN__;
+                layer_out = Mat(dn, cols, CV_64F, data);
+                Mat w = weights[j].rowRange(0, layer_in.cols);
+                gemm(layer_in, w, 1, noArray(), 0, layer_out);
+                calc_activ_func( layer_out, weights[j] );
 
-    if( reset_weights )
-    {
-        double a0 = no_scale ? 1 : DBL_MAX, b0 = no_scale ? 0 : -DBL_MAX;
+                layer_in = layer_out;
+            }
+
+            layer_out = outputs.rowRange(i, i + dn);
+            scale_output( layer_in, layer_out );
+        }
 
-        for( j = 0; j < vcount; j++ )
+        if( n == 1 )
         {
-            scale[2*j] = inv_scale[2*j] = a0;
-            scale[j*2+1] = inv_scale[2*j+1] = b0;
+            int maxIdx[] = {0, 0};
+            minMaxIdx(outputs, 0, 0, 0, maxIdx);
+            return (float)(maxIdx[0] + maxIdx[1]);
         }
 
-        if( no_scale )
-            EXIT;
+        return 0.f;
     }
 
-    for( i = 0; i < count; i++ )
+    void scale_input( const Mat& _src, Mat& _dst ) const
     {
-        const float* f = vecs->data.fl[i];
-        const double* d = vecs->data.db[i];
+        int cols = _src.cols;
+        const double* w = weights[0].ptr<double>();
 
-        for( j = 0; j < vcount; j++ )
+        if( _src.type() == CV_32F )
         {
-            double t = type == CV_32F ? (double)f[j] : d[j];
-
-            if( reset_weights )
+            for( int i = 0; i < _src.rows; i++ )
             {
-                double mj = scale[j*2], Mj = scale[j*2+1];
-                if( mj > t ) mj = t;
-                if( Mj < t ) Mj = t;
-
-                scale[j*2] = mj;
-                scale[j*2+1] = Mj;
+                const float* src = _src.ptr<float>(i);
+                double* dst = _dst.ptr<double>(i);
+                for( int j = 0; j < cols; j++ )
+                    dst[j] = src[j]*w[j*2] + w[j*2+1];
             }
-            else
+        }
+        else
+        {
+            for( int i = 0; i < _src.rows; i++ )
             {
-                t = t*inv_scale[j*2] + inv_scale[2*j+1];
-                if( t < m1 || t > M1 )
-                    CV_ERROR( CV_StsOutOfRange,
-                    "Some of new output training vector components run exceed the original range too much" );
+                const double* src = _src.ptr<double>(i);
+                double* dst = _dst.ptr<double>(i);
+                for( int j = 0; j < cols; j++ )
+                    dst[j] = src[j]*w[j*2] + w[j*2+1];
             }
         }
     }
 
-    if( reset_weights )
-        for( j = 0; j < vcount; j++ )
+    void scale_output( const Mat& _src, Mat& _dst ) const
+    {
+        int cols = _src.cols;
+        const double* w = weights[layer_count()].ptr<double>();
+
+        if( _dst.type() == CV_32F )
         {
-            // map mj..Mj to m..M
-            double mj = scale[j*2], Mj = scale[j*2+1];
-            double a, b;
-            double delta = Mj - mj;
-            if( delta < DBL_EPSILON )
-                a = 1, b = (M + m - Mj - mj)*0.5;
-            else
-                a = (M - m)/delta, b = m - mj*a;
-            inv_scale[j*2] = a; inv_scale[j*2+1] = b;
-            a = 1./a; b = -b*a;
-            scale[j*2] = a; scale[j*2+1] = b;
+            for( int i = 0; i < _src.rows; i++ )
+            {
+                const double* src = _src.ptr<double>(i);
+                float* dst = _dst.ptr<float>(i);
+                for( int j = 0; j < cols; j++ )
+                    dst[j] = (float)(src[j]*w[j*2] + w[j*2+1]);
+            }
+        }
+        else
+        {
+            for( int i = 0; i < _src.rows; i++ )
+            {
+                const double* src = _src.ptr<double>(i);
+                double* dst = _dst.ptr<double>(i);
+                for( int j = 0; j < cols; j++ )
+                    dst[j] = src[j]*w[j*2] + w[j*2+1];
+            }
         }
-
-    __END__;
-}
-
-
-bool CvANN_MLP::prepare_to_train( const CvMat* _inputs, const CvMat* _outputs,
-            const CvMat* _sample_weights, const CvMat* _sample_idx,
-            CvVectors* _ivecs, CvVectors* _ovecs, double** _sw, int _flags )
-{
-    bool ok = false;
-    CvMat* sample_idx = 0;
-    CvVectors ivecs, ovecs;
-    double* sw = 0;
-    int count = 0;
-
-    CV_FUNCNAME( "CvANN_MLP::prepare_to_train" );
-
-    ivecs.data.ptr = ovecs.data.ptr = 0;
-    assert( _ivecs && _ovecs );
-
-    __BEGIN__;
-
-    const int* sidx = 0;
-    int i, sw_type = 0, sw_count = 0;
-    int sw_step = 0;
-    double sw_sum = 0;
-
-    if( !layer_sizes )
-        CV_ERROR( CV_StsError,
-        "The network has not been created. Use method create or the appropriate constructor" );
-
-    if( !CV_IS_MAT(_inputs) || (CV_MAT_TYPE(_inputs->type) != CV_32FC1 &&
-        CV_MAT_TYPE(_inputs->type) != CV_64FC1) || _inputs->cols != layer_sizes->data.i[0] )
-        CV_ERROR( CV_StsBadArg,
-        "input training data should be a floating-point matrix with"
-        "the number of rows equal to the number of training samples and "
-        "the number of columns equal to the size of 0-th (input) layer" );
-
-    if( !CV_IS_MAT(_outputs) || (CV_MAT_TYPE(_outputs->type) != CV_32FC1 &&
-        CV_MAT_TYPE(_outputs->type) != CV_64FC1) ||
-        _outputs->cols != layer_sizes->data.i[layer_sizes->cols - 1] )
-        CV_ERROR( CV_StsBadArg,
-        "output training data should be a floating-point matrix with"
-        "the number of rows equal to the number of training samples and "
-        "the number of columns equal to the size of last (output) layer" );
-
-    if( _inputs->rows != _outputs->rows )
-        CV_ERROR( CV_StsUnmatchedSizes, "The numbers of input and output samples do not match" );
-
-    if( _sample_idx )
-    {
-        CV_CALL( sample_idx = cvPreprocessIndexArray( _sample_idx, _inputs->rows ));
-        sidx = sample_idx->data.i;
-        count = sample_idx->cols + sample_idx->rows - 1;
     }
-    else
-        count = _inputs->rows;
 
-    if( _sample_weights )
+    void calc_activ_func(Mat& sums, const Mat& w) const
     {
-        if( !CV_IS_MAT(_sample_weights) )
-            CV_ERROR( CV_StsBadArg, "sample_weights (if passed) must be a valid matrix" );
-
-        sw_type = CV_MAT_TYPE(_sample_weights->type);
-        sw_count = _sample_weights->cols + _sample_weights->rows - 1;
-
-        if( (sw_type != CV_32FC1 && sw_type != CV_64FC1) ||
-            (_sample_weights->cols != 1 && _sample_weights->rows != 1) ||
-            (sw_count != count && sw_count != _inputs->rows) )
-            CV_ERROR( CV_StsBadArg,
-            "sample_weights must be 1d floating-point vector containing weights "
-            "of all or selected training samples" );
+        const double* bias = w.ptr<double>(w.rows - 1);
+        int i, j, n = sums.rows, cols = sums.cols;
+        double scale = 0, scale2 = f_param2;
 
-        sw_step = CV_IS_MAT_CONT(_sample_weights->type) ? 1 :
-            _sample_weights->step/CV_ELEM_SIZE(sw_type);
-
-        CV_CALL( sw = (double*)cvAlloc( count*sizeof(sw[0]) ));
-    }
+        switch (activ_func)
+        {
+        case IDENTITY:
+            scale = 1.;
+            break;
+        case SIGMOID_SYM:
+            scale = -f_param1;
+            break;
+        case GAUSSIAN:
+            scale = -f_param1*f_param1;
+            break;
+        case RELU:
+            scale = 1;
+            break;
+        case LEAKYRELU:
+            scale = 1;
+            break;
+        default:
+            ;
+        }
 
-    CV_CALL( ivecs.data.ptr = (uchar**)cvAlloc( count*sizeof(ivecs.data.ptr[0]) ));
-    CV_CALL( ovecs.data.ptr = (uchar**)cvAlloc( count*sizeof(ovecs.data.ptr[0]) ));
+        CV_Assert(sums.isContinuous());
 
-    ivecs.type = CV_MAT_TYPE(_inputs->type);
-    ovecs.type = CV_MAT_TYPE(_outputs->type);
-    ivecs.count = ovecs.count = count;
+        if (activ_func != GAUSSIAN)
+        {
+            for (i = 0; i < n; i++)
+            {
+                double* data = sums.ptr<double>(i);
+                for (j = 0; j < cols; j++)
+                {
+                    data[j] = (data[j] + bias[j])*scale;
+                    if (activ_func == RELU)
+                        if (data[j] < 0)
+                            data[j] = 0;
+                    if (activ_func == LEAKYRELU)
+                        if (data[j] < 0)
+                            data[j] *= f_param1;
+                }
+            }
 
-    for( i = 0; i < count; i++ )
-    {
-        int idx = sidx ? sidx[i] : i;
-        ivecs.data.ptr[i] = _inputs->data.ptr + idx*_inputs->step;
-        ovecs.data.ptr[i] = _outputs->data.ptr + idx*_outputs->step;
-        if( sw )
+            if (activ_func == IDENTITY || activ_func == RELU || activ_func == LEAKYRELU)
+                return;
+        }
+        else
         {
-            int si = sw_count == count ? i : idx;
-            double w = sw_type == CV_32FC1 ?
-                (double)_sample_weights->data.fl[si*sw_step] :
-                _sample_weights->data.db[si*sw_step];
-            sw[i] = w;
-            if( w < 0 )
-                CV_ERROR( CV_StsOutOfRange, "some of sample weights are negative" );
-            sw_sum += w;
+            for (i = 0; i < n; i++)
+            {
+                double* data = sums.ptr<double>(i);
+                for (j = 0; j < cols; j++)
+                {
+                    double t = data[j] + bias[j];
+                    data[j] = t*t*scale;
+                }
+            }
         }
-    }
 
-    // normalize weights
-    if( sw )
-    {
-        sw_sum = sw_sum > DBL_EPSILON ? 1./sw_sum : 0;
-        for( i = 0; i < count; i++ )
-            sw[i] *= sw_sum;
-    }
+        exp(sums, sums);
 
-    calc_input_scale( &ivecs, _flags );
-    CV_CALL( calc_output_scale( &ovecs, _flags ));
+        if (sums.isContinuous())
+        {
+            cols *= n;
+            n = 1;
+        }
 
-    ok = true;
+        switch (activ_func)
+        {
+        case SIGMOID_SYM:
+            for (i = 0; i < n; i++)
+            {
+                double* data = sums.ptr<double>(i);
+                for (j = 0; j < cols; j++)
+                {
+                    if (!cvIsInf(data[j]))
+                    {
+                        double t = scale2*(1. - data[j]) / (1. + data[j]);
+                        data[j] = t;
+                    }
+                    else
+                    {
+                        data[j] = -scale2;
+                    }
+                }
+            }
+            break;
 
-    __END__;
+        case GAUSSIAN:
+            for (i = 0; i < n; i++)
+            {
+                double* data = sums.ptr<double>(i);
+                for (j = 0; j < cols; j++)
+                    data[j] = scale2*data[j];
+            }
+            break;
 
-    if( !ok )
-    {
-        cvFree( &ivecs.data.ptr );
-        cvFree( &ovecs.data.ptr );
-        cvFree( &sw );
+        default:
+            ;
+        }
     }
 
-    cvReleaseMat( &sample_idx );
-    *_ivecs = ivecs;
-    *_ovecs = ovecs;
-    *_sw = sw;
-
-    return ok;
-}
-
+    void calc_activ_func_deriv(Mat& _xf, Mat& _df, const Mat& w) const
+    {
+        const double* bias = w.ptr<double>(w.rows - 1);
+        int i, j, n = _xf.rows, cols = _xf.cols;
 
-int CvANN_MLP::train( const CvMat* _inputs, const CvMat* _outputs,
-                      const CvMat* _sample_weights, const CvMat* _sample_idx,
-                      CvANN_MLP_TrainParams _params, int flags )
-{
-    const int MAX_ITER = 1000;
-    const double DEFAULT_EPSILON = FLT_EPSILON;
+        if (activ_func == IDENTITY)
+        {
+            for (i = 0; i < n; i++)
+            {
+                double* xf = _xf.ptr<double>(i);
+                double* df = _df.ptr<double>(i);
 
-    double* sw = 0;
-    CvVectors x0, u;
-    int iter = -1;
+                for (j = 0; j < cols; j++)
+                {
+                    xf[j] += bias[j];
+                    df[j] = 1;
+                }
+            }
+        }
+        else if (activ_func == RELU)
+        {
+            for (i = 0; i < n; i++)
+            {
+                double* xf = _xf.ptr<double>(i);
+                double* df = _df.ptr<double>(i);
 
-    x0.data.ptr = u.data.ptr = 0;
-
-    CV_FUNCNAME( "CvANN_MLP::train" );
+                for (j = 0; j < cols; j++)
+                {
+                    xf[j] += bias[j];
+                    if (xf[j] < 0)
+                    {
+                        xf[j] = 0;
+                        df[j] = 0;
+                    }
+                    else
+                        df[j] = 1;
+                }
+            }
+        }
+        else if (activ_func == LEAKYRELU)
+        {
+            for (i = 0; i < n; i++)
+            {
+                double* xf = _xf.ptr<double>(i);
+                double* df = _df.ptr<double>(i);
 
-    __BEGIN__;
+                for (j = 0; j < cols; j++)
+                {
+                    xf[j] += bias[j];
+                    if (xf[j] < 0)
+                    {
+                        xf[j] = f_param1*xf[j];
+                        df[j] = f_param1;
+                    }
+                    else
+                        df[j] = 1;
+                }
+            }
+        }
+        else if (activ_func == GAUSSIAN)
+        {
+            double scale = -f_param1*f_param1;
+            double scale2 = scale*f_param2;
+            for (i = 0; i < n; i++)
+            {
+                double* xf = _xf.ptr<double>(i);
+                double* df = _df.ptr<double>(i);
 
-    int max_iter;
-    double epsilon;
+                for (j = 0; j < cols; j++)
+                {
+                    double t = xf[j] + bias[j];
+                    df[j] = t * 2 * scale2;
+                    xf[j] = t*t*scale;
+                }
+            }
+            exp(_xf, _xf);
 
-    params = _params;
+            for (i = 0; i < n; i++)
+            {
+                double* xf = _xf.ptr<double>(i);
+                double* df = _df.ptr<double>(i);
 
-    // initialize training data
-    CV_CALL( prepare_to_train( _inputs, _outputs, _sample_weights,
-                               _sample_idx, &x0, &u, &sw, flags ));
+                for (j = 0; j < cols; j++)
+                    df[j] *= xf[j];
+            }
+        }
+        else
+        {
+            double scale = f_param1;
+            double scale2 = f_param2;
 
-    // ... and link weights
-    if( !(flags & UPDATE_WEIGHTS) )
-        init_weights();
+            for (i = 0; i < n; i++)
+            {
+                double* xf = _xf.ptr<double>(i);
+                double* df = _df.ptr<double>(i);
 
-    max_iter = params.term_crit.type & CV_TERMCRIT_ITER ? params.term_crit.max_iter : MAX_ITER;
-    max_iter = MAX( max_iter, 1 );
+                for (j = 0; j < cols; j++)
+                {
+                    xf[j] = (xf[j] + bias[j])*scale;
+                    df[j] = -fabs(xf[j]);
+                }
+            }
 
-    epsilon = params.term_crit.type & CV_TERMCRIT_EPS ? params.term_crit.epsilon : DEFAULT_EPSILON;
-    epsilon = MAX(epsilon, DBL_EPSILON);
+            exp(_df, _df);
 
-    params.term_crit.type = CV_TERMCRIT_ITER + CV_TERMCRIT_EPS;
-    params.term_crit.max_iter = max_iter;
-    params.term_crit.epsilon = epsilon;
+            // ((1+exp(-ax))^-1)'=a*((1+exp(-ax))^-2)*exp(-ax);
+            // ((1-exp(-ax))/(1+exp(-ax)))'=(a*exp(-ax)*(1+exp(-ax)) + a*exp(-ax)*(1-exp(-ax)))/(1+exp(-ax))^2=
+            // 2*a*exp(-ax)/(1+exp(-ax))^2
+            scale *= 2 * f_param2;
+            for (i = 0; i < n; i++)
+            {
+                double* xf = _xf.ptr<double>(i);
+                double* df = _df.ptr<double>(i);
 
-    if( params.train_method == CvANN_MLP_TrainParams::BACKPROP )
-    {
-        CV_CALL( iter = train_backprop( x0, u, sw ));
-    }
-    else
-    {
-        CV_CALL( iter = train_rprop( x0, u, sw ));
+                for (j = 0; j < cols; j++)
+                {
+                    int s0 = xf[j] > 0 ? 1 : -1;
+                    double t0 = 1. / (1. + df[j]);
+                    double t1 = scale*df[j] * t0*t0;
+                    t0 *= scale2*(1. - df[j])*s0;
+                    df[j] = t1;
+                    xf[j] = t0;
+                }
+            }
+        }
     }
 
-    __END__;
-
-    cvFree( &x0.data.ptr );
-    cvFree( &u.data.ptr );
-    cvFree( &sw );
-
-    return iter;
-}
-
-
-int CvANN_MLP::train_backprop( CvVectors x0, CvVectors u, const double* sw )
-{
-    CvMat* dw = 0;
-    CvMat* buf = 0;
-    double **x = 0, **df = 0;
-    CvMat* _idx = 0;
-    int iter = -1, count = x0.count;
-
-    CV_FUNCNAME( "CvANN_MLP::train_backprop" );
-
-    __BEGIN__;
-
-    int i, j, k, ivcount, ovcount, l_count, total = 0, max_iter;
-    double *buf_ptr;
-    double prev_E = DBL_MAX*0.5, E = 0, epsilon;
-
-    max_iter = params.term_crit.max_iter*count;
-    epsilon = params.term_crit.epsilon*count;
+    void calc_input_scale( const Mat& inputs, int flags )
+    {
+        bool reset_weights = (flags & UPDATE_WEIGHTS) == 0;
+        bool no_scale = (flags & NO_INPUT_SCALE) != 0;
+        double* scale = weights[0].ptr<double>();
+        int count = inputs.rows;
 
-    l_count = layer_sizes->cols;
-    ivcount = layer_sizes->data.i[0];
-    ovcount = layer_sizes->data.i[l_count-1];
+        if( reset_weights )
+        {
+            int i, j, vcount = layer_sizes[0];
+            int type = inputs.type();
+            double a = no_scale ? 1. : 0.;
 
-    // allocate buffers
-    for( i = 0; i < l_count; i++ )
-        total += layer_sizes->data.i[i] + 1;
+            for( j = 0; j < vcount; j++ )
+                scale[2*j] = a, scale[j*2+1] = 0.;
 
-    CV_CALL( dw = cvCreateMat( wbuf->rows, wbuf->cols, wbuf->type ));
-    cvZero( dw );
-    CV_CALL( buf = cvCreateMat( 1, (total + max_count)*2, CV_64F ));
-    CV_CALL( _idx = cvCreateMat( 1, count, CV_32SC1 ));
-    for( i = 0; i < count; i++ )
-        _idx->data.i[i] = i;
+            if( no_scale )
+                return;
 
-    CV_CALL( x = (double**)cvAlloc( total*2*sizeof(x[0]) ));
-    df = x + total;
-    buf_ptr = buf->data.db;
+            for( i = 0; i < count; i++ )
+            {
+                const uchar* p = inputs.ptr(i);
+                const float* f = (const float*)p;
+                const double* d = (const double*)p;
+                for( j = 0; j < vcount; j++ )
+                {
+                    double t = type == CV_32F ? (double)f[j] : d[j];
+                    scale[j*2] += t;
+                    scale[j*2+1] += t*t;
+                }
+            }
 
-    for( j = 0; j < l_count; j++ )
-    {
-        x[j] = buf_ptr;
-        df[j] = x[j] + layer_sizes->data.i[j];
-        buf_ptr += (df[j] - x[j])*2;
+            for( j = 0; j < vcount; j++ )
+            {
+                double s = scale[j*2], s2 = scale[j*2+1];
+                double m = s/count, sigma2 = s2/count - m*m;
+                scale[j*2] = sigma2 < DBL_EPSILON ? 1 : 1./sqrt(sigma2);
+                scale[j*2+1] = -m*scale[j*2];
+            }
+        }
     }
 
-    // run back-propagation loop
-    /*
-        y_i = w_i*x_{i-1}
-        x_i = f(y_i)
-        E = 1/2*||u - x_N||^2
-        grad_N = (x_N - u)*f'(y_i)
-        dw_i(t) = momentum*dw_i(t-1) + dw_scale*x_{i-1}*grad_i
-        w_i(t+1) = w_i(t) + dw_i(t)
-        grad_{i-1} = w_i^t*grad_i
-    */
-    for( iter = 0; iter < max_iter; iter++ )
+    void calc_output_scale( const Mat& outputs, int flags )
     {
-        int idx = iter % count;
-        double* w = weights[0];
-        double sweight = sw ? count*sw[idx] : 1.;
-        CvMat _w, _dw, hdr1, hdr2, ghdr1, ghdr2, _df;
-        CvMat *x1 = &hdr1, *x2 = &hdr2, *grad1 = &ghdr1, *grad2 = &ghdr2, *temp;
-
-        if( idx == 0 )
+        int i, j, vcount = layer_sizes.back();
+        int type = outputs.type();
+        double m = min_val, M = max_val, m1 = min_val1, M1 = max_val1;
+        bool reset_weights = (flags & UPDATE_WEIGHTS) == 0;
+        bool no_scale = (flags & NO_OUTPUT_SCALE) != 0;
+        int l_count = layer_count();
+        double* scale = weights[l_count].ptr<double>();
+        double* inv_scale = weights[l_count+1].ptr<double>();
+        int count = outputs.rows;
+
+        if( reset_weights )
         {
-            //printf("%d. E = %g\n", iter/count, E);
-            if( fabs(prev_E - E) < epsilon )
-                break;
-            prev_E = E;
-            E = 0;
+            double a0 = no_scale ? 1 : DBL_MAX, b0 = no_scale ? 0 : -DBL_MAX;
 
-            // shuffle indices
-            for( i = 0; i < count; i++ )
+            for( j = 0; j < vcount; j++ )
             {
-                int tt;
-                j = (*rng)(count);
-                k = (*rng)(count);
-                CV_SWAP( _idx->data.i[j], _idx->data.i[k], tt );
+                scale[2*j] = inv_scale[2*j] = a0;
+                scale[j*2+1] = inv_scale[2*j+1] = b0;
             }
-        }
-
-        idx = _idx->data.i[idx];
 
-        if( x0.type == CV_32F )
-        {
-            const float* x0data = x0.data.fl[idx];
-            for( j = 0; j < ivcount; j++ )
-                x[0][j] = x0data[j]*w[j*2] + w[j*2 + 1];
+            if( no_scale )
+                return;
         }
-        else
-        {
-            const double* x0data = x0.data.db[idx];
-            for( j = 0; j < ivcount; j++ )
-                x[0][j] = x0data[j]*w[j*2] + w[j*2 + 1];
-        }
-
-        cvInitMatHeader( x1, 1, ivcount, CV_64F, x[0] );
 
-        // forward pass, compute y[i]=w*x[i-1], x[i]=f(y[i]), df[i]=f'(y[i])
-        for( i = 1; i < l_count; i++ )
+        for( i = 0; i < count; i++ )
         {
-            cvInitMatHeader( x2, 1, layer_sizes->data.i[i], CV_64F, x[i] );
-            cvInitMatHeader( &_w, x1->cols, x2->cols, CV_64F, weights[i] );
-            cvGEMM( x1, &_w, 1, 0, 0, x2 );
-            _df = *x2;
-            _df.data.db = df[i];
-            calc_activ_func_deriv( x2, &_df, _w.data.db + _w.rows*_w.cols );
-            CV_SWAP( x1, x2, temp );
-        }
+            const uchar* p = outputs.ptr(i);
+            const float* f = (const float*)p;
+            const double* d = (const double*)p;
 
-        cvInitMatHeader( grad1, 1, ovcount, CV_64F, buf_ptr );
-        *grad2 = *grad1;
-        grad2->data.db = buf_ptr + max_count;
+            for( j = 0; j < vcount; j++ )
+            {
+                double t = type == CV_32F ? (double)f[j] : d[j];
 
-        w = weights[l_count+1];
+                if( reset_weights )
+                {
+                    double mj = scale[j*2], Mj = scale[j*2+1];
+                    if( mj > t ) mj = t;
+                    if( Mj < t ) Mj = t;
 
-        // calculate error
-        if( u.type == CV_32F )
-        {
-            const float* udata = u.data.fl[idx];
-            for( k = 0; k < ovcount; k++ )
-            {
-                double t = udata[k]*w[k*2] + w[k*2+1] - x[l_count-1][k];
-                grad1->data.db[k] = t*sweight;
-                E += t*t;
-            }
-        }
-        else
-        {
-            const double* udata = u.data.db[idx];
-            for( k = 0; k < ovcount; k++ )
-            {
-                double t = udata[k]*w[k*2] + w[k*2+1] - x[l_count-1][k];
-                grad1->data.db[k] = t*sweight;
-                E += t*t;
+                    scale[j*2] = mj;
+                    scale[j*2+1] = Mj;
+                }
+                else if( !no_scale )
+                {
+                    t = t*inv_scale[j*2] + inv_scale[2*j+1];
+                    if( t < m1 || t > M1 )
+                        CV_Error( CV_StsOutOfRange,
+                                 "Some of new output training vector components run exceed the original range too much" );
+                }
             }
         }
-        E *= sweight;
 
-        // backward pass, update weights
-        for( i = l_count-1; i > 0; i-- )
-        {
-            int n1 = layer_sizes->data.i[i-1], n2 = layer_sizes->data.i[i];
-            cvInitMatHeader( &_df, 1, n2, CV_64F, df[i] );
-            cvMul( grad1, &_df, grad1 );
-            cvInitMatHeader( &_w, n1+1, n2, CV_64F, weights[i] );
-            cvInitMatHeader( &_dw, n1+1, n2, CV_64F, dw->data.db + (weights[i] - weights[0]) );
-            cvInitMatHeader( x1, n1+1, 1, CV_64F, x[i-1] );
-            x[i-1][n1] = 1.;
-            cvGEMM( x1, grad1, params.bp_dw_scale, &_dw, params.bp_moment_scale, &_dw );
-            cvAdd( &_w, &_dw, &_w );
-            if( i > 1 )
+        if( reset_weights )
+            for( j = 0; j < vcount; j++ )
             {
-                grad2->cols = n1;
-                _w.rows = n1;
-                cvGEMM( grad1, &_w, 1, 0, 0, grad2, CV_GEMM_B_T );
+                // map mj..Mj to m..M
+                double mj = scale[j*2], Mj = scale[j*2+1];
+                double a, b;
+                double delta = Mj - mj;
+                if( delta < DBL_EPSILON )
+                    a = 1, b = (M + m - Mj - mj)*0.5;
+                else
+                    a = (M - m)/delta, b = m - mj*a;
+                inv_scale[j*2] = a; inv_scale[j*2+1] = b;
+                a = 1./a; b = -b*a;
+                scale[j*2] = a; scale[j*2+1] = b;
             }
-            CV_SWAP( grad1, grad2, temp );
-        }
     }
 
-    iter /= count;
-
-    __END__;
-
-    cvReleaseMat( &dw );
-    cvReleaseMat( &buf );
-    cvReleaseMat( &_idx );
-    cvFree( &x );
-
-    return iter;
-}
-
-struct rprop_loop : cv::ParallelLoopBody {
-  rprop_loop(const CvANN_MLP* _point, double**& _weights, int& _count, int& _ivcount, CvVectors* _x0,
-     int& _l_count, CvMat*& _layer_sizes, int& _ovcount, int& _max_count,
-     CvVectors* _u, const double*& _sw, double& _inv_count, CvMat*& _dEdw, int& _dcount0, double* _E, int _buf_sz)
-  {
-    point = _point;
-    weights = _weights;
-    count = _count;
-    ivcount = _ivcount;
-    x0 = _x0;
-    l_count = _l_count;
-    layer_sizes = _layer_sizes;
-    ovcount = _ovcount;
-    max_count = _max_count;
-    u = _u;
-    sw = _sw;
-    inv_count = _inv_count;
-    dEdw = _dEdw;
-    dcount0 = _dcount0;
-    E = _E;
-    buf_sz = _buf_sz;
-  }
-
-  const CvANN_MLP* point;
-  double** weights;
-  int count;
-  int ivcount;
-  CvVectors* x0;
-  int l_count;
-  CvMat* layer_sizes;
-  int ovcount;
-  int max_count;
-  CvVectors* u;
-  const double* sw;
-  double inv_count;
-  CvMat* dEdw;
-  int dcount0;
-  double* E;
-  int buf_sz;
-
-
-  void operator()( const cv::Range& range ) const
-  {
-    double* buf_ptr;
-    double** x = 0;
-    double **df = 0;
-    int total = 0;
-
-    for(int i = 0; i < l_count; i++ )
-        total += layer_sizes->data.i[i];
-    CvMat* buf;
-    buf = cvCreateMat( 1, buf_sz, CV_64F );
-    x = (double**)cvAlloc( total*2*sizeof(x[0]) );
-    df = x + total;
-    buf_ptr = buf->data.db;
-    for(int i = 0; i < l_count; i++ )
+    void prepare_to_train( const Mat& inputs, const Mat& outputs,
+                           Mat& sample_weights, int flags )
     {
-        x[i] = buf_ptr;
-        df[i] = x[i] + layer_sizes->data.i[i]*dcount0;
-        buf_ptr += (df[i] - x[i])*2;
+        if( layer_sizes.empty() )
+            CV_Error( CV_StsError,
+                     "The network has not been created. Use method create or the appropriate constructor" );
+
+        if( (inputs.type() != CV_32F && inputs.type() != CV_64F) ||
+            inputs.cols != layer_sizes[0] )
+            CV_Error( CV_StsBadArg,
+                     "input training data should be a floating-point matrix with "
+                     "the number of rows equal to the number of training samples and "
+                     "the number of columns equal to the size of 0-th (input) layer" );
+
+        if( (outputs.type() != CV_32F && outputs.type() != CV_64F) ||
+            outputs.cols != layer_sizes.back() )
+            CV_Error( CV_StsBadArg,
+                     "output training data should be a floating-point matrix with "
+                     "the number of rows equal to the number of training samples and "
+                     "the number of columns equal to the size of last (output) layer" );
+
+        if( inputs.rows != outputs.rows )
+            CV_Error( CV_StsUnmatchedSizes, "The numbers of input and output samples do not match" );
+
+        Mat temp;
+        double s = sum(sample_weights)[0];
+        sample_weights.convertTo(temp, CV_64F, 1./s);
+        sample_weights = temp;
+
+        calc_input_scale( inputs, flags );
+        calc_output_scale( outputs, flags );
     }
 
-    for(int si = range.start; si < range.end; si++ )
+    bool train( const Ptr<TrainData>& trainData, int flags )
     {
-        if (si % dcount0 != 0) continue;
-        int n1, n2, k;
-        double* w;
-        CvMat _w, _dEdw, hdr1, hdr2, ghdr1, ghdr2, _df;
-        CvMat *x1, *x2, *grad1, *grad2, *temp;
-        int dcount = 0;
-
-        dcount = MIN(count - si , dcount0 );
-        w = weights[0];
-        grad1 = &ghdr1; grad2 = &ghdr2;
-        x1 = &hdr1; x2 = &hdr2;
-
-        // grab and preprocess input data
-        if( x0->type == CV_32F )
+        const int MAX_ITER = 1000;
+        const double DEFAULT_EPSILON = FLT_EPSILON;
+
+        // initialize training data
+        Mat inputs = trainData->getTrainSamples();
+        Mat outputs = trainData->getTrainResponses();
+        Mat sw = trainData->getTrainSampleWeights();
+        prepare_to_train( inputs, outputs, sw, flags );
+
+        // ... and link weights
+        if( !(flags & UPDATE_WEIGHTS) )
+            init_weights();
+
+        TermCriteria termcrit;
+        termcrit.type = TermCriteria::COUNT + TermCriteria::EPS;
+        termcrit.maxCount = std::max((params.termCrit.type & CV_TERMCRIT_ITER ? params.termCrit.maxCount : MAX_ITER), 1);
+        termcrit.epsilon = std::max((params.termCrit.type & CV_TERMCRIT_EPS ? params.termCrit.epsilon : DEFAULT_EPSILON), DBL_EPSILON);
+
+        int iter = 0;
+        switch(params.trainMethod){
+        case ANN_MLP::BACKPROP:
+            iter = train_backprop(inputs, outputs, sw, termcrit);
+            break;
+        case ANN_MLP::RPROP:
+            iter = train_rprop(inputs, outputs, sw, termcrit);
+            break;
+        case ANN_MLP::ANNEAL:
+            iter = train_anneal(trainData);
+            break;
+        }
+        trained = iter > 0;
+        return trained;
+    }
+    int train_anneal(const Ptr<TrainData>& trainData)
     {
-            for(int i = 0; i < dcount; i++ )
-            {
-                const float* x0data = x0->data.fl[si+i];
-                double* xdata = x[0]+i*ivcount;
-                for(int j = 0; j < ivcount; j++ )
-                    xdata[j] = x0data[j]*w[j*2] + w[j*2+1];
-            }
+        SimulatedAnnealingSolver t(Ptr<SimulatedAnnealingANN_MLP>(new SimulatedAnnealingANN_MLP(*this, trainData)));
+        t.setEnergyRNG(params.rEnergy);
+        t.setFinalTemperature(params.finalT);
+        t.setInitialTemperature(params.initialT);
+        t.setCoolingRatio(params.coolingRatio);
+        t.setIterPerStep(params.itePerStep);
+        trained = true; // Enable call to CalcError
+        int iter =  t.run();
+        trained =false;
+        return iter;
     }
-        else
-            for(int i = 0; i < dcount; i++ )
-            {
-                const double* x0data = x0->data.db[si+i];
-                double* xdata = x[0]+i*ivcount;
-                for(int j = 0; j < ivcount; j++ )
-                    xdata[j] = x0data[j]*w[j*2] + w[j*2+1];
-            }
-        cvInitMatHeader( x1, dcount, ivcount, CV_64F, x[0] );
 
-        // forward pass, compute y[i]=w*x[i-1], x[i]=f(y[i]), df[i]=f'(y[i])
-        for(int i = 1; i < l_count; i++ )
-        {
-            cvInitMatHeader( x2, dcount, layer_sizes->data.i[i], CV_64F, x[i] );
-            cvInitMatHeader( &_w, x1->cols, x2->cols, CV_64F, weights[i] );
-            cvGEMM( x1, &_w, 1, 0, 0, x2 );
-            _df = *x2;
-            _df.data.db = df[i];
-            point->calc_activ_func_deriv( x2, &_df, _w.data.db + _w.rows*_w.cols );
-            CV_SWAP( x1, x2, temp );
-        }
-        cvInitMatHeader( grad1, dcount, ovcount, CV_64F, buf_ptr );
+    int train_backprop( const Mat& inputs, const Mat& outputs, const Mat& _sw, TermCriteria termCrit )
+    {
+        int i, j, k;
+        double prev_E = DBL_MAX*0.5, E = 0;
+        int itype = inputs.type(), otype = outputs.type();
 
-        w = weights[l_count+1];
-        grad2->data.db = buf_ptr + max_count*dcount;
+        int count = inputs.rows;
 
-        // calculate error
-        if( u->type == CV_32F )
-            for(int i = 0; i < dcount; i++ )
-            {
-                const float* udata = u->data.fl[si+i];
-                const double* xdata = x[l_count-1] + i*ovcount;
-                double* gdata = grad1->data.db + i*ovcount;
-                double sweight = sw ? sw[si+i] : inv_count, E1 = 0;
+        int iter = -1, max_iter = termCrit.maxCount*count;
+        double epsilon = termCrit.epsilon*count;
 
-                for(int j = 0; j < ovcount; j++ )
-                {
-                    double t = udata[j]*w[j*2] + w[j*2+1] - xdata[j];
-                    gdata[j] = t*sweight;
-                    E1 += t*t;
-                }
-                *E += sweight*E1;
-            }
-        else
-            for(int i = 0; i < dcount; i++ )
-            {
-                const double* udata = u->data.db[si+i];
-                const double* xdata = x[l_count-1] + i*ovcount;
-                double* gdata = grad1->data.db + i*ovcount;
-                double sweight = sw ? sw[si+i] : inv_count, E1 = 0;
+        int l_count = layer_count();
+        int ivcount = layer_sizes[0];
+        int ovcount = layer_sizes.back();
 
-                for(int j = 0; j < ovcount; j++ )
-                {
-                    double t = udata[j]*w[j*2] + w[j*2+1] - xdata[j];
-                    gdata[j] = t*sweight;
-                    E1 += t*t;
-                }
-                *E += sweight*E1;
-            }
+        // allocate buffers
+        vector<vector<double> > x(l_count);
+        vector<vector<double> > df(l_count);
+        vector<Mat> dw(l_count);
 
-        // backward pass, update dEdw
-        static cv::Mutex mutex;
+        for( i = 0; i < l_count; i++ )
+        {
+            int n = layer_sizes[i];
+            x[i].resize(n+1);
+            df[i].resize(n);
+            dw[i] = Mat::zeros(weights[i].size(), CV_64F);
+        }
 
-        for(int i = l_count-1; i > 0; i-- )
+        Mat _idx_m(1, count, CV_32S);
+        int* _idx = _idx_m.ptr<int>();
+        for( i = 0; i < count; i++ )
+            _idx[i] = i;
+
+        AutoBuffer<double> _buf(max_lsize*2);
+        double* buf[] = { _buf, (double*)_buf + max_lsize };
+
+        const double* sw = _sw.empty() ? 0 : _sw.ptr<double>();
+
+        // run back-propagation loop
+        /*
+         y_i = w_i*x_{i-1}
+         x_i = f(y_i)
+         E = 1/2*||u - x_N||^2
+         grad_N = (x_N - u)*f'(y_i)
+         dw_i(t) = momentum*dw_i(t-1) + dw_scale*x_{i-1}*grad_i
+         w_i(t+1) = w_i(t) + dw_i(t)
+         grad_{i-1} = w_i^t*grad_i
+        */
+        for( iter = 0; iter < max_iter; iter++ )
         {
-            n1 = layer_sizes->data.i[i-1]; n2 = layer_sizes->data.i[i];
-            cvInitMatHeader( &_df, dcount, n2, CV_64F, df[i] );
-            cvMul( grad1, &_df, grad1 );
+            int idx = iter % count;
+            double sweight = sw ? count*sw[idx] : 1.;
 
+            if( idx == 0 )
             {
-                cv::AutoLock lock(mutex);
-                cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
-                cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
-                cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
-
-                // update bias part of dEdw
-                for( k = 0; k < dcount; k++ )
+                //printf("%d. E = %g\n", iter/count, E);
+                if( fabs(prev_E - E) < epsilon )
+                    break;
+                prev_E = E;
+                E = 0;
+
+                // shuffle indices
+                for( i = 0; i <count; i++ )
                 {
-                    double* dst = _dEdw.data.db + n1*n2;
-                    const double* src = grad1->data.db + k*n2;
-                    for(int j = 0; j < n2; j++ )
-                        dst[j] += src[j];
+                    j = rng.uniform(0, count);
+                    k = rng.uniform(0, count);
+                    std::swap(_idx[j], _idx[k]);
                 }
+            }
 
-                if (i > 1)
-                    cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
-           }
-
-           cvInitMatHeader( grad2, dcount, n1, CV_64F, grad2->data.db );
-           if( i > 1 )
-               cvGEMM( grad1, &_w, 1, 0, 0, grad2, CV_GEMM_B_T );
-           CV_SWAP( grad1, grad2, temp );
-        }
-    }
-    cvFree(&x);
-    cvReleaseMat( &buf );
-}
+            idx = _idx[idx];
 
-};
+            const uchar* x0data_p = inputs.ptr(idx);
+            const float* x0data_f = (const float*)x0data_p;
+            const double* x0data_d = (const double*)x0data_p;
 
+            double* w = weights[0].ptr<double>();
+            for( j = 0; j < ivcount; j++ )
+                x[0][j] = (itype == CV_32F ? (double)x0data_f[j] : x0data_d[j])*w[j*2] + w[j*2 + 1];
 
-int CvANN_MLP::train_rprop( CvVectors x0, CvVectors u, const double* sw )
-{
-    const int max_buf_size = 1 << 16;
-    CvMat* dw = 0;
-    CvMat* dEdw = 0;
-    CvMat* prev_dEdw_sign = 0;
-    CvMat* buf = 0;
-    double **x = 0, **df = 0;
-    int iter = -1, count = x0.count;
-
-    CV_FUNCNAME( "CvANN_MLP::train" );
-
-    __BEGIN__;
-
-    int i, ivcount, ovcount, l_count, total = 0, max_iter, buf_sz, dcount0;
-    double *buf_ptr;
-    double prev_E = DBL_MAX*0.5, epsilon;
-    double dw_plus, dw_minus, dw_min, dw_max;
-    double inv_count;
-
-    max_iter = params.term_crit.max_iter;
-    epsilon = params.term_crit.epsilon;
-    dw_plus = params.rp_dw_plus;
-    dw_minus = params.rp_dw_minus;
-    dw_min = params.rp_dw_min;
-    dw_max = params.rp_dw_max;
-
-    l_count = layer_sizes->cols;
-    ivcount = layer_sizes->data.i[0];
-    ovcount = layer_sizes->data.i[l_count-1];
-
-    // allocate buffers
-    for( i = 0; i < l_count; i++ )
-        total += layer_sizes->data.i[i];
-
-    CV_CALL( dw = cvCreateMat( wbuf->rows, wbuf->cols, wbuf->type ));
-    cvSet( dw, cvScalarAll(params.rp_dw0) );
-    CV_CALL( dEdw = cvCreateMat( wbuf->rows, wbuf->cols, wbuf->type ));
-    cvZero( dEdw );
-    CV_CALL( prev_dEdw_sign = cvCreateMat( wbuf->rows, wbuf->cols, CV_8SC1 ));
-    cvZero( prev_dEdw_sign );
-
-    inv_count = 1./count;
-    dcount0 = max_buf_size/(2*total);
-    dcount0 = MAX( dcount0, 1 );
-    dcount0 = MIN( dcount0, count );
-    buf_sz = dcount0*(total + max_count)*2;
-
-    CV_CALL( buf = cvCreateMat( 1, buf_sz, CV_64F ));
-
-    CV_CALL( x = (double**)cvAlloc( total*2*sizeof(x[0]) ));
-    df = x + total;
-    buf_ptr = buf->data.db;
-
-    for( i = 0; i < l_count; i++ )
-    {
-        x[i] = buf_ptr;
-        df[i] = x[i] + layer_sizes->data.i[i]*dcount0;
-        buf_ptr += (df[i] - x[i])*2;
-    }
-
-    // run rprop loop
-    /*
-        y_i(t) = w_i(t)*x_{i-1}(t)
-        x_i(t) = f(y_i(t))
-        E = sum_over_all_samples(1/2*||u - x_N||^2)
-        grad_N = (x_N - u)*f'(y_i)
+            Mat x1( 1, ivcount, CV_64F, &x[0][0] );
 
-                      MIN(dw_i{jk}(t)*dw_plus, dw_max), if dE/dw_i{jk}(t)*dE/dw_i{jk}(t-1) > 0
-        dw_i{jk}(t) = MAX(dw_i{jk}(t)*dw_minus, dw_min), if dE/dw_i{jk}(t)*dE/dw_i{jk}(t-1) < 0
-                      dw_i{jk}(t-1) else
+            // forward pass, compute y[i]=w*x[i-1], x[i]=f(y[i]), df[i]=f'(y[i])
+            for( i = 1; i < l_count; i++ )
+            {
+                int n = layer_sizes[i];
+                Mat x2(1, n, CV_64F, &x[i][0] );
+                Mat _w = weights[i].rowRange(0, x1.cols);
+                gemm(x1, _w, 1, noArray(), 0, x2);
+                Mat _df(1, n, CV_64F, &df[i][0] );
+                calc_activ_func_deriv( x2, _df, weights[i] );
+                x1 = x2;
+            }
 
-        if (dE/dw_i{jk}(t)*dE/dw_i{jk}(t-1) < 0)
-           dE/dw_i{jk}(t)<-0
-        else
-           w_i{jk}(t+1) = w_i{jk}(t) + dw_i{jk}(t)
-        grad_{i-1}(t) = w_i^t(t)*grad_i(t)
-    */
-    for( iter = 0; iter < max_iter; iter++ )
-    {
-        int n1, n2, j, k;
-        double E = 0;
+            Mat grad1( 1, ovcount, CV_64F, buf[l_count&1] );
+            w = weights[l_count+1].ptr<double>();
 
-        // first, iterate through all the samples and compute dEdw
-        cv::parallel_for_(cv::Range(0, count),
-            rprop_loop(this, weights, count, ivcount, &x0, l_count, layer_sizes,
-                       ovcount, max_count, &u, sw, inv_count, dEdw, dcount0, &E, buf_sz)
-        );
+            // calculate error
+            const uchar* udata_p = outputs.ptr(idx);
+            const float* udata_f = (const float*)udata_p;
+            const double* udata_d = (const double*)udata_p;
 
-        // now update weights
-        for( i = 1; i < l_count; i++ )
-        {
-            n1 = layer_sizes->data.i[i-1]; n2 = layer_sizes->data.i[i];
-            for( k = 0; k <= n1; k++ )
+            double* gdata = grad1.ptr<double>();
+            for( k = 0; k < ovcount; k++ )
             {
-                double* wk = weights[i]+k*n2;
-                size_t delta = wk - weights[0];
-                double* dwk = dw->data.db + delta;
-                double* dEdwk = dEdw->data.db + delta;
-                char* prevEk = (char*)(prev_dEdw_sign->data.ptr + delta);
+                double t = (otype == CV_32F ? (double)udata_f[k] : udata_d[k])*w[k*2] + w[k*2+1] - x[l_count-1][k];
+                gdata[k] = t*sweight;
+                E += t*t;
+            }
+            E *= sweight;
 
-                for( j = 0; j < n2; j++ )
+            // backward pass, update weights
+            for( i = l_count-1; i > 0; i-- )
+            {
+                int n1 = layer_sizes[i-1], n2 = layer_sizes[i];
+                Mat _df(1, n2, CV_64F, &df[i][0]);
+                multiply( grad1, _df, grad1 );
+                Mat _x(n1+1, 1, CV_64F, &x[i-1][0]);
+                x[i-1][n1] = 1.;
+                gemm( _x, grad1, params.bpDWScale, dw[i], params.bpMomentScale, dw[i] );
+                add( weights[i], dw[i], weights[i] );
+                if( i > 1 )
                 {
-                    double Eval = dEdwk[j];
-                    double dval = dwk[j];
-                    double wval = wk[j];
-                    int s = CV_SIGN(Eval);
-                    int ss = prevEk[j]*s;
-                    if( ss > 0 )
-                    {
-                        dval *= dw_plus;
-                        dval = MIN( dval, dw_max );
-                        dwk[j] = dval;
-                        wk[j] = wval + dval*s;
-                    }
-                    else if( ss < 0 )
-                    {
-                        dval *= dw_minus;
-                        dval = MAX( dval, dw_min );
-                        prevEk[j] = 0;
-                        dwk[j] = dval;
-                        wk[j] = wval + dval*s;
-                    }
-                    else
-                    {
-                        prevEk[j] = (char)s;
-                        wk[j] = wval + dval*s;
-                    }
-                    dEdwk[j] = 0.;
+                    Mat grad2(1, n1, CV_64F, buf[i&1]);
+                    Mat _w = weights[i].rowRange(0, n1);
+                    gemm( grad1, _w, 1, noArray(), 0, grad2, GEMM_2_T );
+                    grad1 = grad2;
                 }
             }
         }
 
-        //printf("%d. E = %g\n", iter, E);
-        if( fabs(prev_E - E) < epsilon )
-            break;
-        prev_E = E;
-        E = 0;
+        iter /= count;
+        return iter;
     }
 
-    __END__;
+    struct RPropLoop : public ParallelLoopBody
+    {
+        RPropLoop(ANN_MLPImpl* _ann,
+                  const Mat& _inputs, const Mat& _outputs, const Mat& _sw,
+                  int _dcount0, vector<Mat>& _dEdw, double* _E)
+        {
+            ann = _ann;
+            inputs = _inputs;
+            outputs = _outputs;
+            sw = _sw.ptr<double>();
+            dcount0 = _dcount0;
+            dEdw = &_dEdw;
+            pE = _E;
+        }
 
-    cvReleaseMat( &dw );
-    cvReleaseMat( &dEdw );
-    cvReleaseMat( &prev_dEdw_sign );
-    cvReleaseMat( &buf );
-    cvFree( &x );
+        ANN_MLPImpl* ann;
+        vector<Mat>* dEdw;
+        Mat inputs, outputs;
+        const double* sw;
+        int dcount0;
+        double* pE;
 
-    return iter;
-}
+        void operator()( const Range& range ) const
+        {
+            double inv_count = 1./inputs.rows;
+            int ivcount = ann->layer_sizes.front();
+            int ovcount = ann->layer_sizes.back();
+            int itype = inputs.type(), otype = outputs.type();
+            int count = inputs.rows;
+            int i, j, k, l_count = ann->layer_count();
+            vector<vector<double> > x(l_count);
+            vector<vector<double> > df(l_count);
+            vector<double> _buf(ann->max_lsize*dcount0*2);
+            double* buf[] = { &_buf[0], &_buf[ann->max_lsize*dcount0] };
+            double E = 0;
+
+            for( i = 0; i < l_count; i++ )
+            {
+                x[i].resize(ann->layer_sizes[i]*dcount0);
+                df[i].resize(ann->layer_sizes[i]*dcount0);
+            }
 
+            for( int si = range.start; si < range.end; si++ )
+            {
+                int i0 = si*dcount0, i1 = std::min((si + 1)*dcount0, count);
+                int dcount = i1 - i0;
+                const double* w = ann->weights[0].ptr<double>();
 
-void CvANN_MLP::write_params( CvFileStorage* fs ) const
-{
-    //CV_FUNCNAME( "CvANN_MLP::write_params" );
+                // grab and preprocess input data
+                for( i = 0; i < dcount; i++ )
+                {
+                    const uchar* x0data_p = inputs.ptr(i0 + i);
+                    const float* x0data_f = (const float*)x0data_p;
+                    const double* x0data_d = (const double*)x0data_p;
 
-    __BEGIN__;
+                    double* xdata = &x[0][i*ivcount];
+                    for( j = 0; j < ivcount; j++ )
+                        xdata[j] = (itype == CV_32F ? (double)x0data_f[j] : x0data_d[j])*w[j*2] + w[j*2+1];
+                }
+                Mat x1(dcount, ivcount, CV_64F, &x[0][0]);
 
-    const char* activ_func_name = activ_func == IDENTITY ? "IDENTITY" :
-                            activ_func == SIGMOID_SYM ? "SIGMOID_SYM" :
-                            activ_func == GAUSSIAN ? "GAUSSIAN" : 0;
+                // forward pass, compute y[i]=w*x[i-1], x[i]=f(y[i]), df[i]=f'(y[i])
+                for( i = 1; i < l_count; i++ )
+                {
+                    Mat x2( dcount, ann->layer_sizes[i], CV_64F, &x[i][0] );
+                    Mat _w = ann->weights[i].rowRange(0, x1.cols);
+                    gemm( x1, _w, 1, noArray(), 0, x2 );
+                    Mat _df( x2.size(), CV_64F, &df[i][0] );
+                    ann->calc_activ_func_deriv( x2, _df, ann->weights[i] );
+                    x1 = x2;
+                }
 
-    if( activ_func_name )
-        cvWriteString( fs, "activation_function", activ_func_name );
-    else
-        cvWriteInt( fs, "activation_function", activ_func );
+                Mat grad1(dcount, ovcount, CV_64F, buf[l_count & 1]);
 
-    if( activ_func != IDENTITY )
-    {
-        cvWriteReal( fs, "f_param1", f_param1 );
-        cvWriteReal( fs, "f_param2", f_param2 );
-    }
+                w = ann->weights[l_count+1].ptr<double>();
 
-    cvWriteReal( fs, "min_val", min_val );
-    cvWriteReal( fs, "max_val", max_val );
-    cvWriteReal( fs, "min_val1", min_val1 );
-    cvWriteReal( fs, "max_val1", max_val1 );
+                // calculate error
+                for( i = 0; i < dcount; i++ )
+                {
+                    const uchar* udata_p = outputs.ptr(i0+i);
+                    const float* udata_f = (const float*)udata_p;
+                    const double* udata_d = (const double*)udata_p;
 
-    cvStartWriteStruct( fs, "training_params", CV_NODE_MAP );
-    if( params.train_method == CvANN_MLP_TrainParams::BACKPROP )
-    {
-        cvWriteString( fs, "train_method", "BACKPROP" );
-        cvWriteReal( fs, "dw_scale", params.bp_dw_scale );
-        cvWriteReal( fs, "moment_scale", params.bp_moment_scale );
-    }
-    else if( params.train_method == CvANN_MLP_TrainParams::RPROP )
-    {
-        cvWriteString( fs, "train_method", "RPROP" );
-        cvWriteReal( fs, "dw0", params.rp_dw0 );
-        cvWriteReal( fs, "dw_plus", params.rp_dw_plus );
-        cvWriteReal( fs, "dw_minus", params.rp_dw_minus );
-        cvWriteReal( fs, "dw_min", params.rp_dw_min );
-        cvWriteReal( fs, "dw_max", params.rp_dw_max );
-    }
+                    const double* xdata = &x[l_count-1][i*ovcount];
+                    double* gdata = grad1.ptr<double>(i);
+                    double sweight = sw ? sw[si+i] : inv_count, E1 = 0;
 
-    cvStartWriteStruct( fs, "term_criteria", CV_NODE_MAP + CV_NODE_FLOW );
-    if( params.term_crit.type & CV_TERMCRIT_EPS )
-        cvWriteReal( fs, "epsilon", params.term_crit.epsilon );
-    if( params.term_crit.type & CV_TERMCRIT_ITER )
-        cvWriteInt( fs, "iterations", params.term_crit.max_iter );
-    cvEndWriteStruct( fs );
+                    for( j = 0; j < ovcount; j++ )
+                    {
+                        double t = (otype == CV_32F ? (double)udata_f[j] : udata_d[j])*w[j*2] + w[j*2+1] - xdata[j];
+                        gdata[j] = t*sweight;
+                        E1 += t*t;
+                    }
+                    E += sweight*E1;
+                }
 
-    cvEndWriteStruct( fs );
+                for( i = l_count-1; i > 0; i-- )
+                {
+                    int n1 = ann->layer_sizes[i-1], n2 = ann->layer_sizes[i];
+                    Mat _df(dcount, n2, CV_64F, &df[i][0]);
+                    multiply(grad1, _df, grad1);
 
-    __END__;
-}
+                    {
+                        AutoLock lock(ann->mtx);
+                        Mat _dEdw = dEdw->at(i).rowRange(0, n1);
+                        x1 = Mat(dcount, n1, CV_64F, &x[i-1][0]);
+                        gemm(x1, grad1, 1, _dEdw, 1, _dEdw, GEMM_1_T);
+
+                        // update bias part of dEdw
+                        double* dst = dEdw->at(i).ptr<double>(n1);
+                        for( k = 0; k < dcount; k++ )
+                        {
+                            const double* src = grad1.ptr<double>(k);
+                            for( j = 0; j < n2; j++ )
+                                dst[j] += src[j];
+                        }
+                    }
 
+                    Mat grad2( dcount, n1, CV_64F, buf[i&1] );
+                    if( i > 1 )
+                    {
+                        Mat _w = ann->weights[i].rowRange(0, n1);
+                        gemm(grad1, _w, 1, noArray(), 0, grad2, GEMM_2_T);
+                    }
+                    grad1 = grad2;
+                }
+            }
+            {
+                AutoLock lock(ann->mtx);
+                *pE += E;
+            }
+        }
+    };
 
-void CvANN_MLP::write( CvFileStorage* fs, const char* name ) const
-{
-    CV_FUNCNAME( "CvANN_MLP::write" );
+    int train_rprop( const Mat& inputs, const Mat& outputs, const Mat& _sw, TermCriteria termCrit )
+    {
+        const int max_buf_size = 1 << 16;
+        int i, iter = -1, count = inputs.rows;
 
-    __BEGIN__;
+        double prev_E = DBL_MAX*0.5;
 
-    int i, l_count = layer_sizes->cols;
+        int max_iter = termCrit.maxCount;
+        double epsilon = termCrit.epsilon;
+        double dw_plus = params.rpDWPlus;
+        double dw_minus = params.rpDWMinus;
+        double dw_min = params.rpDWMin;
+        double dw_max = params.rpDWMax;
 
-    if( !layer_sizes )
-        CV_ERROR( CV_StsError, "The network has not been initialized" );
+        int l_count = layer_count();
 
-    cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_ANN_MLP );
+        // allocate buffers
+        vector<Mat> dw(l_count), dEdw(l_count), prev_dEdw_sign(l_count);
 
-    cvWrite( fs, "layer_sizes", layer_sizes );
+        int total = 0;
+        for( i = 0; i < l_count; i++ )
+        {
+            total += layer_sizes[i];
+            dw[i].create(weights[i].size(), CV_64F);
+            dw[i].setTo(Scalar::all(params.rpDW0));
+            prev_dEdw_sign[i] = Mat::zeros(weights[i].size(), CV_8S);
+            dEdw[i] = Mat::zeros(weights[i].size(), CV_64F);
+        }
 
-    write_params( fs );
+        int dcount0 = max_buf_size/(2*total);
+        dcount0 = std::max( dcount0, 1 );
+        dcount0 = std::min( dcount0, count );
+        int chunk_count = (count + dcount0 - 1)/dcount0;
+
+        // run rprop loop
+        /*
+         y_i(t) = w_i(t)*x_{i-1}(t)
+         x_i(t) = f(y_i(t))
+         E = sum_over_all_samples(1/2*||u - x_N||^2)
+         grad_N = (x_N - u)*f'(y_i)
+
+         std::min(dw_i{jk}(t)*dw_plus, dw_max), if dE/dw_i{jk}(t)*dE/dw_i{jk}(t-1) > 0
+         dw_i{jk}(t) = std::max(dw_i{jk}(t)*dw_minus, dw_min), if dE/dw_i{jk}(t)*dE/dw_i{jk}(t-1) < 0
+         dw_i{jk}(t-1) else
+
+         if (dE/dw_i{jk}(t)*dE/dw_i{jk}(t-1) < 0)
+         dE/dw_i{jk}(t)<-0
+         else
+         w_i{jk}(t+1) = w_i{jk}(t) + dw_i{jk}(t)
+         grad_{i-1}(t) = w_i^t(t)*grad_i(t)
+         */
+        for( iter = 0; iter < max_iter; iter++ )
+        {
+            double E = 0;
 
-    cvStartWriteStruct( fs, "input_scale", CV_NODE_SEQ + CV_NODE_FLOW );
-    cvWriteRawData( fs, weights[0], layer_sizes->data.i[0]*2, "d" );
-    cvEndWriteStruct( fs );
+            for( i = 0; i < l_count; i++ )
+                dEdw[i].setTo(Scalar::all(0));
 
-    cvStartWriteStruct( fs, "output_scale", CV_NODE_SEQ + CV_NODE_FLOW );
-    cvWriteRawData( fs, weights[l_count], layer_sizes->data.i[l_count-1]*2, "d" );
-    cvEndWriteStruct( fs );
+            // first, iterate through all the samples and compute dEdw
+            RPropLoop invoker(this, inputs, outputs, _sw, dcount0, dEdw, &E);
+            parallel_for_(Range(0, chunk_count), invoker);
+            //invoker(Range(0, chunk_count));
 
-    cvStartWriteStruct( fs, "inv_output_scale", CV_NODE_SEQ + CV_NODE_FLOW );
-    cvWriteRawData( fs, weights[l_count+1], layer_sizes->data.i[l_count-1]*2, "d" );
-    cvEndWriteStruct( fs );
+            // now update weights
+            for( i = 1; i < l_count; i++ )
+            {
+                int n1 = layer_sizes[i-1], n2 = layer_sizes[i];
+                for( int k = 0; k <= n1; k++ )
+                {
+                    CV_Assert(weights[i].size() == Size(n2, n1+1));
+                    double* wk = weights[i].ptr<double>(k);
+                    double* dwk = dw[i].ptr<double>(k);
+                    double* dEdwk = dEdw[i].ptr<double>(k);
+                    schar* prevEk = prev_dEdw_sign[i].ptr<schar>(k);
 
-    cvStartWriteStruct( fs, "weights", CV_NODE_SEQ );
-    for( i = 1; i < l_count; i++ )
-    {
-        cvStartWriteStruct( fs, 0, CV_NODE_SEQ + CV_NODE_FLOW );
-        cvWriteRawData( fs, weights[i], (layer_sizes->data.i[i-1]+1)*layer_sizes->data.i[i], "d" );
-        cvEndWriteStruct( fs );
-    }
+                    for( int j = 0; j < n2; j++ )
+                    {
+                        double Eval = dEdwk[j];
+                        double dval = dwk[j];
+                        double wval = wk[j];
+                        int s = CV_SIGN(Eval);
+                        int ss = prevEk[j]*s;
+                        if( ss > 0 )
+                        {
+                            dval *= dw_plus;
+                            dval = std::min( dval, dw_max );
+                            dwk[j] = dval;
+                            wk[j] = wval + dval*s;
+                        }
+                        else if( ss < 0 )
+                        {
+                            dval *= dw_minus;
+                            dval = std::max( dval, dw_min );
+                            prevEk[j] = 0;
+                            dwk[j] = dval;
+                            wk[j] = wval + dval*s;
+                        }
+                        else
+                        {
+                            prevEk[j] = (schar)s;
+                            wk[j] = wval + dval*s;
+                        }
+                        dEdwk[j] = 0.;
+                    }
+                }
+            }
 
-    cvEndWriteStruct( fs );
+            //printf("%d. E = %g\n", iter, E);
+            if( fabs(prev_E - E) < epsilon )
+                break;
+            prev_E = E;
+        }
 
-    __END__;
-}
+        return iter;
+    }
 
+    void write_params( FileStorage& fs ) const
+    {
+        const char* activ_func_name = activ_func == IDENTITY ? "IDENTITY" :
+                                      activ_func == SIGMOID_SYM ? "SIGMOID_SYM" :
+                                      activ_func == GAUSSIAN ? "GAUSSIAN" :
+                                      activ_func == RELU ? "RELU" :
+                                      activ_func == LEAKYRELU ? "LEAKYRELU" : 0;
+
+        if( activ_func_name )
+            fs << "activation_function" << activ_func_name;
+        else
+            fs << "activation_function_id" << activ_func;
 
-void CvANN_MLP::read_params( CvFileStorage* fs, CvFileNode* node )
-{
-    //CV_FUNCNAME( "CvANN_MLP::read_params" );
+        if( activ_func != IDENTITY )
+        {
+            fs << "f_param1" << f_param1;
+            fs << "f_param2" << f_param2;
+        }
 
-    __BEGIN__;
+        fs << "min_val" << min_val << "max_val" << max_val << "min_val1" << min_val1 << "max_val1" << max_val1;
 
-    const char* activ_func_name = cvReadStringByName( fs, node, "activation_function", 0 );
-    CvFileNode* tparams_node;
+        fs << "training_params" << "{";
+        if( params.trainMethod == ANN_MLP::BACKPROP )
+        {
+            fs << "train_method" << "BACKPROP";
+            fs << "dw_scale" << params.bpDWScale;
+            fs << "moment_scale" << params.bpMomentScale;
+        }
+        else if (params.trainMethod == ANN_MLP::RPROP)
+        {
+            fs << "train_method" << "RPROP";
+            fs << "dw0" << params.rpDW0;
+            fs << "dw_plus" << params.rpDWPlus;
+            fs << "dw_minus" << params.rpDWMinus;
+            fs << "dw_min" << params.rpDWMin;
+            fs << "dw_max" << params.rpDWMax;
+        }
+        else if (params.trainMethod == ANN_MLP::ANNEAL)
+        {
+            fs << "train_method" << "ANNEAL";
+            fs << "initialT" << params.initialT;
+            fs << "finalT" << params.finalT;
+            fs << "coolingRatio" << params.coolingRatio;
+            fs << "itePerStep" << params.itePerStep;
+        }
+        else
+            CV_Error(CV_StsError, "Unknown training method");
+
+        fs << "term_criteria" << "{";
+        if( params.termCrit.type & TermCriteria::EPS )
+            fs << "epsilon" << params.termCrit.epsilon;
+        if( params.termCrit.type & TermCriteria::COUNT )
+            fs << "iterations" << params.termCrit.maxCount;
+        fs << "}" << "}";
+    }
 
-    if( activ_func_name )
-        activ_func = strcmp( activ_func_name, "SIGMOID_SYM" ) == 0 ? SIGMOID_SYM :
-                     strcmp( activ_func_name, "IDENTITY" ) == 0 ? IDENTITY :
-                     strcmp( activ_func_name, "GAUSSIAN" ) == 0 ? GAUSSIAN : 0;
-    else
-        activ_func = cvReadIntByName( fs, node, "activation_function" );
+    void write( FileStorage& fs ) const
+    {
+        if( layer_sizes.empty() )
+            return;
+        int i, l_count = layer_count();
 
-    f_param1 = cvReadRealByName( fs, node, "f_param1", 0 );
-    f_param2 = cvReadRealByName( fs, node, "f_param2", 0 );
+        writeFormat(fs);
+        fs << "layer_sizes" << layer_sizes;
 
-    set_activ_func( activ_func, f_param1, f_param2 );
+        write_params( fs );
 
-    min_val = cvReadRealByName( fs, node, "min_val", 0. );
-    max_val = cvReadRealByName( fs, node, "max_val", 1. );
-    min_val1 = cvReadRealByName( fs, node, "min_val1", 0. );
-    max_val1 = cvReadRealByName( fs, node, "max_val1", 1. );
+        size_t esz = weights[0].elemSize();
 
-    tparams_node = cvGetFileNodeByName( fs, node, "training_params" );
-    params = CvANN_MLP_TrainParams();
+        fs << "input_scale" << "[";
+        fs.writeRaw("d", weights[0].ptr(), weights[0].total()*esz);
 
-    if( tparams_node )
-    {
-        const char* tmethod_name = cvReadStringByName( fs, tparams_node, "train_method", "" );
-        CvFileNode* tcrit_node;
+        fs << "]" << "output_scale" << "[";
+        fs.writeRaw("d", weights[l_count].ptr(), weights[l_count].total()*esz);
 
-        if( strcmp( tmethod_name, "BACKPROP" ) == 0 )
-        {
-            params.train_method = CvANN_MLP_TrainParams::BACKPROP;
-            params.bp_dw_scale = cvReadRealByName( fs, tparams_node, "dw_scale", 0 );
-            params.bp_moment_scale = cvReadRealByName( fs, tparams_node, "moment_scale", 0 );
-        }
-        else if( strcmp( tmethod_name, "RPROP" ) == 0 )
+        fs << "]" << "inv_output_scale" << "[";
+        fs.writeRaw("d", weights[l_count+1].ptr(), weights[l_count+1].total()*esz);
+
+        fs << "]" << "weights" << "[";
+        for( i = 1; i < l_count; i++ )
         {
-            params.train_method = CvANN_MLP_TrainParams::RPROP;
-            params.rp_dw0 = cvReadRealByName( fs, tparams_node, "dw0", 0 );
-            params.rp_dw_plus = cvReadRealByName( fs, tparams_node, "dw_plus", 0 );
-            params.rp_dw_minus = cvReadRealByName( fs, tparams_node, "dw_minus", 0 );
-            params.rp_dw_min = cvReadRealByName( fs, tparams_node, "dw_min", 0 );
-            params.rp_dw_max = cvReadRealByName( fs, tparams_node, "dw_max", 0 );
+            fs << "[";
+            fs.writeRaw("d", weights[i].ptr(), weights[i].total()*esz);
+            fs << "]";
         }
+        fs << "]";
+    }
 
-        tcrit_node = cvGetFileNodeByName( fs, tparams_node, "term_criteria" );
-        if( tcrit_node )
+    void read_params( const FileNode& fn )
+    {
+        String activ_func_name = (String)fn["activation_function"];
+        if( !activ_func_name.empty() )
         {
-            params.term_crit.epsilon = cvReadRealByName( fs, tcrit_node, "epsilon", -1 );
-            params.term_crit.max_iter = cvReadIntByName( fs, tcrit_node, "iterations", -1 );
-            params.term_crit.type = (params.term_crit.epsilon >= 0 ? CV_TERMCRIT_EPS : 0) +
-                                   (params.term_crit.max_iter >= 0 ? CV_TERMCRIT_ITER : 0);
+            activ_func = activ_func_name == "SIGMOID_SYM" ? SIGMOID_SYM :
+                         activ_func_name == "IDENTITY" ? IDENTITY :
+                         activ_func_name == "RELU" ? RELU :
+                         activ_func_name == "LEAKYRELU" ? LEAKYRELU :
+                         activ_func_name == "GAUSSIAN" ? GAUSSIAN : -1;
+            CV_Assert( activ_func >= 0 );
         }
-    }
+        else
+            activ_func = (int)fn["activation_function_id"];
 
-    __END__;
-}
+        f_param1 = (double)fn["f_param1"];
+        f_param2 = (double)fn["f_param2"];
 
+        setActivationFunction( activ_func, f_param1, f_param2);
 
-void CvANN_MLP::read( CvFileStorage* fs, CvFileNode* node )
-{
-    CvMat* _layer_sizes = 0;
+        min_val = (double)fn["min_val"];
+        max_val = (double)fn["max_val"];
+        min_val1 = (double)fn["min_val1"];
+        max_val1 = (double)fn["max_val1"];
 
-    CV_FUNCNAME( "CvANN_MLP::read" );
+        FileNode tpn = fn["training_params"];
+        params = AnnParams();
 
-    __BEGIN__;
+        if( !tpn.empty() )
+        {
+            String tmethod_name = (String)tpn["train_method"];
 
-    CvFileNode* w;
-    CvSeqReader reader;
-    int i, l_count;
+            if( tmethod_name == "BACKPROP" )
+            {
+                params.trainMethod = ANN_MLP::BACKPROP;
+                params.bpDWScale = (double)tpn["dw_scale"];
+                params.bpMomentScale = (double)tpn["moment_scale"];
+            }
+            else if (tmethod_name == "RPROP")
+            {
+                params.trainMethod = ANN_MLP::RPROP;
+                params.rpDW0 = (double)tpn["dw0"];
+                params.rpDWPlus = (double)tpn["dw_plus"];
+                params.rpDWMinus = (double)tpn["dw_minus"];
+                params.rpDWMin = (double)tpn["dw_min"];
+                params.rpDWMax = (double)tpn["dw_max"];
+            }
+            else if (tmethod_name == "ANNEAL")
+            {
+                params.trainMethod = ANN_MLP::ANNEAL;
+                params.initialT = (double)tpn["initialT"];
+                params.finalT = (double)tpn["finalT"];
+                params.coolingRatio = (double)tpn["coolingRatio"];
+                params.itePerStep = tpn["itePerStep"];
+            }
+            else
+                CV_Error(CV_StsParseError, "Unknown training method (should be BACKPROP or RPROP)");
+
+            FileNode tcn = tpn["term_criteria"];
+            if( !tcn.empty() )
+            {
+                FileNode tcn_e = tcn["epsilon"];
+                FileNode tcn_i = tcn["iterations"];
+                params.termCrit.type = 0;
+                if( !tcn_e.empty() )
+                {
+                    params.termCrit.type |= TermCriteria::EPS;
+                    params.termCrit.epsilon = (double)tcn_e;
+                }
+                if( !tcn_i.empty() )
+                {
+                    params.termCrit.type |= TermCriteria::COUNT;
+                    params.termCrit.maxCount = (int)tcn_i;
+                }
+            }
+        }
+    }
+
+    void read( const FileNode& fn )
+    {
+        clear();
 
-    _layer_sizes = (CvMat*)cvReadByName( fs, node, "layer_sizes" );
-    CV_CALL( create( _layer_sizes, SIGMOID_SYM, 0, 0 ));
+        vector<int> _layer_sizes;
+        readVectorOrMat(fn["layer_sizes"], _layer_sizes);
+        setLayerSizes( _layer_sizes );
 
-    cvReleaseMat( &_layer_sizes );
-    _layer_sizes = NULL;
+        int i, l_count = layer_count();
+        read_params(fn);
 
-    l_count = layer_sizes->cols;
+        size_t esz = weights[0].elemSize();
 
-    CV_CALL( read_params( fs, node ));
+        FileNode w = fn["input_scale"];
+        w.readRaw("d", weights[0].ptr(), weights[0].total()*esz);
 
-    w = cvGetFileNodeByName( fs, node, "input_scale" );
-    if( !w || CV_NODE_TYPE(w->tag) != CV_NODE_SEQ ||
-        w->data.seq->total != layer_sizes->data.i[0]*2 )
-        CV_ERROR( CV_StsParseError, "input_scale tag is not found or is invalid" );
+        w = fn["output_scale"];
+        w.readRaw("d", weights[l_count].ptr(), weights[l_count].total()*esz);
 
-    CV_CALL( cvReadRawData( fs, w, weights[0], "d" ));
+        w = fn["inv_output_scale"];
+        w.readRaw("d", weights[l_count+1].ptr(), weights[l_count+1].total()*esz);
 
-    w = cvGetFileNodeByName( fs, node, "output_scale" );
-    if( !w || CV_NODE_TYPE(w->tag) != CV_NODE_SEQ ||
-        w->data.seq->total != layer_sizes->data.i[l_count-1]*2 )
-        CV_ERROR( CV_StsParseError, "output_scale tag is not found or is invalid" );
+        FileNodeIterator w_it = fn["weights"].begin();
 
-    CV_CALL( cvReadRawData( fs, w, weights[l_count], "d" ));
+        for( i = 1; i < l_count; i++, ++w_it )
+            (*w_it).readRaw("d", weights[i].ptr(), weights[i].total()*esz);
+        trained = true;
+    }
 
-    w = cvGetFileNodeByName( fs, node, "inv_output_scale" );
-    if( !w || CV_NODE_TYPE(w->tag) != CV_NODE_SEQ ||
-        w->data.seq->total != layer_sizes->data.i[l_count-1]*2 )
-        CV_ERROR( CV_StsParseError, "inv_output_scale tag is not found or is invalid" );
+    Mat getWeights(int layerIdx) const
+    {
+        CV_Assert( 0 <= layerIdx && layerIdx < (int)weights.size() );
+        return weights[layerIdx];
+    }
 
-    CV_CALL( cvReadRawData( fs, w, weights[l_count+1], "d" ));
+    bool isTrained() const
+    {
+        return trained;
+    }
 
-    w = cvGetFileNodeByName( fs, node, "weights" );
-    if( !w || CV_NODE_TYPE(w->tag) != CV_NODE_SEQ ||
-        w->data.seq->total != l_count - 1 )
-        CV_ERROR( CV_StsParseError, "weights tag is not found or is invalid" );
+    bool isClassifier() const
+    {
+        return false;
+    }
 
-    cvStartReadSeq( w->data.seq, &reader );
+    int getVarCount() const
+    {
+        return layer_sizes.empty() ? 0 : layer_sizes[0];
+    }
 
-    for( i = 1; i < l_count; i++ )
+    String getDefaultName() const
     {
-        w = (CvFileNode*)reader.ptr;
-        CV_CALL( cvReadRawData( fs, w, weights[i], "d" ));
-        CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
+        return "opencv_ml_ann_mlp";
     }
 
-    __END__;
-}
+    vector<int> layer_sizes;
+    vector<Mat> weights;
+    double f_param1, f_param2;
+    double min_val, max_val, min_val1, max_val1;
+    int activ_func;
+    int max_lsize, max_buf_sz;
+    AnnParams params;
+    RNG rng;
+    Mutex mtx;
+    bool trained;
+};
 
-using namespace cv;
 
-CvANN_MLP::CvANN_MLP( const Mat& _layer_sizes, int _activ_func,
-                      double _f_param1, double _f_param2 )
-{
-    layer_sizes = wbuf = 0;
-    min_val = max_val = min_val1 = max_val1 = 0.;
-    weights = 0;
-    rng = &cv::theRNG();
-    default_model_name = "my_nn";
-    create( _layer_sizes, _activ_func, _f_param1, _f_param2 );
-}
 
-void CvANN_MLP::create( const Mat& _layer_sizes, int _activ_func,
-                       double _f_param1, double _f_param2 )
-{
-    CvMat cvlayer_sizes = _layer_sizes;
-    create( &cvlayer_sizes, _activ_func, _f_param1, _f_param2 );
-}
 
-int CvANN_MLP::train( const Mat& _inputs, const Mat& _outputs,
-                     const Mat& _sample_weights, const Mat& _sample_idx,
-                     CvANN_MLP_TrainParams _params, int flags )
+Ptr<ANN_MLP> ANN_MLP::create()
 {
-    CvMat inputs = _inputs, outputs = _outputs, sweights = _sample_weights, sidx = _sample_idx;
-    return train(&inputs, &outputs, sweights.data.ptr ? &sweights : 0,
-                 sidx.data.ptr ? &sidx : 0, _params, flags);
+    return makePtr<ANN_MLPImpl>();
 }
 
-float CvANN_MLP::predict( const Mat& _inputs, Mat& _outputs ) const
+Ptr<ANN_MLP> ANN_MLP::load(const String& filepath)
 {
-    CV_Assert(layer_sizes != 0);
-    _outputs.create(_inputs.rows, layer_sizes->data.i[layer_sizes->cols-1], _inputs.type());
-    CvMat inputs = _inputs, outputs = _outputs;
-
-    return predict(&inputs, &outputs);
+    FileStorage fs;
+    fs.open(filepath, FileStorage::READ);
+    CV_Assert(fs.isOpened());
+    Ptr<ANN_MLP> ann = makePtr<ANN_MLPImpl>();
+    ((ANN_MLPImpl*)ann.get())->read(fs.getFirstTopLevelNode());
+    return ann;
 }
 
+}}
+
 /* End of file. */