31 namespace neural_nets {
32 namespace optimization {
38 template <
typename eT=
float>
47 AdaDelta(
size_t rows_,
size_t cols_, eT decay_ = 0.9, eT eps_ = 1e-8) :
decay(decay_),
eps(eps_) {
48 EG = MAKE_MATRIX_PTR(eT, rows_, cols_);
51 ED = MAKE_MATRIX_PTR(eT, rows_, cols_);
55 delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
65 mic::types::MatrixPtr<eT>
calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_) {
66 assert(x_->size() == dx_->size());
67 assert(x_->size() ==
EG->size());
77 for (
size_t i=0; i<(size_t)x_->size(); i++) {
78 (*EG)[i] =
decay *(*EG)[i] + (1.0 -
decay) * (*dx_)[i] * (*dx_)[i];
80 assert(std::isfinite((*
EG)[i]));
84 for (
size_t i=0; i<(size_t)x_->size(); i++)
88 for (
size_t i=0; i<(size_t)x_->size(); i++){
90 (*delta)[i] = (std::sqrt((*
ED)[i] +
eps) / std::sqrt((*
EG)[i] +
eps)) * (*dx_)[i];
92 assert(std::isfinite((*delta)[i]));
107 mic::types::MatrixPtr<eT>
EG;
110 mic::types::MatrixPtr<eT>
ED;
eT eps
Smoothing term that avoids division by zero.
mic::types::MatrixPtr< eT > ED
Decaying average of the squares of updates up to time t ("diagonal matrix") - E[delta Theta^2]...
Update using AdaDelta - adaptive gradient descent with running average E[g^2] and E[d^2]...
Abstract class representing interface to optimization function.
AdaDelta(size_t rows_, size_t cols_, eT decay_=0.9, eT eps_=1e-8)
eT decay
Decay ratio, similar to momentum.
mic::types::MatrixPtr< eT > delta
Calculated update.
mic::types::MatrixPtr< eT > EG
Decaying average of the squares of gradients up to time t ("diagonal matrix") - E[g^2].
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_)