31 namespace neural_nets {
32 namespace optimization {
38 template <
typename eT=
float>
47 AdamID(
size_t rows_,
size_t cols_, eT beta1_ = 0.9, eT beta2_ = 0.999, eT eps_ = 1e-8)
50 Edx = MAKE_MATRIX_PTR(eT, rows_, cols_);
53 Edx2 = MAKE_MATRIX_PTR(eT, rows_, cols_);
56 dx_prev = MAKE_MATRIX_PTR(eT, rows_, cols_);
60 delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
73 mic::types::MatrixPtr<eT>
calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_) {
74 assert(x_->size() == dx_->size());
75 assert(x_->size() ==
Edx->size());
78 for (
size_t i=0; i< (size_t)
Edx->size(); i++) {
79 (*Edx)[i] =
beta1 *(*Edx)[i] + (1.0 -
beta1) * (*dx_)[i];
80 assert(std::isfinite((*
Edx)[i]));
84 for (
size_t i=0; i< (size_t)
Edx2->size(); i++) {
85 (*Edx2)[i] =
beta2 *(*Edx2)[i] + (1.0 -
beta2) * (*dx_)[i] * (*dx_)[i];
86 assert(std::isfinite((*
Edx2)[i]));
91 for(
size_t i=0; i< (size_t)
delta->size(); i++) {
94 eT delta_ID = learning_rate_ * (*Edx)[i] + learning_rate_*learning_rate_ * ((*dx_)[i] - (*dx_prev)[i]);
97 assert(std::isfinite((*delta)[i]));
102 for (
size_t i=0; i< (size_t)dx_->size(); i++) {
103 (*dx_prev)[i] = (*dx_)[i];
133 mic::types::MatrixPtr<eT>
Edx;
136 mic::types::MatrixPtr<eT>
Edx2;
Abstract class representing interface to optimization function.
mic::types::MatrixPtr< eT > Edx
Decaying average of gradients up to time t - E[g].
eT eps
Smoothing term that avoids division by zero.
eT beta1
Decay rate 1 (momentum for past gradients).
eT beta2
Decay rate 2 (momentum for past squared gradients).
mic::types::MatrixPtr< eT > Edx2
Decaying average of squared gradients up to time t - E[g^2].
eT beta1_powt
Decay rate 1 to the power of t - bias correction.
mic::types::MatrixPtr< eT > delta
Calculated update.
AdamID - ADAM with integral and derivative coefficients.
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_)
AdamID(size_t rows_, size_t cols_, eT beta1_=0.9, eT beta2_=0.999, eT eps_=1e-8)
eT beta2_powt
Decay rate 2 to the power of t - bias correction.
mic::types::MatrixPtr< eT > dx_prev
Previous value of gradients.