31 namespace neural_nets {
32 namespace optimization {
38 template <
typename eT=
float>
47 Adam(
size_t rows_,
size_t cols_, eT beta1_ = 0.9, eT beta2_ = 0.999, eT eps_ = 1e-8)
50 m = MAKE_MATRIX_PTR(eT, rows_, cols_);
53 v = MAKE_MATRIX_PTR(eT, rows_, cols_);
57 delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
70 mic::types::MatrixPtr<eT>
calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_ = 0.001) {
71 assert(x_->size() == dx_->size());
72 assert(x_->size() ==
m->size());
75 for (
size_t i=0; i<(size_t)x_->size(); i++)
79 for (
size_t i=0; i<(size_t)x_->size(); i++)
80 (*
v)[i] =
beta2 * (*v)[i] + (1-
beta2) * (*dx_)[i] * (*dx_)[i];
83 for (
size_t i=0; i<(size_t)x_->size(); i++)
96 mic::types::MatrixPtr<eT>
m;
99 mic::types::MatrixPtr<eT>
v;
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_=0.001)
Abstract class representing interface to optimization function.
mic::types::MatrixPtr< eT > delta
Calculated update.
eT beta2
Decay rate 2 (momentum for past squared gradients).
eT eps
Smoothing term that avoids division by zero.
eT beta2_powt
Decay rate 2 to the power of t.
Adam(size_t rows_, size_t cols_, eT beta1_=0.9, eT beta2_=0.999, eT eps_=1e-8)
mic::types::MatrixPtr< eT > m
Exponentially decaying average of past gradients.
Adam - adaptive moment estimation.
eT beta1
Decay rate 1 (momentum for past gradients).
mic::types::MatrixPtr< eT > v
Exponentially decaying average of past squared gradients.
eT beta1_powt
Decay rate 1 to the power of t.