31 namespace neural_nets {
32 namespace optimization {
38 template <
typename eT=
float>
47 AdaGrad(
size_t rows_,
size_t cols_, eT eps_ = 1e-8) :
eps(eps_) {
48 G = MAKE_MATRIX_PTR(eT, rows_, cols_);
53 delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
63 mic::types::MatrixPtr<eT>
calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_) {
64 assert(x_->size() == dx_->size());
65 assert(x_->size() ==
G->size());
68 for (
size_t i=0; i<(size_t)x_->size(); i++)
69 (*
G)[i] += (*dx_)[i] * (*dx_)[i];
72 for (
size_t i=0; i<(size_t)x_->size(); i++)
73 (*
delta)[i] = learning_rate_ * (*dx_)[i] / (std::sqrt((*
G)[i] +
eps));
84 mic::types::MatrixPtr<eT>
G;
87 mic::types::MatrixPtr<eT>
delta;
AdaGrad(size_t rows_, size_t cols_, eT eps_=1e-8)
Abstract class representing interface to optimization function.
mic::types::MatrixPtr< eT > G
Sum of all of the squares of the gradients up to time t ("diagonal matrix").
eT eps
Smoothing term that avoids division by zero.
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_)
Update using AdaGrad - adaptive gradient descent.
mic::types::MatrixPtr< eT > delta
Calculated update.