32 namespace neural_nets {
33 namespace optimization {
39 template <
typename eT=
float>
48 GradPID(
size_t rows_,
size_t cols_, eT decay_ = 0.9, eT eps_ = 1e-8) :
decay(decay_),
eps(eps_) {
50 Edx = MAKE_MATRIX_PTR(eT, rows_, cols_);
53 dx_prev = MAKE_MATRIX_PTR(eT, rows_, cols_);
56 deltaP = MAKE_MATRIX_PTR(eT, rows_, cols_);
59 deltaI = MAKE_MATRIX_PTR(eT, rows_, cols_);
62 deltaD = MAKE_MATRIX_PTR(eT, rows_, cols_);
65 delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
76 mic::types::MatrixPtr<eT>
calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_ = 0.001) {
77 assert(x_->size() == dx_->size());
78 assert(x_->size() ==
Edx->size());
81 p_rate = learning_rate_ * learning_rate_ * learning_rate_ * learning_rate_ ;
83 d_rate = learning_rate_ * learning_rate_ * learning_rate_ ;
86 for (
size_t i=0; i< (size_t)
Edx->size(); i++) {
87 (*Edx)[i] =
decay *(*Edx)[i] + (1.0 -
decay) * (*dx_)[i];
88 assert(std::isfinite((*
Edx)[i]));
101 for(
size_t i=0; i< (size_t)
delta->size(); i++){
102 (*deltaP)[i] = p_rate * (*dx_)[i];
107 for(
size_t i=0; i< (size_t)
delta->size(); i++){
108 (*deltaI)[i] =
i_rate * (*Edx)[i];
113 for(
size_t i=0; i< (size_t)
delta->size(); i++){
114 (*deltaD)[i] =
d_rate * ((*dx_)[i] - (*dx_prev)[i]);
119 for(
size_t i=0; i< (size_t)
delta->size(); i++) {
120 (*delta)[i] = (*deltaP)[i] + (*deltaI)[i] + (*deltaD)[i];
122 assert(std::isfinite((*
delta)[i]));
126 for (
size_t i=0; i< (size_t)dx_->size(); i++) {
127 (*dx_prev)[i] = (*dx_)[i];
154 mic::types::MatrixPtr<eT>
Edx;
179 template <
typename eT=
float>
189 Edx = MAKE_MATRIX_PTR(eT, rows_, cols_);
192 dx_prev = MAKE_MATRIX_PTR(eT, rows_, cols_);
195 deltaP = MAKE_MATRIX_PTR(eT, rows_, cols_);
198 deltaI = MAKE_MATRIX_PTR(eT, rows_, cols_);
201 deltaD = MAKE_MATRIX_PTR(eT, rows_, cols_);
204 delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
208 p_rate = MAKE_MATRIX_PTR(eT, rows_, cols_);
212 i_rate = MAKE_MATRIX_PTR(eT, rows_, cols_);
216 d_rate = MAKE_MATRIX_PTR(eT, rows_, cols_);
227 mic::types::MatrixPtr<eT>
calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_ = 0.001) {
228 assert(x_->size() == dx_->size());
229 assert(x_->size() ==
Edx->size());
255 for(
size_t i=0; i< (size_t)
delta->size(); i++){
265 for(
size_t i=0; i< (size_t)
delta->size(); i++){
274 for(
size_t i=0; i< (size_t)
delta->size(); i++){
318 for(
size_t i=0; i< (size_t)
delta->size(); i++) {
319 (*delta)[i] = (*deltaP)[i] + (*deltaI)[i] + (*deltaD)[i];
321 assert(std::isfinite((*
delta)[i]));
326 for (
size_t i=0; i< (size_t)dx_->size(); i++) {
327 (*dx_prev)[i] = (*dx_)[i];
356 mic::types::MatrixPtr<eT>
Edx;
mic::types::MatrixPtr< eT > deltaI
Integral update.
mic::types::MatrixPtr< eT > deltaD
Derivative update.
eT decay
Decay ratio, similar to momentum.
mic::types::MatrixPtr< eT > dx_prev
Previous value of gradients.
AdaGradPID - adaptive gradient descent with proportional, integral and derivative coefficients...
mic::types::MatrixPtr< eT > deltaP
Proportional update.
mic::types::MatrixPtr< eT > Edx
Decaying average of gradients up to time t - E[g].
eT p_rate
Adaptive proportional factor (learning rate).
eT d_rate
Adaptive proportional factor (learning rate).
Abstract class representing interface to optimization function.
mic::types::MatrixPtr< eT > deltaD
Derivative update.
eT eps
Smoothing term that avoids division by zero.
GradPID(size_t rows_, size_t cols_, eT decay_=0.9, eT eps_=1e-8)
eT i_rate
Adaptive integral factor (learning rate).
GradPID - adaptive gradient descent with proportional, integral and derivative coefficients.
mic::types::MatrixPtr< eT > deltaP
Proportional update.
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_=0.001)
mic::types::MatrixPtr< eT > delta
Calculated update.
mic::types::MatrixPtr< eT > deltaI
Integral update.
eT decay
Decay ratio, similar to momentum.
mic::types::MatrixPtr< eT > dx_prev
Previous value of gradients.
eT eps
Smoothing term that avoids division by zero.
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_=0.001)
mic::types::MatrixPtr< eT > d_rate
Adaptive proportional factor (learning rate).
AdaGradPID(size_t rows_, size_t cols_, eT decay_=0.9, eT eps_=1e-8)
mic::types::MatrixPtr< eT > delta
Calculated update.
mic::types::MatrixPtr< eT > Edx
Surprisal - for feed forward nets it is based on the difference between the prediction and target...
mic::types::MatrixPtr< eT > i_rate
Adaptive integral factor (learning rate).
mic::types::MatrixPtr< eT > p_rate
Adaptive proportional factor (learning rate).