MachineIntelligenceCore:NeuralNets
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Friends Macros
AdamID.hpp
Go to the documentation of this file.
1 
25 #ifndef ADAMID_HPP_
26 #define ADAMID_HPP_
27 
29 
30 namespace mic {
31 namespace neural_nets {
32 namespace optimization {
33 
38 template <typename eT=float>
39 class AdamID : public OptimizationFunction<eT> {
40 public:
41 
47  AdamID(size_t rows_, size_t cols_, eT beta1_ = 0.9, eT beta2_ = 0.999, eT eps_ = 1e-8)
48  : beta1(beta1_), beta2(beta2_), eps(eps_)
49 {
50  Edx = MAKE_MATRIX_PTR(eT, rows_, cols_);
51  Edx->zeros();
52 
53  Edx2 = MAKE_MATRIX_PTR(eT, rows_, cols_);
54  Edx2->zeros();
55 
56  dx_prev = MAKE_MATRIX_PTR(eT, rows_, cols_);
57  dx_prev->zeros();
58 
59  // Allocate and reset delta.
60  delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
61  delta->zeros();
62 
63  beta1_powt = beta1;
64  beta2_powt = beta2;
65  }
66 
73  mic::types::MatrixPtr<eT> calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_) {
74  assert(x_->size() == dx_->size());
75  assert(x_->size() == Edx->size());
76 
77  // Update decaying sum of gradients - up to time t. INTEGRAL.
78  for (size_t i=0; i< (size_t)Edx->size(); i++) {
79  (*Edx)[i] = beta1 *(*Edx)[i] + (1.0 - beta1) * (*dx_)[i];
80  assert(std::isfinite((*Edx)[i]));
81  }
82 
83  // Update decaying sum of squared gradients - up to time t. NORMALIZER.
84  for (size_t i=0; i< (size_t)Edx2->size(); i++) {
85  (*Edx2)[i] = beta2 *(*Edx2)[i] + (1.0 - beta2) * (*dx_)[i] * (*dx_)[i];
86  assert(std::isfinite((*Edx2)[i]));
87  }
88 
89  // Calculate update.
90 // std::cout <<"Delta = ";
91  for(size_t i=0; i< (size_t)delta->size(); i++) {
92  // update = integral + small derivative correction.
93  // i.e. lr * I + lr^2 * D.
94  eT delta_ID = learning_rate_ * (*Edx)[i] + learning_rate_*learning_rate_ * ((*dx_)[i] - (*dx_prev)[i]);
95  (*delta)[i] = 1.0 / (sqrt( (*Edx2)[i] / (1 - beta2_powt)) + eps) * ( delta_ID ) / (1 - beta1_powt);
96 // std::cout<< (*delta)[i] << " | ";
97  assert(std::isfinite((*delta)[i]));
98  }
99 // std::cout << std::endl;
100 
101  // Store past gradients.
102  for (size_t i=0; i< (size_t)dx_->size(); i++) {
103  (*dx_prev)[i] = (*dx_)[i];
104  }
105 
106 // std::cout << std::endl;
107 
108  // Update "powered" factors.
109  beta1_powt *= beta1;
110  beta2_powt *= beta2;
111 
112  // Return the update.
113  return delta;
114  }
115 
116 protected:
118  eT beta1;
119 
121  eT beta2;
122 
124  eT eps;
125 
128 
131 
133  mic::types::MatrixPtr<eT> Edx;
134 
136  mic::types::MatrixPtr<eT> Edx2;
137 
139  mic::types::MatrixPtr<eT> dx_prev;
140 
142  mic::types::MatrixPtr<eT> delta;
143 };
144 
145 
146 } //: optimization
147 } //: neural_nets
148 } //: mic
149 
150 
151 
152 #endif /* ADAMID_HPP_ */
Abstract class representing interface to optimization function.
mic::types::MatrixPtr< eT > Edx
Decaying average of gradients up to time t - E[g].
Definition: AdamID.hpp:133
eT eps
Smoothing term that avoids division by zero.
Definition: AdamID.hpp:124
eT beta1
Decay rate 1 (momentum for past gradients).
Definition: AdamID.hpp:118
eT beta2
Decay rate 2 (momentum for past squared gradients).
Definition: AdamID.hpp:121
mic::types::MatrixPtr< eT > Edx2
Decaying average of squared gradients up to time t - E[g^2].
Definition: AdamID.hpp:136
eT beta1_powt
Decay rate 1 to the power of t - bias correction.
Definition: AdamID.hpp:127
mic::types::MatrixPtr< eT > delta
Calculated update.
Definition: AdamID.hpp:142
AdamID - ADAM with integral and derivative coefficients.
Definition: AdamID.hpp:39
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_)
Definition: AdamID.hpp:73
AdamID(size_t rows_, size_t cols_, eT beta1_=0.9, eT beta2_=0.999, eT eps_=1e-8)
Definition: AdamID.hpp:47
eT beta2_powt
Decay rate 2 to the power of t - bias correction.
Definition: AdamID.hpp:130
mic::types::MatrixPtr< eT > dx_prev
Previous value of gradients.
Definition: AdamID.hpp:139