MachineIntelligenceCore:NeuralNets
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Friends Macros
GradPID.hpp
Go to the documentation of this file.
1 
25 #ifndef GRADPID_HPP_
26 #define GRADPID_HPP_
27 
29 
30 
31 namespace mic {
32 namespace neural_nets {
33 namespace optimization {
34 
39 template <typename eT=float>
40 class GradPID : public OptimizationFunction<eT> {
41 public:
42 
48  GradPID(size_t rows_, size_t cols_, eT decay_ = 0.9, eT eps_ = 1e-8) : decay(decay_), eps(eps_) {
49 
50  Edx = MAKE_MATRIX_PTR(eT, rows_, cols_);
51  Edx->zeros();
52 
53  dx_prev = MAKE_MATRIX_PTR(eT, rows_, cols_);
54  dx_prev->zeros();
55 
56  deltaP = MAKE_MATRIX_PTR(eT, rows_, cols_);
57  deltaP->zeros();
58 
59  deltaI = MAKE_MATRIX_PTR(eT, rows_, cols_);
60  deltaI->zeros();
61 
62  deltaD = MAKE_MATRIX_PTR(eT, rows_, cols_);
63  deltaD->zeros();
64 
65  delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
66  delta->zeros();
67 
68  }
69 
76  mic::types::MatrixPtr<eT> calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_ = 0.001) {
77  assert(x_->size() == dx_->size());
78  assert(x_->size() == Edx->size());
79 
80  // Initialize ratios and variables.
81  p_rate = learning_rate_ * learning_rate_ * learning_rate_ * learning_rate_ ;
82  i_rate = learning_rate_;
83  d_rate = learning_rate_ * learning_rate_ * learning_rate_ ;
84 
85  // Update decaying sum of gradients - up to time t.
86  for (size_t i=0; i< (size_t)Edx->size(); i++) {
87  (*Edx)[i] = decay *(*Edx)[i] + (1.0 - decay) * (*dx_)[i];
88  assert(std::isfinite((*Edx)[i]));
89  }
90 
91  // DEBUG
92 /* for(size_t i=0; i< delta->size(); i++){
93  std::cout<< "(*x_)[" << i << "]=" << (*x_)[i] << std::endl;
94  }
95  for(size_t i=0; i< delta->size(); i++){
96  std::cout<< "(*dx_)[" << i << "]=" << (*dx_)[i] << std::endl;
97  }*/
98 
99  // Calculate gradients updates.
100  // Proportional.
101  for(size_t i=0; i< (size_t)delta->size(); i++){
102  (*deltaP)[i] = p_rate * (*dx_)[i];
103 // std::cout<< "(*deltaP)[" << i << "]=" << (*deltaP)[i] << std::endl;
104  }
105 
106  // Integral.
107  for(size_t i=0; i< (size_t)delta->size(); i++){
108  (*deltaI)[i] = i_rate * (*Edx)[i];
109 // std::cout<< "(*deltaI)[" << i << "]=" << (*deltaI)[i] << std::endl;
110  }
111 
112  // Derivative.
113  for(size_t i=0; i< (size_t)delta->size(); i++){
114  (*deltaD)[i] = d_rate * ((*dx_)[i] - (*dx_prev)[i]);
115 // std::cout<< "(*deltaD)[" << i << "]=" << (*deltaD)[i] << std::endl;
116  }
117 
118  // Calculate the update.
119  for(size_t i=0; i< (size_t)delta->size(); i++) {
120  (*delta)[i] = (*deltaP)[i] + (*deltaI)[i] + (*deltaD)[i];
121 // std::cout<< "(*delta)[" << i << "]=" << (*delta)[i] << std::endl;
122  assert(std::isfinite((*delta)[i]));
123  }
124 
125  // Store past gradients.
126  for (size_t i=0; i< (size_t)dx_->size(); i++) {
127  (*dx_prev)[i] = (*dx_)[i];
128  }
129 
130 // std::cout << std::endl;
131 
132  // Return the update.
133  return delta;
134  }
135 
136 protected:
137 
139  eT decay;
140 
142  eT eps;
143 
145  eT p_rate;
146 
148  eT i_rate;
149 
151  eT d_rate;
152 
154  mic::types::MatrixPtr<eT> Edx;
155 
157  mic::types::MatrixPtr<eT> dx_prev;
158 
160  mic::types::MatrixPtr<eT> deltaP;
161 
163  mic::types::MatrixPtr<eT> deltaI;
164 
166  mic::types::MatrixPtr<eT> deltaD;
167 
169  mic::types::MatrixPtr<eT> delta;
170 };
171 
172 
173 
174 
179 template <typename eT=float>
180 class AdaGradPID : public OptimizationFunction<eT> {
181 public:
182 
188  AdaGradPID(size_t rows_, size_t cols_, eT decay_ = 0.9, eT eps_ = 1e-8) : decay(decay_), eps(eps_) {
189  Edx = MAKE_MATRIX_PTR(eT, rows_, cols_);
190  Edx->zeros();
191 
192  dx_prev = MAKE_MATRIX_PTR(eT, rows_, cols_);
193  dx_prev->zeros();
194 
195  deltaP = MAKE_MATRIX_PTR(eT, rows_, cols_);
196  deltaP->zeros();
197 
198  deltaI = MAKE_MATRIX_PTR(eT, rows_, cols_);
199  deltaI->zeros();
200 
201  deltaD = MAKE_MATRIX_PTR(eT, rows_, cols_);
202  deltaD->zeros();
203 
204  delta = MAKE_MATRIX_PTR(eT, rows_, cols_);
205  delta->zeros();
206 
207  // Initialize ratios and variables.
208  p_rate = MAKE_MATRIX_PTR(eT, rows_, cols_);
209  p_rate->zeros();
210 
211  // Initialize ratios and variables.
212  i_rate = MAKE_MATRIX_PTR(eT, rows_, cols_);
213  i_rate->zeros();
214 
215  // Initialize ratios and variables.
216  d_rate = MAKE_MATRIX_PTR(eT, rows_, cols_);
217  d_rate->zeros();
218 
219  }
220 
227  mic::types::MatrixPtr<eT> calculateUpdate(mic::types::MatrixPtr<eT> x_, mic::types::MatrixPtr<eT> dx_, eT learning_rate_ = 0.001) {
228  assert(x_->size() == dx_->size());
229  assert(x_->size() == Edx->size());
230 
231  // Update decaying sum of gradients - up to time t.
232 /* for (size_t i=0; i<Edx->size(); i++) {
233  (*Edx)[i] = decay *(*Edx)[i] + (1.0 - decay) * (*dx_)[i];
234  assert(std::isfinite((*Edx)[i]));
235  }*/
236 
237  // DEBUG
238 /* std::cout << "surprisal = " << surprisal << std::endl;
239  std::cout << "p_rate = " << p_rate << std::endl;
240  std::cout << "i_rate = " << i_rate << std::endl;
241  std::cout << "d_rate = " << d_rate << std::endl;*/
242 /* std::cout<< " x: ";
243  for(size_t i=0; i< delta->size(); i++)
244  std::cout<< (*x_)[i] << " | ";
245  std::cout << std::endl;
246 
247  std::cout<< " dx: ";
248  for(size_t i=0; i< delta->size(); i++)
249  std::cout<< (*dx_)[i] << " | ";
250  std::cout << std::endl << std::endl;*/
251 
252  // Calculate gradients updates.
253  // Proportional.
254 // std::cout<< " deltaP: ";
255  for(size_t i=0; i< (size_t)delta->size(); i++){
256  // Calculate surprisal.
257 // double surp_p = logistic(calculateSigmoidSurprisalMod((*deltaP)[i], (*dx_)[i]), 1.0);
258 // (*deltaP)[i] = (1-surp_p) * (*deltaP)[i] + (*p_rate)[i] * surp_p * (*dx_)[i];
259 // std::cout<< (*deltaP)[i] << " | ";
260  }
261 // std::cout << std::endl ;
262 
263  // Integral.
264 // std::cout<< " deltaI: ";
265  for(size_t i=0; i< (size_t)delta->size(); i++){
266 // double surp_i = logistic(calculateSigmoidSurprisalMod((*deltaI)[i], (decay *(*deltaI)[i] + (1.0 - decay) * (*dx_)[i])), 1.0);
267 // (*deltaI)[i] = (1-surp_i) * (*deltaI)[i] + (*i_rate)[i] * surp_i * (decay *(*deltaI)[i] + (1.0 - decay) * (*dx_)[i]);
268 // std::cout<< (*deltaI)[i] << " | ";
269  }
270 // std::cout << std::endl ;
271 
272  // Derivative.
273 // std::cout<< " deltaD: ";
274  for(size_t i=0; i< (size_t)delta->size(); i++){
275 // double surp_d = logistic(calculateSigmoidSurprisalMod((*deltaD)[i], ((*dx_)[i] - (*dx_prev)[i])), 1.0);
276 // (*deltaD)[i] = (1-surp_d) * (*deltaD)[i] + (*d_rate)[i] * surp_d * ((*dx_)[i] - (*dx_prev)[i]);
277 // std::cout<< (*deltaD)[i] << " | ";
278  }
279 // std::cout << std::endl ;
280 
281  // Adaptive rate update.
282 /* // Update P ratio.
283  eT up;
284  std::cout<< " p_rate: ";
285  for (size_t i=0; i<delta->size(); i++){
286 // up = std::abs((*deltaP)[i] / ((*dx_)[i] + eps)); // softsign
287 // std::cout << "up = " << up << std::endl;
288 // std::cout << "tanh(up) = " << tanh(up) << std::endl;
289  //decay * (*p_rate)[i] + (1 - decay) * tanh(up);//(logistic<eT>(up ,5) / 5);
290  std::cout << (*p_rate)[i] << " | ";
291  }
292  std::cout << std::endl;
293 
294  // Update I ratio.
295  eT ui;
296  std::cout<< " i_rate: ";
297  for (size_t i=0; i<delta->size(); i++){
298 // ui = std::abs((*deltaI)[i] / ((*dx_)[i] + eps)); // softsign
299 // (*i_rate)[i] = decay * (*i_rate)[i] + (1 - decay) * calculateSigmoidSurprisal((*i_rate)[i], ui);
300  //decay * (*i_rate)[i] + (1 - decay) * tanh(ui);//(logistic<eT>(ui ,5) / 5);
301  std::cout << (*i_rate)[i] << " | ";
302  }
303  std::cout << std::endl ;
304 
305  // Update D ratio.
306  eT ud;
307 / std::cout<< " d_rate: ";
308  for (size_t i=0; i<delta->size(); i++){
309 // ud = std::abs((*deltaD)[i] / ((*dx_)[i] + eps)); // softsign
310 // (*d_rate)[i] = decay * (*d_rate)[i] + (1 - decay) * calculateSigmoidSurprisal((*d_rate)[i], ud);
311  //decay * (*d_rate)[i] + (1 - decay) * tanh(ud);//(logistic<eT>(ud, 5) / 5);
312  std::cout << (*d_rate)[i] << " | ";
313  }
314  std::cout << std::endl ;*/
315 
316  // Calculate update.
317 // std::cout<< " delta: ";
318  for(size_t i=0; i< (size_t)delta->size(); i++) {
319  (*delta)[i] = (*deltaP)[i] + (*deltaI)[i] + (*deltaD)[i];
320  // std::cout<< (*delta)[i] << " | ";
321  assert(std::isfinite((*delta)[i]));
322  }
323  // std::cout << std::endl << std::endl;
324 
325  // Store past gradiens.
326  for (size_t i=0; i< (size_t)dx_->size(); i++) {
327  (*dx_prev)[i] = (*dx_)[i];
328  }
329 
330  // Return the update.
331  return delta;
332 // std::cout << "-------------------" << std::endl;
333  }
334 
335 protected: // Initialize ratios and variables.
336 
338  eT decay;
339 
341  eT eps;
342 
344  mic::types::MatrixPtr<eT> p_rate;
345 
347  mic::types::MatrixPtr<eT> i_rate;
348 
350  mic::types::MatrixPtr<eT> d_rate;
351 
353 // eT surprisal;
354 
356  mic::types::MatrixPtr<eT> Edx;
357 
359  mic::types::MatrixPtr<eT> dx_prev;
360 
362  mic::types::MatrixPtr<eT> deltaP;
363 
365  mic::types::MatrixPtr<eT> deltaI;
366 
368  mic::types::MatrixPtr<eT> deltaD;
369 
371  mic::types::MatrixPtr<eT> delta;
372 };
373 
374 } //: optimization
375 } //: neural_nets
376 } //: mic
377 
378 #endif /* GRADPID_HPP_ */
mic::types::MatrixPtr< eT > deltaI
Integral update.
Definition: GradPID.hpp:163
mic::types::MatrixPtr< eT > deltaD
Derivative update.
Definition: GradPID.hpp:368
eT decay
Decay ratio, similar to momentum.
Definition: GradPID.hpp:139
mic::types::MatrixPtr< eT > dx_prev
Previous value of gradients.
Definition: GradPID.hpp:157
AdaGradPID - adaptive gradient descent with proportional, integral and derivative coefficients...
Definition: GradPID.hpp:180
mic::types::MatrixPtr< eT > deltaP
Proportional update.
Definition: GradPID.hpp:160
mic::types::MatrixPtr< eT > Edx
Decaying average of gradients up to time t - E[g].
Definition: GradPID.hpp:154
eT p_rate
Adaptive proportional factor (learning rate).
Definition: GradPID.hpp:145
eT d_rate
Adaptive proportional factor (learning rate).
Definition: GradPID.hpp:151
Abstract class representing interface to optimization function.
mic::types::MatrixPtr< eT > deltaD
Derivative update.
Definition: GradPID.hpp:166
eT eps
Smoothing term that avoids division by zero.
Definition: GradPID.hpp:341
GradPID(size_t rows_, size_t cols_, eT decay_=0.9, eT eps_=1e-8)
Definition: GradPID.hpp:48
eT i_rate
Adaptive integral factor (learning rate).
Definition: GradPID.hpp:148
GradPID - adaptive gradient descent with proportional, integral and derivative coefficients.
Definition: GradPID.hpp:40
mic::types::MatrixPtr< eT > deltaP
Proportional update.
Definition: GradPID.hpp:362
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_=0.001)
Definition: GradPID.hpp:227
mic::types::MatrixPtr< eT > delta
Calculated update.
Definition: GradPID.hpp:371
mic::types::MatrixPtr< eT > deltaI
Integral update.
Definition: GradPID.hpp:365
eT decay
Decay ratio, similar to momentum.
Definition: GradPID.hpp:338
mic::types::MatrixPtr< eT > dx_prev
Previous value of gradients.
Definition: GradPID.hpp:359
eT eps
Smoothing term that avoids division by zero.
Definition: GradPID.hpp:142
mic::types::MatrixPtr< eT > calculateUpdate(mic::types::MatrixPtr< eT > x_, mic::types::MatrixPtr< eT > dx_, eT learning_rate_=0.001)
Definition: GradPID.hpp:76
mic::types::MatrixPtr< eT > d_rate
Adaptive proportional factor (learning rate).
Definition: GradPID.hpp:350
AdaGradPID(size_t rows_, size_t cols_, eT decay_=0.9, eT eps_=1e-8)
Definition: GradPID.hpp:188
mic::types::MatrixPtr< eT > delta
Calculated update.
Definition: GradPID.hpp:169
mic::types::MatrixPtr< eT > Edx
Surprisal - for feed forward nets it is based on the difference between the prediction and target...
Definition: GradPID.hpp:356
mic::types::MatrixPtr< eT > i_rate
Adaptive integral factor (learning rate).
Definition: GradPID.hpp:347
mic::types::MatrixPtr< eT > p_rate
Adaptive proportional factor (learning rate).
Definition: GradPID.hpp:344