MachineIntelligenceCore:ReinforcementLearning
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator
GridworldDRLExperienceReplay.cpp
Go to the documentation of this file.
1 
23 #include <limits>
24 #include <utils/RandomGenerator.hpp>
26 
27 namespace mic {
28 namespace application {
29 
34 void RegisterApplication (void) {
36 }
37 
38 
39 GridworldDRLExperienceReplay::GridworldDRLExperienceReplay(std::string node_name_) : OpenGLEpisodicApplication(node_name_),
40  step_reward("step_reward", 0.0),
41  discount_rate("discount_rate", 0.9),
42  learning_rate("learning_rate", 0.005),
43  epsilon("epsilon", 0.1),
44  statistics_filename("statistics_filename","drl_er_statistics.csv"),
45  mlnn_filename("mlnn_filename", "drl_er_mlnn.txt"),
46  mlnn_save("mlnn_save", false),
47  mlnn_load("mlnn_load", false),
48  experiences(10000,1)
49  {
50  // Register properties - so their values can be overridden (read from the configuration file).
51  registerProperty(step_reward);
52  registerProperty(discount_rate);
53  registerProperty(learning_rate);
54  registerProperty(epsilon);
55  registerProperty(statistics_filename);
56  registerProperty(mlnn_filename);
57  registerProperty(mlnn_save);
58  registerProperty(mlnn_load);
59 
60  LOG(LINFO) << "Properties registered";
61 }
62 
63 
65  delete(w_chart);
66 }
67 
68 
69 void GridworldDRLExperienceReplay::initialize(int argc, char* argv[]) {
70  // Initialize GLUT! :]
71  VGL_MANAGER->initializeGLUT(argc, argv);
72 
73  collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
74  // Add containers to collector.
75  collector_ptr->createContainer("number_of_steps", mic::types::color_rgba(255, 0, 0, 180));
76  collector_ptr->createContainer("number_of_steps_average", mic::types::color_rgba(255, 255, 0, 180));
77  collector_ptr->createContainer("collected_reward", mic::types::color_rgba(0, 255, 0, 180));
78  collector_ptr->createContainer("collected_reward_average", mic::types::color_rgba(0, 255, 255, 180));
79  collector_ptr->createContainer("success_ratio", mic::types::color_rgba(255, 255, 255, 180));
80 
82  sum_of_rewards = 0;
84 
85  // Create the visualization windows - must be created in the same, main thread :]
86  w_chart = new WindowCollectorChart<float>("GridworldDRLExperienceReplay", 256, 256, 0, 0);
87  w_chart->setDataCollectorPtr(collector_ptr);
88 
89 }
90 
92  // Initialize the gridworld.
94 
95  // Hardcode batchsize - for fastening the display!
97 
98  // Try to load neural network from file.
99  if ((mlnn_load) && (neural_net.load(mlnn_filename))) {
100  // Do nothing ;)
101  } else {
102  // Create a simple neural network.
103  // gridworld wxhx4 -> 100 -> 4 -> regression!
104  neural_net.pushLayer(new Linear<float>((size_t) grid_env.getEnvironmentSize(), 250));
105  neural_net.pushLayer(new ReLU<float>(250));
106  neural_net.pushLayer(new Linear<float>(250, 100));
107  neural_net.pushLayer(new ReLU<float>(100));
108  neural_net.pushLayer(new Linear<float>(100, 4));
109 
110  // Set batch size.
111  neural_net.resizeBatch(batch_size);
112  // Change optimization function from default GradientDescent to Adam.
113  neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
114  // Set loss function -> regression!
115  neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
116 
117  LOG(LINFO) << "Generated new neural network";
118  }//: else
119 
120  // Set batch size in experience replay memory.
121  experiences.setBatchSize(batch_size);
122 }
123 
124 
126  LOG(LSTATUS) << "Starting new episode " << episode;
127 
128  // Generate the gridworld (and move player to initial position).
130 
131  LOG(LSTATUS) << "Network responses: \n" << streamNetworkResponseTable();
132  LOG(LSTATUS) << "Environment: \n" << grid_env.environmentToString();
133 }
134 
135 
137  LOG(LTRACE) << "End current episode";
138 
139  mic::types::Position2D current_position = grid_env.getAgentPosition();
140  float reward = grid_env.getStateReward(current_position);
141  sum_of_iterations += iteration;
142  sum_of_rewards += reward;
143  if (reward > 0)
145 
146  // Add variables to container.
147  collector_ptr->addDataToContainer("number_of_steps",iteration);
148  collector_ptr->addDataToContainer("number_of_steps_average",(float)sum_of_iterations/episode);
149  collector_ptr->addDataToContainer("collected_reward", reward);
150  collector_ptr->addDataToContainer("collected_reward_average", (float)sum_of_rewards/episode);
151  collector_ptr->addDataToContainer("success_ratio", (float)number_of_successes/episode);
152 
153 
154  // Export reward "convergence" diagram.
155  collector_ptr->exportDataToCsv(statistics_filename);
156 
157  // Save nn to file.
158  if (mlnn_save)
160 }
161 
162 
164  LOG(LTRACE) << "streamNetworkResponseTable()";
165  std::string rewards_table;
166  std::string actions_table;
167 
168  // Remember the current state i.e. player position.
169  mic::types::Position2D current_player_pos_t = grid_env.getAgentPosition();
170 
171  // Create new matrices for batches of inputs and targets.
172  MatrixXfPtr inputs_batch(new MatrixXf(grid_env.getEnvironmentSize(), batch_size));
173 
174  // Assume that the batch_size = grid_env.getWidth() * grid_env.getHeight()
176  for (size_t y=0; y<grid_env.getEnvironmentHeight(); y++){
177  for (size_t x=0; x<grid_env.getEnvironmentWidth(); x++) {
178  // Move the player to given state - disregarding whether it is valid or not.
179  grid_env.moveAgentToPosition(Position2D(x,y));
180  // Encode the current state.
181  mic::types::MatrixXfPtr encoded_state = grid_env.encodeEnvironment();
182  // Add to batch.
183  inputs_batch->col(y*grid_env.getEnvironmentWidth()+x) = encoded_state->col(0);
184  }//: for x
185  }//: for y
186 
187  // Get rewards for the whole batch.
188  neural_net.forward(inputs_batch);
189  // Get predictions for all those states - there is no need to create a copy.
190  MatrixXfPtr predicted_batch = neural_net.getPredictions();
191 
192 
193  rewards_table += "Action values:\n";
194  actions_table += "Best actions:\n";
195  // Generate all possible states and all possible rewards.
196  for (size_t y=0; y<grid_env.getEnvironmentHeight(); y++){
197  rewards_table += "| ";
198  actions_table += "| ";
199  for (size_t x=0; x<grid_env.getEnvironmentWidth(); x++) {
200  float bestqval = -std::numeric_limits<float>::infinity();
201  size_t best_action = -1;
202  for (size_t a=0; a<4; a++) {
203  float qval = (*predicted_batch)(a, y*grid_env.getEnvironmentWidth()+x);
204 
205  rewards_table += std::to_string(qval);
206  if (a==3)
207  rewards_table += " | ";
208  else
209  rewards_table += " , ";
210 
211  // Remember the best value.
212  if (grid_env.isStateAllowed(x,y) && (!grid_env.isStateTerminal(x,y)) && grid_env.isActionAllowed(x,y,a) && (qval > bestqval)){
213  bestqval = qval;
214  best_action = a;
215  }//: if
216 
217  }//: for a(ctions)
218  switch(best_action){
219  case 0 : actions_table += "N | "; break;
220  case 1 : actions_table += "E | "; break;
221  case 2 : actions_table += "S | "; break;
222  case 3 : actions_table += "W | "; break;
223  default: actions_table += "- | ";
224  }//: switch
225 
226  }//: for x
227  rewards_table += "\n";
228  actions_table += "\n";
229  }//: for y
230 
231  // Move player to previous position.
232  grid_env.moveAgentToPosition(current_player_pos_t);
233 
234  return rewards_table + actions_table;
235 }
236 
237 
238 
239 float GridworldDRLExperienceReplay::computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float* predictions_){
240  LOG(LTRACE) << "computeBestValueForGivenState()";
241  float best_qvalue = -std::numeric_limits<float>::infinity();
242 
243  // Create a list of possible actions.
244  std::vector<mic::types::NESWAction> actions;
245  actions.push_back(A_NORTH);
246  actions.push_back(A_EAST);
247  actions.push_back(A_SOUTH);
248  actions.push_back(A_WEST);
249 
250  for(mic::types::NESWAction action : actions) {
251  // .. and find the value of teh best allowed action.
252  if(grid_env.isActionAllowed(player_position_, action)) {
253  float qvalue = predictions_[(size_t)action.getType()];
254  if (qvalue > best_qvalue)
255  best_qvalue = qvalue;
256  }//if is allowed
257  }//: for
258 
259  return best_qvalue;
260 }
261 
262 
263 mic::types::MatrixXfPtr GridworldDRLExperienceReplay::getPredictedRewardsForGivenState(mic::types::Position2D player_position_) {
264  LOG(LTRACE) << "getPredictedRewardsForGivenState()";
265  // Remember the current state i.e. player position.
266  mic::types::Position2D current_player_pos_t = grid_env.getAgentPosition();
267 
268  // Move the player to given state.
269  grid_env.moveAgentToPosition(player_position_);
270 
271  // Encode the current state.
272  mic::types::MatrixXfPtr encoded_state = grid_env.encodeEnvironment();
273 
274  // Create NEW matrix for the inputs batch.
275  MatrixXfPtr inputs_batch(new MatrixXf(grid_env.getEnvironmentSize(), batch_size));
276  inputs_batch->setZero();
277 
278  // Set the first input - only this one interests us.
279  inputs_batch->col(0) = encoded_state->col(0);
280 
281  //LOG(LERROR) << "Getting predictions for input batch:\n" <<inputs_batch->transpose();
282 
283  // Pass the data and get predictions.
284  neural_net.forward(inputs_batch);
285 
286  MatrixXfPtr predictions_batch = neural_net.getPredictions();
287 
288  //LOG(LERROR) << "Resulting predictions batch:\n" << predictions_batch->transpose();
289 
290  // Get the first prediction only.
291  MatrixXfPtr predictions_sample(new MatrixXf(4, 1));
292  predictions_sample->col(0) = predictions_batch->col(0);
293 
294  //LOG(LERROR) << "Returned predictions sample:\n" << predictions_sample->transpose();
295 
296  // Move player to previous position.
297  grid_env.moveAgentToPosition(current_player_pos_t);
298 
299  // Return the predictions.
300  return predictions_sample;
301 }
302 
303 mic::types::NESWAction GridworldDRLExperienceReplay::selectBestActionForGivenState(mic::types::Position2D player_position_){
304  LOG(LTRACE) << "selectBestAction";
305 
306  // Greedy methods - returns the index of element with greatest value.
307  mic::types::NESWAction best_action = A_RANDOM;
308  float best_qvalue = -std::numeric_limits<float>::infinity();
309 
310  // Create a list of possible actions.
311  std::vector<mic::types::NESWAction> actions;
312  actions.push_back(A_NORTH);
313  actions.push_back(A_EAST);
314  actions.push_back(A_SOUTH);
315  actions.push_back(A_WEST);
316 
317  // Check the results of actions one by one... (there is no need to create a separate copy of predictions)
318  MatrixXfPtr predictions_sample = getPredictedRewardsForGivenState(player_position_);
319  //LOG(LERROR) << "Selecting action from predictions:\n" << predictions_sample->transpose();
320  float* pred = predictions_sample->data();
321 
322  for(size_t a=0; a<4; a++) {
323  // Find the best action allowed.
324  if(grid_env.isActionAllowed(player_position_, mic::types::NESWAction((mic::types::NESW)a))) {
325  float qvalue = pred[a];
326  if (qvalue > best_qvalue){
327  best_qvalue = qvalue;
328  best_action.setAction((mic::types::NESW)a);
329  }
330  }//if is allowed
331  }//: for
332 
333  return best_action;
334 }
335 
337  LOG(LSTATUS) << "Episode "<< episode << ": step " << iteration << "";
338 
339  // TMP!
340  double nn_weight_decay = 0;
341 
342  // Get player pos at time t.
343  mic::types::Position2D player_pos_t= grid_env.getAgentPosition();
344  LOG(LINFO) << "Agent position at state t: " << player_pos_t;
345 
346  // Select the action.
347  mic::types::NESWAction action;
348  //action = A_NORTH;
349  double eps = (double)epsilon;
350  if ((double)epsilon < 0)
351  eps = 1.0/(1.0+sqrt(episode));
352  if (eps < 0.1)
353  eps = 0.1;
354  LOG(LDEBUG) << "eps = " << eps;
355  bool random = false;
356 
357  // Epsilon-greedy action selection.
358  if (RAN_GEN->uniRandReal() > eps){
359  // Select best action.
360  action = selectBestActionForGivenState(player_pos_t);
361  } else {
362  // Random action.
363  action = A_RANDOM;
364  random = true;
365  }//: if
366 
367  // Execute action - do not monitor the success.
368  grid_env.moveAgent(action);
369 
370  // Get new state s(t+1).
371  mic::types::Position2D player_pos_t_prim = grid_env.getAgentPosition();
372  LOG(LINFO) << "Agent position at t+1: " << player_pos_t_prim << " after performing the action = " << action << ((random) ? " [Random]" : "");
373 
374  // Collect the experience.
375  SpatialExperiencePtr exp(new SpatialExperience(player_pos_t, action, player_pos_t_prim));
376  // Create an empty matrix for rewards - this will be recalculated each time the experience will be replayed anyway.
377  MatrixXfPtr rewards (new MatrixXf(4 , batch_size));
378  // Add experience to experience table.
379  experiences.add(exp, rewards);
380 
381 
382  // Deep Q learning - train network with random sample from the experience memory.
383  if (experiences.size() >= 2*batch_size) {
384  // Create new matrices for batches of inputs and targets.
385  MatrixXfPtr inputs_t_batch(new MatrixXf(grid_env.getEnvironmentSize(), batch_size));
386  MatrixXfPtr inputs_t_prim_batch(new MatrixXf(grid_env.getEnvironmentSize(), batch_size));
387  MatrixXfPtr targets_t_batch(new MatrixXf(4, batch_size));
388 
389  // Get the random batch.
390  SpatialExperienceBatch geb = experiences.getRandomBatch();
391 
392  // Debug purposes.
393  geb.setNextSampleIndex(0);
394  for (size_t i=0; i<batch_size; i++) {
395  SpatialExperienceSample ges = geb.getNextSample();
396  SpatialExperiencePtr ge_ptr = ges.data();
397  LOG(LDEBUG) << "Training sample : " << ge_ptr->s_t << " -> " << ge_ptr->a_t << " -> " << ge_ptr->s_t_prim;
398  }//: for
399 
400  // Iterate through samples and create inputs_t_batch.
401  for (size_t i=0; i<batch_size; i++) {
402  SpatialExperienceSample ges = geb.getNextSample();
403  SpatialExperiencePtr ge_ptr = ges.data();
404 
405  // Replay the experience.
406  // "Simulate" moving player to position from state/time (t).
407  grid_env.moveAgentToPosition(ge_ptr->s_t);
408  // Encode the state at time (t).
409  mic::types::MatrixXfPtr encoded_state_t = grid_env.encodeEnvironment();
410  //float* state = encoded_state_t->data();
411 
412  // Copy the encoded state to inputs batch.
413  inputs_t_batch->col(i) = encoded_state_t->col(0);
414  }// for samples.
415 
416  // Get network responses.
417  neural_net.forward(inputs_t_batch);
418  // Get predictions for all those states...
419  MatrixXfPtr predictions_t_batch = neural_net.getPredictions();
420  // ... and copy them to reward pointer - a container which we will modify.
421  (*targets_t_batch) = (*predictions_t_batch);
422 
423  // Iterate through samples and create inputs_t_prim_batch.
424  geb.setNextSampleIndex(0);
425  for (size_t i=0; i<batch_size; i++) {
426  SpatialExperienceSample ges = geb.getNextSample();
427  SpatialExperiencePtr ge_ptr = ges.data();
428 
429  // Replay the experience.
430  // "Simulate" moving player to position from state/time (t+1).
431  grid_env.moveAgentToPosition(ge_ptr->s_t_prim);
432  // Encode the state at time (t+1).
433  mic::types::MatrixXfPtr encoded_state_t = grid_env.encodeEnvironment();
434  //float* state = encoded_state_t->data();
435 
436  // Copy the encoded state to inputs batch.
437  inputs_t_prim_batch->col(i) = encoded_state_t->col(0);
438  }// for samples.
439 
440  // Get network responses.
441  neural_net.forward(inputs_t_prim_batch);
442  // Get predictions for all those states...
443  MatrixXfPtr predictions_t_prim_batch = neural_net.getPredictions();
444 
445 
446  // Calculate the rewards, one by one.
447  // Iterate through samples and create inputs_t_prim_batch.
448  geb.setNextSampleIndex(0);
449  for (size_t i=0; i<batch_size; i++) {
450  SpatialExperienceSample ges = geb.getNextSample();
451  SpatialExperiencePtr ge_ptr = ges.data();
452 
453  if (ge_ptr->s_t == ge_ptr->s_t_prim) {
454  // The move was not possible! Learn that as well.
455  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = 3*step_reward;
456  } else if(grid_env.isStateTerminal(ge_ptr->s_t_prim)) {
457  // The position at (t+1) state appears to be terminal - learn the reward.
458  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = grid_env.getStateReward(ge_ptr->s_t_prim);
459  } else {
460  MatrixXfPtr preds_t_prim (new MatrixXf(4, 1));
461  preds_t_prim->col(0) = predictions_t_prim_batch->col(i);
462  // Get best value for the NEXT state - position from (t+1) state.
463  float max_q_st_prim_at_prim = computeBestValueForGivenStateAndPredictions(ge_ptr->s_t_prim, preds_t_prim->data());
464  // If next state best value is finite.
465  // Update running average for given action - Deep Q learning!
466  if (std::isfinite(max_q_st_prim_at_prim))
467  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = step_reward + discount_rate*max_q_st_prim_at_prim;
468  }//: else
469 
470  }//: for
471 
472  LOG(LDEBUG) <<"Inputs batch:\n" << inputs_t_batch->transpose();
473  LOG(LDEBUG) <<"Targets batch:\n" << targets_t_batch->transpose();
474 
475  // Perform the Deep-Q-learning.
476  LOG(LDEBUG) << "Network responses before training:" << std::endl << streamNetworkResponseTable();
477 
478  // Train network with rewards.
479  float loss = neural_net.train (inputs_t_batch, targets_t_batch, learning_rate, nn_weight_decay);
480  LOG(LDEBUG) << "Training loss:" << loss;
481 
482  //LOG(LDEBUG) << "Network responses after training:" << std::endl << streamNetworkResponseTable();
483 
484  // Finish the replay: move the player to REAL, CURRENT POSITION.
485  grid_env.moveAgentToPosition(player_pos_t_prim);
486  }//: if enough experiences
487  else
488  LOG(LWARNING) << "Not enough samples in the experience replay memory!";
489 
490  LOG(LSTATUS) << "Network responses:" << std::endl << streamNetworkResponseTable();
491  LOG(LSTATUS) << "Environment: \n" << grid_env.environmentToString();
492 
493  // Check whether state t+1 is terminal - finish the episode.
495  return false;
496 
497  // Check whether we reached maximum number of iterations.
498  if (iteration >= 100)
499  return false;
500 
501 
502  return true;
503 }
504 
505 
506 } /* namespace application */
507 } /* namespace mic */
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
Definition: Gridworld.cpp:805
float computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float *predictions_)
mic::environments::Gridworld grid_env
The gridworld environment.
virtual void add(std::shared_ptr< mic::types::SpatialExperience > input_, std::shared_ptr< mic::types::MatrixXf > target_)
virtual float getStateReward(mic::types::Position2D pos_)
Definition: Gridworld.cpp:823
mic::types::MatrixXfPtr getPredictedRewardsForGivenState(mic::types::Position2D player_position_)
std::shared_ptr< mic::types::SpatialExperience > SpatialExperiencePtr
Shared pointer to spatial experience object.
virtual mic::types::MatrixXfPtr encodeEnvironment()
Definition: Gridworld.cpp:703
virtual mic::types::Position2D getAgentPosition()
Definition: Gridworld.cpp:790
Structure storing a spatial experience - a triplet of position in time t, executed action and positio...
virtual size_t getEnvironmentSize()
Definition: Environment.hpp:81
size_t batch_size
Size of the batch in experience replay - set to the size of maze (width*height).
virtual bool isActionAllowed(long x_, long y_, size_t action_)
Definition: Environment.cpp:70
virtual bool isStateTerminal(mic::types::Position2D pos_)
Definition: Gridworld.cpp:849
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
virtual size_t getEnvironmentWidth()
Definition: Environment.hpp:69
Class responsible for solving the gridworld problem with Q-learning, neural network used for approxim...
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
GridworldDRLExperienceReplay(std::string node_name_="application")
bool moveAgent(mic::types::Action2DInterface ac_)
Definition: Environment.cpp:48
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
virtual std::string environmentToString()
Definition: Gridworld.cpp:689
virtual size_t getEnvironmentHeight()
Definition: Environment.hpp:75
mic::types::Batch< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceBatch
Spatial experience replay batch.
mic::types::NESWAction selectBestActionForGivenState(mic::types::Position2D player_position_)
virtual bool isStateAllowed(mic::types::Position2D pos_)
Definition: Gridworld.cpp:834
void RegisterApplication(void)
Registers application.
mic::types::Sample< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceSample
Spatial experience replay sample.
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
virtual void initializeEnvironment()
Definition: Gridworld.cpp:81