MachineIntelligenceCore:ReinforcementLearning
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator
GridworldDRLExperienceReplayPOMDP.cpp
Go to the documentation of this file.
1 
23 #include <limits>
24 #include <utils/RandomGenerator.hpp>
25 
27 
28 namespace mic {
29 namespace application {
30 
35 void RegisterApplication (void) {
37 }
38 
39 
40 GridworldDRLExperienceReplayPOMDP::GridworldDRLExperienceReplayPOMDP(std::string node_name_) : OpenGLEpisodicApplication(node_name_),
41  step_reward("step_reward", 0.0),
42  discount_rate("discount_rate", 0.9),
43  learning_rate("learning_rate", 0.005),
44  epsilon("epsilon", 0.1),
45  step_limit("step_limit",0),
46  statistics_filename("statistics_filename","drl_er_statistics.csv"),
47  mlnn_filename("mlnn_filename", "drl_er_mlnn.txt"),
48  mlnn_save("mlnn_save", false),
49  mlnn_load("mlnn_load", false),
50  experiences(10000,1)
51  {
52  // Register properties - so their values can be overridden (read from the configuration file).
53  registerProperty(step_reward);
54  registerProperty(discount_rate);
55  registerProperty(learning_rate);
56  registerProperty(epsilon);
57  registerProperty(step_limit);
58  registerProperty(statistics_filename);
59  registerProperty(mlnn_filename);
60  registerProperty(mlnn_save);
61  registerProperty(mlnn_load);
62 
63  LOG(LINFO) << "Properties registered";
64 }
65 
66 
68  delete(w_chart);
69 }
70 
71 
72 void GridworldDRLExperienceReplayPOMDP::initialize(int argc, char* argv[]) {
73  // Initialize GLUT! :]
74  VGL_MANAGER->initializeGLUT(argc, argv);
75 
76  collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
77  // Add containers to collector.
78  collector_ptr->createContainer("number_of_steps", mic::types::color_rgba(255, 0, 0, 180));
79  collector_ptr->createContainer("number_of_steps_average", mic::types::color_rgba(255, 255, 0, 180));
80  collector_ptr->createContainer("collected_reward", mic::types::color_rgba(0, 255, 0, 180));
81  collector_ptr->createContainer("collected_reward_average", mic::types::color_rgba(0, 255, 255, 180));
82  collector_ptr->createContainer("success_ratio", mic::types::color_rgba(255, 255, 255, 180));
83 
85  sum_of_rewards = 0;
87 
88  // Create the visualization windows - must be created in the same, main thread :]
89  w_chart = new WindowCollectorChart<float>("GridworldDRLExperienceReplayPOMDP", 256, 256, 0, 0);
90  w_chart->setDataCollectorPtr(collector_ptr);
91 
92 }
93 
95  // Initialize the gridworld.
97 
98  // Hardcode batchsize - for fastening the display!
100 
101  // Try to load neural network from file.
102  if ((mlnn_load) && (neural_net.load(mlnn_filename))) {
103  // Do nothing ;)
104  } else {
105  // Create a simple neural network.
106  // gridworld wxhx4 -> 100 -> 4 -> regression!.
107  neural_net.pushLayer(new Linear<float>((size_t) grid_env.getObservationSize(), 250));
108  neural_net.pushLayer(new ReLU<float>(250));
109  neural_net.pushLayer(new Linear<float>(250, 100));
110  neural_net.pushLayer(new ReLU<float>(100));
111  neural_net.pushLayer(new Linear<float>(100, 4));
112 
113  // Set batch size.
114  neural_net.resizeBatch(batch_size);
115  // Change optimization function from default GradientDescent to Adam.
116  neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
117  // Set loss function -> regression!
118  neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
119 
120  LOG(LINFO) << "Generated new neural network";
121  }//: else
122 
123  // Set batch size in experience replay memory.
124  experiences.setBatchSize(batch_size);
125 }
126 
127 
129  LOG(LSTATUS) << "Starting new episode " << episode;
130 
131  // Generate the gridworld (and move player to initial position).
133 
134  LOG(LSTATUS) << "Network responses: \n" << streamNetworkResponseTable();
135  LOG(LSTATUS) << "Observation: \n" << grid_env.observationToString();
136  LOG(LSTATUS) << "Environment: \n" << grid_env.environmentToString();
137 }
138 
139 
141  LOG(LTRACE) << "End current episode";
142 
143  mic::types::Position2D current_position = grid_env.getAgentPosition();
144  float reward = grid_env.getStateReward(current_position);
145  sum_of_iterations += iteration;
146  sum_of_rewards += reward;
147  if (reward > 0)
149 
150  // Add variables to container.
151  collector_ptr->addDataToContainer("number_of_steps",iteration);
152  collector_ptr->addDataToContainer("number_of_steps_average",(float)sum_of_iterations/episode);
153  collector_ptr->addDataToContainer("collected_reward", reward);
154  collector_ptr->addDataToContainer("collected_reward_average", (float)sum_of_rewards/episode);
155  collector_ptr->addDataToContainer("success_ratio", (float)number_of_successes/episode);
156 
157 
158  // Export reward "convergence" diagram.
159  collector_ptr->exportDataToCsv(statistics_filename);
160 
161  // Save nn to file.
162  if (mlnn_save)
164 }
165 
166 
168  LOG(LTRACE) << "streamNetworkResponseTable()";
169  std::string rewards_table;
170  std::string actions_table;
171 
172  // Remember the current state i.e. player position.
173  mic::types::Position2D current_player_pos_t = grid_env.getAgentPosition();
174 
175  // Create new matrices for batches of inputs and targets.
176  MatrixXfPtr inputs_batch(new MatrixXf(grid_env.getObservationSize(), batch_size));
177 
178  // Assume that the batch_size = grid_env.getWidth() * grid_env.getHeight()
180 
181 
182  size_t dx = (grid_env.getObservationWidth()-1)/2;
183  size_t dy = (grid_env.getObservationHeight()-1)/2;
184  mic::types::Position2D p = grid_env.getAgentPosition();
185 
186  // Copy data.
187  for (long oy=0, ey=(p.y-dy); oy<(long)grid_env.getObservationHeight(); oy++, ey++){
188  for (long ox=0, ex=(p.x-dx); ox<(long)grid_env.getObservationWidth(); ox++, ex++) {
189 
190  //for (size_t y=0; y<grid_env.getObservationHeight(); y++){
191 // for (size_t x=0; x<grid_env.getObservationWidth(); x++) {
192 
193  // Move the player to given state - disregarding whether it was successful or not, answers for walls/positions outside of the gridworld do not interes us anyway...
194  if (!grid_env.moveAgentToPosition(Position2D(ex,ey)))
195  LOG(LDEBUG) << "Failed!";
196  // Encode the current state.
197  mic::types::MatrixXfPtr encoded_state = grid_env.encodeObservation();
198  // Add to batch.
199  inputs_batch->col(oy*grid_env.getObservationWidth()+ox) = encoded_state->col(0);
200  }//: for x
201  }//: for y
202 
203  // Get rewards for the whole batch.
204  neural_net.forward(inputs_batch);
205  // Get predictions for all those states - there is no need to create a copy.
206  MatrixXfPtr predicted_batch = neural_net.getPredictions();
207 
208 
209  rewards_table += "Action values:\n";
210  actions_table += "Best actions:\n";
211  // Generate all possible states and all possible rewards.
212  for (long oy=0, ey=(p.y-dy); oy<(long)grid_env.getObservationHeight(); oy++, ey++){
213  rewards_table += "| ";
214  actions_table += "| ";
215  for (long ox=0, ex=(p.x-dx); ox<(long)grid_env.getObservationWidth(); ox++, ex++) {
216  float bestqval = -std::numeric_limits<float>::infinity();
217  size_t best_action = -1;
218  for (size_t a=0; a<4; a++) {
219  float qval = (*predicted_batch)(a, oy*grid_env.getObservationWidth()+ox);
220 
221  rewards_table += std::to_string(qval);
222  if (a==3)
223  rewards_table += " | ";
224  else
225  rewards_table += " , ";
226 
227  // Remember the best value.
228  if (grid_env.isStateAllowed(ex,ey) && (!grid_env.isStateTerminal(ex,ey)) && grid_env.isActionAllowed(ex,ey,a) && (qval > bestqval)){
229  bestqval = qval;
230  best_action = a;
231  }//: if
232 
233  }//: for a(ctions)
234  switch(best_action){
235  case 0 : actions_table += "N | "; break;
236  case 1 : actions_table += "E | "; break;
237  case 2 : actions_table += "S | "; break;
238  case 3 : actions_table += "W | "; break;
239  default: actions_table += "- | ";
240  }//: switch
241 
242  }//: for x
243  rewards_table += "\n";
244  actions_table += "\n";
245  }//: for y
246 
247  // Move player to previous position.
248  grid_env.moveAgentToPosition(current_player_pos_t);
249 
250  return rewards_table + actions_table;
251 }
252 
253 
254 
255 float GridworldDRLExperienceReplayPOMDP::computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float* predictions_){
256  LOG(LTRACE) << "computeBestValueForGivenState()";
257  float best_qvalue = -std::numeric_limits<float>::infinity();
258 
259  // Create a list of possible actions.
260  std::vector<mic::types::NESWAction> actions;
261  actions.push_back(A_NORTH);
262  actions.push_back(A_EAST);
263  actions.push_back(A_SOUTH);
264  actions.push_back(A_WEST);
265 
266  for(mic::types::NESWAction action : actions) {
267  // .. and find the value of teh best allowed action.
268  if(grid_env.isActionAllowed(player_position_, action)) {
269  float qvalue = predictions_[(size_t)action.getType()];
270  if (qvalue > best_qvalue)
271  best_qvalue = qvalue;
272  }//if is allowed
273  }//: for
274 
275  return best_qvalue;
276 }
277 
278 
279 mic::types::MatrixXfPtr GridworldDRLExperienceReplayPOMDP::getPredictedRewardsForGivenState(mic::types::Position2D player_position_) {
280  LOG(LTRACE) << "getPredictedRewardsForGivenState()";
281  // Remember the current state i.e. player position.
282  mic::types::Position2D current_player_pos_t = grid_env.getAgentPosition();
283 
284  // Move the player to given state.
285  grid_env.moveAgentToPosition(player_position_);
286 
287  // Encode the current state.
288  mic::types::MatrixXfPtr encoded_state = grid_env.encodeObservation();
289 
290  // Create NEW matrix for the inputs batch.
291  MatrixXfPtr inputs_batch(new MatrixXf(grid_env.getObservationSize(), batch_size));
292  inputs_batch->setZero();
293 
294  // Set the first input - only this one interests us.
295  inputs_batch->col(0) = encoded_state->col(0);
296 
297  //LOG(LERROR) << "Getting predictions for input batch:\n" <<inputs_batch->transpose();
298 
299  // Pass the data and get predictions.
300  neural_net.forward(inputs_batch);
301 
302  MatrixXfPtr predictions_batch = neural_net.getPredictions();
303 
304  //LOG(LERROR) << "Resulting predictions batch:\n" << predictions_batch->transpose();
305 
306  // Get the first prediction only.
307  MatrixXfPtr predictions_sample(new MatrixXf(4, 1));
308  predictions_sample->col(0) = predictions_batch->col(0);
309 
310  //LOG(LERROR) << "Returned predictions sample:\n" << predictions_sample->transpose();
311 
312  // Move player to previous position.
313  grid_env.moveAgentToPosition(current_player_pos_t);
314 
315  // Return the predictions.
316  return predictions_sample;
317 }
318 
319 mic::types::NESWAction GridworldDRLExperienceReplayPOMDP::selectBestActionForGivenState(mic::types::Position2D player_position_){
320  LOG(LTRACE) << "selectBestAction";
321 
322  // Greedy methods - returns the index of element with greatest value.
323  mic::types::NESWAction best_action = A_RANDOM;
324  float best_qvalue = -std::numeric_limits<float>::infinity();
325 
326  // Create a list of possible actions.
327  std::vector<mic::types::NESWAction> actions;
328  actions.push_back(A_NORTH);
329  actions.push_back(A_EAST);
330  actions.push_back(A_SOUTH);
331  actions.push_back(A_WEST);
332 
333  // Check the results of actions one by one... (there is no need to create a separate copy of predictions)
334  MatrixXfPtr predictions_sample = getPredictedRewardsForGivenState(player_position_);
335  //LOG(LERROR) << "Selecting action from predictions:\n" << predictions_sample->transpose();
336  float* pred = predictions_sample->data();
337 
338  for(size_t a=0; a<4; a++) {
339  // Find the best action allowed.
340  if(grid_env.isActionAllowed(player_position_, mic::types::NESWAction((mic::types::NESW)a))) {
341  float qvalue = pred[a];
342  if (qvalue > best_qvalue){
343  best_qvalue = qvalue;
344  best_action.setAction((mic::types::NESW)a);
345  }
346  }//if is allowed
347  }//: for
348 
349  return best_action;
350 }
351 
353  LOG(LSTATUS) << "Episode "<< episode << ": step " << iteration << "";
354 
355  // TMP!
356  double nn_weight_decay = 0;
357 
358  // Get player pos at time t.
359  mic::types::Position2D player_pos_t= grid_env.getAgentPosition();
360  LOG(LINFO) << "Agent position at state t: " << player_pos_t;
361 
362  // Select the action.
363  mic::types::NESWAction action;
364  //action = A_NORTH;
365  double eps = (double)epsilon;
366  if ((double)epsilon < 0)
367  eps = 1.0/(1.0+sqrt(episode));
368  if (eps < 0.1)
369  eps = 0.1;
370  LOG(LDEBUG) << "eps = " << eps;
371  bool random = false;
372 
373  // Epsilon-greedy action selection.
374  if (RAN_GEN->uniRandReal() > eps){
375  // Select best action.
376  action = selectBestActionForGivenState(player_pos_t);
377  } else {
378  // Random action.
379  action = A_RANDOM;
380  random = true;
381  }//: if
382 
383  // Execute action - do not monitor the success.
384  grid_env.moveAgent(action);
385 
386  // Get new state s(t+1).
387  mic::types::Position2D player_pos_t_prim = grid_env.getAgentPosition();
388  LOG(LINFO) << "Agent position at t+1: " << player_pos_t_prim << " after performing the action = " << action << ((random) ? " [Random]" : "");
389 
390  // Collect the experience.
391  SpatialExperiencePtr exp(new SpatialExperience(player_pos_t, action, player_pos_t_prim));
392  // Create an empty matrix for rewards - this will be recalculated each time the experience will be replayed anyway.
393  MatrixXfPtr rewards (new MatrixXf(4 , batch_size));
394  // Add experience to experience table.
395  experiences.add(exp, rewards);
396 
397 
398  // Deep Q learning - train network with random sample from the experience memory.
399  if (experiences.size() >= 2*batch_size) {
400  // Create new matrices for batches of inputs and targets.
401  MatrixXfPtr inputs_t_batch(new MatrixXf(grid_env.getObservationSize(), batch_size));
402  MatrixXfPtr inputs_t_prim_batch(new MatrixXf(grid_env.getObservationSize(), batch_size));
403  MatrixXfPtr targets_t_batch(new MatrixXf(4, batch_size));
404 
405  // Get the random batch.
406  SpatialExperienceBatch geb = experiences.getRandomBatch();
407 
408  // Debug purposes.
409  geb.setNextSampleIndex(0);
410  for (size_t i=0; i<batch_size; i++) {
411  SpatialExperienceSample ges = geb.getNextSample();
412  SpatialExperiencePtr ge_ptr = ges.data();
413  LOG(LDEBUG) << "Training sample : " << ge_ptr->s_t << " -> " << ge_ptr->a_t << " -> " << ge_ptr->s_t_prim;
414  }//: for
415 
416  // Iterate through samples and create inputs_t_batch.
417  for (size_t i=0; i<batch_size; i++) {
418  SpatialExperienceSample ges = geb.getNextSample();
419  SpatialExperiencePtr ge_ptr = ges.data();
420 
421  // Replay the experience.
422  // "Simulate" moving player to position from state/time (t).
423  grid_env.moveAgentToPosition(ge_ptr->s_t);
424  // Encode the state at time (t).
425  mic::types::MatrixXfPtr encoded_state_t = grid_env.encodeObservation();
426  //float* state = encoded_state_t->data();
427 
428  // Copy the encoded state to inputs batch.
429  inputs_t_batch->col(i) = encoded_state_t->col(0);
430  }// for samples.
431 
432  // Get network responses.
433  neural_net.forward(inputs_t_batch);
434  // Get predictions for all those states...
435  MatrixXfPtr predictions_t_batch = neural_net.getPredictions();
436  // ... and copy them to reward pointer - a container which we will modify.
437  (*targets_t_batch) = (*predictions_t_batch);
438 
439  // Iterate through samples and create inputs_t_prim_batch.
440  geb.setNextSampleIndex(0);
441  for (size_t i=0; i<batch_size; i++) {
442  SpatialExperienceSample ges = geb.getNextSample();
443  SpatialExperiencePtr ge_ptr = ges.data();
444 
445  // Replay the experience.
446  // "Simulate" moving player to position from state/time (t+1).
447  grid_env.moveAgentToPosition(ge_ptr->s_t_prim);
448  // Encode the state at time (t+1).
449  mic::types::MatrixXfPtr encoded_state_t = grid_env.encodeObservation();
450  //float* state = encoded_state_t->data();
451 
452  // Copy the encoded state to inputs batch.
453  inputs_t_prim_batch->col(i) = encoded_state_t->col(0);
454  }// for samples.
455 
456  // Get network responses.
457  neural_net.forward(inputs_t_prim_batch);
458  // Get predictions for all those states...
459  MatrixXfPtr predictions_t_prim_batch = neural_net.getPredictions();
460 
461 
462  // Calculate the rewards, one by one.
463  // Iterate through samples and create inputs_t_prim_batch.
464  geb.setNextSampleIndex(0);
465  for (size_t i=0; i<batch_size; i++) {
466  SpatialExperienceSample ges = geb.getNextSample();
467  SpatialExperiencePtr ge_ptr = ges.data();
468 
469  if (ge_ptr->s_t == ge_ptr->s_t_prim) {
470  // The move was not possible! Learn that as well.
471  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = 3*step_reward;
472  } else if(grid_env.isStateTerminal(ge_ptr->s_t_prim)) {
473  // The position at (t+1) state appears to be terminal - learn the reward.
474  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = grid_env.getStateReward(ge_ptr->s_t_prim);
475  } else {
476  MatrixXfPtr preds_t_prim (new MatrixXf(4, 1));
477  preds_t_prim->col(0) = predictions_t_prim_batch->col(i);
478  // Get best value for the NEXT state - position from (t+1) state.
479  float max_q_st_prim_at_prim = computeBestValueForGivenStateAndPredictions(ge_ptr->s_t_prim, preds_t_prim->data());
480  // If next state best value is finite.
481  // Update running average for given action - Deep Q learning!
482  if (std::isfinite(max_q_st_prim_at_prim))
483  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = step_reward + discount_rate*max_q_st_prim_at_prim;
484  }//: else
485 
486  }//: for
487 
488  LOG(LDEBUG) <<"Inputs batch:\n" << inputs_t_batch->transpose();
489  LOG(LDEBUG) <<"Targets batch:\n" << targets_t_batch->transpose();
490 
491  // Perform the Deep-Q-learning.
492  LOG(LDEBUG) << "Network responses before training:" << std::endl << streamNetworkResponseTable();
493 
494  // Train network with rewards.
495  float loss = neural_net.train (inputs_t_batch, targets_t_batch, learning_rate, nn_weight_decay);
496  LOG(LDEBUG) << "Training loss:" << loss;
497 
498  //LOG(LDEBUG) << "Network responses after training:" << std::endl << streamNetworkResponseTable();
499 
500  // Finish the replay: move the player to REAL, CURRENT POSITION.
501  grid_env.moveAgentToPosition(player_pos_t_prim);
502  }//: if enough experiences
503  else
504  LOG(LWARNING) << "Not enough samples in the experience replay memory!";
505 
506  LOG(LSTATUS) << "Network responses: \n" << streamNetworkResponseTable();
507  LOG(LSTATUS) << "Observation: \n" << grid_env.observationToString();
508  LOG(LSTATUS) << "Environment: \n" << grid_env.environmentToString();
509 
510  // Check whether state t+1 is terminal - finish the episode.
512  return false;
513 
514  // Check whether we reached maximum number of iterations.
515  if ((step_limit>0) && (iteration >= (size_t)step_limit))
516  return false;
517 
518 
519  return true;
520 }
521 
522 } /* namespace application */
523 } /* namespace mic */
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
Definition: Gridworld.cpp:805
virtual void add(std::shared_ptr< mic::types::SpatialExperience > input_, std::shared_ptr< mic::types::MatrixXf > target_)
virtual float getStateReward(mic::types::Position2D pos_)
Definition: Gridworld.cpp:823
std::shared_ptr< mic::types::SpatialExperience > SpatialExperiencePtr
Shared pointer to spatial experience object.
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
mic::types::MatrixXfPtr getPredictedRewardsForGivenState(mic::types::Position2D player_position_)
virtual mic::types::MatrixXfPtr encodeObservation()
Definition: Gridworld.cpp:715
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
virtual mic::types::Position2D getAgentPosition()
Definition: Gridworld.cpp:790
Structure storing a spatial experience - a triplet of position in time t, executed action and positio...
Class responsible for solving the gridworld problem with Q-learning, neural network used for approxim...
virtual size_t getObservationWidth()
Definition: Environment.hpp:87
virtual bool isActionAllowed(long x_, long y_, size_t action_)
Definition: Environment.cpp:70
virtual bool isStateTerminal(mic::types::Position2D pos_)
Definition: Gridworld.cpp:849
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
virtual size_t getObservationSize()
Definition: Environment.hpp:99
mic::types::NESWAction selectBestActionForGivenState(mic::types::Position2D player_position_)
float computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float *predictions_)
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
bool moveAgent(mic::types::Action2DInterface ac_)
Definition: Environment.cpp:48
mic::environments::Gridworld grid_env
The gridworld environment.
virtual std::string environmentToString()
Definition: Gridworld.cpp:689
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
mic::types::Batch< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceBatch
Spatial experience replay batch.
virtual bool isStateAllowed(mic::types::Position2D pos_)
Definition: Gridworld.cpp:834
void RegisterApplication(void)
Registers application.
mic::types::Sample< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceSample
Spatial experience replay sample.
virtual void initializeEnvironment()
Definition: Gridworld.cpp:81
size_t batch_size
Size of the batch in experience replay - set to the size of maze (width*height).
virtual size_t getObservationHeight()
Definition: Environment.hpp:93
virtual std::string observationToString()
Definition: Gridworld.cpp:693