MachineIntelligenceCore:ReinforcementLearning
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator
MazeOfDigitsDLRERPOMPD.cpp
Go to the documentation of this file.
1 
23 #include <limits>
24 #include <utils/RandomGenerator.hpp>
25 
27 
28 namespace mic {
29 namespace application {
30 
35 void RegisterApplication (void) {
36  REGISTER_APPLICATION(mic::application::MazeOfDigitsDLRERPOMPD);
37 }
38 
39 
40 MazeOfDigitsDLRERPOMPD::MazeOfDigitsDLRERPOMPD(std::string node_name_) : OpenGLEpisodicApplication(node_name_),
41  saccadic_path(new std::vector <mic::types::Position2D>()),
42  step_reward("step_reward", 0.0),
43  discount_rate("discount_rate", 0.9),
44  learning_rate("learning_rate", 0.005),
45  epsilon("epsilon", 0.1),
46  step_limit("step_limit",0),
47  statistics_filename("statistics_filename","maze_of_digits_drl_er_statistics.csv"),
48  mlnn_filename("mlnn_filename", "maze_of_digits_drl_er_mlnn.txt"),
49  mlnn_save("mlnn_save", false),
50  mlnn_load("mlnn_load", false),
51  experiences(10000,1)
52  {
53  // Register properties - so their values can be overridden (read from the configuration file).
54  registerProperty(step_reward);
55  registerProperty(discount_rate);
56  registerProperty(learning_rate);
57  registerProperty(epsilon);
58  registerProperty(step_limit);
59  registerProperty(statistics_filename);
60  registerProperty(mlnn_filename);
61  registerProperty(mlnn_save);
62  registerProperty(mlnn_load);
63 
64  LOG(LINFO) << "Properties registered";
65 }
66 
67 
69  delete(w_chart);
70  delete(wmd_environment);
71  delete(wmd_observation);
72 }
73 
74 
75 void MazeOfDigitsDLRERPOMPD::initialize(int argc, char* argv[]) {
76  // Initialize GLUT! :]
77  VGL_MANAGER->initializeGLUT(argc, argv);
78 
79  collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
80  // Add containers to collector.
81  collector_ptr->createContainer("path_length_episode", mic::types::color_rgba(0, 255, 0, 180));
82  collector_ptr->createContainer("path_length_average", mic::types::color_rgba(180, 255, 0, 180));
83  collector_ptr->createContainer("path_length_optimal", mic::types::color_rgba(0, 255, 180, 180));
84  collector_ptr->createContainer("path_opt_to_episodic", mic::types::color_rgba(255, 180, 180, 180));
85  collector_ptr->createContainer("path_opt_to_episodic_average", mic::types::color_rgba(255, 255, 255, 180));
86 
89 
90  // Create the visualization windows - must be created in the same, main thread :]
91  w_chart = new WindowCollectorChart<float>("MazeOfDigitsDLRERPOMPD", 256, 512, 0, 0);
92  w_chart->setDataCollectorPtr(collector_ptr);
93 
94 }
95 
97  // Initialize the maze.
99 
100  // Create windows for the visualization of the whole environment and a single observation.
101  wmd_environment = new WindowMazeOfDigits("Environment", env.getEnvironmentHeight()*20,env.getEnvironmentWidth()*20, 0, 316);
102  wmd_observation = new WindowMazeOfDigits("Observation", env.getObservationHeight()*20,env.getObservationWidth()*20, env.getEnvironmentWidth()*20, 316);
103 
104 
105  // Hardcode batchsize - for fastening the display!
107 
108  // Try to load neural network from file.
109  if ((mlnn_load) && (neural_net.load(mlnn_filename))) {
110  // Do nothing ;)
111  } else {
112  // Create a simple neural network.
113  // gridworld wxhx4 -> 100 -> 4 -> regression!.
114  neural_net.pushLayer(new Linear<float>((size_t) env.getObservationSize(), 250));
115  neural_net.pushLayer(new ReLU<float>(250));
116  neural_net.pushLayer(new Linear<float>(250, 100));
117  neural_net.pushLayer(new ReLU<float>(100));
118  neural_net.pushLayer(new Linear<float>(100, 4));
119 
120  // Set batch size.
121  neural_net.resizeBatch(batch_size);
122  // Change optimization function from default GradientDescent to Adam.
123  neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
124  // Set loss function -> regression!
125  neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
126 
127  LOG(LINFO) << "Generated new neural network";
128  }//: else
129 
130  // Set batch size in experience replay memory.
131  experiences.setBatchSize(batch_size);
132 
133  // Set displayed matrix pointers.
134  wmd_environment->setMazePointer(env.getEnvironment());
135  wmd_environment->setPathPointer(saccadic_path);
136  wmd_observation->setMazePointer(env.getObservation());
137 
138 }
139 
140 
142  LOG(LSTATUS) << "Starting new episode " << episode;
143 
144  // Generate the gridworld (and move player to initial position).
146  saccadic_path->clear();
147  // Add first, initial position to to saccadic path.
148  saccadic_path->push_back(env.getAgentPosition());
149 
150  /*LOG(LNOTICE) << "Network responses: \n" << streamNetworkResponseTable();
151  LOG(LNOTICE) << "Observation: \n" << env.observationToString();
152  LOG(LNOTICE) << "Environment: \n" << env.environmentToString();*/
153  // Do not forget to get the current observation!
155 }
156 
157 
159  LOG(LTRACE) << "End current episode";
160 
161  sum_of_iterations += iteration -1; // -1 is the fix related to moving the terminal condition to the front of step!
162  float opt_to_episodic = (float)env.optimalPathLength() / (iteration -1);
163  sum_of_opt_to_episodic_lenghts += opt_to_episodic;
164 
165  // Add variables to container.
166  collector_ptr->addDataToContainer("path_length_episode",(iteration -1));
167  collector_ptr->addDataToContainer("path_length_average",(float)sum_of_iterations/episode);
168  collector_ptr->addDataToContainer("path_length_optimal", (float)env.optimalPathLength());
169  collector_ptr->addDataToContainer("path_opt_to_episodic", opt_to_episodic);
170  collector_ptr->addDataToContainer("path_opt_to_episodic_average", sum_of_opt_to_episodic_lenghts/episode);
171 
172 
173  // Export reward "convergence" diagram.
174  collector_ptr->exportDataToCsv(statistics_filename);
175 
176  // Save nn to file.
177  if (mlnn_save && (episode %10))
179 }
180 
181 
183  LOG(LTRACE) << "streamNetworkResponseTable()";
184  std::string rewards_table;
185  std::string actions_table;
186 
187  // Remember the current state i.e. player position.
188  mic::types::Position2D current_player_pos_t = env.getAgentPosition();
189 
190  // Create new matrices for batches of inputs and targets.
191  MatrixXfPtr inputs_batch(new MatrixXf(env.getObservationSize(), batch_size));
192 
193  // Assume that the batch_size = grid_env.getWidth() * grid_env.getHeight()
195 
196 
197  size_t dx = (env.getObservationWidth()-1)/2;
198  size_t dy = (env.getObservationHeight()-1)/2;
199  mic::types::Position2D p = env.getAgentPosition();
200 
201  // Copy data.
202  for (long oy=0, ey=(p.y-dy); oy<(long)env.getObservationHeight(); oy++, ey++){
203  for (long ox=0, ex=(p.x-dx); ox<(long)env.getObservationWidth(); ox++, ex++) {
204 
205  // Move the player to given state - disregarding whether it was successful or not, answers for walls/positions outside of the gridworld do not interes us anyway...
206  if (!env.moveAgentToPosition(Position2D(ex,ey)))
207  LOG(LDEBUG) << "Failed!"; //... but still we can live with that... ;)
208  // Encode the current state.
209  mic::types::MatrixXfPtr encoded_state = env.encodeObservation();
210  // Add to batch.
211  inputs_batch->col(oy*env.getObservationWidth()+ox) = encoded_state->col(0);
212  }//: for x
213  }//: for y
214 
215  // Get rewards for the whole batch.
216  neural_net.forward(inputs_batch);
217  // Get predictions for all those states - there is no need to create a copy.
218  MatrixXfPtr predicted_batch = neural_net.getPredictions();
219 
220 
221  rewards_table += "Action values:\n";
222  actions_table += "Best actions:\n";
223  // Generate all possible states and all possible rewards.
224  for (long oy=0, ey=(p.y-dy); oy<(long)env.getObservationHeight(); oy++, ey++){
225  rewards_table += "| ";
226  actions_table += "| ";
227  for (long ox=0, ex=(p.x-dx); ox<(long)env.getObservationWidth(); ox++, ex++) {
228  float bestqval = -std::numeric_limits<float>::infinity();
229  size_t best_action = -1;
230  for (size_t a=0; a<4; a++) {
231  float qval = (*predicted_batch)(a, oy*env.getObservationWidth()+ox);
232 
233  rewards_table += std::to_string(qval);
234  if (a==3)
235  rewards_table += " | ";
236  else
237  rewards_table += " , ";
238 
239  // Remember the best value.
240  if (env.isStateAllowed(ex,ey) && (!env.isStateTerminal(ex,ey)) && env.isActionAllowed(ex,ey,a) && (qval > bestqval)){
241  bestqval = qval;
242  best_action = a;
243  }//: if
244 
245  }//: for a(ctions)
246  switch(best_action){
247  case 0 : actions_table += "N | "; break;
248  case 1 : actions_table += "E | "; break;
249  case 2 : actions_table += "S | "; break;
250  case 3 : actions_table += "W | "; break;
251  default: actions_table += "- | ";
252  }//: switch
253 
254  }//: for x
255  rewards_table += "\n";
256  actions_table += "\n";
257  }//: for y
258 
259  // Move player to previous position.
260  env.moveAgentToPosition(current_player_pos_t);
261 
262  return rewards_table + actions_table;
263 }
264 
265 
266 
267 float MazeOfDigitsDLRERPOMPD::computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float* predictions_){
268  LOG(LTRACE) << "computeBestValueForGivenState()";
269  float best_qvalue = -std::numeric_limits<float>::infinity();
270 
271  // Create a list of possible actions.
272  std::vector<mic::types::NESWAction> actions;
273  actions.push_back(A_NORTH);
274  actions.push_back(A_EAST);
275  actions.push_back(A_SOUTH);
276  actions.push_back(A_WEST);
277 
278  for(mic::types::NESWAction action : actions) {
279  // .. and find the value of the best allowed action.
280  if(env.isActionAllowed(player_position_, action)) {
281  float qvalue = predictions_[(size_t)action.getType()];
282  if (qvalue > best_qvalue)
283  best_qvalue = qvalue;
284  }//if is allowed
285  }//: for
286 
287  return best_qvalue;
288 }
289 
290 
291 mic::types::MatrixXfPtr MazeOfDigitsDLRERPOMPD::getPredictedRewardsForGivenState(mic::types::Position2D player_position_) {
292  LOG(LTRACE) << "getPredictedRewardsForGivenState()";
293  // Remember the current state i.e. player position.
294  mic::types::Position2D current_player_pos_t = env.getAgentPosition();
295 
296  // Move the player to given state.
297  env.moveAgentToPosition(player_position_);
298 
299  // Encode the current state.
300  mic::types::MatrixXfPtr encoded_state = env.encodeObservation();
301 
302  // Create NEW matrix for the inputs batch.
303  MatrixXfPtr inputs_batch(new MatrixXf(env.getObservationSize(), batch_size));
304  inputs_batch->setZero();
305 
306  // Set the first input - only this one interests us.
307  inputs_batch->col(0) = encoded_state->col(0);
308 
309  //LOG(LERROR) << "Getting predictions for input batch:\n" <<inputs_batch->transpose();
310 
311  // Pass the data and get predictions.
312  neural_net.forward(inputs_batch);
313 
314  MatrixXfPtr predictions_batch = neural_net.getPredictions();
315 
316  //LOG(LERROR) << "Resulting predictions batch:\n" << predictions_batch->transpose();
317 
318  // Get the first prediction only.
319  MatrixXfPtr predictions_sample(new MatrixXf(4, 1));
320  predictions_sample->col(0) = predictions_batch->col(0);
321 
322  //LOG(LERROR) << "Returned predictions sample:\n" << predictions_sample->transpose();
323 
324  // Move player to previous position.
325  env.moveAgentToPosition(current_player_pos_t);
326 
327  // Return the predictions.
328  return predictions_sample;
329 }
330 
331 mic::types::NESWAction MazeOfDigitsDLRERPOMPD::selectBestActionForGivenState(mic::types::Position2D player_position_){
332  LOG(LTRACE) << "selectBestAction";
333 
334  // Greedy methods - returns the index of element with greatest value.
335  mic::types::NESWAction best_action = A_RANDOM;
336  float best_qvalue = -std::numeric_limits<float>::infinity();
337 
338  // Create a list of possible actions.
339  std::vector<mic::types::NESWAction> actions;
340  actions.push_back(A_NORTH);
341  actions.push_back(A_EAST);
342  actions.push_back(A_SOUTH);
343  actions.push_back(A_WEST);
344 
345  // Check the results of actions one by one... (there is no need to create a separate copy of predictions)
346  MatrixXfPtr predictions_sample = getPredictedRewardsForGivenState(player_position_);
347  //LOG(LERROR) << "Selecting action from predictions:\n" << predictions_sample->transpose();
348  float* pred = predictions_sample->data();
349 
350  for(size_t a=0; a<4; a++) {
351  // Find the best action allowed.
352  if(env.isActionAllowed(player_position_, mic::types::NESWAction((mic::types::NESW)a))) {
353  float qvalue = pred[a];
354  if (qvalue > best_qvalue){
355  best_qvalue = qvalue;
356  best_action.setAction((mic::types::NESW)a);
357  }
358  }//if is allowed
359  }//: for
360 
361  return best_action;
362 }
363 
365  LOG(LSTATUS) << "Episode "<< episode << ": step " << iteration << "";
366 
367  // Check whether state t is terminal - finish the episode.
369  return false;
370 
371  // TMP!
372  double nn_weight_decay = 0;
373 
374  // Get player pos at time t.
375  mic::types::Position2D player_pos_t= env.getAgentPosition();
376  LOG(LINFO) << "Agent position at state t: " << player_pos_t;
377 
378  // Select the action.
379  mic::types::NESWAction action;
380  //action = A_NORTH;
381  double eps = (double)epsilon;
382  if ((double)epsilon < 0)
383  eps = 1.0/(1.0+sqrt(episode));
384  if (eps < 0.1)
385  eps = 0.1;
386  LOG(LDEBUG) << "eps = " << eps;
387  bool random = false;
388 
389  // Epsilon-greedy action selection.
390  if (RAN_GEN->uniRandReal() > eps){
391  // Select best action.
392  action = selectBestActionForGivenState(player_pos_t);
393  } else {
394  // Random action.
395  action = A_RANDOM;
396  random = true;
397  }//: if
398 
399  // Execute action - do not monitor the success.
400  env.moveAgent(action);
401 
402  // Get new state s(t+1).
403  mic::types::Position2D player_pos_t_prim = env.getAgentPosition();
404  LOG(LINFO) << "Agent position at t+1: " << player_pos_t_prim << " after performing the action = " << action << ((random) ? " [Random]" : "");
405 
406  // Add this position to to saccadic path.
407  saccadic_path->push_back(player_pos_t_prim);
408 
409  // Collect the experience.
410  SpatialExperiencePtr exp(new SpatialExperience(player_pos_t, action, player_pos_t_prim));
411  // Create an empty matrix for rewards - this will be recalculated each time the experience will be replayed anyway.
412  MatrixXfPtr rewards (new MatrixXf(4 , batch_size));
413  // Add experience to experience table.
414  experiences.add(exp, rewards);
415 
416 
417  // Deep Q learning - train network with random sample from the experience memory.
418  if (experiences.size() >= 2*batch_size) {
419  // Create new matrices for batches of inputs and targets.
420  MatrixXfPtr inputs_t_batch(new MatrixXf(env.getObservationSize(), batch_size));
421  MatrixXfPtr inputs_t_prim_batch(new MatrixXf(env.getObservationSize(), batch_size));
422  MatrixXfPtr targets_t_batch(new MatrixXf(4, batch_size));
423 
424  // Get the random batch.
425  SpatialExperienceBatch geb = experiences.getRandomBatch();
426 
427  // Debug purposes.
428  geb.setNextSampleIndex(0);
429  for (size_t i=0; i<batch_size; i++) {
430  SpatialExperienceSample ges = geb.getNextSample();
431  SpatialExperiencePtr ge_ptr = ges.data();
432  LOG(LDEBUG) << "Training sample : " << ge_ptr->s_t << " -> " << ge_ptr->a_t << " -> " << ge_ptr->s_t_prim;
433  }//: for
434 
435  // Iterate through samples and create inputs_t_batch.
436  for (size_t i=0; i<batch_size; i++) {
437  SpatialExperienceSample ges = geb.getNextSample();
438  SpatialExperiencePtr ge_ptr = ges.data();
439 
440  // Replay the experience.
441  // "Simulate" moving player to position from state/time (t).
442  env.moveAgentToPosition(ge_ptr->s_t);
443  // Encode the state at time (t).
444  mic::types::MatrixXfPtr encoded_state_t = env.encodeObservation();
445  //float* state = encoded_state_t->data();
446 
447  // Copy the encoded state to inputs batch.
448  inputs_t_batch->col(i) = encoded_state_t->col(0);
449  }// for samples.
450 
451  // Get network responses.
452  neural_net.forward(inputs_t_batch);
453  // Get predictions for all those states...
454  MatrixXfPtr predictions_t_batch = neural_net.getPredictions();
455  // ... and copy them to reward pointer - a container which we will modify.
456  (*targets_t_batch) = (*predictions_t_batch);
457 
458  // Iterate through samples and create inputs_t_prim_batch.
459  geb.setNextSampleIndex(0);
460  for (size_t i=0; i<batch_size; i++) {
461  SpatialExperienceSample ges = geb.getNextSample();
462  SpatialExperiencePtr ge_ptr = ges.data();
463 
464  // Replay the experience.
465  // "Simulate" moving player to position from state/time (t+1).
466  env.moveAgentToPosition(ge_ptr->s_t_prim);
467  // Encode the state at time (t+1).
468  mic::types::MatrixXfPtr encoded_state_t = env.encodeObservation();
469  //float* state = encoded_state_t->data();
470 
471  // Copy the encoded state to inputs batch.
472  inputs_t_prim_batch->col(i) = encoded_state_t->col(0);
473  }// for samples.
474 
475  // Get network responses.
476  neural_net.forward(inputs_t_prim_batch);
477  // Get predictions for all those states...
478  MatrixXfPtr predictions_t_prim_batch = neural_net.getPredictions();
479 
480  // Calculate the rewards, one by one.
481  // Iterate through samples and create inputs_t_prim_batch.
482  geb.setNextSampleIndex(0);
483  for (size_t i=0; i<batch_size; i++) {
484  SpatialExperienceSample ges = geb.getNextSample();
485  SpatialExperiencePtr ge_ptr = ges.data();
486 
487  /*if (ge_ptr->s_t == ge_ptr->s_t_prim) {
488  // The move was not possible! Learn that as well.
489  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = 3*step_reward;
490  } else*/
491  if(env.isStateTerminal(ge_ptr->s_t_prim)) {
492  // The position at (t+1) state appears to be terminal - learn the reward.
493  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = env.getStateReward(ge_ptr->s_t_prim);
494  } else {
495  MatrixXfPtr preds_t_prim (new MatrixXf(4, 1));
496  preds_t_prim->col(0) = predictions_t_prim_batch->col(i);
497  // Get best value for the NEXT state - position from (t+1) state.
498  float max_q_st_prim_at_prim = computeBestValueForGivenStateAndPredictions(ge_ptr->s_t_prim, preds_t_prim->data());
499  // If next state best value is finite.
500  // Update running average for given action - Deep Q learning!
501  if (std::isfinite(max_q_st_prim_at_prim))
502  (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = step_reward + discount_rate*max_q_st_prim_at_prim;
503  }//: else
504 
505  }//: for
506 
507  LOG(LDEBUG) <<"Inputs batch:\n" << inputs_t_batch->transpose();
508  LOG(LDEBUG) <<"Targets batch:\n" << targets_t_batch->transpose();
509 
510  // Perform the Deep-Q-learning.
511  //LOG(LDEBUG) << "Network responses before training:" << std::endl << streamNetworkResponseTable();
512 
513  // Train network with rewards.
514  float loss = neural_net.train (inputs_t_batch, targets_t_batch, learning_rate, nn_weight_decay);
515  LOG(LDEBUG) << "Training loss:" << loss;
516 
517  //LOG(LDEBUG) << "Network responses after training:" << std::endl << streamNetworkResponseTable();
518 
519  // Finish the replay: move the player to REAL, CURRENT POSITION.
520  env.moveAgentToPosition(player_pos_t_prim);
521  }//: if enough experiences
522  else
523  LOG(LWARNING) << "Not enough samples in the experience replay memory!";
524 
525  LOG(LNOTICE) << "Network responses: \n" << streamNetworkResponseTable();
526  LOG(LNOTICE) << "Observation: \n" << env.observationToString();
527  LOG(LNOTICE) << "Environment: \n" << env.environmentToString();
528  // Do not forget to get the current observation!
530 
531  // Check whether we reached maximum number of iterations.
532  if ((step_limit>0) && (iteration >= (size_t)step_limit))
533  return false;
534 
535  return true;
536 }
537 
538 } /* namespace application */
539 } /* namespace mic */
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
virtual float getStateReward(mic::types::Position2D pos_)
virtual std::string observationToString()
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
mic::configuration::Property< float > discount_rate
virtual void add(std::shared_ptr< mic::types::SpatialExperience > input_, std::shared_ptr< mic::types::MatrixXf > target_)
virtual void initialize(int argc, char *argv[])
std::shared_ptr< std::vector< mic::types::Position2D > > saccadic_path
Saccadic path - a sequence of consecutive agent positions.
WindowMazeOfDigits * wmd_observation
Window displaying the observation.
virtual size_t getObservationSize()
virtual bool isStateTerminal(mic::types::Position2D pos_)
mic::configuration::Property< float > learning_rate
std::shared_ptr< mic::types::SpatialExperience > SpatialExperiencePtr
Shared pointer to spatial experience object.
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
virtual mic::types::Position2D getAgentPosition()
mic::types::MatrixXfPtr getPredictedRewardsForGivenState(mic::types::Position2D player_position_)
virtual bool isStateAllowed(mic::types::Position2D pos_)
Structure storing a spatial experience - a triplet of position in time t, executed action and positio...
mic::types::TensorXfPtr getObservation()
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
virtual size_t getObservationWidth()
Definition: Environment.hpp:87
virtual bool isActionAllowed(long x_, long y_, size_t action_)
Definition: Environment.cpp:70
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
virtual size_t getEnvironmentWidth()
Definition: Environment.hpp:69
mic::environments::MazeOfDigits env
The maze of digits environment.
mic::configuration::Property< double > epsilon
Application of Partially Observable Deep Q-learning with Experience Reply to the maze of digits probl...
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
bool moveAgent(mic::types::Action2DInterface ac_)
Definition: Environment.cpp:48
WindowMazeOfDigits * wmd_environment
Window displaying the whole environment.
mic::configuration::Property< int > step_limit
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
size_t batch_size
Size of the batch in experience replay - set to the size of maze (width*height).
float computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float *predictions_)
virtual size_t getEnvironmentHeight()
Definition: Environment.hpp:75
virtual mic::types::MatrixXfPtr encodeObservation()
mic::types::Batch< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceBatch
Spatial experience replay batch.
mic::types::NESWAction selectBestActionForGivenState(mic::types::Position2D player_position_)
mic::types::TensorXfPtr & getEnvironment()
Definition: Environment.hpp:63
void RegisterApplication(void)
Registers application.
mic::types::Sample< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceSample
Spatial experience replay sample.
mic::configuration::Property< float > step_reward
virtual size_t getObservationHeight()
Definition: Environment.hpp:93
MazeOfDigitsDLRERPOMPD(std::string node_name_="application")
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
virtual std::string environmentToString()