MachineIntelligenceCore:ReinforcementLearning
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator
GridworldDeepQLearning.cpp
Go to the documentation of this file.
1 
23 #include <limits>
24 #include <utils/RandomGenerator.hpp>
26 
27 namespace mic {
28 namespace application {
29 
34 void RegisterApplication (void) {
35  REGISTER_APPLICATION(mic::application::GridworldDeepQLearning);
36 }
37 
38 
39 GridworldDeepQLearning::GridworldDeepQLearning(std::string node_name_) : OpenGLEpisodicApplication(node_name_),
40  step_reward("step_reward", 0.0),
41  discount_rate("discount_rate", 0.9),
42  learning_rate("learning_rate", 0.1),
43  epsilon("epsilon", 0.1),
44  statistics_filename("statistics_filename","dql_statistics.csv"),
45  mlnn_filename("mlnn_filename", "dql_mlnn.txt"),
46  mlnn_save("mlnn_save", false),
47  mlnn_load("mlnn_load", false)
48  {
49  // Register properties - so their values can be overridden (read from the configuration file).
50  registerProperty(step_reward);
51  registerProperty(discount_rate);
52  registerProperty(learning_rate);
53  registerProperty(epsilon);
54  registerProperty(statistics_filename);
55  registerProperty(mlnn_filename);
56  registerProperty(mlnn_save);
57  registerProperty(mlnn_load);
58 
59  LOG(LINFO) << "Properties registered";
60 }
61 
62 
64  delete(w_chart);
65 }
66 
67 
68 void GridworldDeepQLearning::initialize(int argc, char* argv[]) {
69  // Initialize GLUT! :]
70  VGL_MANAGER->initializeGLUT(argc, argv);
71 
72  collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
73  // Add containers to collector.
74  collector_ptr->createContainer("number_of_steps", mic::types::color_rgba(255, 0, 0, 180));
75  collector_ptr->createContainer("average_number_of_steps", mic::types::color_rgba(255, 255, 0, 180));
76  collector_ptr->createContainer("collected_reward", mic::types::color_rgba(0, 255, 0, 180));
77  collector_ptr->createContainer("average_collected_reward", mic::types::color_rgba(0, 255, 255, 180));
78 
80  sum_of_rewards = 0;
81 
82  // Create the visualization windows - must be created in the same, main thread :]
83  w_chart = new WindowCollectorChart<float>("GridworldDeepQLearning", 256, 256, 0, 0);
84  w_chart->setDataCollectorPtr(collector_ptr);
85 
86 }
87 
89  // Initialize the gridworld.
91 
92  // Try to load neural network from file.
93  if ((mlnn_load) && (neural_net.load(mlnn_filename))) {
94  // Do nothing ;)
95  } else {
96  // Create a simple neural network.
97  // gridworld wxhx4 -> 100 -> 4 -> regression!; batch size is set to one.
98  neural_net.pushLayer(new Linear<float>((size_t) grid_env.getEnvironmentWidth() * grid_env.getEnvironmentHeight(), 250));
99  neural_net.pushLayer(new ReLU<float>(250));
100  neural_net.pushLayer(new Linear<float>(250, 100));
101  neural_net.pushLayer(new ReLU<float>(100));
102  neural_net.pushLayer(new Linear<float>(100, 4));
103 
104  // Set batch size to 1.
105  //neural_net.resizeBatch(1);
106  // Change optimization function from default GradientDescent to Adam.
107  neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
108  // Set loss function -> regression!
109  neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
110 
111  LOG(LINFO) << "Generated new neural network";
112  }//: else
113 }
114 
115 
117  LOG(LSTATUS) << "Starting new episode " << episode;
118 
119  // Generate the gridworld (and move player to initial position).
121 
122  LOG(LSTATUS) << "Network responses: \n" << streamNetworkResponseTable();
123  LOG(LSTATUS) << "Environment: \n" << grid_env.environmentToString();
124 
125 }
126 
127 
129  LOG(LTRACE) << "End of the episode " << episode;
130 
132  sum_of_iterations += iteration;
133  sum_of_rewards += reward;
134 
135  // Add variables to container.
136  collector_ptr->addDataToContainer("number_of_steps",iteration);
137  collector_ptr->addDataToContainer("average_number_of_steps",(float)sum_of_iterations/episode);
138  collector_ptr->addDataToContainer("collected_reward", reward);
139  collector_ptr->addDataToContainer("average_collected_reward", (float)sum_of_rewards/episode);
140 
141  // Export reward "convergence" diagram.
142  collector_ptr->exportDataToCsv(statistics_filename);
143 
144  // Save nn to file.
145  if (mlnn_save)
147 }
148 
149 
151  LOG(LTRACE) << "streamNetworkResponseTable()";
152  std::string rewards_table;
153  std::string actions_table;
154 
155  // Remember the current state i.e. player position.
156  mic::types::Position2D current_player_pos_t = grid_env.getAgentPosition();
157 
158  rewards_table += "Action values:\n";
159  actions_table += "Best actions:\n";
160  // Generate all possible states and all possible rewards.
161  for (size_t y=0; y<grid_env.getEnvironmentHeight(); y++){
162  rewards_table += "| ";
163  actions_table += "| ";
164  for (size_t x=0; x<grid_env.getEnvironmentWidth(); x++) {
165  float bestqval = -std::numeric_limits<float>::infinity();
166  size_t best_action = -1;
167 
168  // Check network response for given state.
169  grid_env.moveAgentToPosition(Position2D(x,y));
170  mic::types::MatrixXfPtr tmp_state = grid_env.encodeAgentGrid();
171  //std::cout<< "tmp_state = " << tmp_state->transpose() << std::endl;
172  // Pass the data and get predictions.
173  neural_net.forward(tmp_state);
174  mic::types::MatrixXfPtr tmp_predicted_rewards = neural_net.getPredictions();
175  float* qstate = tmp_predicted_rewards->data();
176 
177  for (size_t a=0; a<4; a++) {
178  float qval = qstate[a];
179 
180  rewards_table += std::to_string(qval);
181  if (a==3)
182  rewards_table += " | ";
183  else
184  rewards_table += " , ";
185 
186  // Remember the best value.
187  if (grid_env.isStateAllowed(x,y) && (!grid_env.isStateTerminal(x,y)) && grid_env.isActionAllowed(x,y,a) && (qval > bestqval)){
188  bestqval = qval;
189  best_action = a;
190  }//: if
191 
192  }//: for a(ctions)
193  switch(best_action){
194  case 0 : actions_table += "N | "; break;
195  case 1 : actions_table += "E | "; break;
196  case 2 : actions_table += "S | "; break;
197  case 3 : actions_table += "W | "; break;
198  default: actions_table += "- | ";
199  }//: switch
200 
201  }//: for x
202  rewards_table += "\n";
203  actions_table += "\n";
204  }//: for y
205 
206 
207  // Move player to previous position.
208  grid_env.moveAgentToPosition(current_player_pos_t);
209 
210  return rewards_table + actions_table;
211 }
212 
213 
214 
216  LOG(LTRACE) << "computeBestValue";
217  float best_qvalue = -std::numeric_limits<float>::infinity();
218 
219  // Create a list of possible actions.
220  std::vector<mic::types::NESWAction> actions;
221  actions.push_back(A_NORTH);
222  actions.push_back(A_EAST);
223  actions.push_back(A_SOUTH);
224  actions.push_back(A_WEST);
225 
226  // Check the results of actions one by one... (there is no need to create a separate copy of predictions)
227  MatrixXfPtr predictions_sample = getPredictedRewardsForCurrentState();
228  //LOG(LERROR) << "Selecting action from predictions:\n" << predictions_sample->transpose();
229  float* pred = predictions_sample->data();
230 
231  for(mic::types::NESWAction action : actions) {
232  // .. and find the value of teh best allowed action.
233  if(grid_env.isActionAllowed(action)) {
234  float qvalue = pred[(size_t)action.getType()];
235  if (qvalue > best_qvalue){
236  best_qvalue = qvalue;
237  }
238  }//if is allowed
239  }//: for
240 
241  return best_qvalue;
242 }
243 
245  // Encode the current state.
246  mic::types::MatrixXfPtr encoded_state = grid_env.encodeAgentGrid();
247  // Pass the data and get predictions.
248  neural_net.forward(encoded_state);
249  // Return the predictions.
250  return neural_net.getPredictions();
251 }
252 
253 
255  LOG(LTRACE) << "selectBestAction";
256 
257  // Greedy methods - returns the index of element with greatest value.
258  mic::types::NESWAction best_action = A_RANDOM;
259  float best_qvalue = -std::numeric_limits<float>::infinity();
260 
261  // Create a list of possible actions.
262  std::vector<mic::types::NESWAction> actions;
263  actions.push_back(A_NORTH);
264  actions.push_back(A_EAST);
265  actions.push_back(A_SOUTH);
266  actions.push_back(A_WEST);
267 
268  // Check the results of actions one by one... (there is no need to create a separate copy of predictions)
269  MatrixXfPtr predictions_sample = getPredictedRewardsForCurrentState();
270  //LOG(LERROR) << "Selecting action from predictions:\n" << predictions_sample->transpose();
271  float* pred = predictions_sample->data();
272 
273  for(size_t a=0; a<4; a++) {
274  // Find the best action allowed.
275  if(grid_env.isActionAllowed(mic::types::NESWAction((mic::types::NESW)a))) {
276  float qvalue = pred[a];
277  if (qvalue > best_qvalue){
278  best_qvalue = qvalue;
279  best_action.setAction((mic::types::NESW)a);
280  }
281  }//if is allowed
282  }//: for
283 
284  return best_action;
285 }
286 
288  LOG(LSTATUS) << "Episode "<< episode << ": step " << iteration << "";
289 
290  // TMP!
291  double nn_weight_decay = 0;
292 
293  // Get player pos at time t.
294  mic::types::Position2D player_pos_t= grid_env.getAgentPosition();
295 
296  // Encode the current state at time t.
297  mic::types::MatrixXfPtr encoded_state_t = grid_env.encodeAgentGrid();
298 
299  // Get the prediced rewards at time t...
300  MatrixXfPtr tmp_rewards_t = getPredictedRewardsForCurrentState();
301  // ... but make a local copy!
302  MatrixXfPtr predicted_rewards_t (new MatrixXf(*tmp_rewards_t));
303  LOG(LINFO) << "Agent position at state t: " << player_pos_t;
304  LOG(LSTATUS) << "Predicted rewards for state t: " << predicted_rewards_t->transpose();
305 
306  // Select the action.
307  mic::types::NESWAction action;
308  //action = A_NORTH;
309  double eps = (double)epsilon;
310  if ((double)epsilon < 0)
311  eps = 1.0/(1.0+sqrt(episode));
312  if (eps < 0.1)
313  eps = 0.1;
314  LOG(LDEBUG) << "eps = " << eps;
315  bool random = false;
316 
317  // Epsilon-greedy action selection.
318  if (RAN_GEN->uniRandReal() > eps){
319  // Select best action.
321  } else {
322  // Random action.
323  action = A_RANDOM;
324  random = true;
325  }//: if
326 
327  // Execute action - until success.
328  if (!grid_env.moveAgent(action)) {
329  // The move was not possible! Learn that as well.
330  (*predicted_rewards_t)((size_t)action.getType(), 0) = step_reward;
331 
332  } else {
333  // Ok, move performed, get rewards.
334 
335  // Get new state s(t+1).
336  mic::types::Position2D player_pos_t_prim = grid_env.getAgentPosition();
337 
338  LOG(LINFO) << "Agent position at t+1: " << player_pos_t_prim << " after performing the action = " << action << ((random) ? " [Random]" : "");
339 
340  // Check whether state t+1 is terminal.
341  if(grid_env.isStateTerminal(player_pos_t_prim))
342  (*predicted_rewards_t)((size_t)action.getType(), 0) = grid_env.getStateReward(player_pos_t_prim);
343  else {
344  // Update running average for given action - Deep Q learning!
345  float r = step_reward;
346  // Get best value for the NEXT state (!).
347  float max_q_st_prim_at_prim = computeBestValueForCurrentState();
348 
349  LOG(LWARNING) << "step_reward = " << step_reward;
350  LOG(LWARNING) << "max_q_st_prim_at_prim = " << max_q_st_prim_at_prim;
351 
352  // If next state best value is finite.
353  if (std::isfinite(max_q_st_prim_at_prim))
354  (*predicted_rewards_t)((size_t)action.getType(), 0) = r + discount_rate*max_q_st_prim_at_prim;
355  else
356  (*predicted_rewards_t)((size_t)action.getType(), 0) = r;
357 
358  // Special case - punish going back!
359  if (player_pos_t_minus_prim == player_pos_t_prim)
360  (*predicted_rewards_t)((size_t)action.getType(), 0) = 5*r + discount_rate*max_q_st_prim_at_prim;;
361 
362  }//: else is terminal state
363  }//: else !move
364 
365 
366  // Deep Q learning - train network with the desired values.
367  LOG(LERROR) << "Training with state: " << encoded_state_t->transpose();
368  LOG(LERROR) << "Training with desired rewards: " << predicted_rewards_t->transpose();
369  LOG(LSTATUS) << "Network responses before training:" << std::endl << streamNetworkResponseTable();
370 
371  // Train network with rewards.
372  float loss = neural_net.train (encoded_state_t, predicted_rewards_t, learning_rate, nn_weight_decay);
373  LOG(LSTATUS) << "Training loss:" << loss;
374 
375  LOG(LSTATUS) << "Network responses after training:" << std::endl << streamNetworkResponseTable();
376  LOG(LSTATUS) << "Current environment: \n" << grid_env.environmentToString();
377 
378  // Remember the previous position.
379  player_pos_t_minus_prim = player_pos_t;
380  // Check whether state t+1 is terminal - finish the episode.
382  return false;
383 
384  return true;
385 }
386 
387 
388 } /* namespace application */
389 } /* namespace mic */
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
Definition: Gridworld.cpp:805
virtual float getStateReward(mic::types::Position2D pos_)
Definition: Gridworld.cpp:823
mic::configuration::Property< float > discount_rate
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
mic::configuration::Property< double > epsilon
virtual mic::types::Position2D getAgentPosition()
Definition: Gridworld.cpp:790
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
mic::environments::Gridworld grid_env
The gridworld environment.
virtual void initialize(int argc, char *argv[])
GridworldDeepQLearning(std::string node_name_="application")
mic::configuration::Property< float > learning_rate
mic::types::MatrixXfPtr getPredictedRewardsForCurrentState()
virtual bool isActionAllowed(long x_, long y_, size_t action_)
Definition: Environment.cpp:70
virtual bool isStateTerminal(mic::types::Position2D pos_)
Definition: Gridworld.cpp:849
virtual size_t getEnvironmentWidth()
Definition: Environment.hpp:69
virtual mic::types::MatrixXfPtr encodeAgentGrid()
Encode the current state of the reduced grid (only the agent position) as a matrix of size [1...
Definition: Gridworld.cpp:768
bool moveAgent(mic::types::Action2DInterface ac_)
Definition: Environment.cpp:48
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
virtual std::string environmentToString()
Definition: Gridworld.cpp:689
virtual size_t getEnvironmentHeight()
Definition: Environment.hpp:75
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
virtual bool isStateAllowed(mic::types::Position2D pos_)
Definition: Gridworld.cpp:834
void RegisterApplication(void)
Registers application.
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
mic::configuration::Property< float > step_reward
Class responsible for solving the gridworld problem with Q-learning and (not that) deep neural networ...
virtual void initializeEnvironment()
Definition: Gridworld.cpp:81
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.