24 #include <utils/RandomGenerator.hpp>
29 namespace application {
41 saccadic_path(new std::vector <mic::types::Position2D>()),
42 step_reward(
"step_reward", 0.0),
43 discount_rate(
"discount_rate", 0.9),
44 learning_rate(
"learning_rate", 0.005),
45 epsilon(
"epsilon", 0.1),
46 step_limit(
"step_limit",0),
47 statistics_filename(
"statistics_filename",
"maze_of_digits_drl_er_statistics.csv"),
48 mlnn_filename(
"mlnn_filename",
"maze_of_digits_drl_er_mlnn.txt"),
49 mlnn_save(
"mlnn_save", false),
50 mlnn_load(
"mlnn_load", false),
64 LOG(LINFO) <<
"Properties registered";
77 VGL_MANAGER->initializeGLUT(argc, argv);
79 collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
81 collector_ptr->createContainer(
"path_length_episode", mic::types::color_rgba(0, 255, 0, 180));
82 collector_ptr->createContainer(
"path_length_average", mic::types::color_rgba(180, 255, 0, 180));
83 collector_ptr->createContainer(
"path_length_optimal", mic::types::color_rgba(0, 255, 180, 180));
84 collector_ptr->createContainer(
"path_opt_to_episodic", mic::types::color_rgba(255, 180, 180, 180));
85 collector_ptr->createContainer(
"path_opt_to_episodic_average", mic::types::color_rgba(255, 255, 255, 180));
91 w_chart =
new WindowCollectorChart<float>(
"MazeOfDigitsDLRERPOMPD", 256, 512, 0, 0);
116 neural_net.pushLayer(
new Linear<float>(250, 100));
118 neural_net.pushLayer(
new Linear<float>(100, 4));
123 neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
125 neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
127 LOG(LINFO) <<
"Generated new neural network";
142 LOG(LSTATUS) <<
"Starting new episode " << episode;
159 LOG(LTRACE) <<
"End current episode";
166 collector_ptr->addDataToContainer(
"path_length_episode",(iteration -1));
169 collector_ptr->addDataToContainer(
"path_opt_to_episodic", opt_to_episodic);
183 LOG(LTRACE) <<
"streamNetworkResponseTable()";
184 std::string rewards_table;
185 std::string actions_table;
207 LOG(LDEBUG) <<
"Failed!";
218 MatrixXfPtr predicted_batch =
neural_net.getPredictions();
221 rewards_table +=
"Action values:\n";
222 actions_table +=
"Best actions:\n";
225 rewards_table +=
"| ";
226 actions_table +=
"| ";
228 float bestqval = -std::numeric_limits<float>::infinity();
229 size_t best_action = -1;
230 for (
size_t a=0; a<4; a++) {
233 rewards_table += std::to_string(qval);
235 rewards_table +=
" | ";
237 rewards_table +=
" , ";
247 case 0 : actions_table +=
"N | ";
break;
248 case 1 : actions_table +=
"E | ";
break;
249 case 2 : actions_table +=
"S | ";
break;
250 case 3 : actions_table +=
"W | ";
break;
251 default: actions_table +=
"- | ";
255 rewards_table +=
"\n";
256 actions_table +=
"\n";
262 return rewards_table + actions_table;
268 LOG(LTRACE) <<
"computeBestValueForGivenState()";
269 float best_qvalue = -std::numeric_limits<float>::infinity();
272 std::vector<mic::types::NESWAction> actions;
273 actions.push_back(A_NORTH);
274 actions.push_back(A_EAST);
275 actions.push_back(A_SOUTH);
276 actions.push_back(A_WEST);
278 for(mic::types::NESWAction action : actions) {
281 float qvalue = predictions_[(size_t)action.getType()];
282 if (qvalue > best_qvalue)
283 best_qvalue = qvalue;
292 LOG(LTRACE) <<
"getPredictedRewardsForGivenState()";
304 inputs_batch->setZero();
307 inputs_batch->col(0) = encoded_state->col(0);
314 MatrixXfPtr predictions_batch =
neural_net.getPredictions();
319 MatrixXfPtr predictions_sample(
new MatrixXf(4, 1));
320 predictions_sample->col(0) = predictions_batch->col(0);
328 return predictions_sample;
332 LOG(LTRACE) <<
"selectBestAction";
335 mic::types::NESWAction best_action = A_RANDOM;
336 float best_qvalue = -std::numeric_limits<float>::infinity();
339 std::vector<mic::types::NESWAction> actions;
340 actions.push_back(A_NORTH);
341 actions.push_back(A_EAST);
342 actions.push_back(A_SOUTH);
343 actions.push_back(A_WEST);
348 float* pred = predictions_sample->data();
350 for(
size_t a=0; a<4; a++) {
352 if(
env.
isActionAllowed(player_position_, mic::types::NESWAction((mic::types::NESW)a))) {
353 float qvalue = pred[a];
354 if (qvalue > best_qvalue){
355 best_qvalue = qvalue;
356 best_action.setAction((mic::types::NESW)a);
365 LOG(LSTATUS) <<
"Episode "<< episode <<
": step " << iteration <<
"";
372 double nn_weight_decay = 0;
376 LOG(LINFO) <<
"Agent position at state t: " << player_pos_t;
379 mic::types::NESWAction action;
383 eps = 1.0/(1.0+sqrt(episode));
386 LOG(LDEBUG) <<
"eps = " << eps;
390 if (RAN_GEN->uniRandReal() > eps){
404 LOG(LINFO) <<
"Agent position at t+1: " << player_pos_t_prim <<
" after performing the action = " << action << ((random) ?
" [Random]" :
"");
412 MatrixXfPtr rewards (
new MatrixXf(4 ,
batch_size));
422 MatrixXfPtr targets_t_batch(
new MatrixXf(4,
batch_size));
428 geb.setNextSampleIndex(0);
432 LOG(LDEBUG) <<
"Training sample : " << ge_ptr->s_t <<
" -> " << ge_ptr->a_t <<
" -> " << ge_ptr->s_t_prim;
448 inputs_t_batch->col(i) = encoded_state_t->col(0);
454 MatrixXfPtr predictions_t_batch =
neural_net.getPredictions();
456 (*targets_t_batch) = (*predictions_t_batch);
459 geb.setNextSampleIndex(0);
472 inputs_t_prim_batch->col(i) = encoded_state_t->col(0);
478 MatrixXfPtr predictions_t_prim_batch =
neural_net.getPredictions();
482 geb.setNextSampleIndex(0);
493 (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) =
env.
getStateReward(ge_ptr->s_t_prim);
495 MatrixXfPtr preds_t_prim (
new MatrixXf(4, 1));
496 preds_t_prim->col(0) = predictions_t_prim_batch->col(i);
501 if (std::isfinite(max_q_st_prim_at_prim))
507 LOG(LDEBUG) <<
"Inputs batch:\n" << inputs_t_batch->transpose();
508 LOG(LDEBUG) <<
"Targets batch:\n" << targets_t_batch->transpose();
515 LOG(LDEBUG) <<
"Training loss:" << loss;
523 LOG(LWARNING) <<
"Not enough samples in the experience replay memory!";
virtual void startNewEpisode()
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
virtual float getStateReward(mic::types::Position2D pos_)
virtual std::string observationToString()
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
mic::configuration::Property< float > discount_rate
virtual void add(std::shared_ptr< mic::types::SpatialExperience > input_, std::shared_ptr< mic::types::MatrixXf > target_)
long long sum_of_iterations
virtual void initialize(int argc, char *argv[])
virtual void initializeEnvironment()
std::shared_ptr< std::vector< mic::types::Position2D > > saccadic_path
Saccadic path - a sequence of consecutive agent positions.
WindowMazeOfDigits * wmd_observation
Window displaying the observation.
virtual size_t getObservationSize()
virtual bool isStateTerminal(mic::types::Position2D pos_)
mic::configuration::Property< float > learning_rate
std::shared_ptr< mic::types::SpatialExperience > SpatialExperiencePtr
Shared pointer to spatial experience object.
double sum_of_opt_to_episodic_lenghts
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
virtual mic::types::Position2D getAgentPosition()
mic::types::MatrixXfPtr getPredictedRewardsForGivenState(mic::types::Position2D player_position_)
virtual bool isStateAllowed(mic::types::Position2D pos_)
Structure storing a spatial experience - a triplet of position in time t, executed action and positio...
mic::types::TensorXfPtr getObservation()
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
virtual size_t getObservationWidth()
unsigned int optimalPathLength()
virtual ~MazeOfDigitsDLRERPOMPD()
SpatialExperienceMemory experiences
virtual bool isActionAllowed(long x_, long y_, size_t action_)
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
virtual size_t getEnvironmentWidth()
virtual void finishCurrentEpisode()
mic::environments::MazeOfDigits env
The maze of digits environment.
mic::configuration::Property< double > epsilon
Application of Partially Observable Deep Q-learning with Experience Reply to the maze of digits probl...
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
bool moveAgent(mic::types::Action2DInterface ac_)
WindowMazeOfDigits * wmd_environment
Window displaying the whole environment.
mic::configuration::Property< int > step_limit
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
size_t batch_size
Size of the batch in experience replay - set to the size of maze (width*height).
float computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float *predictions_)
virtual size_t getEnvironmentHeight()
virtual mic::types::MatrixXfPtr encodeObservation()
mic::types::Batch< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceBatch
Spatial experience replay batch.
mic::types::NESWAction selectBestActionForGivenState(mic::types::Position2D player_position_)
mic::types::TensorXfPtr & getEnvironment()
void RegisterApplication(void)
Registers application.
mic::types::Sample< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceSample
Spatial experience replay sample.
std::string streamNetworkResponseTable()
virtual void initializePropertyDependentVariables()
mic::configuration::Property< float > step_reward
virtual size_t getObservationHeight()
MazeOfDigitsDLRERPOMPD(std::string node_name_="application")
virtual bool performSingleStep()
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
virtual std::string environmentToString()