26 #include <utils/RandomGenerator.hpp>
30 namespace application {
42 saccadic_path(new std::vector <mic::types::Position2D>()),
43 step_reward(
"step_reward", 0.0),
44 discount_rate(
"discount_rate", 0.9),
45 learning_rate(
"learning_rate", 0.005),
46 epsilon(
"epsilon", 0.1),
47 step_limit(
"step_limit",0),
48 statistics_filename(
"statistics_filename",
"mnist_digit_drl_er_statistics.csv"),
49 mlnn_filename(
"mlnn_filename",
"mnist_digit_drl_er_mlnn.txt"),
50 mlnn_save(
"mlnn_save", false),
51 mlnn_load(
"mlnn_load", false),
65 LOG(LINFO) <<
"Properties registered";
78 VGL_MANAGER->initializeGLUT(argc, argv);
80 collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
82 collector_ptr->createContainer(
"path_length_episode", mic::types::color_rgba(0, 255, 0, 180));
83 collector_ptr->createContainer(
"path_length_average", mic::types::color_rgba(255, 255, 0, 180));
84 collector_ptr->createContainer(
"path_length_optimal", mic::types::color_rgba(255, 255, 255, 180));
85 collector_ptr->createContainer(
"path_length_diff", mic::types::color_rgba(255, 0, 0, 180));
90 w_chart =
new WindowCollectorChart<float>(
"MNISTDigitDLRERPOMDP", 256, 512, 0, 0);
112 neural_net.pushLayer(
new Linear<float>(250, 100));
114 neural_net.pushLayer(
new Linear<float>(100, 4));
119 neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
121 neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
123 LOG(LINFO) <<
"Generated new neural network";
138 LOG(LSTATUS) <<
"Starting new episode " << episode;
155 LOG(LTRACE) <<
"End current episode";
160 collector_ptr->addDataToContainer(
"path_length_episode",(iteration -1));
176 LOG(LTRACE) <<
"streamNetworkResponseTable()";
177 std::string rewards_table;
178 std::string actions_table;
200 LOG(LDEBUG) <<
"Failed!";
211 MatrixXfPtr predicted_batch =
neural_net.getPredictions();
214 rewards_table +=
"Action values:\n";
215 actions_table +=
"Best actions:\n";
218 rewards_table +=
"| ";
219 actions_table +=
"| ";
221 float bestqval = -std::numeric_limits<float>::infinity();
222 size_t best_action = -1;
223 for (
size_t a=0; a<4; a++) {
226 rewards_table += std::to_string(qval);
228 rewards_table +=
" | ";
230 rewards_table +=
" , ";
240 case 0 : actions_table +=
"N | ";
break;
241 case 1 : actions_table +=
"E | ";
break;
242 case 2 : actions_table +=
"S | ";
break;
243 case 3 : actions_table +=
"W | ";
break;
244 default: actions_table +=
"- | ";
248 rewards_table +=
"\n";
249 actions_table +=
"\n";
255 return rewards_table + actions_table;
261 LOG(LTRACE) <<
"computeBestValueForGivenState()";
262 float best_qvalue = -std::numeric_limits<float>::infinity();
265 std::vector<mic::types::NESWAction> actions;
266 actions.push_back(A_NORTH);
267 actions.push_back(A_EAST);
268 actions.push_back(A_SOUTH);
269 actions.push_back(A_WEST);
271 for(mic::types::NESWAction action : actions) {
274 float qvalue = predictions_[(size_t)action.getType()];
275 if (qvalue > best_qvalue)
276 best_qvalue = qvalue;
285 LOG(LTRACE) <<
"getPredictedRewardsForGivenState()";
297 inputs_batch->setZero();
300 inputs_batch->col(0) = encoded_state->col(0);
307 MatrixXfPtr predictions_batch =
neural_net.getPredictions();
312 MatrixXfPtr predictions_sample(
new MatrixXf(4, 1));
313 predictions_sample->col(0) = predictions_batch->col(0);
321 return predictions_sample;
325 LOG(LTRACE) <<
"selectBestAction";
328 mic::types::NESWAction best_action = A_RANDOM;
329 float best_qvalue = -std::numeric_limits<float>::infinity();
332 std::vector<mic::types::NESWAction> actions;
333 actions.push_back(A_NORTH);
334 actions.push_back(A_EAST);
335 actions.push_back(A_SOUTH);
336 actions.push_back(A_WEST);
341 float* pred = predictions_sample->data();
343 for(
size_t a=0; a<4; a++) {
345 if(
env.
isActionAllowed(player_position_, mic::types::NESWAction((mic::types::NESW)a))) {
346 float qvalue = pred[a];
347 if (qvalue > best_qvalue){
348 best_qvalue = qvalue;
349 best_action.setAction((mic::types::NESW)a);
358 LOG(LSTATUS) <<
"Episode "<< episode <<
": step " << iteration <<
"";
362 LOG(LINFO) <<
"Agent position at state t: " << player_pos_t;
369 double nn_weight_decay = 0;
372 mic::types::NESWAction action;
376 eps = 1.0/(1.0+sqrt(episode));
379 LOG(LDEBUG) <<
"eps = " << eps;
383 if (RAN_GEN->uniRandReal() > eps){
397 LOG(LINFO) <<
"Agent position at t+1: " << player_pos_t_prim <<
" after performing the action = " << action << ((random) ?
" [Random]" :
"");
405 MatrixXfPtr rewards (
new MatrixXf(4 ,
batch_size));
415 MatrixXfPtr targets_t_batch(
new MatrixXf(4,
batch_size));
421 geb.setNextSampleIndex(0);
425 LOG(LDEBUG) <<
"Training sample : " << ge_ptr->s_t <<
" -> " << ge_ptr->a_t <<
" -> " << ge_ptr->s_t_prim;
441 inputs_t_batch->col(i) = encoded_state_t->col(0);
447 MatrixXfPtr predictions_t_batch =
neural_net.getPredictions();
449 (*targets_t_batch) = (*predictions_t_batch);
452 geb.setNextSampleIndex(0);
465 inputs_t_prim_batch->col(i) = encoded_state_t->col(0);
471 MatrixXfPtr predictions_t_prim_batch =
neural_net.getPredictions();
475 geb.setNextSampleIndex(0);
480 if (ge_ptr->s_t == ge_ptr->s_t_prim) {
482 (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = 3*
step_reward;
485 (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) =
env.
getStateReward(ge_ptr->s_t_prim);
487 MatrixXfPtr preds_t_prim (
new MatrixXf(4, 1));
488 preds_t_prim->col(0) = predictions_t_prim_batch->col(i);
493 if (std::isfinite(max_q_st_prim_at_prim))
499 LOG(LDEBUG) <<
"Inputs batch:\n" << inputs_t_batch->transpose();
500 LOG(LDEBUG) <<
"Targets batch:\n" << targets_t_batch->transpose();
507 LOG(LDEBUG) <<
"Training loss:" << loss;
515 LOG(LWARNING) <<
"Not enough samples in the experience replay memory!";
mic::types::TensorXfPtr getObservation()
std::string streamNetworkResponseTable()
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
WindowMNISTDigit * wmd_observation
Window displaying the observation.
virtual bool isStateTerminal(mic::types::Position2D pos_)
virtual std::string observationToString()
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
SpatialExperienceMemory experiences
virtual void add(std::shared_ptr< mic::types::SpatialExperience > input_, std::shared_ptr< mic::types::MatrixXf > target_)
Application of Partially Observable Deep Q-learning with Experience Reply to the MNIST digits problem...
std::shared_ptr< mic::types::SpatialExperience > SpatialExperiencePtr
Shared pointer to spatial experience object.
virtual bool performSingleStep()
virtual void initializeEnvironment()
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
mic::configuration::Property< double > epsilon
virtual mic::types::MatrixXfPtr encodeObservation()
Structure storing a spatial experience - a triplet of position in time t, executed action and positio...
WindowMNISTDigit * wmd_environment
Window displaying the whole environment.
mic::configuration::Property< float > discount_rate
std::shared_ptr< std::vector< mic::types::Position2D > > saccadic_path
Saccadic path - a sequence of consecutive agent positions.
virtual size_t getObservationWidth()
virtual void finishCurrentEpisode()
virtual bool isActionAllowed(long x_, long y_, size_t action_)
MNISTDigitDLRERPOMDP(std::string node_name_="application")
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
virtual size_t getEnvironmentWidth()
virtual size_t getObservationSize()
size_t batch_size
Size of the batch in experience replay - set to the size of maze (width*height).
virtual mic::types::Position2D getAgentPosition()
float computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float *predictions_)
virtual std::string environmentToString()
virtual void startNewEpisode()
unsigned int optimalPathLength()
mic::configuration::Property< float > learning_rate
bool moveAgent(mic::types::Action2DInterface ac_)
mic::configuration::Property< float > step_reward
virtual void initializePropertyDependentVariables()
virtual float getStateReward(mic::types::Position2D pos_)
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
virtual bool isStateAllowed(mic::types::Position2D pos_)
virtual size_t getEnvironmentHeight()
mic::types::Batch< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceBatch
Spatial experience replay batch.
long long sum_of_iterations
virtual ~MNISTDigitDLRERPOMDP()
mic::types::TensorXfPtr & getEnvironment()
void RegisterApplication(void)
Registers application.
virtual void initialize(int argc, char *argv[])
mic::types::NESWAction selectBestActionForGivenState(mic::types::Position2D player_position_)
mic::types::Sample< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceSample
Spatial experience replay sample.
mic::configuration::Property< int > step_limit
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
mic::environments::MNISTDigit env
The maze of digits environment.
mic::types::MatrixXfPtr getPredictedRewardsForGivenState(mic::types::Position2D player_position_)
virtual size_t getObservationHeight()