24 #include <utils/RandomGenerator.hpp>
29 namespace application {
41 step_reward(
"step_reward", 0.0),
42 discount_rate(
"discount_rate", 0.9),
43 learning_rate(
"learning_rate", 0.005),
44 epsilon(
"epsilon", 0.1),
45 step_limit(
"step_limit",0),
46 statistics_filename(
"statistics_filename",
"drl_er_statistics.csv"),
47 mlnn_filename(
"mlnn_filename",
"drl_er_mlnn.txt"),
48 mlnn_save(
"mlnn_save", false),
49 mlnn_load(
"mlnn_load", false),
63 LOG(LINFO) <<
"Properties registered";
74 VGL_MANAGER->initializeGLUT(argc, argv);
76 collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
78 collector_ptr->createContainer(
"number_of_steps", mic::types::color_rgba(255, 0, 0, 180));
79 collector_ptr->createContainer(
"number_of_steps_average", mic::types::color_rgba(255, 255, 0, 180));
80 collector_ptr->createContainer(
"collected_reward", mic::types::color_rgba(0, 255, 0, 180));
81 collector_ptr->createContainer(
"collected_reward_average", mic::types::color_rgba(0, 255, 255, 180));
82 collector_ptr->createContainer(
"success_ratio", mic::types::color_rgba(255, 255, 255, 180));
89 w_chart =
new WindowCollectorChart<float>(
"GridworldDRLExperienceReplayPOMDP", 256, 256, 0, 0);
109 neural_net.pushLayer(
new Linear<float>(250, 100));
111 neural_net.pushLayer(
new Linear<float>(100, 4));
116 neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
118 neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
120 LOG(LINFO) <<
"Generated new neural network";
129 LOG(LSTATUS) <<
"Starting new episode " << episode;
141 LOG(LTRACE) <<
"End current episode";
151 collector_ptr->addDataToContainer(
"number_of_steps",iteration);
153 collector_ptr->addDataToContainer(
"collected_reward", reward);
168 LOG(LTRACE) <<
"streamNetworkResponseTable()";
169 std::string rewards_table;
170 std::string actions_table;
195 LOG(LDEBUG) <<
"Failed!";
206 MatrixXfPtr predicted_batch =
neural_net.getPredictions();
209 rewards_table +=
"Action values:\n";
210 actions_table +=
"Best actions:\n";
213 rewards_table +=
"| ";
214 actions_table +=
"| ";
216 float bestqval = -std::numeric_limits<float>::infinity();
217 size_t best_action = -1;
218 for (
size_t a=0; a<4; a++) {
221 rewards_table += std::to_string(qval);
223 rewards_table +=
" | ";
225 rewards_table +=
" , ";
235 case 0 : actions_table +=
"N | ";
break;
236 case 1 : actions_table +=
"E | ";
break;
237 case 2 : actions_table +=
"S | ";
break;
238 case 3 : actions_table +=
"W | ";
break;
239 default: actions_table +=
"- | ";
243 rewards_table +=
"\n";
244 actions_table +=
"\n";
250 return rewards_table + actions_table;
256 LOG(LTRACE) <<
"computeBestValueForGivenState()";
257 float best_qvalue = -std::numeric_limits<float>::infinity();
260 std::vector<mic::types::NESWAction> actions;
261 actions.push_back(A_NORTH);
262 actions.push_back(A_EAST);
263 actions.push_back(A_SOUTH);
264 actions.push_back(A_WEST);
266 for(mic::types::NESWAction action : actions) {
269 float qvalue = predictions_[(size_t)action.getType()];
270 if (qvalue > best_qvalue)
271 best_qvalue = qvalue;
280 LOG(LTRACE) <<
"getPredictedRewardsForGivenState()";
292 inputs_batch->setZero();
295 inputs_batch->col(0) = encoded_state->col(0);
302 MatrixXfPtr predictions_batch =
neural_net.getPredictions();
307 MatrixXfPtr predictions_sample(
new MatrixXf(4, 1));
308 predictions_sample->col(0) = predictions_batch->col(0);
316 return predictions_sample;
320 LOG(LTRACE) <<
"selectBestAction";
323 mic::types::NESWAction best_action = A_RANDOM;
324 float best_qvalue = -std::numeric_limits<float>::infinity();
327 std::vector<mic::types::NESWAction> actions;
328 actions.push_back(A_NORTH);
329 actions.push_back(A_EAST);
330 actions.push_back(A_SOUTH);
331 actions.push_back(A_WEST);
336 float* pred = predictions_sample->data();
338 for(
size_t a=0; a<4; a++) {
341 float qvalue = pred[a];
342 if (qvalue > best_qvalue){
343 best_qvalue = qvalue;
344 best_action.setAction((mic::types::NESW)a);
353 LOG(LSTATUS) <<
"Episode "<< episode <<
": step " << iteration <<
"";
356 double nn_weight_decay = 0;
360 LOG(LINFO) <<
"Agent position at state t: " << player_pos_t;
363 mic::types::NESWAction action;
367 eps = 1.0/(1.0+sqrt(episode));
370 LOG(LDEBUG) <<
"eps = " << eps;
374 if (RAN_GEN->uniRandReal() > eps){
388 LOG(LINFO) <<
"Agent position at t+1: " << player_pos_t_prim <<
" after performing the action = " << action << ((random) ?
" [Random]" :
"");
393 MatrixXfPtr rewards (
new MatrixXf(4 ,
batch_size));
403 MatrixXfPtr targets_t_batch(
new MatrixXf(4,
batch_size));
409 geb.setNextSampleIndex(0);
413 LOG(LDEBUG) <<
"Training sample : " << ge_ptr->s_t <<
" -> " << ge_ptr->a_t <<
" -> " << ge_ptr->s_t_prim;
429 inputs_t_batch->col(i) = encoded_state_t->col(0);
435 MatrixXfPtr predictions_t_batch =
neural_net.getPredictions();
437 (*targets_t_batch) = (*predictions_t_batch);
440 geb.setNextSampleIndex(0);
453 inputs_t_prim_batch->col(i) = encoded_state_t->col(0);
459 MatrixXfPtr predictions_t_prim_batch =
neural_net.getPredictions();
464 geb.setNextSampleIndex(0);
469 if (ge_ptr->s_t == ge_ptr->s_t_prim) {
471 (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = 3*
step_reward;
476 MatrixXfPtr preds_t_prim (
new MatrixXf(4, 1));
477 preds_t_prim->col(0) = predictions_t_prim_batch->col(i);
482 if (std::isfinite(max_q_st_prim_at_prim))
488 LOG(LDEBUG) <<
"Inputs batch:\n" << inputs_t_batch->transpose();
489 LOG(LDEBUG) <<
"Targets batch:\n" << targets_t_batch->transpose();
496 LOG(LDEBUG) <<
"Training loss:" << loss;
504 LOG(LWARNING) <<
"Not enough samples in the experience replay memory!";
virtual void finishCurrentEpisode()
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
virtual void add(std::shared_ptr< mic::types::SpatialExperience > input_, std::shared_ptr< mic::types::MatrixXf > target_)
long long number_of_successes
virtual float getStateReward(mic::types::Position2D pos_)
virtual ~GridworldDRLExperienceReplayPOMDP()
virtual void startNewEpisode()
mic::configuration::Property< int > step_limit
std::shared_ptr< mic::types::SpatialExperience > SpatialExperiencePtr
Shared pointer to spatial experience object.
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
virtual bool performSingleStep()
mic::types::MatrixXfPtr getPredictedRewardsForGivenState(mic::types::Position2D player_position_)
virtual mic::types::MatrixXfPtr encodeObservation()
virtual void initializePropertyDependentVariables()
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
virtual mic::types::Position2D getAgentPosition()
Structure storing a spatial experience - a triplet of position in time t, executed action and positio...
mic::configuration::Property< float > step_reward
Class responsible for solving the gridworld problem with Q-learning, neural network used for approxim...
mic::configuration::Property< float > learning_rate
long long sum_of_iterations
virtual size_t getObservationWidth()
mic::configuration::Property< double > epsilon
virtual bool isActionAllowed(long x_, long y_, size_t action_)
virtual bool isStateTerminal(mic::types::Position2D pos_)
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
virtual size_t getObservationSize()
mic::configuration::Property< float > discount_rate
mic::types::NESWAction selectBestActionForGivenState(mic::types::Position2D player_position_)
float computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float *predictions_)
virtual void initialize(int argc, char *argv[])
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
SpatialExperienceMemory experiences
GridworldDRLExperienceReplayPOMDP(std::string node_name_="application")
bool moveAgent(mic::types::Action2DInterface ac_)
mic::environments::Gridworld grid_env
The gridworld environment.
virtual std::string environmentToString()
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
mic::types::Batch< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceBatch
Spatial experience replay batch.
virtual bool isStateAllowed(mic::types::Position2D pos_)
void RegisterApplication(void)
Registers application.
mic::types::Sample< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceSample
Spatial experience replay sample.
virtual void initializeEnvironment()
size_t batch_size
Size of the batch in experience replay - set to the size of maze (width*height).
std::string streamNetworkResponseTable()
virtual size_t getObservationHeight()
virtual std::string observationToString()