24 #include <utils/RandomGenerator.hpp>
28 namespace application {
40 step_reward(
"step_reward", 0.0),
41 discount_rate(
"discount_rate", 0.9),
42 learning_rate(
"learning_rate", 0.005),
43 epsilon(
"epsilon", 0.1),
44 statistics_filename(
"statistics_filename",
"drl_er_statistics.csv"),
45 mlnn_filename(
"mlnn_filename",
"drl_er_mlnn.txt"),
46 mlnn_save(
"mlnn_save", false),
47 mlnn_load(
"mlnn_load", false),
60 LOG(LINFO) <<
"Properties registered";
71 VGL_MANAGER->initializeGLUT(argc, argv);
73 collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
75 collector_ptr->createContainer(
"number_of_steps", mic::types::color_rgba(255, 0, 0, 180));
76 collector_ptr->createContainer(
"number_of_steps_average", mic::types::color_rgba(255, 255, 0, 180));
77 collector_ptr->createContainer(
"collected_reward", mic::types::color_rgba(0, 255, 0, 180));
78 collector_ptr->createContainer(
"collected_reward_average", mic::types::color_rgba(0, 255, 255, 180));
79 collector_ptr->createContainer(
"success_ratio", mic::types::color_rgba(255, 255, 255, 180));
86 w_chart =
new WindowCollectorChart<float>(
"GridworldDRLExperienceReplay", 256, 256, 0, 0);
106 neural_net.pushLayer(
new Linear<float>(250, 100));
108 neural_net.pushLayer(
new Linear<float>(100, 4));
113 neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
115 neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
117 LOG(LINFO) <<
"Generated new neural network";
126 LOG(LSTATUS) <<
"Starting new episode " << episode;
137 LOG(LTRACE) <<
"End current episode";
147 collector_ptr->addDataToContainer(
"number_of_steps",iteration);
149 collector_ptr->addDataToContainer(
"collected_reward", reward);
164 LOG(LTRACE) <<
"streamNetworkResponseTable()";
165 std::string rewards_table;
166 std::string actions_table;
190 MatrixXfPtr predicted_batch =
neural_net.getPredictions();
193 rewards_table +=
"Action values:\n";
194 actions_table +=
"Best actions:\n";
197 rewards_table +=
"| ";
198 actions_table +=
"| ";
200 float bestqval = -std::numeric_limits<float>::infinity();
201 size_t best_action = -1;
202 for (
size_t a=0; a<4; a++) {
205 rewards_table += std::to_string(qval);
207 rewards_table +=
" | ";
209 rewards_table +=
" , ";
219 case 0 : actions_table +=
"N | ";
break;
220 case 1 : actions_table +=
"E | ";
break;
221 case 2 : actions_table +=
"S | ";
break;
222 case 3 : actions_table +=
"W | ";
break;
223 default: actions_table +=
"- | ";
227 rewards_table +=
"\n";
228 actions_table +=
"\n";
234 return rewards_table + actions_table;
240 LOG(LTRACE) <<
"computeBestValueForGivenState()";
241 float best_qvalue = -std::numeric_limits<float>::infinity();
244 std::vector<mic::types::NESWAction> actions;
245 actions.push_back(A_NORTH);
246 actions.push_back(A_EAST);
247 actions.push_back(A_SOUTH);
248 actions.push_back(A_WEST);
250 for(mic::types::NESWAction action : actions) {
253 float qvalue = predictions_[(size_t)action.getType()];
254 if (qvalue > best_qvalue)
255 best_qvalue = qvalue;
264 LOG(LTRACE) <<
"getPredictedRewardsForGivenState()";
276 inputs_batch->setZero();
279 inputs_batch->col(0) = encoded_state->col(0);
286 MatrixXfPtr predictions_batch =
neural_net.getPredictions();
291 MatrixXfPtr predictions_sample(
new MatrixXf(4, 1));
292 predictions_sample->col(0) = predictions_batch->col(0);
300 return predictions_sample;
304 LOG(LTRACE) <<
"selectBestAction";
307 mic::types::NESWAction best_action = A_RANDOM;
308 float best_qvalue = -std::numeric_limits<float>::infinity();
311 std::vector<mic::types::NESWAction> actions;
312 actions.push_back(A_NORTH);
313 actions.push_back(A_EAST);
314 actions.push_back(A_SOUTH);
315 actions.push_back(A_WEST);
320 float* pred = predictions_sample->data();
322 for(
size_t a=0; a<4; a++) {
325 float qvalue = pred[a];
326 if (qvalue > best_qvalue){
327 best_qvalue = qvalue;
328 best_action.setAction((mic::types::NESW)a);
337 LOG(LSTATUS) <<
"Episode "<< episode <<
": step " << iteration <<
"";
340 double nn_weight_decay = 0;
344 LOG(LINFO) <<
"Agent position at state t: " << player_pos_t;
347 mic::types::NESWAction action;
351 eps = 1.0/(1.0+sqrt(episode));
354 LOG(LDEBUG) <<
"eps = " << eps;
358 if (RAN_GEN->uniRandReal() > eps){
372 LOG(LINFO) <<
"Agent position at t+1: " << player_pos_t_prim <<
" after performing the action = " << action << ((random) ?
" [Random]" :
"");
377 MatrixXfPtr rewards (
new MatrixXf(4 ,
batch_size));
387 MatrixXfPtr targets_t_batch(
new MatrixXf(4,
batch_size));
393 geb.setNextSampleIndex(0);
397 LOG(LDEBUG) <<
"Training sample : " << ge_ptr->s_t <<
" -> " << ge_ptr->a_t <<
" -> " << ge_ptr->s_t_prim;
413 inputs_t_batch->col(i) = encoded_state_t->col(0);
419 MatrixXfPtr predictions_t_batch =
neural_net.getPredictions();
421 (*targets_t_batch) = (*predictions_t_batch);
424 geb.setNextSampleIndex(0);
437 inputs_t_prim_batch->col(i) = encoded_state_t->col(0);
443 MatrixXfPtr predictions_t_prim_batch =
neural_net.getPredictions();
448 geb.setNextSampleIndex(0);
453 if (ge_ptr->s_t == ge_ptr->s_t_prim) {
455 (*targets_t_batch)((size_t)ge_ptr->a_t.getType(), i) = 3*
step_reward;
460 MatrixXfPtr preds_t_prim (
new MatrixXf(4, 1));
461 preds_t_prim->col(0) = predictions_t_prim_batch->col(i);
466 if (std::isfinite(max_q_st_prim_at_prim))
472 LOG(LDEBUG) <<
"Inputs batch:\n" << inputs_t_batch->transpose();
473 LOG(LDEBUG) <<
"Targets batch:\n" << targets_t_batch->transpose();
480 LOG(LDEBUG) <<
"Training loss:" << loss;
488 LOG(LWARNING) <<
"Not enough samples in the experience replay memory!";
498 if (iteration >= 100)
long long sum_of_iterations
virtual void finishCurrentEpisode()
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
mic::configuration::Property< float > discount_rate
virtual ~GridworldDRLExperienceReplay()
float computeBestValueForGivenStateAndPredictions(mic::types::Position2D player_position_, float *predictions_)
mic::environments::Gridworld grid_env
The gridworld environment.
virtual void add(std::shared_ptr< mic::types::SpatialExperience > input_, std::shared_ptr< mic::types::MatrixXf > target_)
virtual float getStateReward(mic::types::Position2D pos_)
mic::types::MatrixXfPtr getPredictedRewardsForGivenState(mic::types::Position2D player_position_)
mic::configuration::Property< double > epsilon
virtual void initialize(int argc, char *argv[])
std::shared_ptr< mic::types::SpatialExperience > SpatialExperiencePtr
Shared pointer to spatial experience object.
long long number_of_successes
virtual mic::types::MatrixXfPtr encodeEnvironment()
virtual mic::types::Position2D getAgentPosition()
Structure storing a spatial experience - a triplet of position in time t, executed action and positio...
virtual size_t getEnvironmentSize()
size_t batch_size
Size of the batch in experience replay - set to the size of maze (width*height).
virtual bool isActionAllowed(long x_, long y_, size_t action_)
virtual bool isStateTerminal(mic::types::Position2D pos_)
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
virtual void initializePropertyDependentVariables()
virtual size_t getEnvironmentWidth()
Class responsible for solving the gridworld problem with Q-learning, neural network used for approxim...
mic::configuration::Property< float > learning_rate
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
GridworldDRLExperienceReplay(std::string node_name_="application")
SpatialExperienceMemory experiences
bool moveAgent(mic::types::Action2DInterface ac_)
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
virtual std::string environmentToString()
virtual size_t getEnvironmentHeight()
mic::types::Batch< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceBatch
Spatial experience replay batch.
mic::types::NESWAction selectBestActionForGivenState(mic::types::Position2D player_position_)
virtual bool isStateAllowed(mic::types::Position2D pos_)
virtual bool performSingleStep()
void RegisterApplication(void)
Registers application.
mic::types::Sample< mic::types::SpatialExperience, mic::types::MatrixXf > SpatialExperienceSample
Spatial experience replay sample.
std::string streamNetworkResponseTable()
mic::configuration::Property< float > step_reward
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
virtual void initializeEnvironment()
virtual void startNewEpisode()