24 #include <utils/RandomGenerator.hpp>
28 namespace application {
40 step_reward(
"step_reward", 0.0),
41 discount_rate(
"discount_rate", 0.9),
42 learning_rate(
"learning_rate", 0.1),
43 epsilon(
"epsilon", 0.1),
44 statistics_filename(
"statistics_filename",
"dql_statistics.csv"),
45 mlnn_filename(
"mlnn_filename",
"dql_mlnn.txt"),
46 mlnn_save(
"mlnn_save", false),
47 mlnn_load(
"mlnn_load", false)
59 LOG(LINFO) <<
"Properties registered";
70 VGL_MANAGER->initializeGLUT(argc, argv);
72 collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
74 collector_ptr->createContainer(
"number_of_steps", mic::types::color_rgba(255, 0, 0, 180));
75 collector_ptr->createContainer(
"average_number_of_steps", mic::types::color_rgba(255, 255, 0, 180));
76 collector_ptr->createContainer(
"collected_reward", mic::types::color_rgba(0, 255, 0, 180));
77 collector_ptr->createContainer(
"average_collected_reward", mic::types::color_rgba(0, 255, 255, 180));
83 w_chart =
new WindowCollectorChart<float>(
"GridworldDeepQLearning", 256, 256, 0, 0);
100 neural_net.pushLayer(
new Linear<float>(250, 100));
102 neural_net.pushLayer(
new Linear<float>(100, 4));
107 neural_net.setOptimization<mic::neural_nets::optimization::Adam<float> >();
109 neural_net.setLoss <mic::neural_nets::loss::SquaredErrorLoss<float> >();
111 LOG(LINFO) <<
"Generated new neural network";
117 LOG(LSTATUS) <<
"Starting new episode " << episode;
129 LOG(LTRACE) <<
"End of the episode " << episode;
136 collector_ptr->addDataToContainer(
"number_of_steps",iteration);
138 collector_ptr->addDataToContainer(
"collected_reward", reward);
151 LOG(LTRACE) <<
"streamNetworkResponseTable()";
152 std::string rewards_table;
153 std::string actions_table;
158 rewards_table +=
"Action values:\n";
159 actions_table +=
"Best actions:\n";
162 rewards_table +=
"| ";
163 actions_table +=
"| ";
165 float bestqval = -std::numeric_limits<float>::infinity();
166 size_t best_action = -1;
174 mic::types::MatrixXfPtr tmp_predicted_rewards =
neural_net.getPredictions();
175 float* qstate = tmp_predicted_rewards->data();
177 for (
size_t a=0; a<4; a++) {
178 float qval = qstate[a];
180 rewards_table += std::to_string(qval);
182 rewards_table +=
" | ";
184 rewards_table +=
" , ";
194 case 0 : actions_table +=
"N | ";
break;
195 case 1 : actions_table +=
"E | ";
break;
196 case 2 : actions_table +=
"S | ";
break;
197 case 3 : actions_table +=
"W | ";
break;
198 default: actions_table +=
"- | ";
202 rewards_table +=
"\n";
203 actions_table +=
"\n";
210 return rewards_table + actions_table;
216 LOG(LTRACE) <<
"computeBestValue";
217 float best_qvalue = -std::numeric_limits<float>::infinity();
220 std::vector<mic::types::NESWAction> actions;
221 actions.push_back(A_NORTH);
222 actions.push_back(A_EAST);
223 actions.push_back(A_SOUTH);
224 actions.push_back(A_WEST);
229 float* pred = predictions_sample->data();
231 for(mic::types::NESWAction action : actions) {
234 float qvalue = pred[(size_t)action.getType()];
235 if (qvalue > best_qvalue){
236 best_qvalue = qvalue;
255 LOG(LTRACE) <<
"selectBestAction";
258 mic::types::NESWAction best_action = A_RANDOM;
259 float best_qvalue = -std::numeric_limits<float>::infinity();
262 std::vector<mic::types::NESWAction> actions;
263 actions.push_back(A_NORTH);
264 actions.push_back(A_EAST);
265 actions.push_back(A_SOUTH);
266 actions.push_back(A_WEST);
271 float* pred = predictions_sample->data();
273 for(
size_t a=0; a<4; a++) {
276 float qvalue = pred[a];
277 if (qvalue > best_qvalue){
278 best_qvalue = qvalue;
279 best_action.setAction((mic::types::NESW)a);
288 LOG(LSTATUS) <<
"Episode "<< episode <<
": step " << iteration <<
"";
291 double nn_weight_decay = 0;
302 MatrixXfPtr predicted_rewards_t (
new MatrixXf(*tmp_rewards_t));
303 LOG(LINFO) <<
"Agent position at state t: " << player_pos_t;
304 LOG(LSTATUS) <<
"Predicted rewards for state t: " << predicted_rewards_t->transpose();
307 mic::types::NESWAction action;
311 eps = 1.0/(1.0+sqrt(episode));
314 LOG(LDEBUG) <<
"eps = " << eps;
318 if (RAN_GEN->uniRandReal() > eps){
330 (*predicted_rewards_t)((size_t)action.getType(), 0) =
step_reward;
338 LOG(LINFO) <<
"Agent position at t+1: " << player_pos_t_prim <<
" after performing the action = " << action << ((random) ?
" [Random]" :
"");
350 LOG(LWARNING) <<
"max_q_st_prim_at_prim = " << max_q_st_prim_at_prim;
353 if (std::isfinite(max_q_st_prim_at_prim))
354 (*predicted_rewards_t)((size_t)action.getType(), 0) = r +
discount_rate*max_q_st_prim_at_prim;
356 (*predicted_rewards_t)((size_t)action.getType(), 0) = r;
360 (*predicted_rewards_t)((size_t)action.getType(), 0) = 5*r +
discount_rate*max_q_st_prim_at_prim;;
367 LOG(LERROR) <<
"Training with state: " << encoded_state_t->transpose();
368 LOG(LERROR) <<
"Training with desired rewards: " << predicted_rewards_t->transpose();
373 LOG(LSTATUS) <<
"Training loss:" << loss;
mic::configuration::Property< std::string > mlnn_filename
Property: name of the file to which the neural network will be serialized (or deserialized from)...
virtual bool performSingleStep()
virtual bool moveAgentToPosition(mic::types::Position2D pos_)
virtual float getStateReward(mic::types::Position2D pos_)
long long sum_of_iterations
mic::types::NESWAction selectBestActionForCurrentState()
mic::configuration::Property< float > discount_rate
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
mic::configuration::Property< double > epsilon
virtual mic::types::Position2D getAgentPosition()
mic::configuration::Property< bool > mlnn_load
Property: flad denoting thether the nn should be loaded from a file (at the initialization of the tas...
mic::environments::Gridworld grid_env
The gridworld environment.
virtual void initialize(int argc, char *argv[])
GridworldDeepQLearning(std::string node_name_="application")
mic::configuration::Property< float > learning_rate
mic::types::Position2D player_pos_t_minus_prim
virtual void initializePropertyDependentVariables()
mic::types::MatrixXfPtr getPredictedRewardsForCurrentState()
virtual bool isActionAllowed(long x_, long y_, size_t action_)
virtual bool isStateTerminal(mic::types::Position2D pos_)
virtual size_t getEnvironmentWidth()
virtual mic::types::MatrixXfPtr encodeAgentGrid()
Encode the current state of the reduced grid (only the agent position) as a matrix of size [1...
bool moveAgent(mic::types::Action2DInterface ac_)
virtual void startNewEpisode()
BackpropagationNeuralNetwork< float > neural_net
Multi-layer neural network used for approximation of the Qstate rewards.
virtual std::string environmentToString()
virtual void finishCurrentEpisode()
virtual size_t getEnvironmentHeight()
WindowCollectorChart< float > * w_chart
Window for displaying statistics.
float computeBestValueForCurrentState()
virtual bool isStateAllowed(mic::types::Position2D pos_)
void RegisterApplication(void)
Registers application.
mic::configuration::Property< bool > mlnn_save
Property: flad denoting thether the nn should be saved to a file (after every episode end)...
mic::configuration::Property< float > step_reward
Class responsible for solving the gridworld problem with Q-learning and (not that) deep neural networ...
virtual ~GridworldDeepQLearning()
virtual void initializeEnvironment()
std::string streamNetworkResponseTable()
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.