26 #include <utils/RandomGenerator.hpp>
30 namespace application {
42 step_reward(
"step_reward", 0.0),
43 discount_rate(
"discount_rate", 0.9),
44 learning_rate(
"learning_rate", 0.1),
45 move_noise(
"move_noise",0.2),
46 epsilon(
"epsilon", 0.1),
47 statistics_filename(
"statistics_filename",
"statistics_filename.csv")
58 LOG(LINFO) <<
"Properties registered";
69 VGL_MANAGER->initializeGLUT(argc, argv);
71 collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
73 collector_ptr->createContainer(
"number_of_steps", mic::types::color_rgba(255, 0, 0, 180));
74 collector_ptr->createContainer(
"average_number_of_steps", mic::types::color_rgba(255, 255, 0, 180));
75 collector_ptr->createContainer(
"collected_reward", mic::types::color_rgba(0, 255, 0, 180));
76 collector_ptr->createContainer(
"average_collected_reward", mic::types::color_rgba(0, 255, 255, 180));
82 w_chart =
new WindowCollectorChart<float>(
"GridworldQLearning", 256, 256, 0, 0);
101 LOG(LSTATUS) <<
"Starting new episode " << episode;
113 LOG(LTRACE) <<
"End current episode";
120 collector_ptr->addDataToContainer(
"number_of_steps",iteration);
122 collector_ptr->addDataToContainer(
"collected_reward", reward);
133 std::string rewards_table;
134 std::string actions_table;
136 rewards_table +=
"Action values:\n";
137 actions_table +=
"Best actions:\n";
139 rewards_table +=
"| ";
140 actions_table +=
"| ";
143 float bestqval = -std::numeric_limits<float>::infinity();
144 size_t best_action = -1;
145 for (
size_t a=0; a<4; a++) {
147 if (
qstate_table({x,y,a}) == -std::numeric_limits<float>::infinity())
148 rewards_table +=
"-INF";
150 rewards_table += std::to_string(qval);
152 rewards_table +=
" | ";
154 rewards_table +=
" , ";
164 case 0 : actions_table +=
"N | ";
break;
165 case 1 : actions_table +=
"E | ";
break;
166 case 2 : actions_table +=
"S | ";
break;
167 case 3 : actions_table +=
"W | ";
break;
168 default: actions_table +=
"- | ";
172 rewards_table +=
"\n";
173 actions_table +=
"\n";
176 return rewards_table + actions_table;
182 float qbest_value = -std::numeric_limits<float>::infinity();
188 std::vector<mic::types::NESWAction> actions;
189 actions.push_back(A_NORTH);
190 actions.push_back(A_EAST);
191 actions.push_back(A_SOUTH);
192 actions.push_back(A_WEST);
195 for(mic::types::NESWAction action : actions) {
197 float qvalue =
qstate_table({(size_t)pos_.x, (
size_t)pos_.y, (size_t)action.getType()});
198 if (qvalue > qbest_value)
199 qbest_value = qvalue;
207 LOG(LTRACE) <<
"Select best action for state" << pos_;
210 mic::types::NESWAction best_action = A_NONE;
211 float best_qvalue = -std::numeric_limits<float>::infinity();
214 std::vector<mic::types::NESWAction> actions;
215 actions.push_back(A_NORTH);
216 actions.push_back(A_EAST);
217 actions.push_back(A_SOUTH);
218 actions.push_back(A_WEST);
221 for(mic::types::NESWAction action : actions) {
223 float qvalue =
qstate_table({(size_t)pos_.x, (
size_t)pos_.y, (size_t)action.getType()});
224 std::cout <<
" qvalue = " << qvalue << std::endl;
225 if (qvalue > best_qvalue){
226 best_qvalue = qvalue;
227 best_action = action;
228 std::cout <<
" best_qvalue = " << best_qvalue << std::endl;
237 LOG(LSTATUS) <<
"Episode "<< episode <<
": step " << iteration <<
"";
247 for (
size_t a=0; a<4; a++)
248 qstate_table({(size_t)agent_pos_t.x,(
size_t)agent_pos_t.y, a}) = final_reward;
250 LOG(LINFO) <<
"Agent action = " << A_EXIT;
251 LOG(LDEBUG) <<
"Agent position = " << agent_pos_t;
260 mic::types::NESWAction action;
263 eps = 1.0/(1.0+episode);
264 LOG(LDEBUG) <<
"eps =" << eps;
268 if (RAN_GEN->uniRandReal() > eps){
272 if (action.getType() == mic::types::NESW::None){
282 LOG(LINFO) << action << action << ((random) ?
" [Random]" :
"");
295 LOG(LINFO) <<
"Agent position at t+1: " << agent_pos_t_prim <<
" after performing the action = " << action << ((random) ?
" [Random]" :
"");
299 float q_st_at =
qstate_table({(size_t)agent_pos_t.x, (
size_t)agent_pos_t.y, (size_t)action.getType()});
302 LOG(LDEBUG) <<
"q_st_at = " << q_st_at;
303 LOG(LDEBUG) <<
"agent_t_prim = " << agent_pos_t_prim;
305 LOG(LDEBUG) <<
"max_q_st_prim_at_prim = " << max_q_st_prim_at_prim;
307 if (agent_pos_t == agent_pos_t_prim)
virtual void finishCurrentEpisode()
mic::types::NESWAction selectBestAction(mic::types::Position2D pos_)
WindowCollectorChart< float > * w_chart
Window for displaying ???.
virtual float getStateReward(mic::types::Position2D pos_)
std::string streamQStateTable()
virtual mic::types::Position2D getAgentPosition()
float computeBestValue(mic::types::Position2D pos_)
virtual bool performSingleStep()
mic::configuration::Property< double > epsilon
virtual bool isActionAllowed(long x_, long y_, size_t action_)
virtual bool isStateTerminal(mic::types::Position2D pos_)
virtual size_t getEnvironmentWidth()
virtual void initialize(int argc, char *argv[])
virtual void startNewEpisode()
long long sum_of_iterations
mic::types::TensorXf qstate_table
Tensor storing values for all states (gridworld w * h * 4 (number of actions)). COL MAJOR(!)...
bool moveAgent(mic::types::Action2DInterface ac_)
mic::configuration::Property< float > move_noise
GridworldQLearning(std::string node_name_="application")
virtual std::string environmentToString()
virtual size_t getEnvironmentHeight()
mic::configuration::Property< float > discount_rate
mic::configuration::Property< float > learning_rate
virtual void initializePropertyDependentVariables()
virtual bool isStateAllowed(mic::types::Position2D pos_)
mic::utils::DataCollectorPtr< std::string, float > collector_ptr
Data collector.
mic::environments::Gridworld grid_env
The gridworld object.
void RegisterApplication(void)
Registers application.
virtual ~GridworldQLearning()
mic::configuration::Property< float > step_reward
virtual void initializeEnvironment()
Class responsible for solving the gridworld problem with Q-learning.
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.