MachineIntelligenceCore:ReinforcementLearning
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator
nArmedBanditsSimpleQlearning.cpp
Go to the documentation of this file.
1 
24 
25 #include <utils/RandomGenerator.hpp>
26 
27 namespace mic {
28 namespace application {
29 
34 void RegisterApplication (void) {
35  REGISTER_APPLICATION(mic::application::TestApp);
36 }
37 
38 
39 TestApp::TestApp(std::string node_name_) : OpenGLApplication(node_name_),
40  number_of_bandits("number_of_bandits", 10),
41  epsilon("epsilon", 0.1),
42  statistics_filename("statistics_filename","statistics_filename.csv")
43 
44  {
45  // Register properties - so their values can be overridden (read from the configuration file).
46  registerProperty(number_of_bandits);
47  registerProperty(epsilon);
48  registerProperty(statistics_filename);
49 
50  LOG(LINFO) << "Properties registered";
51 }
52 
53 
55  delete(w_reward);
56 }
57 
58 
59 void TestApp::initialize(int argc, char* argv[]) {
60  // Initialize GLUT! :]
61  VGL_MANAGER->initializeGLUT(argc, argv);
62 
63  reward_collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
64  // Add containers to collector.
65  reward_collector_ptr->createContainer("average_reward", 0, 10, mic::types::color_rgba(255, 0, 0, 180));
66  reward_collector_ptr->createContainer("correct_arms_percentage", 0, 100, mic::types::color_rgba(0, 255, 0, 180));
67  reward_collector_ptr->createContainer("best_possible_reward", 0, 10, mic::types::color_rgba(0, 0, 255, 180));
68 
69  // Create the visualization windows - must be created in the same, main thread :]
70  w_reward = new WindowCollectorChart<float>("nBandits", 256, 256, 0, 0);
71  w_reward->setDataCollectorPtr(reward_collector_ptr);
72 
73 }
74 
76  // Initialize random "arm" thresholds.
77  arms.resize(number_of_bandits);
78  for(size_t i=0; i<number_of_bandits; i++)
79  arms[i] = RAN_GEN->uniRandReal();
80  //std::cout << arms << std:: endl;
81 
82  // Find the best arm.
83  best_arm = -1;
84  best_arm_prob = -1;
85  for (size_t i=0; i<number_of_bandits; i++) {
86  if (arms[i] > best_arm_prob){
87  best_arm_prob = arms[i];
88  best_arm = i;
89  }//: if
90  }//: for
91 
92  // Initialize action values and counts.
93  action_values.resize(number_of_bandits);
94  action_counts.resize(number_of_bandits);
95 
96  action_values.setOnes();
97  action_counts.setZero();
98 }
99 
100 short TestApp::calculateReward(float prob_) {
101  short reward = 0;
102  for(size_t i=0; i<number_of_bandits; i++) {
103  if (RAN_GEN->uniRandReal() < prob_)
104  reward += 1;
105  }//: for
106  return reward;
107 }
108 
109 
111  // Greedy methods - returns the index of element with greatest value.
112  size_t current_best_arm = 0;
113  float current_best_value = -1;
114  // For all possible arms.
115  for(size_t i=0; i<number_of_bandits; i++) {
116  // Check if this one is better than the others.
117  if (action_values(i) > current_best_value) {
118  current_best_value = action_values(i);
119  current_best_arm = i;
120  }//: if
121  }//: for
122  return current_best_arm;
123 }
124 
125 
127  LOG(LTRACE) << "Performing a single step (" << iteration << ")";
128 
129  std::cout<< "hidden state (arms)=";
130  for (size_t i=0; i<number_of_bandits; i++)
131  std::cout << arms[i] << ", ";
132  std::cout << std::endl;
133 
134  std::cout << "action_counts=" ;
135  for (size_t i=0; i<number_of_bandits; i++)
136  std::cout << action_counts[i] << ", ";
137  std::cout << std::endl;
138 
139  std::cout<< "action_values=";
140  for (size_t i=0; i<number_of_bandits; i++)
141  std::cout << action_values[i] << ", ";
142  std::cout << std::endl;
143 
144 
145  short choice;
146  // Epsilon-greedy action selection.
147  if (RAN_GEN->uniRandReal() > (double)epsilon){
148  // Select best action.
149  choice = selectBestArm();
150  std::cout<< "best choice=" << choice << std::endl;
151  } else {
152  //std::cout << "Random action!" << std::endl;
153  // Random arm selection.
154  choice = RAN_GEN->uniRandInt(0, number_of_bandits-1);
155  std::cout<< "random choice=" << choice << std::endl;
156  }//: if
157 
158 
159  // Calculate reward.
160  float reward = calculateReward(arms[choice]);
161  std::cout<< "reward= " << reward << std::endl;
162 
163  // Update running average for given action - Q learning;)
164  action_counts[choice] +=1;
165  std::cout<< "action_values[choice]" << action_values[choice] << " (1.0/action_counts[choice])=" << (1.0/action_counts[choice]) << " (reward - action_values[choice])=" << (reward - action_values[choice]) << std::endl;
166 
167  action_values[choice] = action_values[choice] + (1.0/action_counts[choice]) * (reward - action_values[choice]);
168  std::cout<< "action_values[choice] po = " << action_values[choice] << std::endl;
169 
170  // Calculate the percentage the correct arm is chosen.
171  float correct_arms_percentage = 100.0*(action_counts[best_arm])/((float)iteration);
172  std::cout<< "correct arm/choice=" << best_arm << std::endl;
173 
174  // Calculate the mean reward.
175  float running_mean_reward = 0;
176  for (size_t i=0; i<number_of_bandits; i++) {
177  running_mean_reward += (float)action_values[i] * (float)action_counts[i];
178  }//: for all action values
179  running_mean_reward /= (float)iteration;
180 
181  // Add variables to container.
182  reward_collector_ptr->addDataToContainer("average_reward",running_mean_reward);
183  reward_collector_ptr->addDataToContainer("correct_arms_percentage",correct_arms_percentage);
184  reward_collector_ptr->addDataToContainer("best_possible_reward",10.0*best_arm_prob);
185 
186  // Export reward "convergence" diagram.
187  reward_collector_ptr->exportDataToCsv(statistics_filename);
188 
189  return true;
190 }
191 
192 
193 
194 } /* namespace application */
195 } /* namespace mic */
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
mic::utils::DataCollectorPtr< std::string, float > reward_collector_ptr
Reward collector.
mic::configuration::Property< double > epsilon
Property: variable denoting epsilon in action selection (the probability "below" which a random actio...
Class implementing a n-Armed Bandits problem solving the n armed bandits problem using simple Q-learn...
virtual void initialize(int argc, char *argv[])
mic::types::VectorXf arms
n Bandit arms.
TestApp(std::string node_name_="application")
void RegisterApplication(void)
Registers application.
WindowCollectorChart< float > * w_reward
Window for displaying average reward.
mic::types::VectorXf action_values
Action values.
mic::types::VectorXi action_counts
Counters storing how many times we've taken a particular action.
mic::configuration::Property< size_t > number_of_bandits
Property: number of bandits.