MachineIntelligenceCore:ReinforcementLearning
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator
nArmedBanditsSofmax.cpp
Go to the documentation of this file.
1 
24 
25 #include <utils/RandomGenerator.hpp>
26 
27 #include <math.h>
28 
29 namespace mic {
30 namespace application {
31 
36 void RegisterApplication (void) {
37  REGISTER_APPLICATION(mic::application::nArmedBanditsSofmax);
38 }
39 
40 
41 nArmedBanditsSofmax::nArmedBanditsSofmax(std::string node_name_) : OpenGLApplication(node_name_),
42  number_of_bandits("number_of_bandits", 10),
43  tau("tau", 1.0),
44  statistics_filename("statistics_filename","statistics_filename.csv")
45 
46  {
47  // Register properties - so their values can be overridden (read from the configuration file).
48  registerProperty(number_of_bandits);
49  registerProperty(tau);
50  registerProperty(statistics_filename);
51 
52  LOG(LINFO) << "Properties registered";
53 }
54 
55 
57  delete(w_reward);
58 }
59 
60 
61 void nArmedBanditsSofmax::initialize(int argc, char* argv[]) {
62  // Initialize GLUT! :]
63  VGL_MANAGER->initializeGLUT(argc, argv);
64 
65  reward_collector_ptr = std::make_shared < mic::utils::DataCollector<std::string, float> >( );
66  // Add containers to collector.
67  reward_collector_ptr->createContainer("average_reward", 0, 10, mic::types::color_rgba(255, 0, 0, 180));
68  reward_collector_ptr->createContainer("correct_arms_percentage", 0, 100, mic::types::color_rgba(0, 255, 0, 180));
69  reward_collector_ptr->createContainer("best_possible_reward", 0, 10, mic::types::color_rgba(0, 0, 255, 180));
70 
71  // Create the visualization windows - must be created in the same, main thread :]
72  w_reward = new WindowCollectorChart<float>("nBandits", 256, 256, 0, 0);
73  w_reward->setDataCollectorPtr(reward_collector_ptr);
74 
75 }
76 
78  // Initialize random "arm" thresholds.
79  arms.resize(number_of_bandits);
80  for(size_t i=0; i<number_of_bandits; i++)
81  arms[i] = RAN_GEN->uniRandReal();
82  //std::cout << arms << std:: endl;
83 
84  // Find the best arm.
85  best_arm = -1;
86  best_arm_prob = -1;
87  for (size_t i=0; i<number_of_bandits; i++) {
88  if (arms[i] > best_arm_prob){
89  best_arm_prob = arms[i];
90  best_arm = i;
91  }//: if
92  }//: for
93 
94  // Initialize action values and counts.
95  action_values.resize(number_of_bandits);
96  action_counts.resize(number_of_bandits);
97  action_values_softmax.resize(number_of_bandits);
98 
99  action_values.setZero();
100  action_counts.setZero();
101 
102  // Initialize softmax.
103  for(size_t i=0; i<number_of_bandits; i++)
104  action_values_softmax[i] = 0.1;
105 
106 
107 }
108 
110  short reward = 0;
111  for(size_t i=0; i<number_of_bandits; i++) {
112  if (RAN_GEN->uniRandReal() < prob_)
113  reward += 1;
114  }//: for
115  return reward;
116 }
117 
118 
120 
121  // For all possible arms - compute probability distribution.
122  float sum = 0;
123  for(size_t i=0; i<number_of_bandits; i++) {
124  action_values_softmax(i) = exp(action_values(i) / (float)tau);
125  sum += action_values_softmax(i);
126  }//: for
127 
128  // Normalize the distribution.
129  for(size_t i=0; i<number_of_bandits; i++) {
130  action_values_softmax(i) /= sum;
131  }//: for
132 
133 }
134 
135 
137  LOG(LTRACE) << "Performing a single step (" << iteration << ")";
138 
139  std::cout<< "hidden state (arms)=";
140  for (size_t i=0; i<number_of_bandits; i++)
141  std::cout << arms[i] << ", ";
142  std::cout << std::endl;
143 
144  std::cout << "action_counts=" ;
145  for (size_t i=0; i<number_of_bandits; i++)
146  std::cout << action_counts[i] << ", ";
147  std::cout << std::endl;
148 
149  std::cout<< "action_values=";
150  for (size_t i=0; i<number_of_bandits; i++)
151  std::cout << action_values[i] << ", ";
152  std::cout << std::endl;
153 
154 
155  short choice=0;
156  // Select the random arm using weighted probability distribution from softmax.
157  // Random number from 0 to 1.
158  float r = RAN_GEN->uniRandReal();
159  // "Spin the random wheel" ;)
160  for (size_t i=0; i< number_of_bandits; i++){
161  if (r < action_values_softmax(i)) {
162  choice = i;
163  std::cout<< "choice=" << choice << std::endl;
164  break;
165  }//: if
166  r -= action_values_softmax(i);
167  }//: for
168 
169  // Calculate reward.
170  float reward = calculateReward(arms[choice]);
171  std::cout<< "reward= " << reward << std::endl;
172 
173  // Update running average for given action - Q learning;)
174  action_counts[choice] +=1;
175  std::cout<< "action_values[choice]" << action_values[choice] << " (1.0/action_counts[choice])=" << (1.0/action_counts[choice]) << " (reward - action_values[choice])=" << (reward - action_values[choice]) << std::endl;
176 
177  action_values[choice] = action_values[choice] + (1.0/action_counts[choice]) * (reward - action_values[choice]);
178  std::cout<< "action_values[choice] po = " << action_values[choice] << std::endl;
179 
180  // Update softmax.
182 
183  // Calculate the percentage the correct arm is chosen.
184  float correct_arms_percentage = 100.0*(action_counts[best_arm])/((float)iteration);
185  std::cout<< "correct arm/choice=" << best_arm << std::endl;
186 
187  // Calculate the mean reward.
188  float running_mean_reward = 0;
189  for (size_t i=0; i<number_of_bandits; i++) {
190  running_mean_reward += (float)action_values[i] * (float)action_counts[i];
191  }//: for all action values
192  running_mean_reward /= (float)iteration;
193 
194  // Add variables to container.
195  reward_collector_ptr->addDataToContainer("average_reward",running_mean_reward);
196  reward_collector_ptr->addDataToContainer("correct_arms_percentage",correct_arms_percentage);
197  reward_collector_ptr->addDataToContainer("best_possible_reward",10.0*best_arm_prob);
198 
199  // Export reward "convergence" diagram.
200  reward_collector_ptr->exportDataToCsv(statistics_filename);
201 
202  return true;
203 }
204 } /* namespace application */
205 } /* namespace mic */
mic::configuration::Property< size_t > number_of_bandits
Property: number of bandits.
mic::types::VectorXi action_counts
Counters storing how many times we've taken a particular action.
mic::types::VectorXf action_values
Action values.
mic::types::VectorXf arms
n Bandit arms.
mic::utils::DataCollectorPtr< std::string, float > reward_collector_ptr
Reward collector.
mic::configuration::Property< double > tau
mic::types::VectorXf action_values_softmax
Action values - softmax.
mic::configuration::Property< std::string > statistics_filename
Property: name of the file to which the statistics will be exported.
WindowCollectorChart< float > * w_reward
Window for displaying average reward.
nArmedBanditsSofmax(std::string node_name_="application")
virtual void initialize(int argc, char *argv[])
void RegisterApplication(void)
Registers application.
Class implementing a n-Armed Bandits problem solving the n armed bandits problem using Softmax Action...