00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #ifndef C_POLICYGRADIENT__H
00033 #define C_POLICYGRADIENT__H
00034
00035 #include "cagentlistener.h"
00036 #include "csupervisedlearner.h"
00037
00038 class CAgentController;
00039 class CPolicyEvaluator;
00040 class CFeatureList;
00041 class CReinforcementBaseLineCalculator;
00042
00043 class CRewardFunction;
00044 class CAgent;
00045 class CTransitionFunctionEnvironment;
00046 class CContinuousActionGradientPolicy;
00047 class CGradientUpdateFunction;
00048 class CStochasticPolicy;
00049
00050 class CPolicyGradientCalculator : public CGradientCalculator
00051 {
00052 protected:
00053 CAgentController *policy;
00054 CPolicyEvaluator *evaluator;
00055 public:
00056 CPolicyGradientCalculator(CAgentController *policy, CPolicyEvaluator *evaluator);
00057 virtual ~CPolicyGradientCalculator() {};
00058
00059 virtual void getGradient(CFeatureList *gradient) = 0;
00060 virtual double getFunctionValue();
00061 };
00062
00063 class CGPOMDPGradientCalculator : public CPolicyGradientCalculator, public CSemiMDPRewardListener
00064 {
00065 protected:
00066 CFeatureList *localGradient;
00067 CFeatureList *localZTrace;
00068
00069 CFeatureList *globalGradient;
00070
00071 CAgent *agent;
00072 CReinforcementBaseLineCalculator *baseLine;
00073
00074 CStochasticPolicy *stochPolicy;
00075
00076 public:
00077 CGPOMDPGradientCalculator(CRewardFunction *reward, CStochasticPolicy *policy, CPolicyEvaluator *evaluator, CAgent *agent, CReinforcementBaseLineCalculator *baseLine, int TSteps, int nEpisodes, double beta);
00078 virtual ~CGPOMDPGradientCalculator();
00079
00080 virtual void nextStep(CStateCollection *oldState, CAction *action, double reward, CStateCollection *newState);
00081 virtual void newEpisode();
00082
00083 virtual void getGradient(CFeatureList *gradient);
00084
00085 virtual CFeatureList* getGlobalGradient();
00086 virtual void setGlobalGradient(CFeatureList *globalGradient);
00087 };
00088
00089 class CContinuousActionGradientPolicy;
00090
00091 class CNumericPolicyGradientCalculator : public CPolicyGradientCalculator
00092 {
00093 protected:
00094 CFeatureList *gradientFeatures;
00095 double *weights;
00096
00097 CRewardFunction *rewardFunction;
00098 CAgent *agent;
00099 CTransitionFunctionEnvironment *dynModel;
00100 CContinuousActionGradientPolicy *gradientPolicy;
00101 public:
00102 CNumericPolicyGradientCalculator(CAgent *agent, CContinuousActionGradientPolicy *policy, CTransitionFunctionEnvironment *dynModel, CRewardFunction *reward, double stepSize, CPolicyEvaluator *evaluator);
00103 ~CNumericPolicyGradientCalculator();
00104
00105 virtual void getGradient(CFeatureList *gradientFeatures);
00106 };
00107
00108 class CRandomPolicyGradientCalculator : public CPolicyGradientCalculator
00109 {
00110 protected:
00111 double *stepSizes;
00112 double *minWeights;
00113 double *nullWeights;
00114 double *plusWeights;
00115
00116 int *numMinWeights;
00117 int *numMaxWeights;
00118 int *numNullWeights;
00119
00120 CContinuousActionGradientPolicy *gradientPolicy;
00121 public:
00122 CRandomPolicyGradientCalculator(CContinuousActionGradientPolicy *policy, CPolicyEvaluator *evaluator, int numEvaluations, double stepSize);
00123 virtual ~CRandomPolicyGradientCalculator();
00124
00125 virtual void getGradient(CFeatureList *gradient);
00126 virtual void setStepSize(int index, double stepSize);
00127
00128 virtual void resetGradientCalculator() {};
00129 };
00130
00131 class CRandomMaxPolicyGradientCalculator : public CPolicyGradientCalculator
00132 {
00133 protected:
00134 double *stepSizes;
00135 double *workStepSizes;
00136
00137 CContinuousActionGradientPolicy *gradientPolicy;
00138 public:
00139 CRandomMaxPolicyGradientCalculator(CContinuousActionGradientPolicy *policy, CPolicyEvaluator *evaluator, int numEvaluations, double stepSize);
00140 virtual ~CRandomMaxPolicyGradientCalculator();
00141
00142 virtual void getGradient(CFeatureList *gradient);
00143 virtual void setStepSize(int index, double stepSize);
00144
00145 virtual void resetGradientCalculator();
00146 };
00147
00148
00149 class CGSearchPolicyGradientUpdater : public CGradientFunctionUpdater
00150 {
00151 protected:
00152 CPolicyGradientCalculator *gradientCalculator;
00153
00154 double *startParameters;
00155 double *workParameters;
00156
00157 double lastStepSize;
00158
00159
00160 void setWorkingParamters(CFeatureList *gradient, double stepSize, double *startParameters, double *workParameters);
00161 public:
00162
00163 CGSearchPolicyGradientUpdater(CGradientUpdateFunction *updateFunction, CPolicyGradientCalculator *gradientCalculator, double s0, double epsilon);
00164 virtual ~CGSearchPolicyGradientUpdater();
00165
00166 virtual void updateWeights(CFeatureList *gradient);
00167 };
00168
00169
00170
00171 class CPolicyGradientWeightDecayListener : public CSemiMDPListener
00172 {
00173 protected:
00174 CGradientUpdateFunction *updateFunction;
00175 double *parameters;
00176 public:
00177 CPolicyGradientWeightDecayListener(CGradientUpdateFunction *updateFunction, double weightdecay);
00178 ~CPolicyGradientWeightDecayListener();
00179
00180 virtual void newEpisode();
00181 };
00182
00183
00184
00185 #endif
00186