Reinforcement Learning Toolbox 2.0
last updated:
General
Documentation
Manual
Tutorial
Class Reference
Master Thesis
Examples
Related Papers
Downloads
Links
News
mailto:webmaster
Main Page     Class Hierarchy   Compound List   File List   Compound Members   File Members

cpolicygradient.h

Go to the documentation of this file.
00001 // Copyright (C) 2003
00002 // Gerhard Neumann (gneumann@gmx.net)
00003 // Stephan Neumann (sneumann@gmx.net) 
00004 //                
00005 // This file is part of RL Toolbox.
00006 // http://www.igi.tugraz.at/ril_toolbox
00007 //
00008 // All rights reserved.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions
00012 // are met:
00013 // 1. Redistributions of source code must retain the above copyright
00014 //    notice, this list of conditions and the following disclaimer.
00015 // 2. Redistributions in binary form must reproduce the above copyright
00016 //    notice, this list of conditions and the following disclaimer in the
00017 //    documentation and/or other materials provided with the distribution.
00018 // 3. The name of the author may not be used to endorse or promote products
00019 //    derived from this software without specific prior written permission.
00020 // 
00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00024 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00025 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00026 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00027 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00028 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00029 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00030 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00031 
00032 #ifndef C_POLICYGRADIENT__H
00033 #define C_POLICYGRADIENT__H
00034 
00035 #include "cagentlistener.h"
00036 #include "csupervisedlearner.h"
00037 
00038 class CAgentController;
00039 class CPolicyEvaluator;
00040 class CFeatureList;
00041 class CReinforcementBaseLineCalculator;
00042 
00043 class CRewardFunction;
00044 class CAgent;
00045 class CTransitionFunctionEnvironment;
00046 class CContinuousActionGradientPolicy;
00047 class CGradientUpdateFunction;
00048 class CStochasticPolicy;
00049 
00050 class CPolicyGradientCalculator : public CGradientCalculator
00051 {
00052 protected:
00053         CAgentController *policy;
00054         CPolicyEvaluator *evaluator;
00055 public:
00056         CPolicyGradientCalculator(CAgentController *policy, CPolicyEvaluator *evaluator);
00057         virtual ~CPolicyGradientCalculator() {};
00058 
00059         virtual void getGradient(CFeatureList *gradient) = 0;
00060         virtual double getFunctionValue();
00061 };
00062 
00063 class CGPOMDPGradientCalculator : public CPolicyGradientCalculator, public CSemiMDPRewardListener
00064 {
00065 protected:
00066         CFeatureList *localGradient;
00067         CFeatureList *localZTrace;
00068 
00069         CFeatureList *globalGradient;
00070 
00071         CAgent *agent; 
00072         CReinforcementBaseLineCalculator *baseLine;
00073 
00074         CStochasticPolicy *stochPolicy;
00075 
00076 public:
00077         CGPOMDPGradientCalculator(CRewardFunction *reward, CStochasticPolicy *policy, CPolicyEvaluator *evaluator, CAgent *agent, CReinforcementBaseLineCalculator *baseLine, int TSteps, int nEpisodes, double beta);
00078         virtual ~CGPOMDPGradientCalculator();
00079 
00080         virtual void nextStep(CStateCollection *oldState, CAction *action, double reward, CStateCollection *newState);
00081         virtual void newEpisode();
00082 
00083         virtual void getGradient(CFeatureList *gradient);
00084 
00085         virtual CFeatureList* getGlobalGradient();
00086         virtual void setGlobalGradient(CFeatureList *globalGradient);
00087 };
00088 
00089 class CContinuousActionGradientPolicy;
00090 
00091 class CNumericPolicyGradientCalculator : public CPolicyGradientCalculator
00092 {
00093 protected:
00094         CFeatureList *gradientFeatures;
00095         double *weights;
00096 
00097         CRewardFunction *rewardFunction;
00098         CAgent *agent;
00099         CTransitionFunctionEnvironment *dynModel;
00100         CContinuousActionGradientPolicy *gradientPolicy;
00101 public:
00102         CNumericPolicyGradientCalculator(CAgent *agent, CContinuousActionGradientPolicy *policy, CTransitionFunctionEnvironment *dynModel, CRewardFunction *reward, double stepSize, CPolicyEvaluator *evaluator);
00103         ~CNumericPolicyGradientCalculator();
00104 
00105         virtual void getGradient(CFeatureList *gradientFeatures);
00106 };
00107 
00108 class CRandomPolicyGradientCalculator : public CPolicyGradientCalculator
00109 {
00110 protected:
00111         double *stepSizes;
00112         double *minWeights;
00113         double *nullWeights;
00114         double *plusWeights;
00115         
00116         int *numMinWeights;
00117         int *numMaxWeights;
00118         int *numNullWeights;
00119         
00120         CContinuousActionGradientPolicy *gradientPolicy;
00121 public:
00122         CRandomPolicyGradientCalculator(CContinuousActionGradientPolicy *policy, CPolicyEvaluator *evaluator, int numEvaluations, double stepSize);
00123         virtual ~CRandomPolicyGradientCalculator();
00124 
00125         virtual void getGradient(CFeatureList *gradient);
00126         virtual void setStepSize(int index, double stepSize);
00127         
00128         virtual void resetGradientCalculator() {};
00129 };
00130 
00131 class CRandomMaxPolicyGradientCalculator : public CPolicyGradientCalculator
00132 {
00133 protected:
00134         double *stepSizes;
00135         double *workStepSizes;
00136         
00137         CContinuousActionGradientPolicy *gradientPolicy;
00138 public:
00139         CRandomMaxPolicyGradientCalculator(CContinuousActionGradientPolicy *policy, CPolicyEvaluator *evaluator, int numEvaluations, double stepSize);
00140         virtual ~CRandomMaxPolicyGradientCalculator();
00141 
00142         virtual void getGradient(CFeatureList *gradient);
00143         virtual void setStepSize(int index, double stepSize);
00144         
00145         virtual void resetGradientCalculator();
00146 };
00147 
00148 
00149 class CGSearchPolicyGradientUpdater : public CGradientFunctionUpdater
00150 {
00151 protected:
00152         CPolicyGradientCalculator *gradientCalculator;
00153 
00154         double *startParameters;
00155         double *workParameters;
00156 
00157         double lastStepSize;
00158 
00159 
00160         void setWorkingParamters(CFeatureList *gradient, double stepSize, double *startParameters, double *workParameters);
00161 public:
00162 
00163         CGSearchPolicyGradientUpdater(CGradientUpdateFunction *updateFunction, CPolicyGradientCalculator *gradientCalculator, double s0, double epsilon);
00164         virtual ~CGSearchPolicyGradientUpdater();
00165 
00166         virtual void updateWeights(CFeatureList *gradient);
00167 };
00168 
00169 
00170 
00171 class CPolicyGradientWeightDecayListener : public CSemiMDPListener
00172 {
00173 protected:
00174         CGradientUpdateFunction *updateFunction;
00175         double *parameters;
00176 public:
00177         CPolicyGradientWeightDecayListener(CGradientUpdateFunction *updateFunction, double weightdecay);
00178         ~CPolicyGradientWeightDecayListener();
00179 
00180         virtual void newEpisode();
00181 };
00182 
00183 
00184 
00185 #endif
00186