"""A more stable successor to TD3.
By default, this uses a near-identical configuration to that reported in the
TD3 paper.
"""
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from ray.rllib.algorithms.ddpg.ddpg import DDPG, DDPGConfig
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import (
DEPRECATED_VALUE,
Deprecated,
ALGO_DEPRECATION_WARNING,
)
[docs]class TD3Config(DDPGConfig):
"""Defines a configuration class from which a TD3 Algorithm can be built.
Example:
>>> from ray.rllib.algorithms.td3 import TD3Config
>>> config = TD3Config().training(lr=0.01).resources(num_gpus=1)
>>> print(config.to_dict()) # doctest: +SKIP
>>> # Build a Algorithm object from the config and run one training iteration.
>>> algo = config.build(env="Pendulum-v1") # doctest: +SKIP
>>> algo.train() # doctest: +SKIP
Example:
>>> from ray.rllib.algorithms.td3 import TD3Config
>>> from ray import air
>>> from ray import tune
>>> config = TD3Config()
>>> # Print out some default values.
>>> print(config.lr) # doctest: +SKIP
>>> # Update the config object.
>>> config = config.training(lr=tune.grid_search( # doctest: +SKIP
... [0.001, 0.0001])) # doctest: +SKIP
>>> # Set the config object's env.
>>> config.environment(env="Pendulum-v1") # doctest: +SKIP
>>> # Use to_dict() to get the old-style python config dict
>>> # when running with tune.
>>> tune.Tuner( # doctest: +SKIP
... "TD3",
... run_config=air.RunConfig(stop={"episode_reward_mean": 200}),
... param_space=config.to_dict(),
... ).fit()
"""
def __init__(self, algo_class=None):
"""Initializes a TD3Config instance."""
super().__init__(algo_class=algo_class or TD3)
# fmt: off
# __sphinx_doc_begin__
# Override some of DDPG/SimpleQ/Algorithm's default values with TD3-specific
# values.
# .training()
# largest changes: twin Q functions, delayed policy updates, target
# smoothing, no l2-regularization.
self.twin_q = True
self.policy_delay = 2
self.smooth_target_policy = True,
self.l2_reg = 0.0
# Different tau (affecting target network update).
self.tau = 5e-3
# Different batch size.
self.train_batch_size = 100
# No prioritized replay by default (we may want to change this at some
# point).
self.replay_buffer_config = {
"type": "MultiAgentReplayBuffer",
# Specify prioritized replay by supplying a buffer type that supports
# prioritization, for example: MultiAgentPrioritizedReplayBuffer.
"prioritized_replay": DEPRECATED_VALUE,
"capacity": 1000000,
"worker_side_prioritization": False,
}
# Number of timesteps to collect from rollout workers before we start
# sampling from replay buffers for learning. Whether we count this in agent
# steps or environment steps depends on config.multi_agent(count_steps_by=..).
self.num_steps_sampled_before_learning_starts = 10000
# .exploration()
# TD3 uses Gaussian Noise by default.
self.exploration_config = {
# TD3 uses simple Gaussian noise on top of deterministic NN-output
# actions (after a possible pure random phase of n timesteps).
"type": "GaussianNoise",
# For how many timesteps should we return completely random
# actions, before we start adding (scaled) noise?
"random_timesteps": 10000,
# Gaussian stddev of action noise for exploration.
"stddev": 0.1,
# Scaling settings by which the Gaussian noise is scaled before
# being added to the actions. NOTE: The scale timesteps start only
# after(!) any random steps have been finished.
# By default, do not anneal over time (fixed 1.0).
"initial_scale": 1.0,
"final_scale": 1.0,
"scale_timesteps": 1,
}
# __sphinx_doc_end__
# fmt: on
@Deprecated(
old="rllib/algorithms/td3/",
new="rllib_contrib/td3/",
help=ALGO_DEPRECATION_WARNING,
error=False,
)
class TD3(DDPG):
@classmethod
@override(DDPG)
def get_default_config(cls) -> AlgorithmConfig:
return TD3Config()