This commit is contained in:
Ubuntu 2020-07-22 11:19:45 +00:00
Родитель c9dd8e01bd
Коммит a8403a7462
52 изменённых файлов: 8069 добавлений и 8053 удалений

258
.gitignore поставляемый
Просмотреть файл

@ -1,129 +1,129 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

Просмотреть файл

@ -1,14 +1,14 @@
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to,
and actually do, grant us the rights to use your contribution. For details, visit
https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to,
and actually do, grant us the rights to use your contribution. For details, visit
https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -1,2 +1,2 @@
# coding=utf-8
name = 'dopamine'
# coding=utf-8
name = 'dopamine'

Просмотреть файл

@ -1,15 +1,15 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Просмотреть файл

@ -1,15 +1,15 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Просмотреть файл

@ -1,37 +1,37 @@
# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
# comparison.
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 20000 # agent steps
DQNAgent.update_period = 4
DQNAgent.target_update_period = 8000 # agent steps
DQNAgent.epsilon_train = 0.01
DQNAgent.epsilon_eval = 0.001
DQNAgent.epsilon_decay_period = 250000 # agent steps
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
tf.train.RMSPropOptimizer.learning_rate = 0.00025
tf.train.RMSPropOptimizer.decay = 0.95
tf.train.RMSPropOptimizer.momentum = 0.0
tf.train.RMSPropOptimizer.epsilon = 0.00001
tf.train.RMSPropOptimizer.centered = True
atari_lib.create_atari_environment.game_name = 'Pong'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
atari_lib.create_atari_environment.sticky_actions = True
create_agent.agent_name = 'dqn'
Runner.num_iterations = 200
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
WrappedReplayBuffer.replay_capacity = 1000000
WrappedReplayBuffer.batch_size = 32
# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
# comparison.
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 20000 # agent steps
DQNAgent.update_period = 4
DQNAgent.target_update_period = 8000 # agent steps
DQNAgent.epsilon_train = 0.01
DQNAgent.epsilon_eval = 0.001
DQNAgent.epsilon_decay_period = 250000 # agent steps
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
tf.train.RMSPropOptimizer.learning_rate = 0.00025
tf.train.RMSPropOptimizer.decay = 0.95
tf.train.RMSPropOptimizer.momentum = 0.0
tf.train.RMSPropOptimizer.epsilon = 0.00001
tf.train.RMSPropOptimizer.centered = True
atari_lib.create_atari_environment.game_name = 'Pong'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
atari_lib.create_atari_environment.sticky_actions = True
create_agent.agent_name = 'dqn'
Runner.num_iterations = 200
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
WrappedReplayBuffer.replay_capacity = 1000000
WrappedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,35 +1,35 @@
# Hyperparameters for a simple DQN-style Acrobot agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
DQNAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
DQNAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
DQNAgent.network = @gym_lib.acrobot_dqn_network
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 500
DQNAgent.update_period = 4
DQNAgent.target_update_period = 100
DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.001
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'Acrobot'
create_gym_environment.version = 'v1'
create_agent.agent_name = 'dqn'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 500
WrappedReplayBuffer.replay_capacity = 50000
WrappedReplayBuffer.batch_size = 128
# Hyperparameters for a simple DQN-style Acrobot agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
DQNAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
DQNAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
DQNAgent.network = @gym_lib.acrobot_dqn_network
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 500
DQNAgent.update_period = 4
DQNAgent.target_update_period = 100
DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.001
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'Acrobot'
create_gym_environment.version = 'v1'
create_agent.agent_name = 'dqn'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 500
WrappedReplayBuffer.replay_capacity = 50000
WrappedReplayBuffer.batch_size = 128

Просмотреть файл

@ -1,35 +1,35 @@
# Hyperparameters for a simple DQN-style Cartpole agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
DQNAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
DQNAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
DQNAgent.network = @gym_lib.cartpole_dqn_network
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 500
DQNAgent.update_period = 4
DQNAgent.target_update_period = 100
DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.001
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'CartPole'
create_gym_environment.version = 'v0'
create_agent.agent_name = 'dqn'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 200 # Default max episode length.
WrappedReplayBuffer.replay_capacity = 50000
WrappedReplayBuffer.batch_size = 128
# Hyperparameters for a simple DQN-style Cartpole agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
DQNAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
DQNAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
DQNAgent.network = @gym_lib.cartpole_dqn_network
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 500
DQNAgent.update_period = 4
DQNAgent.target_update_period = 100
DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.001
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'CartPole'
create_gym_environment.version = 'v0'
create_agent.agent_name = 'dqn'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 200 # Default max episode length.
WrappedReplayBuffer.replay_capacity = 50000
WrappedReplayBuffer.batch_size = 128

Просмотреть файл

@ -1,37 +1,37 @@
# Hyperparameters used for reporting DQN results in Bellemare et al. (2017).
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 50000 # agent steps
DQNAgent.update_period = 4
DQNAgent.target_update_period = 10000 # agent steps
DQNAgent.epsilon_train = 0.01
DQNAgent.epsilon_eval = 0.001
DQNAgent.epsilon_decay_period = 1000000 # agent steps
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
tf.train.RMSPropOptimizer.learning_rate = 0.00025
tf.train.RMSPropOptimizer.decay = 0.95
tf.train.RMSPropOptimizer.momentum = 0.0
tf.train.RMSPropOptimizer.epsilon = 0.00001
tf.train.RMSPropOptimizer.centered = True
atari_lib.create_atari_environment.game_name = 'Pong'
# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'dqn'
Runner.num_iterations = 200
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedReplayBuffer.replay_capacity = 1000000
WrappedReplayBuffer.batch_size = 32
# Hyperparameters used for reporting DQN results in Bellemare et al. (2017).
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 50000 # agent steps
DQNAgent.update_period = 4
DQNAgent.target_update_period = 10000 # agent steps
DQNAgent.epsilon_train = 0.01
DQNAgent.epsilon_eval = 0.001
DQNAgent.epsilon_decay_period = 1000000 # agent steps
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
tf.train.RMSPropOptimizer.learning_rate = 0.00025
tf.train.RMSPropOptimizer.decay = 0.95
tf.train.RMSPropOptimizer.momentum = 0.0
tf.train.RMSPropOptimizer.epsilon = 0.00001
tf.train.RMSPropOptimizer.centered = True
atari_lib.create_atari_environment.game_name = 'Pong'
# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'dqn'
Runner.num_iterations = 200
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedReplayBuffer.replay_capacity = 1000000
WrappedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,41 +1,41 @@
# Hyperparameters used in Mnih et al. (2015).
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.runtype = 'RUNTYPE'
DQNAgent.game = 'GAME'
DQNAgent.min_replay_history = 50000 # agent steps
DQNAgent.update_period = 4
DQNAgent.target_update_period = 10000 # agent steps
DQNAgent.epsilon_train = 0.1
DQNAgent.epsilon_eval = 0.05
DQNAgent.epsilon_decay_period = 1000000 # agent steps
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
tf.train.RMSPropOptimizer.learning_rate = 0.00025
tf.train.RMSPropOptimizer.decay = 0.95
tf.train.RMSPropOptimizer.momentum = 0.0
tf.train.RMSPropOptimizer.epsilon = 0.00001
tf.train.RMSPropOptimizer.centered = True
atari_lib.create_atari_environment.game_name = 'GAME'
# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'dqn'
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.num_iterations = 200
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedReplayBuffer.replay_capacity = 1000000
WrappedReplayBuffer.batch_size = 32
# Hyperparameters used in Mnih et al. (2015).
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.circular_replay_buffer
import gin.tf.external_configurables
DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.runtype = 'RUNTYPE'
DQNAgent.game = 'GAME'
DQNAgent.min_replay_history = 50000 # agent steps
DQNAgent.update_period = 4
DQNAgent.target_update_period = 10000 # agent steps
DQNAgent.epsilon_train = 0.1
DQNAgent.epsilon_eval = 0.05
DQNAgent.epsilon_decay_period = 1000000 # agent steps
DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
tf.train.RMSPropOptimizer.learning_rate = 0.00025
tf.train.RMSPropOptimizer.decay = 0.95
tf.train.RMSPropOptimizer.momentum = 0.0
tf.train.RMSPropOptimizer.epsilon = 0.00001
tf.train.RMSPropOptimizer.centered = True
atari_lib.create_atari_environment.game_name = 'GAME'
# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'dqn'
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.num_iterations = 200
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedReplayBuffer.replay_capacity = 1000000
WrappedReplayBuffer.batch_size = 32

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,46 +1,46 @@
# Hyperparameters follow Dabney et al. (2018).
import dopamine.agents.fqf.fqf_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
FQFAgent.kappa = 1.0
FQFAgent.num_tau_samples = 32
FQFAgent.num_tau_prime_samples = 32
FQFAgent.num_quantile_samples = 32
FQFAgent.runtype = 'RUNTYPE'
FQFAgent.fqf_factor = 'FQFFACTOR'
FQFAgent.fqf_ent = 'FQFENT'
RainbowAgent.gamma = 0.99
RainbowAgent.game = 'GAME'
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 50000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 10000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 1000000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00005
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME'
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'implicit_quantile'
Runner.num_iterations = 200
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.training_steps = 250000
Runner.evaluation_steps = 125000
Runner.max_steps_per_episode = 27000
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
# Hyperparameters follow Dabney et al. (2018).
import dopamine.agents.fqf.fqf_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
FQFAgent.kappa = 1.0
FQFAgent.num_tau_samples = 32
FQFAgent.num_tau_prime_samples = 32
FQFAgent.num_quantile_samples = 32
FQFAgent.runtype = 'RUNTYPE'
FQFAgent.fqf_factor = 'FQFFACTOR'
FQFAgent.fqf_ent = 'FQFENT'
RainbowAgent.gamma = 0.99
RainbowAgent.game = 'GAME'
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 50000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 10000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 1000000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00005
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME'
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'implicit_quantile'
Runner.num_iterations = 200
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.training_steps = 250000
Runner.evaluation_steps = 125000
Runner.max_steps_per_episode = 27000
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,410 +1,420 @@
# coding=utf-8
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
from dopamine.agents.rainbow import rainbow_agent
from dopamine.discrete_domains import atari_lib
import tensorflow as tf
import gin.tf
slim = tf.contrib.slim
@gin.configurable
class FQFAgent(rainbow_agent.RainbowAgent):
def __init__(self,
sess,
num_actions,
network=atari_lib.fqf_network,
kappa=1.0,
runtype=None,
fqf_factor=0.000001,
fqf_ent=0.001,
num_tau_samples=32,
num_tau_prime_samples=32,
num_quantile_samples=32,
quantile_embedding_dim=64,
double_dqn=False,
summary_writer=None,
summary_writing_frequency=500):
"""Initializes the agent and constructs the Graph.
Most of this constructor's parameters are IQN-specific hyperparameters whose
values are taken from Dabney et al. (2018).
Args:
sess: `tf.Session` object for running associated ops.
num_actions: int, number of actions the agent can take at any state.
network: function expecting three parameters:
(num_actions, network_type, state). This function will return the
network_type object containing the tensors output by the network.
See dopamine.discrete_domains.atari_lib.nature_dqn_network as
an example.
kappa: float, Huber loss cutoff.
num_tau_samples: int, number of online quantile samples for loss
estimation.
num_tau_prime_samples: int, number of target quantile samples for loss
estimation.
num_quantile_samples: int, number of quantile samples for computing
Q-values.
quantile_embedding_dim: int, embedding dimension for the quantile input.
double_dqn: boolean, whether to perform double DQN style learning
as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
summary_writer: SummaryWriter object for outputting training statistics.
Summary writing disabled if set to None.
summary_writing_frequency: int, frequency with which summaries will be
written. Lower values will result in slower training.
"""
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
self._runtype= runtype
print (self._runtype)
self.fqf_factor = float(fqf_factor)
self.ent = float(fqf_ent)
self.kappa = kappa
print ('fqf factor:', self.fqf_factor)
# num_tau_samples = N below equation (3) in the paper.
self.num_tau_samples = num_tau_samples
# num_tau_prime_samples = N' below equation (3) in the paper.
self.num_tau_prime_samples = num_tau_prime_samples
# num_quantile_samples = k below equation (3) in the paper.
self.num_quantile_samples = num_quantile_samples
# quantile_embedding_dim = n above equation (4) in the paper.
self.quantile_embedding_dim = quantile_embedding_dim
# option to perform double dqn.
self.double_dqn = double_dqn
if 'adam' in self._runtype:
self.optimizer1 = tf.train.AdamOptimizer(
learning_rate=0.00005 * self.fqf_factor,
epsilon=0.0003125)
else:
self.optimizer1 = tf.train.RMSPropOptimizer(
learning_rate=0.00005 * self.fqf_factor,
decay=0.95,
momentum=0.0,
epsilon=0.00001,
centered=True)
super(FQFAgent, self).__init__(
sess=sess,
num_actions=num_actions,
network=network,
summary_writer=summary_writer,
summary_writing_frequency=summary_writing_frequency)
def _get_network_type(self):
return collections.namedtuple(
'iqn_network', ['quantile_values', 'quantiles', 'quantile_values_origin', 'quantiles_origin', 'Fv_diff', 'v_diff', 'quantile_values_mid', 'quantiles_mid', 'L_tau', 'gradient_tau', 'quantile_tau'])
def _network_template(self, state, num_quantiles):
return self.network(self.num_actions, self.quantile_embedding_dim,
self._get_network_type(), state, num_quantiles, self._runtype)
def _train_step(self):
"""Runs a single training step.
Runs a training op if both:
(1) A minimum number of frames have been added to the replay buffer.
(2) `training_steps` is a multiple of `update_period`.
Also, syncs weights from online to target network if training steps is a
multiple of target update period.
"""
# Run a train op at the rate of self.update_period if enough training steps
# have been run. This matches the Nature DQN behaviour.
if self._replay.memory.add_count > self.min_replay_history:
if self.training_steps % self.update_period == 0:
_, _, _, loss, loss1, quan_value, quan, vdiff = self._sess.run(self._train_op)
if self.training_steps % 50000 == 0:
batchsize = 32
quan_value = np.reshape(quan_value, [batchsize, self.num_tau_samples])
quan = np.reshape(quan, [batchsize, self.num_tau_samples])
quan_value = quan_value[0].tolist()
quan = quan[0].tolist()
vdiff = vdiff[:, 0].tolist()
print (">>> loss:", loss)
print (">>> loss1:", loss1)
print (">>> value:", quan_value)
print (">>> quans:", quan)
print (">>> vdiff:", vdiff)
print (">>> vdiff_sum:", np.sum(vdiff))
if (self.summary_writer is not None and
self.training_steps > 0 and
self.training_steps % self.summary_writing_frequency == 0):
summary = self._sess.run(self._merged_summaries)
self.summary_writer.add_summary(summary, self.training_steps)
if self.training_steps % self.target_update_period == 0:
self._sess.run(self._sync_qt_ops)
self.training_steps += 1
def _build_networks(self):
"""Builds the FQF computations needed for acting and training.
These are:
self.online_convnet: For computing the current state's quantile values.
self.target_convnet: For computing the next state's target quantile
values.
self._net_outputs: The actual quantile values.
self._q_argmax: The action maximizing the current state's Q-values.
self._replay_net_outputs: The replayed states' quantile values.
self._replay_next_target_net_outputs: The replayed next states' target
quantile values.
"""
# Calling online_convnet will generate a new graph as defined in
# self._get_network_template using whatever input is passed, but will always
# share the same weights.
self.online_convnet = tf.make_template('Online', self._network_template)
self.target_convnet = tf.make_template('Target', self._network_template)
# Compute the Q-values which are used for action selection in the current
# state.
self._net_outputs = self.online_convnet(self.state_ph,
self.num_quantile_samples)
# Shape of self._net_outputs.quantile_values:
# num_quantile_samples x num_actions.
# e.g. if num_actions is 2, it might look something like this:
# Vals for Quantile .2 Vals for Quantile .4 Vals for Quantile .6
# [[0.1, 0.5], [0.15, -0.3], [0.15, -0.2]]
# Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
if 'ws' in self._runtype:
self._q_values = tf.reduce_sum(self._net_outputs.quantile_values * self._net_outputs.v_diff, axis=0) #NOTE: quantile_values = quantile_values_mid
else:
self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
self._q_argmax = tf.argmax(self._q_values, axis=0)
self._replay_net_outputs = self.online_convnet(self._replay.states,
self.num_tau_samples)
# Shape: (num_tau_samples x batch_size) x num_actions.
self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
self._replay_net_quantiles = self._replay_net_outputs.quantiles
# Do the same for next states in the replay buffer.
self._replay_net_target_outputs = self.target_convnet(
self._replay.next_states, self.num_tau_prime_samples)
# Shape: (num_tau_prime_samples x batch_size) x num_actions.
vals = self._replay_net_target_outputs.quantile_values
self._replay_net_target_quantile_values = vals
# Compute Q-values which are used for action selection for the next states
# in the replay buffer. Compute the argmax over the Q-values.
if self.double_dqn:
outputs_action = self.online_convnet(self._replay.next_states,
self.num_quantile_samples)
else:
outputs_action = self.target_convnet(self._replay.next_states,
self.num_quantile_samples)
# Shape: (num_quantile_samples x batch_size) x num_actions.
target_quantile_values_action = outputs_action.quantile_values #NOTE: quantile_values = quantile_values_mid
# Shape: num_quantile_samples x batch_size x num_actions.
target_quantile_values_action = tf.reshape(target_quantile_values_action,
[self.num_quantile_samples,
self._replay.batch_size,
self.num_actions])
# Shape: batch_size x num_actions.
if 'ws' in self._runtype:
v_diff = tf.reshape(outputs_action.v_diff, [self.num_quantile_samples, self._replay.batch_size, 1])
self._replay_net_target_q_values = tf.squeeze(tf.reduce_sum(
target_quantile_values_action * v_diff, axis=0))
else:
self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
target_quantile_values_action, axis=0))
self._replay_next_qt_argmax = tf.argmax(
self._replay_net_target_q_values, axis=1)
def _build_target_quantile_values_op(self):
"""Build an op used as a target for return values at given quantiles.
Returns:
An op calculating the target quantile return.
"""
batch_size = tf.shape(self._replay.rewards)[0]
# Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
rewards = self._replay.rewards[:, None]
rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
# Incorporate terminal state to discount factor.
# size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
[self.num_tau_prime_samples, 1])
# Get the indices of the maximium Q-value across the action dimension.
# Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
replay_next_qt_argmax = tf.tile(
self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
# Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
batch_indices = tf.cast(tf.range(
self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
# Shape of batch_indexed_target_values:
# (num_tau_prime_samples x batch_size) x 2.
batch_indexed_target_values = tf.concat(
[batch_indices, replay_next_qt_argmax], axis=1)
# Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
target_quantile_values = tf.gather_nd(
self._replay_net_target_quantile_values,
batch_indexed_target_values)[:, None]
return rewards + gamma_with_terminal * target_quantile_values
def _build_train_op(self):
"""Builds a training op.
Returns:
train_op: An op performing one step of training from replay data.
"""
batch_size = tf.shape(self._replay.rewards)[0]
target_quantile_values = tf.stop_gradient(
self._build_target_quantile_values_op())
# Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
# the manner in which the target_quantile_values are tiled.
target_quantile_values = tf.reshape(target_quantile_values,
[self.num_tau_prime_samples,
batch_size, 1])
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_prime_samples x 1 to prepare for computation of
# Bellman errors.
# Final shape of target_quantile_values:
# batch_size x num_tau_prime_samples x 1.
target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
# Shape of indices: (num_tau_samples x batch_size) x 1.
# Expand dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
indices = tf.range(self.num_tau_samples * batch_size)[:, None]
# Expand the dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
reshaped_actions = self._replay.actions[:, None]
reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
# Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
chosen_action_quantile_values = tf.gather_nd(
self._replay_net_quantile_values, reshaped_actions)
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', self._replay_net_quantile_values)
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_samples x 1 to prepare for computation of
# Bellman errors.
# Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
# in which the quantile values are tiled.
chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
[self.num_tau_samples,
batch_size, 1])
# Final shape of chosen_action_quantile_values:
# batch_size x num_tau_samples x 1.
chosen_action_quantile_values = tf.transpose(
chosen_action_quantile_values, [1, 0, 2]) #batchsize x quan x 1
##########################################################################################
reshaped_actions1 = self._replay.actions[:, None]
reshaped_actions1 = tf.tile(reshaped_actions1, [self.num_tau_samples-1, 1])
# Shape of reshaped_actions1: (num_tau_samples-1 x batch_size) x 2.
indices1 = tf.range((self.num_tau_samples-1) * batch_size)[:, None]
reshaped_actions1 = tf.concat([indices1, reshaped_actions1], axis=1)
gradient_tau = tf.reshape(self._replay_net_outputs.gradient_tau, (-1, self.num_actions)) #31 x 32 x 18
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
gradient_tau = tf.gather_nd(
gradient_tau, reshaped_actions1)
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
chosen_action_gradient_tau = tf.reshape(gradient_tau,
[self.num_tau_samples-1,
batch_size, 1])
self.chosen_action_gradient_tau = tf.transpose(
chosen_action_gradient_tau, [1, 0, 2]) #batchsize x quan x 1 (32 x 31 x 18)
self.chosen_action_gradient_tau = self.chosen_action_gradient_tau[:,:,0] #(32 x 31)
##########################################################################################
# Shape of bellman_erors and huber_loss:
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
#if 'fqf12' in self._runtype and 'fixbugtarg' in self._runtype:
# print ("============================================================= fixbug")
# print (bellman_errors.shape, self._replay_net_outputs.v_diff.shape, self.num_tau_samples)
# bellman_errors = bellman_errors * self._replay_net_outputs.v_diff[:,:,None,None] * self.num_tau_samples
# The huber loss (see Section 2.3 of the paper) is defined via two cases:
# case_one: |bellman_errors| <= kappa
# case_two: |bellman_errors| > kappa
huber_loss_case_one = tf.to_float(
tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
huber_loss_case_two = tf.to_float(
tf.abs(bellman_errors) > self.kappa) * self.kappa * (
tf.abs(bellman_errors) - 0.5 * self.kappa)
huber_loss = huber_loss_case_one + huber_loss_case_two
# Reshape replay_quantiles to batch_size x num_tau_samples x 1
replay_quantiles = tf.reshape(
self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2]) #batchsize x quan x 1
# Tile by num_tau_prime_samples along a new dimension. Shape is now
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
# These quantiles will be used for computation of the quantile huber loss
# below (see section 2.3 of the paper).
replay_quantiles = tf.to_float(tf.tile(
replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
# Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
# Sum over current quantile value (num_tau_samples) dimension,
# average over target quantile value (num_tau_prime_samples) dimension.
# Shape: batch_size x num_tau_prime_samples x 1.
loss = tf.reduce_sum(quantile_huber_loss, axis=2)
# Shape: batch_size x 1.
loss = tf.reduce_mean(loss, axis=1)
chosen_action_L_tau = tf.gather_nd(self._replay_net_outputs.L_tau, reshaped_actions)
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", chosen_action_L_tau.shape)
loss1 = tf.reduce_mean(chosen_action_L_tau, axis=0)
print (loss1.shape)
update_priorities_op = tf.no_op()
with tf.control_dependencies([update_priorities_op]):
if self.summary_writer is not None:
with tf.variable_scope('Losses'):
tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
iqn_params, fqf_params = [], []
params = tf.trainable_variables()
for p in params:
if 'fqf' in p.name and 'Target' not in p.name: fqf_params.append(p)
else: iqn_params.append(p)
print ("fqf_params:>>>>>>", fqf_params)
print ("iqn_params:>>>>>>", iqn_params)
#batchsize x quan
#batchsize x quan
#quan x batchsize
print ('================================================')
quantile_tau = tf.transpose(self._replay_net_outputs.quantile_tau, (1,0))
q_entropy = tf.reduce_sum(-quantile_tau * tf.log(quantile_tau), axis=1) * 0.001
#print (quantile_tau) #32x31
print ("q_entropy:", q_entropy)
print (self.chosen_action_gradient_tau) #32x31
print (fqf_params)
grads = tf.gradients(quantile_tau, fqf_params, grad_ys=self.chosen_action_gradient_tau)
print (grads)
grads_and_vars = [(grads[i], fqf_params[i]) for i in range(len(grads))]
return self.optimizer.minimize(tf.reduce_mean(loss), var_list=iqn_params), \
self.optimizer1.apply_gradients(grads_and_vars), \
self.optimizer1.minimize(self.ent * tf.reduce_mean(-q_entropy), var_list=fqf_params), \
tf.reduce_mean(loss), tf.reduce_mean(loss1), \
tf.squeeze(chosen_action_quantile_values), \
tf.squeeze(replay_quantiles[:,0,:,:]), \
self._replay_net_outputs.v_diff
# coding=utf-8
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
from dopamine.agents.rainbow import rainbow_agent
from dopamine.discrete_domains import atari_lib
import tensorflow as tf
import gin.tf
slim = tf.contrib.slim
@gin.configurable
class FQFAgent(rainbow_agent.RainbowAgent):
def __init__(self,
sess,
num_actions,
network=atari_lib.fqf_network,
kappa=1.0,
runtype=None,
fqf_factor=0.000001,
fqf_ent=0.001,
num_tau_samples=32,
num_tau_prime_samples=32,
num_quantile_samples=32,
quantile_embedding_dim=64,
double_dqn=False,
summary_writer=None,
summary_writing_frequency=500):
"""Initializes the agent and constructs the Graph.
Most of this constructor's parameters are IQN-specific hyperparameters whose
values are taken from Dabney et al. (2018).
Args:
sess: `tf.Session` object for running associated ops.
num_actions: int, number of actions the agent can take at any state.
network: function expecting three parameters:
(num_actions, network_type, state). This function will return the
network_type object containing the tensors output by the network.
See dopamine.discrete_domains.atari_lib.nature_dqn_network as
an example.
kappa: float, Huber loss cutoff.
num_tau_samples: int, number of online quantile samples for loss
estimation.
num_tau_prime_samples: int, number of target quantile samples for loss
estimation.
num_quantile_samples: int, number of quantile samples for computing
Q-values.
quantile_embedding_dim: int, embedding dimension for the quantile input.
double_dqn: boolean, whether to perform double DQN style learning
as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
summary_writer: SummaryWriter object for outputting training statistics.
Summary writing disabled if set to None.
summary_writing_frequency: int, frequency with which summaries will be
written. Lower values will result in slower training.
"""
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
self._runtype= runtype
print (self._runtype)
self.fqf_factor = float(fqf_factor)
self.ent = float(fqf_ent)
self.kappa = kappa
print ('fqf factor:', self.fqf_factor)
# num_tau_samples = N below equation (3) in the paper.
self.num_tau_samples = num_tau_samples
# num_tau_prime_samples = N' below equation (3) in the paper.
self.num_tau_prime_samples = num_tau_prime_samples
# num_quantile_samples = k below equation (3) in the paper.
self.num_quantile_samples = num_quantile_samples
# quantile_embedding_dim = n above equation (4) in the paper.
self.quantile_embedding_dim = quantile_embedding_dim
# option to perform double dqn.
self.double_dqn = double_dqn
if 'adam' in self._runtype:
self.optimizer1 = tf.train.AdamOptimizer(
learning_rate=0.00005 * self.fqf_factor,
epsilon=0.0003125)
else:
self.optimizer1 = tf.train.RMSPropOptimizer(
learning_rate=0.00005 * self.fqf_factor,
decay=0.95,
momentum=0.0,
epsilon=0.00001,
centered=True)
super(FQFAgent, self).__init__(
sess=sess,
num_actions=num_actions,
network=network,
summary_writer=summary_writer,
summary_writing_frequency=summary_writing_frequency)
def _get_network_type(self):
return collections.namedtuple(
'iqn_network', ['quantile_values', 'quantiles', 'quantile_values_origin', 'quantiles_origin', 'Fv_diff', 'v_diff', 'quantile_values_mid', 'quantiles_mid', 'L_tau', 'gradient_tau', 'quantile_tau'])
def _network_template(self, state, num_quantiles):
return self.network(self.num_actions, self.quantile_embedding_dim,
self._get_network_type(), state, num_quantiles, self._runtype)
def _train_step(self):
"""Runs a single training step.
Runs a training op if both:
(1) A minimum number of frames have been added to the replay buffer.
(2) `training_steps` is a multiple of `update_period`.
Also, syncs weights from online to target network if training steps is a
multiple of target update period.
"""
# Run a train op at the rate of self.update_period if enough training steps
# have been run. This matches the Nature DQN behaviour.
if self._replay.memory.add_count > self.min_replay_history:
if self.training_steps % self.update_period == 0:
_, _, _, loss, loss1, quan_value, quan, vdiff = self._sess.run(self._train_op)
if self.training_steps % 50000 == 0:
batchsize = 32
quan_value = np.reshape(quan_value, [batchsize, self.num_tau_samples])
quan = np.reshape(quan, [batchsize, self.num_tau_samples])
quan_value = quan_value[0].tolist()
quan = quan[0].tolist()
vdiff = vdiff[:, 0].tolist()
print (">>> loss:", loss)
print (">>> loss1:", loss1)
print (">>> value:", quan_value)
print (">>> quans:", quan)
print (">>> vdiff:", vdiff)
print (">>> vdiff_sum:", np.sum(vdiff))
if (self.summary_writer is not None and
self.training_steps > 0 and
self.training_steps % self.summary_writing_frequency == 0):
summary = self._sess.run(self._merged_summaries)
self.summary_writer.add_summary(summary, self.training_steps)
if self.training_steps % self.target_update_period == 0:
self._sess.run(self._sync_qt_ops)
self.training_steps += 1
def _build_networks(self):
"""Builds the FQF computations needed for acting and training.
These are:
self.online_convnet: For computing the current state's quantile values.
self.target_convnet: For computing the next state's target quantile
values.
self._net_outputs: The actual quantile values.
self._q_argmax: The action maximizing the current state's Q-values.
self._replay_net_outputs: The replayed states' quantile values.
self._replay_next_target_net_outputs: The replayed next states' target
quantile values.
"""
# Calling online_convnet will generate a new graph as defined in
# self._get_network_template using whatever input is passed, but will always
# share the same weights.
self.online_convnet = tf.make_template('Online', self._network_template)
self.target_convnet = tf.make_template('Target', self._network_template)
# Compute the Q-values which are used for action selection in the current
# state.
self._net_outputs = self.online_convnet(self.state_ph,
self.num_quantile_samples)
# Shape of self._net_outputs.quantile_values:
# num_quantile_samples x num_actions.
# e.g. if num_actions is 2, it might look something like this:
# Vals for Quantile .2 Vals for Quantile .4 Vals for Quantile .6
# [[0.1, 0.5], [0.15, -0.3], [0.15, -0.2]]
# Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
if 'ws' in self._runtype:
self._q_values = tf.reduce_sum(self._net_outputs.quantile_values * self._net_outputs.v_diff, axis=0) #NOTE: quantile_values = quantile_values_mid
else:
self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
self._q_argmax = tf.argmax(self._q_values, axis=0)
self._replay_net_outputs = self.online_convnet(self._replay.states,
self.num_tau_samples)
# Shape: (num_tau_samples x batch_size) x num_actions.
self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
self._replay_net_quantiles = self._replay_net_outputs.quantiles
# Do the same for next states in the replay buffer.
self._replay_net_target_outputs = self.target_convnet(
self._replay.next_states, self.num_tau_prime_samples)
# Shape: (num_tau_prime_samples x batch_size) x num_actions.
vals = self._replay_net_target_outputs.quantile_values
self._replay_net_target_quantile_values = vals
# Compute Q-values which are used for action selection for the next states
# in the replay buffer. Compute the argmax over the Q-values.
if self.double_dqn:
outputs_action = self.online_convnet(self._replay.next_states,
self.num_quantile_samples)
else:
outputs_action = self.target_convnet(self._replay.next_states,
self.num_quantile_samples)
# Shape: (num_quantile_samples x batch_size) x num_actions.
target_quantile_values_action = outputs_action.quantile_values #NOTE: quantile_values = quantile_values_mid
# Shape: num_quantile_samples x batch_size x num_actions.
target_quantile_values_action = tf.reshape(target_quantile_values_action,
[self.num_quantile_samples,
self._replay.batch_size,
self.num_actions])
# Shape: batch_size x num_actions.
if 'ws' in self._runtype:
v_diff = tf.reshape(outputs_action.v_diff, [self.num_quantile_samples, self._replay.batch_size, 1])
self._replay_net_target_q_values = tf.squeeze(tf.reduce_sum(
target_quantile_values_action * v_diff, axis=0))
else:
self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
target_quantile_values_action, axis=0))
self._replay_next_qt_argmax = tf.argmax(
self._replay_net_target_q_values, axis=1)
def _build_target_quantile_values_op(self):
"""Build an op used as a target for return values at given quantiles.
Returns:
An op calculating the target quantile return.
"""
batch_size = tf.shape(self._replay.rewards)[0]
# Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
rewards = self._replay.rewards[:, None]
rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
# Incorporate terminal state to discount factor.
# size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
[self.num_tau_prime_samples, 1])
# Get the indices of the maximium Q-value across the action dimension.
# Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
replay_next_qt_argmax = tf.tile(
self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
# Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
batch_indices = tf.cast(tf.range(
self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
# Shape of batch_indexed_target_values:
# (num_tau_prime_samples x batch_size) x 2.
batch_indexed_target_values = tf.concat(
[batch_indices, replay_next_qt_argmax], axis=1)
# Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
target_quantile_values = tf.gather_nd(
self._replay_net_target_quantile_values,
batch_indexed_target_values)[:, None]
return rewards + gamma_with_terminal * target_quantile_values
def _build_train_op(self):
"""Builds a training op.
Returns:
train_op: An op performing one step of training from replay data.
"""
batch_size = tf.shape(self._replay.rewards)[0]
target_quantile_values = tf.stop_gradient(
self._build_target_quantile_values_op())
# Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
# the manner in which the target_quantile_values are tiled.
target_quantile_values = tf.reshape(target_quantile_values,
[self.num_tau_prime_samples,
batch_size, 1])
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_prime_samples x 1 to prepare for computation of
# Bellman errors.
# Final shape of target_quantile_values:
# batch_size x num_tau_prime_samples x 1.
target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
# Shape of indices: (num_tau_samples x batch_size) x 1.
# Expand dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
indices = tf.range(self.num_tau_samples * batch_size)[:, None]
# Expand the dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
reshaped_actions = self._replay.actions[:, None]
reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
# Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
chosen_action_quantile_values = tf.gather_nd(
self._replay_net_quantile_values, reshaped_actions)
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', self._replay_net_quantile_values)
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_samples x 1 to prepare for computation of
# Bellman errors.
# Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
# in which the quantile values are tiled.
chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
[self.num_tau_samples,
batch_size, 1])
# Final shape of chosen_action_quantile_values:
# batch_size x num_tau_samples x 1.
chosen_action_quantile_values = tf.transpose(
chosen_action_quantile_values, [1, 0, 2]) #batchsize x quan x 1
##########################################################################################
reshaped_actions1 = self._replay.actions[:, None]
reshaped_actions1 = tf.tile(reshaped_actions1, [self.num_tau_samples-1, 1])
# Shape of reshaped_actions1: (num_tau_samples-1 x batch_size) x 2.
indices1 = tf.range((self.num_tau_samples-1) * batch_size)[:, None]
reshaped_actions1 = tf.concat([indices1, reshaped_actions1], axis=1)
gradient_tau = tf.reshape(self._replay_net_outputs.gradient_tau, (-1, self.num_actions)) #31 x 32 x 18
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
gradient_tau = tf.gather_nd(
gradient_tau, reshaped_actions1)
print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
chosen_action_gradient_tau = tf.reshape(gradient_tau,
[self.num_tau_samples-1,
batch_size, 1])
self.chosen_action_gradient_tau = tf.transpose(
chosen_action_gradient_tau, [1, 0, 2]) #batchsize x quan x 1 (32 x 31 x 18)
self.chosen_action_gradient_tau = self.chosen_action_gradient_tau[:,:,0] #(32 x 31)
##########################################################################################
# Shape of bellman_erors and huber_loss:
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
#if 'fqf12' in self._runtype and 'fixbugtarg' in self._runtype:
# print ("============================================================= fixbug")
# print (bellman_errors.shape, self._replay_net_outputs.v_diff.shape, self.num_tau_samples)
# bellman_errors = bellman_errors * self._replay_net_outputs.v_diff[:,:,None,None] * self.num_tau_samples
# The huber loss (see Section 2.3 of the paper) is defined via two cases:
# case_one: |bellman_errors| <= kappa
# case_two: |bellman_errors| > kappa
huber_loss_case_one = tf.to_float(
tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
huber_loss_case_two = tf.to_float(
tf.abs(bellman_errors) > self.kappa) * self.kappa * (
tf.abs(bellman_errors) - 0.5 * self.kappa)
huber_loss = huber_loss_case_one + huber_loss_case_two
# Reshape replay_quantiles to batch_size x num_tau_samples x 1
replay_quantiles = tf.reshape(
self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2]) #batchsize x quan x 1
# Tile by num_tau_prime_samples along a new dimension. Shape is now
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
# These quantiles will be used for computation of the quantile huber loss
# below (see section 2.3 of the paper).
replay_quantiles = tf.to_float(tf.tile(
replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
# Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
# Sum over current quantile value (num_tau_samples) dimension,
# average over target quantile value (num_tau_prime_samples) dimension.
# Shape: batch_size x num_tau_prime_samples x 1.
loss = tf.reduce_sum(quantile_huber_loss, axis=2)
# Shape: batch_size x 1.
loss = tf.reduce_mean(loss, axis=1)
chosen_action_L_tau = tf.gather_nd(self._replay_net_outputs.L_tau, reshaped_actions)
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", chosen_action_L_tau.shape)
loss1 = tf.reduce_mean(chosen_action_L_tau, axis=0)
print (loss1.shape)
update_priorities_op = tf.no_op()
with tf.control_dependencies([update_priorities_op]):
if self.summary_writer is not None:
with tf.variable_scope('Losses'):
tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
iqn_params, fqf_params = [], []
params = tf.trainable_variables()
for p in params:
if 'fqf' in p.name and 'Target' not in p.name: fqf_params.append(p)
else: iqn_params.append(p)
print ("fqf_params:>>>>>>", fqf_params)
print ("iqn_params:>>>>>>", iqn_params)
#batchsize x quan
#batchsize x quan
#quan x batchsize
print ('================================================')
quantile_tau = tf.transpose(self._replay_net_outputs.quantile_tau, (1,0))
q_entropy = tf.reduce_sum(-quantile_tau * tf.log(quantile_tau), axis=1) * 0.001
#print (quantile_tau) #32x31
print ("q_entropy:", q_entropy)
print (self.chosen_action_gradient_tau) #32x31
print (fqf_params)
grads = tf.gradients(quantile_tau, fqf_params, grad_ys=self.chosen_action_gradient_tau)
print (grads)
grads_and_vars = [(grads[i], fqf_params[i]) for i in range(len(grads))]
if 'sqloss' in self._runtype:
print ('use sqloss')
return self.optimizer.minimize(tf.reduce_mean(loss), var_list=iqn_params), \
self.optimizer1.minimize(tf.reduce_mean(loss1), var_list=fqf_params), \
tf.reduce_mean(loss), tf.reduce_mean(loss1), \
tf.squeeze(chosen_action_quantile_values), \
tf.squeeze(replay_quantiles[:,0,:,:]), \
self._replay_net_outputs.v_diff
else:
print ('use directBP')
return self.optimizer.minimize(tf.reduce_mean(loss), var_list=iqn_params), \
self.optimizer1.apply_gradients(grads_and_vars), \
self.optimizer1.minimize(self.ent * tf.reduce_mean(-q_entropy), var_list=fqf_params), \
tf.reduce_mean(loss), tf.reduce_mean(loss1), \
tf.squeeze(chosen_action_quantile_values), \
tf.squeeze(replay_quantiles[:,0,:,:]), \
self._replay_net_outputs.v_diff

Просмотреть файл

@ -1,15 +1,15 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Просмотреть файл

@ -1,46 +1,46 @@
# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to
# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
# comparison.
import dopamine.agents.implicit_quantile.implicit_quantile_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
ImplicitQuantileAgent.kappa = 1.0
ImplicitQuantileAgent.num_tau_samples = 32
ImplicitQuantileAgent.num_tau_prime_samples = 32
ImplicitQuantileAgent.num_quantile_samples = 32
ImplicitQuantileAgent.runtype = 'RUNTYPE'
RainbowAgent.gamma = 0.99
RainbowAgent.game = 'GAME'
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
# IQN currently does not support prioritized replay.
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00005
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
atari_lib.create_atari_environment.sticky_actions = True
create_agent.agent_name = 'implicit_quantile'
Runner.num_iterations = 200
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.training_steps = 250000
Runner.evaluation_steps = 125000
Runner.max_steps_per_episode = 27000
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to
# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
# comparison.
import dopamine.agents.implicit_quantile.implicit_quantile_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
ImplicitQuantileAgent.kappa = 1.0
ImplicitQuantileAgent.num_tau_samples = 32
ImplicitQuantileAgent.num_tau_prime_samples = 32
ImplicitQuantileAgent.num_quantile_samples = 32
ImplicitQuantileAgent.runtype = 'RUNTYPE'
RainbowAgent.gamma = 0.99
RainbowAgent.game = 'GAME'
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
# IQN currently does not support prioritized replay.
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00005
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
atari_lib.create_atari_environment.sticky_actions = True
create_agent.agent_name = 'implicit_quantile'
Runner.num_iterations = 200
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.training_steps = 250000
Runner.evaluation_steps = 125000
Runner.max_steps_per_episode = 27000
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,43 +1,43 @@
# Hyperparameters follow Dabney et al. (2018).
import dopamine.agents.implicit_quantile.implicit_quantile_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
ImplicitQuantileAgent.kappa = 1.0
ImplicitQuantileAgent.num_tau_samples = 32
ImplicitQuantileAgent.num_tau_prime_samples = 32
ImplicitQuantileAgent.num_quantile_samples = 32
RainbowAgent.gamma = 0.99
RainbowAgent.game = 'GAME'
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 50000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 10000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 1000000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00005
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME'
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'implicit_quantile'
Runner.num_iterations = 200
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.training_steps = 250000
Runner.evaluation_steps = 125000
Runner.max_steps_per_episode = 27000
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
# Hyperparameters follow Dabney et al. (2018).
import dopamine.agents.implicit_quantile.implicit_quantile_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
ImplicitQuantileAgent.kappa = 1.0
ImplicitQuantileAgent.num_tau_samples = 32
ImplicitQuantileAgent.num_tau_prime_samples = 32
ImplicitQuantileAgent.num_quantile_samples = 32
RainbowAgent.gamma = 0.99
RainbowAgent.game = 'GAME'
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 50000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 10000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 1000000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00005
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME'
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'implicit_quantile'
Runner.num_iterations = 200
Runner.game = 'GAME'
Runner.runtype = 'RUNTYPE'
Runner.training_steps = 250000
Runner.evaluation_steps = 125000
Runner.max_steps_per_episode = 27000
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,348 +1,348 @@
#1 coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The implicit quantile networks (IQN) agent.
The agent follows the description given in "Implicit Quantile Networks for
Distributional RL" (Dabney et. al, 2018).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
from dopamine.agents.rainbow import rainbow_agent
from dopamine.discrete_domains import atari_lib
import tensorflow as tf
import gin.tf
slim = tf.contrib.slim
@gin.configurable
class ImplicitQuantileAgent(rainbow_agent.RainbowAgent):
"""An extension of Rainbow to perform implicit quantile regression."""
def __init__(self,
sess,
num_actions,
network=atari_lib.implicit_quantile_network,
kappa=1.0,
num_tau_samples=32,
num_tau_prime_samples=32,
num_quantile_samples=32,
quantile_embedding_dim=64,
double_dqn=False,
summary_writer=None,
summary_writing_frequency=500):
"""Initializes the agent and constructs the Graph.
Most of this constructor's parameters are IQN-specific hyperparameters whose
values are taken from Dabney et al. (2018).
Args:
sess: `tf.Session` object for running associated ops.
num_actions: int, number of actions the agent can take at any state.
network: function expecting three parameters:
(num_actions, network_type, state). This function will return the
network_type object containing the tensors output by the network.
See dopamine.discrete_domains.atari_lib.nature_dqn_network as
an example.
kappa: float, Huber loss cutoff.
num_tau_samples: int, number of online quantile samples for loss
estimation.
num_tau_prime_samples: int, number of target quantile samples for loss
estimation.
num_quantile_samples: int, number of quantile samples for computing
Q-values.
quantile_embedding_dim: int, embedding dimension for the quantile input.
double_dqn: boolean, whether to perform double DQN style learning
as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
summary_writer: SummaryWriter object for outputting training statistics.
Summary writing disabled if set to None.
summary_writing_frequency: int, frequency with which summaries will be
written. Lower values will result in slower training.
"""
self.kappa = kappa
# num_tau_samples = N below equation (3) in the paper.
self.num_tau_samples = num_tau_samples
# num_tau_prime_samples = N' below equation (3) in the paper.
self.num_tau_prime_samples = num_tau_prime_samples
# num_quantile_samples = k below equation (3) in the paper.
self.num_quantile_samples = num_quantile_samples
# quantile_embedding_dim = n above equation (4) in the paper.
self.quantile_embedding_dim = quantile_embedding_dim
# option to perform double dqn.
self.double_dqn = double_dqn
super(ImplicitQuantileAgent, self).__init__(
sess=sess,
num_actions=num_actions,
network=network,
summary_writer=summary_writer,
summary_writing_frequency=summary_writing_frequency)
def _get_network_type(self):
"""Returns the type of the outputs of the implicit quantile network.
Returns:
_network_type object defining the outputs of the network.
"""
return collections.namedtuple(
'iqn_network', ['quantile_values', 'quantiles'])
def _network_template(self, state, num_quantiles):
r"""Builds an Implicit Quantile ConvNet.
Takes state and quantile as inputs and outputs state-action quantile values.
Args:
state: A `tf.placeholder` for the RL state.
num_quantiles: int, number of quantile inputs.
Returns:
_network_type object containing quantile value outputs of the network.
"""
return self.network(self.num_actions, self.quantile_embedding_dim,
self._get_network_type(), state, num_quantiles)
def _train_step(self):
"""Runs a single training step.
Runs a training op if both:
(1) A minimum number of frames have been added to the replay buffer.
(2) `training_steps` is a multiple of `update_period`.
Also, syncs weights from online to target network if training steps is a
multiple of target update period.
"""
# Run a train op at the rate of self.update_period if enough training steps
# have been run. This matches the Nature DQN behaviour.
if self._replay.memory.add_count > self.min_replay_history:
if self.training_steps % self.update_period == 0:
self._sess.run(self._train_op)
if (self.summary_writer is not None and
self.training_steps > 0 and
self.training_steps % self.summary_writing_frequency == 0):
summary = self._sess.run(self._merged_summaries)
self.summary_writer.add_summary(summary, self.training_steps)
if self.training_steps % self.target_update_period == 0:
self._sess.run(self._sync_qt_ops)
self.training_steps += 1
def _build_networks(self):
"""Builds the IQN computations needed for acting and training.
These are:
self.online_convnet: For computing the current state's quantile values.
self.target_convnet: For computing the next state's target quantile
values.
self._net_outputs: The actual quantile values.
self._q_argmax: The action maximizing the current state's Q-values.
self._replay_net_outputs: The replayed states' quantile values.
self._replay_next_target_net_outputs: The replayed next states' target
quantile values.
"""
# Calling online_convnet will generate a new graph as defined in
# self._get_network_template using whatever input is passed, but will always
# share the same weights.
self.online_convnet = tf.make_template('Online', self._network_template)
self.target_convnet = tf.make_template('Target', self._network_template)
# Compute the Q-values which are used for action selection in the current
# state.
self._net_outputs = self.online_convnet(self.state_ph,
self.num_quantile_samples)
# Shape of self._net_outputs.quantile_values:
# num_quantile_samples x num_actions.
# e.g. if num_actions is 2, it might look something like this:
# Vals for Quantile .2 Vals for Quantile .4 Vals for Quantile .6
# [[0.1, 0.5], [0.15, -0.3], [0.15, -0.2]]
# Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
self._q_argmax = tf.argmax(self._q_values, axis=0)
self._replay_net_outputs = self.online_convnet(self._replay.states,
self.num_tau_samples)
# Shape: (num_tau_samples x batch_size) x num_actions.
self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
self._replay_net_quantiles = self._replay_net_outputs.quantiles
# Do the same for next states in the replay buffer.
self._replay_net_target_outputs = self.target_convnet(
self._replay.next_states, self.num_tau_prime_samples)
# Shape: (num_tau_prime_samples x batch_size) x num_actions.
vals = self._replay_net_target_outputs.quantile_values
self._replay_net_target_quantile_values = vals
# Compute Q-values which are used for action selection for the next states
# in the replay buffer. Compute the argmax over the Q-values.
if self.double_dqn:
outputs_action = self.online_convnet(self._replay.next_states,
self.num_quantile_samples)
else:
outputs_action = self.target_convnet(self._replay.next_states,
self.num_quantile_samples)
# Shape: (num_quantile_samples x batch_size) x num_actions.
target_quantile_values_action = outputs_action.quantile_values
# Shape: num_quantile_samples x batch_size x num_actions.
target_quantile_values_action = tf.reshape(target_quantile_values_action,
[self.num_quantile_samples,
self._replay.batch_size,
self.num_actions])
# Shape: batch_size x num_actions.
self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
target_quantile_values_action, axis=0))
self._replay_next_qt_argmax = tf.argmax(
self._replay_net_target_q_values, axis=1)
def _build_target_quantile_values_op(self):
"""Build an op used as a target for return values at given quantiles.
Returns:
An op calculating the target quantile return.
"""
batch_size = tf.shape(self._replay.rewards)[0]
# Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
rewards = self._replay.rewards[:, None]
rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
# Incorporate terminal state to discount factor.
# size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
[self.num_tau_prime_samples, 1])
# Get the indices of the maximium Q-value across the action dimension.
# Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
replay_next_qt_argmax = tf.tile(
self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
# Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
batch_indices = tf.cast(tf.range(
self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
# Shape of batch_indexed_target_values:
# (num_tau_prime_samples x batch_size) x 2.
batch_indexed_target_values = tf.concat(
[batch_indices, replay_next_qt_argmax], axis=1)
# Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
target_quantile_values = tf.gather_nd(
self._replay_net_target_quantile_values,
batch_indexed_target_values)[:, None]
return rewards + gamma_with_terminal * target_quantile_values
def _build_train_op(self):
"""Builds a training op.
Returns:
train_op: An op performing one step of training from replay data.
"""
batch_size = tf.shape(self._replay.rewards)[0]
target_quantile_values = tf.stop_gradient(
self._build_target_quantile_values_op())
# Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
# the manner in which the target_quantile_values are tiled.
target_quantile_values = tf.reshape(target_quantile_values,
[self.num_tau_prime_samples,
batch_size, 1])
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_prime_samples x 1 to prepare for computation of
# Bellman errors.
# Final shape of target_quantile_values:
# batch_size x num_tau_prime_samples x 1.
target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
# Shape of indices: (num_tau_samples x batch_size) x 1.
# Expand dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
indices = tf.range(self.num_tau_samples * batch_size)[:, None]
# Expand the dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
reshaped_actions = self._replay.actions[:, None]
reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
# Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
chosen_action_quantile_values = tf.gather_nd(
self._replay_net_quantile_values, reshaped_actions)
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_samples x 1 to prepare for computation of
# Bellman errors.
# Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
# in which the quantile values are tiled.
chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
[self.num_tau_samples,
batch_size, 1])
# Final shape of chosen_action_quantile_values:
# batch_size x num_tau_samples x 1.
chosen_action_quantile_values = tf.transpose(
chosen_action_quantile_values, [1, 0, 2]) #batchsize x quan x 1
# Shape of bellman_erors and huber_loss:
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
# The huber loss (see Section 2.3 of the paper) is defined via two cases:
# case_one: |bellman_errors| <= kappa
# case_two: |bellman_errors| > kappa
huber_loss_case_one = tf.to_float(
tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
huber_loss_case_two = tf.to_float(
tf.abs(bellman_errors) > self.kappa) * self.kappa * (
tf.abs(bellman_errors) - 0.5 * self.kappa)
huber_loss = huber_loss_case_one + huber_loss_case_two
# Reshape replay_quantiles to batch_size x num_tau_samples x 1
replay_quantiles = tf.reshape(
self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2]) #batchsize x quan x 1
# Tile by num_tau_prime_samples along a new dimension. Shape is now
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
# These quantiles will be used for computation of the quantile huber loss
# below (see section 2.3 of the paper).
replay_quantiles = tf.to_float(tf.tile(
replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
# Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
# Sum over current quantile value (num_tau_samples) dimension,
# average over target quantile value (num_tau_prime_samples) dimension.
# Shape: batch_size x num_tau_prime_samples x 1.
loss = tf.reduce_sum(quantile_huber_loss, axis=2)
# Shape: batch_size x 1.
loss = tf.reduce_mean(loss, axis=1)
# TODO(kumasaurabh): Add prioritized replay functionality here.
update_priorities_op = tf.no_op()
with tf.control_dependencies([update_priorities_op]):
if self.summary_writer is not None:
with tf.variable_scope('Losses'):
tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
return self.optimizer.minimize(tf.reduce_mean(loss))
#1 coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The implicit quantile networks (IQN) agent.
The agent follows the description given in "Implicit Quantile Networks for
Distributional RL" (Dabney et. al, 2018).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
from dopamine.agents.rainbow import rainbow_agent
from dopamine.discrete_domains import atari_lib
import tensorflow as tf
import gin.tf
slim = tf.contrib.slim
@gin.configurable
class ImplicitQuantileAgent(rainbow_agent.RainbowAgent):
"""An extension of Rainbow to perform implicit quantile regression."""
def __init__(self,
sess,
num_actions,
network=atari_lib.implicit_quantile_network,
kappa=1.0,
num_tau_samples=32,
num_tau_prime_samples=32,
num_quantile_samples=32,
quantile_embedding_dim=64,
double_dqn=False,
summary_writer=None,
summary_writing_frequency=500):
"""Initializes the agent and constructs the Graph.
Most of this constructor's parameters are IQN-specific hyperparameters whose
values are taken from Dabney et al. (2018).
Args:
sess: `tf.Session` object for running associated ops.
num_actions: int, number of actions the agent can take at any state.
network: function expecting three parameters:
(num_actions, network_type, state). This function will return the
network_type object containing the tensors output by the network.
See dopamine.discrete_domains.atari_lib.nature_dqn_network as
an example.
kappa: float, Huber loss cutoff.
num_tau_samples: int, number of online quantile samples for loss
estimation.
num_tau_prime_samples: int, number of target quantile samples for loss
estimation.
num_quantile_samples: int, number of quantile samples for computing
Q-values.
quantile_embedding_dim: int, embedding dimension for the quantile input.
double_dqn: boolean, whether to perform double DQN style learning
as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
summary_writer: SummaryWriter object for outputting training statistics.
Summary writing disabled if set to None.
summary_writing_frequency: int, frequency with which summaries will be
written. Lower values will result in slower training.
"""
self.kappa = kappa
# num_tau_samples = N below equation (3) in the paper.
self.num_tau_samples = num_tau_samples
# num_tau_prime_samples = N' below equation (3) in the paper.
self.num_tau_prime_samples = num_tau_prime_samples
# num_quantile_samples = k below equation (3) in the paper.
self.num_quantile_samples = num_quantile_samples
# quantile_embedding_dim = n above equation (4) in the paper.
self.quantile_embedding_dim = quantile_embedding_dim
# option to perform double dqn.
self.double_dqn = double_dqn
super(ImplicitQuantileAgent, self).__init__(
sess=sess,
num_actions=num_actions,
network=network,
summary_writer=summary_writer,
summary_writing_frequency=summary_writing_frequency)
def _get_network_type(self):
"""Returns the type of the outputs of the implicit quantile network.
Returns:
_network_type object defining the outputs of the network.
"""
return collections.namedtuple(
'iqn_network', ['quantile_values', 'quantiles'])
def _network_template(self, state, num_quantiles):
r"""Builds an Implicit Quantile ConvNet.
Takes state and quantile as inputs and outputs state-action quantile values.
Args:
state: A `tf.placeholder` for the RL state.
num_quantiles: int, number of quantile inputs.
Returns:
_network_type object containing quantile value outputs of the network.
"""
return self.network(self.num_actions, self.quantile_embedding_dim,
self._get_network_type(), state, num_quantiles)
def _train_step(self):
"""Runs a single training step.
Runs a training op if both:
(1) A minimum number of frames have been added to the replay buffer.
(2) `training_steps` is a multiple of `update_period`.
Also, syncs weights from online to target network if training steps is a
multiple of target update period.
"""
# Run a train op at the rate of self.update_period if enough training steps
# have been run. This matches the Nature DQN behaviour.
if self._replay.memory.add_count > self.min_replay_history:
if self.training_steps % self.update_period == 0:
self._sess.run(self._train_op)
if (self.summary_writer is not None and
self.training_steps > 0 and
self.training_steps % self.summary_writing_frequency == 0):
summary = self._sess.run(self._merged_summaries)
self.summary_writer.add_summary(summary, self.training_steps)
if self.training_steps % self.target_update_period == 0:
self._sess.run(self._sync_qt_ops)
self.training_steps += 1
def _build_networks(self):
"""Builds the IQN computations needed for acting and training.
These are:
self.online_convnet: For computing the current state's quantile values.
self.target_convnet: For computing the next state's target quantile
values.
self._net_outputs: The actual quantile values.
self._q_argmax: The action maximizing the current state's Q-values.
self._replay_net_outputs: The replayed states' quantile values.
self._replay_next_target_net_outputs: The replayed next states' target
quantile values.
"""
# Calling online_convnet will generate a new graph as defined in
# self._get_network_template using whatever input is passed, but will always
# share the same weights.
self.online_convnet = tf.make_template('Online', self._network_template)
self.target_convnet = tf.make_template('Target', self._network_template)
# Compute the Q-values which are used for action selection in the current
# state.
self._net_outputs = self.online_convnet(self.state_ph,
self.num_quantile_samples)
# Shape of self._net_outputs.quantile_values:
# num_quantile_samples x num_actions.
# e.g. if num_actions is 2, it might look something like this:
# Vals for Quantile .2 Vals for Quantile .4 Vals for Quantile .6
# [[0.1, 0.5], [0.15, -0.3], [0.15, -0.2]]
# Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
self._q_argmax = tf.argmax(self._q_values, axis=0)
self._replay_net_outputs = self.online_convnet(self._replay.states,
self.num_tau_samples)
# Shape: (num_tau_samples x batch_size) x num_actions.
self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
self._replay_net_quantiles = self._replay_net_outputs.quantiles
# Do the same for next states in the replay buffer.
self._replay_net_target_outputs = self.target_convnet(
self._replay.next_states, self.num_tau_prime_samples)
# Shape: (num_tau_prime_samples x batch_size) x num_actions.
vals = self._replay_net_target_outputs.quantile_values
self._replay_net_target_quantile_values = vals
# Compute Q-values which are used for action selection for the next states
# in the replay buffer. Compute the argmax over the Q-values.
if self.double_dqn:
outputs_action = self.online_convnet(self._replay.next_states,
self.num_quantile_samples)
else:
outputs_action = self.target_convnet(self._replay.next_states,
self.num_quantile_samples)
# Shape: (num_quantile_samples x batch_size) x num_actions.
target_quantile_values_action = outputs_action.quantile_values
# Shape: num_quantile_samples x batch_size x num_actions.
target_quantile_values_action = tf.reshape(target_quantile_values_action,
[self.num_quantile_samples,
self._replay.batch_size,
self.num_actions])
# Shape: batch_size x num_actions.
self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
target_quantile_values_action, axis=0))
self._replay_next_qt_argmax = tf.argmax(
self._replay_net_target_q_values, axis=1)
def _build_target_quantile_values_op(self):
"""Build an op used as a target for return values at given quantiles.
Returns:
An op calculating the target quantile return.
"""
batch_size = tf.shape(self._replay.rewards)[0]
# Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
rewards = self._replay.rewards[:, None]
rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
# Incorporate terminal state to discount factor.
# size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
[self.num_tau_prime_samples, 1])
# Get the indices of the maximium Q-value across the action dimension.
# Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
replay_next_qt_argmax = tf.tile(
self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
# Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
batch_indices = tf.cast(tf.range(
self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
# Shape of batch_indexed_target_values:
# (num_tau_prime_samples x batch_size) x 2.
batch_indexed_target_values = tf.concat(
[batch_indices, replay_next_qt_argmax], axis=1)
# Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
target_quantile_values = tf.gather_nd(
self._replay_net_target_quantile_values,
batch_indexed_target_values)[:, None]
return rewards + gamma_with_terminal * target_quantile_values
def _build_train_op(self):
"""Builds a training op.
Returns:
train_op: An op performing one step of training from replay data.
"""
batch_size = tf.shape(self._replay.rewards)[0]
target_quantile_values = tf.stop_gradient(
self._build_target_quantile_values_op())
# Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
# the manner in which the target_quantile_values are tiled.
target_quantile_values = tf.reshape(target_quantile_values,
[self.num_tau_prime_samples,
batch_size, 1])
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_prime_samples x 1 to prepare for computation of
# Bellman errors.
# Final shape of target_quantile_values:
# batch_size x num_tau_prime_samples x 1.
target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
# Shape of indices: (num_tau_samples x batch_size) x 1.
# Expand dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
indices = tf.range(self.num_tau_samples * batch_size)[:, None]
# Expand the dimension by one so that it can be used to index into all the
# quantiles when using the tf.gather_nd function (see below).
reshaped_actions = self._replay.actions[:, None]
reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
# Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
chosen_action_quantile_values = tf.gather_nd(
self._replay_net_quantile_values, reshaped_actions)
# Transpose dimensions so that the dimensionality is batch_size x
# self.num_tau_samples x 1 to prepare for computation of
# Bellman errors.
# Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
# in which the quantile values are tiled.
chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
[self.num_tau_samples,
batch_size, 1])
# Final shape of chosen_action_quantile_values:
# batch_size x num_tau_samples x 1.
chosen_action_quantile_values = tf.transpose(
chosen_action_quantile_values, [1, 0, 2]) #batchsize x quan x 1
# Shape of bellman_erors and huber_loss:
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
# The huber loss (see Section 2.3 of the paper) is defined via two cases:
# case_one: |bellman_errors| <= kappa
# case_two: |bellman_errors| > kappa
huber_loss_case_one = tf.to_float(
tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
huber_loss_case_two = tf.to_float(
tf.abs(bellman_errors) > self.kappa) * self.kappa * (
tf.abs(bellman_errors) - 0.5 * self.kappa)
huber_loss = huber_loss_case_one + huber_loss_case_two
# Reshape replay_quantiles to batch_size x num_tau_samples x 1
replay_quantiles = tf.reshape(
self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2]) #batchsize x quan x 1
# Tile by num_tau_prime_samples along a new dimension. Shape is now
# batch_size x num_tau_prime_samples x num_tau_samples x 1.
# These quantiles will be used for computation of the quantile huber loss
# below (see section 2.3 of the paper).
replay_quantiles = tf.to_float(tf.tile(
replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
# Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
# Sum over current quantile value (num_tau_samples) dimension,
# average over target quantile value (num_tau_prime_samples) dimension.
# Shape: batch_size x num_tau_prime_samples x 1.
loss = tf.reduce_sum(quantile_huber_loss, axis=2)
# Shape: batch_size x 1.
loss = tf.reduce_mean(loss, axis=1)
# TODO(kumasaurabh): Add prioritized replay functionality here.
update_priorities_op = tf.no_op()
with tf.control_dependencies([update_priorities_op]):
if self.summary_writer is not None:
with tf.variable_scope('Losses'):
tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
return self.optimizer.minimize(tf.reduce_mean(loss))

Просмотреть файл

@ -1,15 +1,15 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Просмотреть файл

@ -1,42 +1,42 @@
# Hyperparameters follow the settings from Bellemare et al. (2017), but we
# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to
# ensure apples-to-apples comparison.
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.num_atoms = 51
RainbowAgent.dueltype = 'DUELTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00025
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
#atari_lib.create_atari_environment.sticky_actions = True
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.dueltype = 'DUELTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
# Hyperparameters follow the settings from Bellemare et al. (2017), but we
# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to
# ensure apples-to-apples comparison.
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.num_atoms = 51
RainbowAgent.dueltype = 'DUELTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00025
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
#atari_lib.create_atari_environment.sticky_actions = True
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.dueltype = 'DUELTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,39 +1,39 @@
# Hyperparameters for a simple C51-style Acrobot agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.agents.dqn.dqn_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
RainbowAgent.network = @gym_lib.acrobot_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.1
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'Acrobot'
create_gym_environment.version = 'v1'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 500
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128
# Hyperparameters for a simple C51-style Acrobot agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.agents.dqn.dqn_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
RainbowAgent.network = @gym_lib.acrobot_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.1
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'Acrobot'
create_gym_environment.version = 'v1'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 500
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128

Просмотреть файл

@ -1,39 +1,39 @@
# Hyperparameters for a simple C51-style Cartpole agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.agents.dqn.dqn_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
RainbowAgent.network = @gym_lib.cartpole_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.001
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'CartPole'
create_gym_environment.version = 'v0'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 200 # Default max episode length.
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128
# Hyperparameters for a simple C51-style Cartpole agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.agents.dqn.dqn_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
RainbowAgent.network = @gym_lib.cartpole_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.001
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'CartPole'
create_gym_environment.version = 'v0'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 200 # Default max episode length.
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128

Просмотреть файл

@ -1,41 +1,41 @@
# Hyperparameters used in Bellemare et al. (2017).
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.num_atoms = 51
RainbowAgent.dueltype = 'DUELTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 50000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 10000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 1000000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00025
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.dueltype = 'DUELTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
# Hyperparameters used in Bellemare et al. (2017).
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.num_atoms = 51
RainbowAgent.dueltype = 'DUELTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 50000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 10000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 1000000 # agent steps
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.00025
tf.train.AdamOptimizer.epsilon = 0.0003125
atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.dueltype = 'DUELTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,42 +1,42 @@
# Hyperparameters follow Hessel et al. (2018), except for sticky_actions,
# which was False (not using sticky actions) in the original paper.
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.num_atoms = 51
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 3
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
# Note these parameters are different from C51's.
tf.train.AdamOptimizer.learning_rate = 0.0000625
tf.train.AdamOptimizer.epsilon = 0.00015
atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
#atari_lib.create_atari_environment.sticky_actions = True
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.runtype = 'RUNTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
# Hyperparameters follow Hessel et al. (2018), except for sticky_actions,
# which was False (not using sticky actions) in the original paper.
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.num_atoms = 51
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 3
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
# Note these parameters are different from C51's.
tf.train.AdamOptimizer.learning_rate = 0.0000625
tf.train.AdamOptimizer.epsilon = 0.00015
atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
#atari_lib.create_atari_environment.sticky_actions = True
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.runtype = 'RUNTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,43 +1,43 @@
# Hyperparameters follow Hessel et al. (2018).
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
import os
RainbowAgent.num_atoms = 51
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
#RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
# Note these parameters are different from C51's.
tf.train.AdamOptimizer.learning_rate = 0.0000625
tf.train.AdamOptimizer.epsilon = 0.00015
atari_lib.create_atari_environment.game_name = 'GAME' #'StarGunner'
# Deterministic ALE version used in the AAAI paper.
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.runtype = 'RUNTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
# Hyperparameters follow Hessel et al. (2018).
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
import os
RainbowAgent.num_atoms = 51
RainbowAgent.runtype = 'RUNTYPE'
RainbowAgent.game = 'GAME'
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 1
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps
RainbowAgent.epsilon_train = 0.01
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 250000 # agent steps
#RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
# Note these parameters are different from C51's.
tf.train.AdamOptimizer.learning_rate = 0.0000625
tf.train.AdamOptimizer.epsilon = 0.00015
atari_lib.create_atari_environment.game_name = 'GAME' #'StarGunner'
# Deterministic ALE version used in the AAAI paper.
atari_lib.create_atari_environment.sticky_actions = False
create_agent.agent_name = 'rainbow'
Runner.num_iterations = 200
Runner.runtype = 'RUNTYPE'
Runner.game = 'GAME'
Runner.training_steps = 250000 # agent steps
Runner.evaluation_steps = 125000 # agent steps
Runner.max_steps_per_episode = 27000 # agent steps
AtariPreprocessing.terminal_on_life_loss = True
WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32

Просмотреть файл

@ -1,38 +1,38 @@
# Hyperparameters for a simple Rainbow-style Acrobot agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
RainbowAgent.network = @gym_lib.acrobot_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 3
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.09
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'Acrobot'
create_gym_environment.version = 'v1'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 500
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128
# Hyperparameters for a simple Rainbow-style Acrobot agent. The hyperparameters
# chosen achieve reasonable performance.
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
RainbowAgent.network = @gym_lib.acrobot_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 3
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.09
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'Acrobot'
create_gym_environment.version = 'v1'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 500
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128

Просмотреть файл

@ -1,39 +1,39 @@
# Hyperparameters for a simple Rainbow-style Cartpole agent. The
# hyperparameters chosen achieve reasonable performance.
import dopamine.agents.dqn.dqn_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
RainbowAgent.network = @gym_lib.cartpole_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 3
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.09
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'CartPole'
create_gym_environment.version = 'v0'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 200 # Default max episode length.
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128
# Hyperparameters for a simple Rainbow-style Cartpole agent. The
# hyperparameters chosen achieve reasonable performance.
import dopamine.agents.dqn.dqn_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.gym_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables
RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
RainbowAgent.network = @gym_lib.cartpole_rainbow_network
RainbowAgent.num_atoms = 51
RainbowAgent.vmax = 10.
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 3
RainbowAgent.min_replay_history = 500
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 100
RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
RainbowAgent.replay_scheme = 'prioritized'
RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()
tf.train.AdamOptimizer.learning_rate = 0.09
tf.train.AdamOptimizer.epsilon = 0.0003125
create_gym_environment.environment_name = 'CartPole'
create_gym_environment.version = 'v0'
create_agent.agent_name = 'rainbow'
Runner.create_environment_fn = @gym_lib.create_gym_environment
Runner.num_iterations = 500
Runner.training_steps = 1000
Runner.evaluation_steps = 1000
Runner.max_steps_per_episode = 200 # Default max episode length.
WrappedPrioritizedReplayBuffer.replay_capacity = 50000
WrappedPrioritizedReplayBuffer.batch_size = 128

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,31 +1,31 @@
# Colabs
This directory contains
[`utils.py`](https://github.com/google/dopamine/blob/master/dopamine/colab/utils.py),
which provides a number of useful utilities for loading experiment statistics.
We also provide a set of colabs to help illustrate how you can use Dopamine.
## Agents
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/agents.ipynb)
we illustrate how to create a new agent by either subclassing
[`DQN`](https://github.com/google/dopamine/blob/master/dopamine/agents/dqn/dqn_agent.py)
or by creating a new agent from scratch.
## Loading statistics
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/load_statistics.ipynb)
we illustrate how to load and visualize the logs data produced by Dopamine.
## Visualizing with Tensorboard
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/tensorboard.ipynb)
we illustrate how to download and visualize different agents with Tensorboard.
## Training on Cartpole
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/cartpole.ipynb)
we illustrate how to train DQN and C51 on the Cartpole environment.
# Colabs
This directory contains
[`utils.py`](https://github.com/google/dopamine/blob/master/dopamine/colab/utils.py),
which provides a number of useful utilities for loading experiment statistics.
We also provide a set of colabs to help illustrate how you can use Dopamine.
## Agents
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/agents.ipynb)
we illustrate how to create a new agent by either subclassing
[`DQN`](https://github.com/google/dopamine/blob/master/dopamine/agents/dqn/dqn_agent.py)
or by creating a new agent from scratch.
## Loading statistics
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/load_statistics.ipynb)
we illustrate how to load and visualize the logs data produced by Dopamine.
## Visualizing with Tensorboard
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/tensorboard.ipynb)
we illustrate how to download and visualize different agents with Tensorboard.
## Training on Cartpole
In this
[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/cartpole.ipynb)
we illustrate how to train DQN and C51 on the Cartpole environment.

Просмотреть файл

@ -1,15 +1,15 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,112 +1,112 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "tensorboard.ipynb",
"version": "0.3.2",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"metadata": {
"id": "VYNA79KmgvbY",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"Copyright 2018 The Dopamine Authors.\n",
"\n",
"Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
"\n",
"https://www.apache.org/licenses/LICENSE-2.0\n",
"\n",
"Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
]
},
{
"metadata": {
"id": "Ctd9k0h6wnqT",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"# Visualize Dopamine baselines with Tensorboard\n",
"This colab allows you to easily view the trained baselines with Tensorboard (even if you don't have Tensorboard on your local machine!).\n",
"\n",
"Simply specify the game you would like to visualize and then run the cells in order.\n",
"\n",
"_The instructions for setting up Tensorboard were obtained from https://www.dlology.com/blog/quick-guide-to-run-tensorboard-in-google-colab/_"
]
},
{
"metadata": {
"id": "s8r_45_0qpmb",
"colab_type": "code",
"colab": {},
"cellView": "form"
},
"cell_type": "code",
"source": [
"# @title Prepare all necessary files and binaries.\n",
"!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip\n",
"!unzip ngrok-stable-linux-amd64.zip\n",
"!gsutil -q -m cp -R gs://download-dopamine-rl/compiled_tb_event_files.tar.gz /content/\n",
"!tar -xvzf /content/compiled_tb_event_files.tar.gz"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "D-oZRzeWwHZN",
"colab_type": "code",
"colab": {},
"cellView": "form"
},
"cell_type": "code",
"source": [
"# @title Select which game to visualize.\n",
"game = 'Asterix' # @param['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge', 'Zaxxon']\n",
"agents = ['dqn', 'c51', 'rainbow', 'iqn']\n",
"for agent in agents:\n",
" for run in range(1, 6):\n",
" !mkdir -p \"/content/$game/$agent/$run\"\n",
" !cp -r \"/content/$agent/$game/$run\" \"/content/$game/$agent/$run\"\n",
"LOG_DIR = '/content/{}'.format(game)\n",
"get_ipython().system_raw(\n",
" 'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'\n",
" .format(LOG_DIR)\n",
")"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "zlKKnaP4y9FA",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"cellView": "form",
"outputId": "3abff714-c484-436e-dc5f-88b15511f4f2"
},
"cell_type": "code",
"source": [
"# @title Start the tensorboard\n",
"get_ipython().system_raw('./ngrok http 6006 &')\n",
"! curl -s http://localhost:4040/api/tunnels | python3 -c \\\n",
" \"import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])\""
],
"execution_count": 0,
"outputs": []
}
]
}
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "tensorboard.ipynb",
"version": "0.3.2",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"metadata": {
"id": "VYNA79KmgvbY",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"Copyright 2018 The Dopamine Authors.\n",
"\n",
"Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
"\n",
"https://www.apache.org/licenses/LICENSE-2.0\n",
"\n",
"Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
]
},
{
"metadata": {
"id": "Ctd9k0h6wnqT",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"# Visualize Dopamine baselines with Tensorboard\n",
"This colab allows you to easily view the trained baselines with Tensorboard (even if you don't have Tensorboard on your local machine!).\n",
"\n",
"Simply specify the game you would like to visualize and then run the cells in order.\n",
"\n",
"_The instructions for setting up Tensorboard were obtained from https://www.dlology.com/blog/quick-guide-to-run-tensorboard-in-google-colab/_"
]
},
{
"metadata": {
"id": "s8r_45_0qpmb",
"colab_type": "code",
"colab": {},
"cellView": "form"
},
"cell_type": "code",
"source": [
"# @title Prepare all necessary files and binaries.\n",
"!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip\n",
"!unzip ngrok-stable-linux-amd64.zip\n",
"!gsutil -q -m cp -R gs://download-dopamine-rl/compiled_tb_event_files.tar.gz /content/\n",
"!tar -xvzf /content/compiled_tb_event_files.tar.gz"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "D-oZRzeWwHZN",
"colab_type": "code",
"colab": {},
"cellView": "form"
},
"cell_type": "code",
"source": [
"# @title Select which game to visualize.\n",
"game = 'Asterix' # @param['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge', 'Zaxxon']\n",
"agents = ['dqn', 'c51', 'rainbow', 'iqn']\n",
"for agent in agents:\n",
" for run in range(1, 6):\n",
" !mkdir -p \"/content/$game/$agent/$run\"\n",
" !cp -r \"/content/$agent/$game/$run\" \"/content/$game/$agent/$run\"\n",
"LOG_DIR = '/content/{}'.format(game)\n",
"get_ipython().system_raw(\n",
" 'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'\n",
" .format(LOG_DIR)\n",
")"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "zlKKnaP4y9FA",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"cellView": "form",
"outputId": "3abff714-c484-436e-dc5f-88b15511f4f2"
},
"cell_type": "code",
"source": [
"# @title Start the tensorboard\n",
"get_ipython().system_raw('./ngrok http 6006 &')\n",
"! curl -s http://localhost:4040/api/tunnels | python3 -c \\\n",
" \"import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])\""
],
"execution_count": 0,
"outputs": []
}
]
}

Просмотреть файл

@ -1,280 +1,280 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This provides utilities for dealing with Dopamine data.
See: dopamine/common/logger.py .
"""
import itertools
import os
import pickle
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
FILE_PREFIX = 'log'
ITERATION_PREFIX = 'iteration_'
ALL_GAMES = ['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk',
'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede',
'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk',
'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite',
'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond',
'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix',
'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid',
'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris',
'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham',
'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge',
'Zaxxon']
def load_baselines(base_dir, verbose=False):
"""Reads in the baseline experimental data from a specified base directory.
Args:
base_dir: string, base directory where to read data from.
verbose: bool, whether to print warning messages.
Returns:
A dict containing pandas DataFrames for all available agents and games.
"""
experimental_data = {}
for game in ALL_GAMES:
for agent in ['dqn', 'c51', 'rainbow', 'iqn']:
game_data_file = os.path.join(base_dir, agent, '{}.pkl'.format(game))
if not tf.gfile.Exists(game_data_file):
if verbose:
# pylint: disable=superfluous-parens
print('Unable to load data for agent {} on game {}'.format(agent,
game))
# pylint: enable=superfluous-parens
continue
with tf.gfile.Open(game_data_file, 'rb') as f:
if sys.version_info.major >= 3:
# pylint: disable=unexpected-keyword-arg
single_agent_data = pickle.load(f, encoding='latin1')
# pylint: enable=unexpected-keyword-arg
else:
single_agent_data = pickle.load(f)
single_agent_data['agent'] = agent
if game in experimental_data:
experimental_data[game] = experimental_data[game].merge(
single_agent_data, how='outer')
else:
experimental_data[game] = single_agent_data
return experimental_data
def load_statistics(log_path, iteration_number=None, verbose=True):
"""Reads in a statistics object from log_path.
Args:
log_path: string, provides the full path to the training/eval statistics.
iteration_number: The iteration number of the statistics object we want
to read. If set to None, load the latest version.
verbose: Whether to output information about the load procedure.
Returns:
data: The requested statistics object.
iteration: The corresponding iteration number.
Raises:
Exception: if data is not present.
"""
# If no iteration is specified, we'll look for the most recent.
if iteration_number is None:
iteration_number = get_latest_iteration(log_path)
log_file = '%s/%s_%d' % (log_path, FILE_PREFIX, iteration_number)
if verbose:
# pylint: disable=superfluous-parens
print('Reading statistics from: {}'.format(log_file))
# pylint: enable=superfluous-parens
with tf.gfile.Open(log_file, 'rb') as f:
return pickle.load(f), iteration_number
def get_latest_file(path):
"""Return the file named 'path_[0-9]*' with the largest such number.
Args:
path: The base path (including directory and base name) to search.
Returns:
The latest file (in terms of given numbers).
"""
try:
latest_iteration = get_latest_iteration(path)
return os.path.join(path, '{}_{}'.format(FILE_PREFIX, latest_iteration))
except ValueError:
return None
def get_latest_iteration(path):
"""Return the largest iteration number corresponding to the given path.
Args:
path: The base path (including directory and base name) to search.
Returns:
The latest iteration number.
Raises:
ValueError: if there is not available log data at the given path.
"""
glob = os.path.join(path, '{}_[0-9]*'.format(FILE_PREFIX))
log_files = tf.gfile.Glob(glob)
if not log_files:
raise ValueError('No log data found at {}'.format(path))
def extract_iteration(x):
return int(x[x.rfind('_') + 1:])
latest_iteration = max(extract_iteration(x) for x in log_files)
return latest_iteration
def summarize_data(data, summary_keys):
"""Processes log data into a per-iteration summary.
Args:
data: Dictionary loaded by load_statistics describing the data. This
dictionary has keys iteration_0, iteration_1, ... describing per-iteration
data.
summary_keys: List of per-iteration data to be summarized.
Example:
data = load_statistics(...)
summarize_data(data, ['train_episode_returns',
'eval_episode_returns'])
Returns:
A dictionary mapping each key in returns_keys to a per-iteration summary.
"""
summary = {}
latest_iteration_number = len(data.keys())
current_value = None
for key in summary_keys:
summary[key] = []
# Compute per-iteration average of the given key.
for i in range(latest_iteration_number):
iter_key = '{}{}'.format(ITERATION_PREFIX, i)
# We allow reporting the same value multiple times when data is missing.
# If there is no data for this iteration, use the previous'.
if iter_key in data:
current_value = np.mean(data[iter_key][key])
summary[key].append(current_value)
return summary
def read_experiment(log_path,
parameter_set=None,
job_descriptor='',
iteration_number=None,
summary_keys=('train_episode_returns',
'eval_episode_returns'),
verbose=False):
"""Reads in a set of experimental results from log_path.
The provided parameter_set is an ordered_dict which
1) defines the parameters of this experiment,
2) defines the order in which they occur in the job descriptor.
The method reads all experiments of the form
${log_path}/${job_descriptor}.format(params)/logs,
where params is constructed from the cross product of the elements in
the parameter_set.
For example:
parameter_set = collections.OrderedDict([
('game', ['Asterix', 'Pong']),
('epsilon', ['0', '0.1'])
])
read_experiment('/tmp/logs', parameter_set, job_descriptor='{}_{}')
Will try to read logs from:
- /tmp/logs/Asterix_0/logs
- /tmp/logs/Asterix_0.1/logs
- /tmp/logs/Pong_0/logs
- /tmp/logs/Pong_0.1/logs
Args:
log_path: string, base path specifying where results live.
parameter_set: An ordered_dict mapping parameter names to allowable values.
job_descriptor: A job descriptor string which is used to construct the full
path for each trial within an experiment.
iteration_number: Int, if not None determines the iteration number at which
we read in results.
summary_keys: Iterable of strings, iteration statistics to summarize.
verbose: If True, print out additional information.
Returns:
A Pandas dataframe containing experimental results.
"""
keys = [] if parameter_set is None else list(parameter_set.keys())
# Extract parameter value lists, one per parameter.
ordered_values = [parameter_set[key] for key in keys]
column_names = keys + ['iteration'] + list(summary_keys)
num_parameter_settings = len([_ for _ in itertools.product(*ordered_values)])
expected_num_iterations = 200
expected_num_rows = num_parameter_settings * expected_num_iterations
# Create DataFrame with predicted number of rows.
data_frame = pd.DataFrame(index=np.arange(0, expected_num_rows),
columns=column_names)
row_index = 0
# Now take their cross product. This generates tuples of the form
# (p1, p2, p3, ...) where p1, p2, p3 are parameter values for the first,
# second, etc. parameters as ordered in value_set.
for parameter_tuple in itertools.product(*ordered_values):
if job_descriptor is not None:
name = job_descriptor.format(*parameter_tuple)
else:
# Construct name for values.
name = '-'.join([keys[i] + '_' + str(parameter_tuple[i])
for i in range(len(keys))])
experiment_path = '{}/{}/logs'.format(log_path, name)
raw_data, last_iteration = load_statistics(
experiment_path, iteration_number=iteration_number, verbose=verbose)
summary = summarize_data(raw_data, summary_keys)
for iteration in range(last_iteration):
# The row contains all the parameters, the iteration, and finally the
# requested values.
row_data = (list(parameter_tuple) + [iteration] +
[summary[key][iteration] for key in summary_keys])
data_frame.loc[row_index] = row_data
row_index += 1
# Shed any unused rows.
return data_frame.drop(np.arange(row_index, expected_num_rows))
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This provides utilities for dealing with Dopamine data.
See: dopamine/common/logger.py .
"""
import itertools
import os
import pickle
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
FILE_PREFIX = 'log'
ITERATION_PREFIX = 'iteration_'
ALL_GAMES = ['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk',
'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede',
'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk',
'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite',
'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond',
'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix',
'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid',
'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris',
'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham',
'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge',
'Zaxxon']
def load_baselines(base_dir, verbose=False):
"""Reads in the baseline experimental data from a specified base directory.
Args:
base_dir: string, base directory where to read data from.
verbose: bool, whether to print warning messages.
Returns:
A dict containing pandas DataFrames for all available agents and games.
"""
experimental_data = {}
for game in ALL_GAMES:
for agent in ['dqn', 'c51', 'rainbow', 'iqn']:
game_data_file = os.path.join(base_dir, agent, '{}.pkl'.format(game))
if not tf.gfile.Exists(game_data_file):
if verbose:
# pylint: disable=superfluous-parens
print('Unable to load data for agent {} on game {}'.format(agent,
game))
# pylint: enable=superfluous-parens
continue
with tf.gfile.Open(game_data_file, 'rb') as f:
if sys.version_info.major >= 3:
# pylint: disable=unexpected-keyword-arg
single_agent_data = pickle.load(f, encoding='latin1')
# pylint: enable=unexpected-keyword-arg
else:
single_agent_data = pickle.load(f)
single_agent_data['agent'] = agent
if game in experimental_data:
experimental_data[game] = experimental_data[game].merge(
single_agent_data, how='outer')
else:
experimental_data[game] = single_agent_data
return experimental_data
def load_statistics(log_path, iteration_number=None, verbose=True):
"""Reads in a statistics object from log_path.
Args:
log_path: string, provides the full path to the training/eval statistics.
iteration_number: The iteration number of the statistics object we want
to read. If set to None, load the latest version.
verbose: Whether to output information about the load procedure.
Returns:
data: The requested statistics object.
iteration: The corresponding iteration number.
Raises:
Exception: if data is not present.
"""
# If no iteration is specified, we'll look for the most recent.
if iteration_number is None:
iteration_number = get_latest_iteration(log_path)
log_file = '%s/%s_%d' % (log_path, FILE_PREFIX, iteration_number)
if verbose:
# pylint: disable=superfluous-parens
print('Reading statistics from: {}'.format(log_file))
# pylint: enable=superfluous-parens
with tf.gfile.Open(log_file, 'rb') as f:
return pickle.load(f), iteration_number
def get_latest_file(path):
"""Return the file named 'path_[0-9]*' with the largest such number.
Args:
path: The base path (including directory and base name) to search.
Returns:
The latest file (in terms of given numbers).
"""
try:
latest_iteration = get_latest_iteration(path)
return os.path.join(path, '{}_{}'.format(FILE_PREFIX, latest_iteration))
except ValueError:
return None
def get_latest_iteration(path):
"""Return the largest iteration number corresponding to the given path.
Args:
path: The base path (including directory and base name) to search.
Returns:
The latest iteration number.
Raises:
ValueError: if there is not available log data at the given path.
"""
glob = os.path.join(path, '{}_[0-9]*'.format(FILE_PREFIX))
log_files = tf.gfile.Glob(glob)
if not log_files:
raise ValueError('No log data found at {}'.format(path))
def extract_iteration(x):
return int(x[x.rfind('_') + 1:])
latest_iteration = max(extract_iteration(x) for x in log_files)
return latest_iteration
def summarize_data(data, summary_keys):
"""Processes log data into a per-iteration summary.
Args:
data: Dictionary loaded by load_statistics describing the data. This
dictionary has keys iteration_0, iteration_1, ... describing per-iteration
data.
summary_keys: List of per-iteration data to be summarized.
Example:
data = load_statistics(...)
summarize_data(data, ['train_episode_returns',
'eval_episode_returns'])
Returns:
A dictionary mapping each key in returns_keys to a per-iteration summary.
"""
summary = {}
latest_iteration_number = len(data.keys())
current_value = None
for key in summary_keys:
summary[key] = []
# Compute per-iteration average of the given key.
for i in range(latest_iteration_number):
iter_key = '{}{}'.format(ITERATION_PREFIX, i)
# We allow reporting the same value multiple times when data is missing.
# If there is no data for this iteration, use the previous'.
if iter_key in data:
current_value = np.mean(data[iter_key][key])
summary[key].append(current_value)
return summary
def read_experiment(log_path,
parameter_set=None,
job_descriptor='',
iteration_number=None,
summary_keys=('train_episode_returns',
'eval_episode_returns'),
verbose=False):
"""Reads in a set of experimental results from log_path.
The provided parameter_set is an ordered_dict which
1) defines the parameters of this experiment,
2) defines the order in which they occur in the job descriptor.
The method reads all experiments of the form
${log_path}/${job_descriptor}.format(params)/logs,
where params is constructed from the cross product of the elements in
the parameter_set.
For example:
parameter_set = collections.OrderedDict([
('game', ['Asterix', 'Pong']),
('epsilon', ['0', '0.1'])
])
read_experiment('/tmp/logs', parameter_set, job_descriptor='{}_{}')
Will try to read logs from:
- /tmp/logs/Asterix_0/logs
- /tmp/logs/Asterix_0.1/logs
- /tmp/logs/Pong_0/logs
- /tmp/logs/Pong_0.1/logs
Args:
log_path: string, base path specifying where results live.
parameter_set: An ordered_dict mapping parameter names to allowable values.
job_descriptor: A job descriptor string which is used to construct the full
path for each trial within an experiment.
iteration_number: Int, if not None determines the iteration number at which
we read in results.
summary_keys: Iterable of strings, iteration statistics to summarize.
verbose: If True, print out additional information.
Returns:
A Pandas dataframe containing experimental results.
"""
keys = [] if parameter_set is None else list(parameter_set.keys())
# Extract parameter value lists, one per parameter.
ordered_values = [parameter_set[key] for key in keys]
column_names = keys + ['iteration'] + list(summary_keys)
num_parameter_settings = len([_ for _ in itertools.product(*ordered_values)])
expected_num_iterations = 200
expected_num_rows = num_parameter_settings * expected_num_iterations
# Create DataFrame with predicted number of rows.
data_frame = pd.DataFrame(index=np.arange(0, expected_num_rows),
columns=column_names)
row_index = 0
# Now take their cross product. This generates tuples of the form
# (p1, p2, p3, ...) where p1, p2, p3 are parameter values for the first,
# second, etc. parameters as ordered in value_set.
for parameter_tuple in itertools.product(*ordered_values):
if job_descriptor is not None:
name = job_descriptor.format(*parameter_tuple)
else:
# Construct name for values.
name = '-'.join([keys[i] + '_' + str(parameter_tuple[i])
for i in range(len(keys))])
experiment_path = '{}/{}/logs'.format(log_path, name)
raw_data, last_iteration = load_statistics(
experiment_path, iteration_number=iteration_number, verbose=verbose)
summary = summarize_data(raw_data, summary_keys)
for iteration in range(last_iteration):
# The row contains all the parameters, the iteration, and finally the
# requested values.
row_data = (list(parameter_tuple) + [iteration] +
[summary[key][iteration] for key in summary_keys])
data_frame.loc[row_index] = row_data
row_index += 1
# Shed any unused rows.
return data_frame.drop(np.arange(row_index, expected_num_rows))

Двоичные данные
dopamine/discrete_domains/.DS_Store поставляемый

Двоичный файл не отображается.

Просмотреть файл

@ -1 +1 @@
# coding=utf-8
# coding=utf-8

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,177 +1,177 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A checkpointing mechanism for Dopamine agents.
This Checkpointer expects a base directory where checkpoints for different
iterations are stored. Specifically, Checkpointer.save_checkpoint() takes in
as input a dictionary 'data' to be pickled to disk. At each iteration, we
write a file called 'cpkt.#', where # is the iteration number. The
Checkpointer also cleans up old files, maintaining up to the CHECKPOINT_DURATION
most recent iterations.
The Checkpointer writes a sentinel file to indicate that checkpointing was
globally successful. This means that all other checkpointing activities
(saving the Tensorflow graph, the replay buffer) should be performed *prior*
to calling Checkpointer.save_checkpoint(). This allows the Checkpointer to
detect incomplete checkpoints.
#### Example
After running 10 iterations (numbered 0...9) with base_directory='/checkpoint',
the following files will exist:
```
/checkpoint/cpkt.6
/checkpoint/cpkt.7
/checkpoint/cpkt.8
/checkpoint/cpkt.9
/checkpoint/sentinel_checkpoint_complete.6
/checkpoint/sentinel_checkpoint_complete.7
/checkpoint/sentinel_checkpoint_complete.8
/checkpoint/sentinel_checkpoint_complete.9
```
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle
import tensorflow as tf
CHECKPOINT_DURATION = 4
def get_latest_checkpoint_number(base_directory):
"""Returns the version number of the latest completed checkpoint.
Args:
base_directory: str, directory in which to look for checkpoint files.
Returns:
int, the iteration number of the latest checkpoint, or -1 if none was found.
"""
glob = os.path.join(base_directory, 'sentinel_checkpoint_complete.*')
def extract_iteration(x):
return int(x[x.rfind('.') + 1:])
try:
checkpoint_files = tf.gfile.Glob(glob)
except tf.errors.NotFoundError:
return -1
try:
latest_iteration = max(extract_iteration(x) for x in checkpoint_files)
return latest_iteration
except ValueError:
return -1
class Checkpointer(object):
"""Class for managing checkpoints for Dopamine agents.
"""
def __init__(self, base_directory, checkpoint_file_prefix='ckpt',
checkpoint_frequency=1):
"""Initializes Checkpointer.
Args:
base_directory: str, directory where all checkpoints are saved/loaded.
checkpoint_file_prefix: str, prefix to use for naming checkpoint files.
checkpoint_frequency: int, the frequency at which to checkpoint.
Raises:
ValueError: if base_directory is empty, or not creatable.
"""
if not base_directory:
raise ValueError('No path provided to Checkpointer.')
self._checkpoint_file_prefix = checkpoint_file_prefix
self._checkpoint_frequency = checkpoint_frequency
self._base_directory = base_directory
try:
tf.gfile.MakeDirs(base_directory)
except tf.errors.PermissionDeniedError:
# We catch the PermissionDeniedError and issue a more useful exception.
raise ValueError('Unable to create checkpoint path: {}.'.format(
base_directory))
def _generate_filename(self, file_prefix, iteration_number):
"""Returns a checkpoint filename from prefix and iteration number."""
filename = '{}.{}'.format(file_prefix, iteration_number)
return os.path.join(self._base_directory, filename)
def _save_data_to_file(self, data, filename):
"""Saves the given 'data' object to a file."""
with tf.gfile.GFile(filename, 'w') as fout:
pickle.dump(data, fout)
def save_checkpoint(self, iteration_number, data):
"""Saves a new checkpoint at the current iteration_number.
Args:
iteration_number: int, the current iteration number for this checkpoint.
data: Any (picklable) python object containing the data to store in the
checkpoint.
"""
if iteration_number % self._checkpoint_frequency != 0:
return
filename = self._generate_filename(self._checkpoint_file_prefix,
iteration_number)
self._save_data_to_file(data, filename)
filename = self._generate_filename('sentinel_checkpoint_complete',
iteration_number)
with tf.gfile.GFile(filename, 'wb') as fout:
fout.write('done')
self._clean_up_old_checkpoints(iteration_number)
def _clean_up_old_checkpoints(self, iteration_number):
"""Removes sufficiently old checkpoints."""
# After writing a the checkpoint and sentinel file, we garbage collect files
# that are CHECKPOINT_DURATION * self._checkpoint_frequency versions old.
stale_iteration_number = iteration_number - (self._checkpoint_frequency *
CHECKPOINT_DURATION)
if stale_iteration_number >= 0:
stale_file = self._generate_filename(self._checkpoint_file_prefix,
stale_iteration_number)
stale_sentinel = self._generate_filename('sentinel_checkpoint_complete',
stale_iteration_number)
try:
tf.gfile.Remove(stale_file)
tf.gfile.Remove(stale_sentinel)
except tf.errors.NotFoundError:
# Ignore if file not found.
tf.logging.info('Unable to remove {} or {}.'.format(stale_file,
stale_sentinel))
def _load_data_from_file(self, filename):
if not tf.gfile.Exists(filename):
return None
with tf.gfile.GFile(filename, 'rb') as fin:
return pickle.load(fin)
def load_checkpoint(self, iteration_number):
"""Tries to reload a checkpoint at the selected iteration number.
Args:
iteration_number: The checkpoint iteration number to try to load.
Returns:
If the checkpoint files exist, two unpickled objects that were passed in
as data to save_checkpoint; returns None if the files do not exist.
"""
checkpoint_file = self._generate_filename(self._checkpoint_file_prefix,
iteration_number)
return self._load_data_from_file(checkpoint_file)
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A checkpointing mechanism for Dopamine agents.
This Checkpointer expects a base directory where checkpoints for different
iterations are stored. Specifically, Checkpointer.save_checkpoint() takes in
as input a dictionary 'data' to be pickled to disk. At each iteration, we
write a file called 'cpkt.#', where # is the iteration number. The
Checkpointer also cleans up old files, maintaining up to the CHECKPOINT_DURATION
most recent iterations.
The Checkpointer writes a sentinel file to indicate that checkpointing was
globally successful. This means that all other checkpointing activities
(saving the Tensorflow graph, the replay buffer) should be performed *prior*
to calling Checkpointer.save_checkpoint(). This allows the Checkpointer to
detect incomplete checkpoints.
#### Example
After running 10 iterations (numbered 0...9) with base_directory='/checkpoint',
the following files will exist:
```
/checkpoint/cpkt.6
/checkpoint/cpkt.7
/checkpoint/cpkt.8
/checkpoint/cpkt.9
/checkpoint/sentinel_checkpoint_complete.6
/checkpoint/sentinel_checkpoint_complete.7
/checkpoint/sentinel_checkpoint_complete.8
/checkpoint/sentinel_checkpoint_complete.9
```
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle
import tensorflow as tf
CHECKPOINT_DURATION = 4
def get_latest_checkpoint_number(base_directory):
"""Returns the version number of the latest completed checkpoint.
Args:
base_directory: str, directory in which to look for checkpoint files.
Returns:
int, the iteration number of the latest checkpoint, or -1 if none was found.
"""
glob = os.path.join(base_directory, 'sentinel_checkpoint_complete.*')
def extract_iteration(x):
return int(x[x.rfind('.') + 1:])
try:
checkpoint_files = tf.gfile.Glob(glob)
except tf.errors.NotFoundError:
return -1
try:
latest_iteration = max(extract_iteration(x) for x in checkpoint_files)
return latest_iteration
except ValueError:
return -1
class Checkpointer(object):
"""Class for managing checkpoints for Dopamine agents.
"""
def __init__(self, base_directory, checkpoint_file_prefix='ckpt',
checkpoint_frequency=1):
"""Initializes Checkpointer.
Args:
base_directory: str, directory where all checkpoints are saved/loaded.
checkpoint_file_prefix: str, prefix to use for naming checkpoint files.
checkpoint_frequency: int, the frequency at which to checkpoint.
Raises:
ValueError: if base_directory is empty, or not creatable.
"""
if not base_directory:
raise ValueError('No path provided to Checkpointer.')
self._checkpoint_file_prefix = checkpoint_file_prefix
self._checkpoint_frequency = checkpoint_frequency
self._base_directory = base_directory
try:
tf.gfile.MakeDirs(base_directory)
except tf.errors.PermissionDeniedError:
# We catch the PermissionDeniedError and issue a more useful exception.
raise ValueError('Unable to create checkpoint path: {}.'.format(
base_directory))
def _generate_filename(self, file_prefix, iteration_number):
"""Returns a checkpoint filename from prefix and iteration number."""
filename = '{}.{}'.format(file_prefix, iteration_number)
return os.path.join(self._base_directory, filename)
def _save_data_to_file(self, data, filename):
"""Saves the given 'data' object to a file."""
with tf.gfile.GFile(filename, 'w') as fout:
pickle.dump(data, fout)
def save_checkpoint(self, iteration_number, data):
"""Saves a new checkpoint at the current iteration_number.
Args:
iteration_number: int, the current iteration number for this checkpoint.
data: Any (picklable) python object containing the data to store in the
checkpoint.
"""
if iteration_number % self._checkpoint_frequency != 0:
return
filename = self._generate_filename(self._checkpoint_file_prefix,
iteration_number)
self._save_data_to_file(data, filename)
filename = self._generate_filename('sentinel_checkpoint_complete',
iteration_number)
with tf.gfile.GFile(filename, 'wb') as fout:
fout.write('done')
self._clean_up_old_checkpoints(iteration_number)
def _clean_up_old_checkpoints(self, iteration_number):
"""Removes sufficiently old checkpoints."""
# After writing a the checkpoint and sentinel file, we garbage collect files
# that are CHECKPOINT_DURATION * self._checkpoint_frequency versions old.
stale_iteration_number = iteration_number - (self._checkpoint_frequency *
CHECKPOINT_DURATION)
if stale_iteration_number >= 0:
stale_file = self._generate_filename(self._checkpoint_file_prefix,
stale_iteration_number)
stale_sentinel = self._generate_filename('sentinel_checkpoint_complete',
stale_iteration_number)
try:
tf.gfile.Remove(stale_file)
tf.gfile.Remove(stale_sentinel)
except tf.errors.NotFoundError:
# Ignore if file not found.
tf.logging.info('Unable to remove {} or {}.'.format(stale_file,
stale_sentinel))
def _load_data_from_file(self, filename):
if not tf.gfile.Exists(filename):
return None
with tf.gfile.GFile(filename, 'rb') as fin:
return pickle.load(fin)
def load_checkpoint(self, iteration_number):
"""Tries to reload a checkpoint at the selected iteration number.
Args:
iteration_number: The checkpoint iteration number to try to load.
Returns:
If the checkpoint files exist, two unpickled objects that were passed in
as data to save_checkpoint; returns None if the files do not exist.
"""
checkpoint_file = self._generate_filename(self._checkpoint_file_prefix,
iteration_number)
return self._load_data_from_file(checkpoint_file)

Просмотреть файл

@ -1,335 +1,335 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Gym-specific (non-Atari) utilities.
Some network specifications specific to certain Gym environments are provided
here.
Includes a wrapper class around Gym environments. This class makes general Gym
environments conformant with the API Dopamine is expecting.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import math
import gym
import numpy as np
import tensorflow as tf
import gin.tf
CARTPOLE_MIN_VALS = np.array([-2.4, -5., -math.pi/12., -math.pi*2.])
CARTPOLE_MAX_VALS = np.array([2.4, 5., math.pi/12., math.pi*2.])
ACROBOT_MIN_VALS = np.array([-1., -1., -1., -1., -5., -5.])
ACROBOT_MAX_VALS = np.array([1., 1., 1., 1., 5., 5.])
gin.constant('gym_lib.CARTPOLE_OBSERVATION_SHAPE', (4, 1))
gin.constant('gym_lib.CARTPOLE_OBSERVATION_DTYPE', tf.float32)
gin.constant('gym_lib.CARTPOLE_STACK_SIZE', 1)
gin.constant('gym_lib.ACROBOT_OBSERVATION_SHAPE', (6, 1))
gin.constant('gym_lib.ACROBOT_OBSERVATION_DTYPE', tf.float32)
gin.constant('gym_lib.ACROBOT_STACK_SIZE', 1)
slim = tf.contrib.slim
@gin.configurable
def create_gym_environment(environment_name=None, version='v0'):
"""Wraps a Gym environment with some basic preprocessing.
Args:
environment_name: str, the name of the environment to run.
version: str, version of the environment to run.
Returns:
A Gym environment with some standard preprocessing.
"""
assert environment_name is not None
full_game_name = '{}-{}'.format(environment_name, version)
env = gym.make(full_game_name)
# Strip out the TimeLimit wrapper from Gym, which caps us at 200 steps.
env = env.env
# Wrap the returned environment in a class which conforms to the API expected
# by Dopamine.
env = GymPreprocessing(env)
return env
@gin.configurable
def _basic_discrete_domain_network(min_vals, max_vals, num_actions, state,
num_atoms=None):
"""Builds a basic network for discrete domains, rescaling inputs to [-1, 1].
Args:
min_vals: float, minimum attainable values (must be same shape as `state`).
max_vals: float, maximum attainable values (must be same shape as `state`).
num_actions: int, number of actions.
state: `tf.Tensor`, the state input.
num_atoms: int or None, if None will construct a DQN-style network,
otherwise will construct a Rainbow-style network.
Returns:
The Q-values for DQN-style agents or logits for Rainbow-style agents.
"""
net = tf.cast(state, tf.float32)
net = slim.flatten(net)
net -= min_vals
net /= max_vals - min_vals
net = 2.0 * net - 1.0 # Rescale in range [-1, 1].
net = slim.fully_connected(net, 512)
net = slim.fully_connected(net, 512)
if num_atoms is None:
# We are constructing a DQN-style network.
return slim.fully_connected(net, num_actions, activation_fn=None)
else:
# We are constructing a rainbow-style network.
return slim.fully_connected(net, num_actions * num_atoms,
activation_fn=None)
@gin.configurable
def cartpole_dqn_network(num_actions, network_type, state):
"""Builds the deep network used to compute the agent's Q-values.
It rescales the input features to a range that yields improved performance.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = _basic_discrete_domain_network(
CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state)
return network_type(q_values)
class FourierBasis(object):
"""Fourier Basis linear function approximation.
Requires the ranges for each dimension, and is thus able to use only sine or
cosine (and uses cosine). So, this has half the coefficients that a full
Fourier approximation would use.
Many thanks to Will Dabney (wdabney@) for this implementation.
From the paper:
G.D. Konidaris, S. Osentoski and P.S. Thomas. (2011)
Value Function Approximation in Reinforcement Learning using the Fourier Basis
"""
def __init__(self, nvars, min_vals=0, max_vals=None, order=3):
self.order = order
self.min_vals = min_vals
self.max_vals = max_vals
terms = itertools.product(range(order + 1), repeat=nvars)
# Removing first iterate because it corresponds to the constant bias
self.multipliers = tf.constant(
[list(map(int, x)) for x in terms][1:], dtype=tf.float32)
def scale(self, values):
shifted = values - self.min_vals
if self.max_vals is None:
return shifted
return shifted / (self.max_vals - self.min_vals)
def compute_features(self, features):
# Important to rescale features to be between [0,1]
scaled = self.scale(features)
return tf.cos(np.pi * tf.matmul(scaled, self.multipliers, transpose_b=True))
@gin.configurable
def fourier_dqn_network(min_vals,
max_vals,
num_actions,
state,
fourier_basis_order=3):
"""Builds the function approximator used to compute the agent's Q-values.
It uses FourierBasis features and a linear layer.
Args:
min_vals: float, minimum attainable values (must be same shape as `state`).
max_vals: float, maximum attainable values (must be same shape as `state`).
num_actions: int, number of actions.
state: `tf.Tensor`, contains the agent's current state.
fourier_basis_order: int, order of the Fourier basis functions.
Returns:
The Q-values for DQN-style agents or logits for Rainbow-style agents.
"""
net = tf.cast(state, tf.float32)
net = slim.flatten(net)
# Feed state through Fourier basis.
feature_generator = FourierBasis(
net.get_shape().as_list()[-1],
min_vals,
max_vals,
order=fourier_basis_order)
net = feature_generator.compute_features(net)
# Q-values are always linear w.r.t. last layer.
q_values = slim.fully_connected(
net, num_actions, activation_fn=None, biases_initializer=None)
return q_values
def cartpole_fourier_dqn_network(num_actions, network_type, state):
"""Builds the function approximator used to compute the agent's Q-values.
It uses the Fourier basis features and a linear function approximator.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = fourier_dqn_network(CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS,
num_actions, state)
return network_type(q_values)
@gin.configurable
def cartpole_rainbow_network(num_actions, num_atoms, support, network_type,
state):
"""Build the deep network used to compute the agent's Q-value distributions.
Args:
num_actions: int, number of actions.
num_atoms: int, the number of buckets of the value function distribution.
support: tf.linspace, the support of the Q-value distribution.
network_type: `namedtuple`, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
net = _basic_discrete_domain_network(
CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state,
num_atoms=num_atoms)
logits = tf.reshape(net, [-1, num_actions, num_atoms])
probabilities = tf.contrib.layers.softmax(logits)
q_values = tf.reduce_sum(support * probabilities, axis=2)
return network_type(q_values, logits, probabilities)
@gin.configurable
def acrobot_dqn_network(num_actions, network_type, state):
"""Builds the deep network used to compute the agent's Q-values.
It rescales the input features to a range that yields improved performance.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = _basic_discrete_domain_network(
ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state)
return network_type(q_values)
@gin.configurable
def acrobot_fourier_dqn_network(num_actions, network_type, state):
"""Builds the function approximator used to compute the agent's Q-values.
It uses the Fourier basis features and a linear function approximator.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = fourier_dqn_network(ACROBOT_MIN_VALS, ACROBOT_MAX_VALS,
num_actions, state)
return network_type(q_values)
@gin.configurable
def acrobot_rainbow_network(num_actions, num_atoms, support, network_type,
state):
"""Build the deep network used to compute the agent's Q-value distributions.
Args:
num_actions: int, number of actions.
num_atoms: int, the number of buckets of the value function distribution.
support: tf.linspace, the support of the Q-value distribution.
network_type: `namedtuple`, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
net = _basic_discrete_domain_network(
ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state,
num_atoms=num_atoms)
logits = tf.reshape(net, [-1, num_actions, num_atoms])
probabilities = tf.contrib.layers.softmax(logits)
q_values = tf.reduce_sum(support * probabilities, axis=2)
return network_type(q_values, logits, probabilities)
@gin.configurable
class GymPreprocessing(object):
"""A Wrapper class around Gym environments."""
def __init__(self, environment):
self.environment = environment
self.game_over = False
@property
def observation_space(self):
return self.environment.observation_space
@property
def action_space(self):
return self.environment.action_space
@property
def reward_range(self):
return self.environment.reward_range
@property
def metadata(self):
return self.environment.metadata
def reset(self):
return self.environment.reset()
def step(self, action):
observation, reward, game_over, info = self.environment.step(action)
self.game_over = game_over
return observation, reward, game_over, info
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Gym-specific (non-Atari) utilities.
Some network specifications specific to certain Gym environments are provided
here.
Includes a wrapper class around Gym environments. This class makes general Gym
environments conformant with the API Dopamine is expecting.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import math
import gym
import numpy as np
import tensorflow as tf
import gin.tf
CARTPOLE_MIN_VALS = np.array([-2.4, -5., -math.pi/12., -math.pi*2.])
CARTPOLE_MAX_VALS = np.array([2.4, 5., math.pi/12., math.pi*2.])
ACROBOT_MIN_VALS = np.array([-1., -1., -1., -1., -5., -5.])
ACROBOT_MAX_VALS = np.array([1., 1., 1., 1., 5., 5.])
gin.constant('gym_lib.CARTPOLE_OBSERVATION_SHAPE', (4, 1))
gin.constant('gym_lib.CARTPOLE_OBSERVATION_DTYPE', tf.float32)
gin.constant('gym_lib.CARTPOLE_STACK_SIZE', 1)
gin.constant('gym_lib.ACROBOT_OBSERVATION_SHAPE', (6, 1))
gin.constant('gym_lib.ACROBOT_OBSERVATION_DTYPE', tf.float32)
gin.constant('gym_lib.ACROBOT_STACK_SIZE', 1)
slim = tf.contrib.slim
@gin.configurable
def create_gym_environment(environment_name=None, version='v0'):
"""Wraps a Gym environment with some basic preprocessing.
Args:
environment_name: str, the name of the environment to run.
version: str, version of the environment to run.
Returns:
A Gym environment with some standard preprocessing.
"""
assert environment_name is not None
full_game_name = '{}-{}'.format(environment_name, version)
env = gym.make(full_game_name)
# Strip out the TimeLimit wrapper from Gym, which caps us at 200 steps.
env = env.env
# Wrap the returned environment in a class which conforms to the API expected
# by Dopamine.
env = GymPreprocessing(env)
return env
@gin.configurable
def _basic_discrete_domain_network(min_vals, max_vals, num_actions, state,
num_atoms=None):
"""Builds a basic network for discrete domains, rescaling inputs to [-1, 1].
Args:
min_vals: float, minimum attainable values (must be same shape as `state`).
max_vals: float, maximum attainable values (must be same shape as `state`).
num_actions: int, number of actions.
state: `tf.Tensor`, the state input.
num_atoms: int or None, if None will construct a DQN-style network,
otherwise will construct a Rainbow-style network.
Returns:
The Q-values for DQN-style agents or logits for Rainbow-style agents.
"""
net = tf.cast(state, tf.float32)
net = slim.flatten(net)
net -= min_vals
net /= max_vals - min_vals
net = 2.0 * net - 1.0 # Rescale in range [-1, 1].
net = slim.fully_connected(net, 512)
net = slim.fully_connected(net, 512)
if num_atoms is None:
# We are constructing a DQN-style network.
return slim.fully_connected(net, num_actions, activation_fn=None)
else:
# We are constructing a rainbow-style network.
return slim.fully_connected(net, num_actions * num_atoms,
activation_fn=None)
@gin.configurable
def cartpole_dqn_network(num_actions, network_type, state):
"""Builds the deep network used to compute the agent's Q-values.
It rescales the input features to a range that yields improved performance.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = _basic_discrete_domain_network(
CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state)
return network_type(q_values)
class FourierBasis(object):
"""Fourier Basis linear function approximation.
Requires the ranges for each dimension, and is thus able to use only sine or
cosine (and uses cosine). So, this has half the coefficients that a full
Fourier approximation would use.
Many thanks to Will Dabney (wdabney@) for this implementation.
From the paper:
G.D. Konidaris, S. Osentoski and P.S. Thomas. (2011)
Value Function Approximation in Reinforcement Learning using the Fourier Basis
"""
def __init__(self, nvars, min_vals=0, max_vals=None, order=3):
self.order = order
self.min_vals = min_vals
self.max_vals = max_vals
terms = itertools.product(range(order + 1), repeat=nvars)
# Removing first iterate because it corresponds to the constant bias
self.multipliers = tf.constant(
[list(map(int, x)) for x in terms][1:], dtype=tf.float32)
def scale(self, values):
shifted = values - self.min_vals
if self.max_vals is None:
return shifted
return shifted / (self.max_vals - self.min_vals)
def compute_features(self, features):
# Important to rescale features to be between [0,1]
scaled = self.scale(features)
return tf.cos(np.pi * tf.matmul(scaled, self.multipliers, transpose_b=True))
@gin.configurable
def fourier_dqn_network(min_vals,
max_vals,
num_actions,
state,
fourier_basis_order=3):
"""Builds the function approximator used to compute the agent's Q-values.
It uses FourierBasis features and a linear layer.
Args:
min_vals: float, minimum attainable values (must be same shape as `state`).
max_vals: float, maximum attainable values (must be same shape as `state`).
num_actions: int, number of actions.
state: `tf.Tensor`, contains the agent's current state.
fourier_basis_order: int, order of the Fourier basis functions.
Returns:
The Q-values for DQN-style agents or logits for Rainbow-style agents.
"""
net = tf.cast(state, tf.float32)
net = slim.flatten(net)
# Feed state through Fourier basis.
feature_generator = FourierBasis(
net.get_shape().as_list()[-1],
min_vals,
max_vals,
order=fourier_basis_order)
net = feature_generator.compute_features(net)
# Q-values are always linear w.r.t. last layer.
q_values = slim.fully_connected(
net, num_actions, activation_fn=None, biases_initializer=None)
return q_values
def cartpole_fourier_dqn_network(num_actions, network_type, state):
"""Builds the function approximator used to compute the agent's Q-values.
It uses the Fourier basis features and a linear function approximator.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = fourier_dqn_network(CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS,
num_actions, state)
return network_type(q_values)
@gin.configurable
def cartpole_rainbow_network(num_actions, num_atoms, support, network_type,
state):
"""Build the deep network used to compute the agent's Q-value distributions.
Args:
num_actions: int, number of actions.
num_atoms: int, the number of buckets of the value function distribution.
support: tf.linspace, the support of the Q-value distribution.
network_type: `namedtuple`, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
net = _basic_discrete_domain_network(
CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state,
num_atoms=num_atoms)
logits = tf.reshape(net, [-1, num_actions, num_atoms])
probabilities = tf.contrib.layers.softmax(logits)
q_values = tf.reduce_sum(support * probabilities, axis=2)
return network_type(q_values, logits, probabilities)
@gin.configurable
def acrobot_dqn_network(num_actions, network_type, state):
"""Builds the deep network used to compute the agent's Q-values.
It rescales the input features to a range that yields improved performance.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = _basic_discrete_domain_network(
ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state)
return network_type(q_values)
@gin.configurable
def acrobot_fourier_dqn_network(num_actions, network_type, state):
"""Builds the function approximator used to compute the agent's Q-values.
It uses the Fourier basis features and a linear function approximator.
Args:
num_actions: int, number of actions.
network_type: namedtuple, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
q_values = fourier_dqn_network(ACROBOT_MIN_VALS, ACROBOT_MAX_VALS,
num_actions, state)
return network_type(q_values)
@gin.configurable
def acrobot_rainbow_network(num_actions, num_atoms, support, network_type,
state):
"""Build the deep network used to compute the agent's Q-value distributions.
Args:
num_actions: int, number of actions.
num_atoms: int, the number of buckets of the value function distribution.
support: tf.linspace, the support of the Q-value distribution.
network_type: `namedtuple`, collection of expected values to return.
state: `tf.Tensor`, contains the agent's current state.
Returns:
net: _network_type object containing the tensors output by the network.
"""
net = _basic_discrete_domain_network(
ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state,
num_atoms=num_atoms)
logits = tf.reshape(net, [-1, num_actions, num_atoms])
probabilities = tf.contrib.layers.softmax(logits)
q_values = tf.reduce_sum(support * probabilities, axis=2)
return network_type(q_values, logits, probabilities)
@gin.configurable
class GymPreprocessing(object):
"""A Wrapper class around Gym environments."""
def __init__(self, environment):
self.environment = environment
self.game_over = False
@property
def observation_space(self):
return self.environment.observation_space
@property
def action_space(self):
return self.environment.action_space
@property
def reward_range(self):
return self.environment.reward_range
@property
def metadata(self):
return self.environment.metadata
def reset(self):
return self.environment.reset()
def step(self, action):
observation, reward, game_over, info = self.environment.step(action)
self.game_over = game_over
return observation, reward, game_over, info

Просмотреть файл

@ -1,49 +1,49 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A class for storing iteration-specific metrics.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class IterationStatistics(object):
"""A class for storing iteration-specific metrics.
The internal format is as follows: we maintain a mapping from keys to lists.
Each list contains all the values corresponding to the given key.
For example, self.data_lists['train_episode_returns'] might contain the
per-episode returns achieved during this iteration.
Attributes:
data_lists: dict mapping each metric_name (str) to a list of said metric
across episodes.
"""
def __init__(self):
self.data_lists = {}
def append(self, data_pairs):
"""Add the given values to their corresponding key-indexed lists.
Args:
data_pairs: A dictionary of key-value pairs to be recorded.
"""
for key, value in data_pairs.items():
if key not in self.data_lists:
self.data_lists[key] = []
self.data_lists[key].append(value)
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A class for storing iteration-specific metrics.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class IterationStatistics(object):
"""A class for storing iteration-specific metrics.
The internal format is as follows: we maintain a mapping from keys to lists.
Each list contains all the values corresponding to the given key.
For example, self.data_lists['train_episode_returns'] might contain the
per-episode returns achieved during this iteration.
Attributes:
data_lists: dict mapping each metric_name (str) to a list of said metric
across episodes.
"""
def __init__(self):
self.data_lists = {}
def append(self, data_pairs):
"""Add the given values to their corresponding key-indexed lists.
Args:
data_pairs: A dictionary of key-value pairs to be recorded.
"""
for key, value in data_pairs.items():
if key not in self.data_lists:
self.data_lists[key] = []
self.data_lists[key].append(value)

Просмотреть файл

@ -1,105 +1,105 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A lightweight logging mechanism for dopamine agents."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle
import tensorflow as tf
CHECKPOINT_DURATION = 4
class Logger(object):
"""Class for maintaining a dictionary of data to log."""
def __init__(self, logging_dir):
"""Initializes Logger.
Args:
logging_dir: str, Directory to which logs are written.
"""
# Dict used by logger to store data.
self.data = {}
self._logging_enabled = True
if not logging_dir:
tf.logging.info('Logging directory not specified, will not log.')
self._logging_enabled = False
return
# Try to create logging directory.
try:
tf.gfile.MakeDirs(logging_dir)
except tf.errors.PermissionDeniedError:
# If it already exists, ignore exception.
pass
if not tf.gfile.Exists(logging_dir):
tf.logging.warning(
'Could not create directory %s, logging will be disabled.',
logging_dir)
self._logging_enabled = False
return
self._logging_dir = logging_dir
def __setitem__(self, key, value):
"""This method will set an entry at key with value in the dictionary.
It will effectively overwrite any previous data at the same key.
Args:
key: str, indicating key where to write the entry.
value: A python object to store.
"""
if self._logging_enabled:
self.data[key] = value
def _generate_filename(self, filename_prefix, iteration_number):
filename = '{}_{}'.format(filename_prefix, iteration_number)
return os.path.join(self._logging_dir, filename)
def log_to_file(self, filename_prefix, iteration_number):
"""Save the pickled dictionary to a file.
Args:
filename_prefix: str, name of the file to use (without iteration
number).
iteration_number: int, the iteration number, appended to the end of
filename_prefix.
"""
if not self._logging_enabled:
tf.logging.warning('Logging is disabled.')
return
log_file = self._generate_filename(filename_prefix, iteration_number)
with tf.gfile.GFile(log_file, 'w') as fout:
pickle.dump(self.data, fout, protocol=pickle.HIGHEST_PROTOCOL)
# After writing a checkpoint file, we garbage collect the log file
# that is CHECKPOINT_DURATION versions old.
stale_iteration_number = iteration_number - CHECKPOINT_DURATION
if stale_iteration_number >= 0:
stale_file = self._generate_filename(filename_prefix,
stale_iteration_number)
try:
tf.gfile.Remove(stale_file)
except tf.errors.NotFoundError:
# Ignore if file not found.
pass
def is_logging_enabled(self):
"""Return if logging is enabled."""
return self._logging_enabled
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A lightweight logging mechanism for dopamine agents."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle
import tensorflow as tf
CHECKPOINT_DURATION = 4
class Logger(object):
"""Class for maintaining a dictionary of data to log."""
def __init__(self, logging_dir):
"""Initializes Logger.
Args:
logging_dir: str, Directory to which logs are written.
"""
# Dict used by logger to store data.
self.data = {}
self._logging_enabled = True
if not logging_dir:
tf.logging.info('Logging directory not specified, will not log.')
self._logging_enabled = False
return
# Try to create logging directory.
try:
tf.gfile.MakeDirs(logging_dir)
except tf.errors.PermissionDeniedError:
# If it already exists, ignore exception.
pass
if not tf.gfile.Exists(logging_dir):
tf.logging.warning(
'Could not create directory %s, logging will be disabled.',
logging_dir)
self._logging_enabled = False
return
self._logging_dir = logging_dir
def __setitem__(self, key, value):
"""This method will set an entry at key with value in the dictionary.
It will effectively overwrite any previous data at the same key.
Args:
key: str, indicating key where to write the entry.
value: A python object to store.
"""
if self._logging_enabled:
self.data[key] = value
def _generate_filename(self, filename_prefix, iteration_number):
filename = '{}_{}'.format(filename_prefix, iteration_number)
return os.path.join(self._logging_dir, filename)
def log_to_file(self, filename_prefix, iteration_number):
"""Save the pickled dictionary to a file.
Args:
filename_prefix: str, name of the file to use (without iteration
number).
iteration_number: int, the iteration number, appended to the end of
filename_prefix.
"""
if not self._logging_enabled:
tf.logging.warning('Logging is disabled.')
return
log_file = self._generate_filename(filename_prefix, iteration_number)
with tf.gfile.GFile(log_file, 'w') as fout:
pickle.dump(self.data, fout, protocol=pickle.HIGHEST_PROTOCOL)
# After writing a checkpoint file, we garbage collect the log file
# that is CHECKPOINT_DURATION versions old.
stale_iteration_number = iteration_number - CHECKPOINT_DURATION
if stale_iteration_number >= 0:
stale_file = self._generate_filename(filename_prefix,
stale_iteration_number)
try:
tf.gfile.Remove(stale_file)
except tf.errors.NotFoundError:
# Ignore if file not found.
pass
def is_logging_enabled(self):
"""Return if logging is enabled."""
return self._logging_enabled

Просмотреть файл

@ -7,10 +7,13 @@ mkdir ../agents/fqf/configs/gins &> /dev/null
n=0
#iqn_fqf-ws-sticky-0" "iqn_fqf-ws-sticky-0"
declare -a games=("Centipede")
declare -a seeds=(0 1 2)
declare -a factors=(0.00001)
declare -a ents=(0.0001)
#Berzerk Gopher Kangaroo ChopperCommand Centipede Breakout Amidar KungFuMaster DoubleDunk
declare -a seeds=(0)
declare -a factors=(0.00001 0.000001)
declare -a ents=(0.0001 0.00001)
declare -a optimizers=('rmsprop')
declare -a losses=('directbp' 'sqloss')
for game in "${games[@]}"
do
for opt in "${optimizers[@]}"
@ -21,12 +24,15 @@ do
do
for ent in "${ents[@]}"
do
d="iqn_fqf-ws-${opt}-f${factor}-e${ent}-s${seed}"
sed -e "s!GAME!${game}!" -e "s!RUNTYPE!$d!" -e "s!FQFFACTOR!${factor}!" -e "s!FQFENT!${ent}!" ../agents/fqf/configs/fqf.gin > ../agents/fqf/configs/gins/${d}_${game}.gin
CUDA_VISIBLE_DEVICES=$n nohup python train.py --base_dir=/tmp/${d}-${game} --gin_files="../agents/fqf/configs/gins/${d}_${game}.gin" >& logs/output_${game}_${d} &
echo "$i, $n"
n=$((($n+1) % 4))
sleep 2
for loss in "${losses[@]}"
do
d="iqn_fqf-ws-${loss}-${opt}-f${factor}-e${ent}-s${seed}"
sed -e "s!GAME!${game}!" -e "s!RUNTYPE!$d!" -e "s!FQFFACTOR!${factor}!" -e "s!FQFENT!${ent}!" ../agents/fqf/configs/fqf.gin > ../agents/fqf/configs/gins/${d}_${game}.gin
CUDA_VISIBLE_DEVICES=$n nohup python train.py --base_dir=/tmp/${d}-${game} --gin_files="../agents/fqf/configs/gins/${d}_${game}.gin" >& logs/output_${game}_${d} &
echo "$d, $n"
n=$((($n+1) % 4))
sleep 2
done
done
done
done

Просмотреть файл

@ -14,7 +14,7 @@ do
d="iqn-s${seed}"
sed -e "s!GAME!${game}!" -e "s!RUNTYPE!$d!" ../agents/implicit_quantile/configs/implicit_quantile_icml.gin > ../agents/implicit_quantile/configs/gins/${d}_icml_${game}.gin
CUDA_VISIBLE_DEVICES=$n nohup python train.py --base_dir=/tmp/${d}-${game} --gin_files="../agents/implicit_quantile/configs/gins/${d}_icml_${game}.gin" >& logs/output_${game}_${d} &
echo "$i, $n"
echo "$d, $n"
n=$(($n+1))
sleep 2
done

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,65 +1,65 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""The entry point for running a Dopamine agent.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
sys.path = ['../../'] + sys.path
print (sys.path)
#exit(0)
from absl import app
from absl import flags
from dopamine.discrete_domains import run_experiment
import tensorflow as tf
flags.DEFINE_string('base_dir', None,
'Base directory to host all required sub-directories.')
flags.DEFINE_multi_string(
'gin_files', [], 'List of paths to gin configuration files (e.g.'
'"dopamine/agents/dqn/dqn.gin").')
flags.DEFINE_multi_string(
'gin_bindings', [],
'Gin bindings to override the values set in the config files '
'(e.g. "DQNAgent.epsilon_train=0.1",'
' "create_environment.game_name="Pong"").')
FLAGS = flags.FLAGS
def main(unused_argv):
"""Main method.
Args:
unused_argv: Arguments (unused).
"""
tf.logging.set_verbosity(tf.logging.INFO)
run_experiment.load_gin_configs(FLAGS.gin_files, FLAGS.gin_bindings)
runner = run_experiment.create_runner(FLAGS.base_dir)
runner.run_experiment()
if __name__ == '__main__':
flags.mark_flag_as_required('base_dir')
app.run(main)
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""The entry point for running a Dopamine agent.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
sys.path = ['../../'] + sys.path
print (sys.path)
#exit(0)
from absl import app
from absl import flags
from dopamine.discrete_domains import run_experiment
import tensorflow as tf
flags.DEFINE_string('base_dir', None,
'Base directory to host all required sub-directories.')
flags.DEFINE_multi_string(
'gin_files', [], 'List of paths to gin configuration files (e.g.'
'"dopamine/agents/dqn/dqn.gin").')
flags.DEFINE_multi_string(
'gin_bindings', [],
'Gin bindings to override the values set in the config files '
'(e.g. "DQNAgent.epsilon_train=0.1",'
' "create_environment.game_name="Pong"").')
FLAGS = flags.FLAGS
def main(unused_argv):
"""Main method.
Args:
unused_argv: Arguments (unused).
"""
tf.logging.set_verbosity(tf.logging.INFO)
run_experiment.load_gin_configs(FLAGS.gin_files, FLAGS.gin_bindings)
runner = run_experiment.create_runner(FLAGS.base_dir)
runner.run_experiment()
if __name__ == '__main__':
flags.mark_flag_as_required('base_dir')
app.run(main)

Просмотреть файл

@ -1,15 +1,15 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,357 +1,357 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An implementation of Prioritized Experience Replay (PER).
This implementation is based on the paper "Prioritized Experience Replay"
by Tom Schaul et al. (2015). Many thanks to Tom Schaul, John Quan, and Matteo
Hessel for providing useful pointers on the algorithm and its implementation.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dopamine.replay_memory import circular_replay_buffer
from dopamine.replay_memory import sum_tree
from dopamine.replay_memory.circular_replay_buffer import ReplayElement
import numpy as np
import tensorflow as tf
import gin.tf
class OutOfGraphPrioritizedReplayBuffer(
circular_replay_buffer.OutOfGraphReplayBuffer):
"""An out-of-graph Replay Buffer for Prioritized Experience Replay.
See circular_replay_buffer.py for details.
"""
def __init__(self,
observation_shape,
stack_size,
replay_capacity,
batch_size,
update_horizon=1,
gamma=0.99,
max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
extra_storage_types=None,
observation_dtype=np.uint8,
action_shape=(),
action_dtype=np.int32,
reward_shape=(),
reward_dtype=np.float32):
"""Initializes OutOfGraphPrioritizedReplayBuffer.
Args:
observation_shape: tuple of ints.
stack_size: int, number of frames to use in state stack.
replay_capacity: int, number of transitions to keep in memory.
batch_size: int.
update_horizon: int, length of update ('n' in n-step update).
gamma: int, the discount factor.
max_sample_attempts: int, the maximum number of attempts allowed to
get a sample.
extra_storage_types: list of ReplayElements defining the type of the extra
contents that will be stored and returned by sample_transition_batch.
observation_dtype: np.dtype, type of the observations. Defaults to
np.uint8 for Atari 2600.
action_shape: tuple of ints, the shape for the action vector. Empty tuple
means the action is a scalar.
action_dtype: np.dtype, type of elements in the action.
reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
means the reward is a scalar.
reward_dtype: np.dtype, type of elements in the reward.
"""
super(OutOfGraphPrioritizedReplayBuffer, self).__init__(
observation_shape=observation_shape,
stack_size=stack_size,
replay_capacity=replay_capacity,
batch_size=batch_size,
update_horizon=update_horizon,
gamma=gamma,
max_sample_attempts=max_sample_attempts,
extra_storage_types=extra_storage_types,
observation_dtype=observation_dtype,
action_shape=action_shape,
action_dtype=action_dtype,
reward_shape=reward_shape,
reward_dtype=reward_dtype)
self.sum_tree = sum_tree.SumTree(replay_capacity)
def get_add_args_signature(self):
"""The signature of the add function.
The signature is the same as the one for OutOfGraphReplayBuffer, with an
added priority.
Returns:
list of ReplayElements defining the type of the argument signature needed
by the add function.
"""
parent_add_signature = super(OutOfGraphPrioritizedReplayBuffer,
self).get_add_args_signature()
add_signature = parent_add_signature + [
ReplayElement('priority', (), np.float32)
]
return add_signature
def _add(self, *args):
"""Internal add method to add to the underlying memory arrays.
The arguments need to match add_arg_signature.
If priority is none, it is set to the maximum priority ever seen.
Args:
*args: All the elements in a transition.
"""
# Use Schaul et al.'s (2015) scheme of setting the priority of new elements
# to the maximum priority so far.
parent_add_args = []
# Picks out 'priority' from arguments and passes the other arguments to the
# parent method.
for i, element in enumerate(self.get_add_args_signature()):
if element.name == 'priority':
priority = args[i]
else:
parent_add_args.append(args[i])
self.sum_tree.set(self.cursor(), priority)
super(OutOfGraphPrioritizedReplayBuffer, self)._add(*parent_add_args)
def sample_index_batch(self, batch_size):
"""Returns a batch of valid indices sampled as in Schaul et al. (2015).
Args:
batch_size: int, number of indices returned.
Returns:
list of ints, a batch of valid indices sampled uniformly.
Raises:
Exception: If the batch was not constructed after maximum number of tries.
"""
# Sample stratified indices. Some of them might be invalid.
indices = self.sum_tree.stratified_sample(batch_size)
allowed_attempts = self._max_sample_attempts
for i in range(len(indices)):
if not self.is_valid_transition(indices[i]):
if allowed_attempts == 0:
raise RuntimeError(
'Max sample attempts: Tried {} times but only sampled {}'
' valid indices. Batch size is {}'.
format(self._max_sample_attempts, i, batch_size))
index = indices[i]
while not self.is_valid_transition(index) and allowed_attempts > 0:
# If index i is not valid keep sampling others. Note that this
# is not stratified.
index = self.sum_tree.sample()
allowed_attempts -= 1
indices[i] = index
return indices
def sample_transition_batch(self, batch_size=None, indices=None):
"""Returns a batch of transitions with extra storage and the priorities.
The extra storage are defined through the extra_storage_types constructor
argument.
When the transition is terminal next_state_batch has undefined contents.
Args:
batch_size: int, number of transitions returned. If None, the default
batch_size will be used.
indices: None or list of ints, the indices of every transition in the
batch. If None, sample the indices uniformly.
Returns:
transition_batch: tuple of np.arrays with the shape and type as in
get_transition_elements().
"""
transition = (super(OutOfGraphPrioritizedReplayBuffer, self).
sample_transition_batch(batch_size, indices))
transition_elements = self.get_transition_elements(batch_size)
transition_names = [e.name for e in transition_elements]
probabilities_index = transition_names.index('sampling_probabilities')
indices_index = transition_names.index('indices')
indices = transition[indices_index]
# The parent returned an empty array for the probabilities. Fill it with the
# contents of the sum tree.
transition[probabilities_index][:] = self.get_priority(indices)
return transition
def set_priority(self, indices, priorities):
"""Sets the priority of the given elements according to Schaul et al.
Args:
indices: np.array with dtype int32, of indices in range
[0, replay_capacity).
priorities: float, the corresponding priorities.
"""
assert indices.dtype == np.int32, ('Indices must be integers, '
'given: {}'.format(indices.dtype))
for index, priority in zip(indices, priorities):
self.sum_tree.set(index, priority)
def get_priority(self, indices):
"""Fetches the priorities correspond to a batch of memory indices.
For any memory location not yet used, the corresponding priority is 0.
Args:
indices: np.array with dtype int32, of indices in range
[0, replay_capacity).
Returns:
priorities: float, the corresponding priorities.
"""
assert indices.shape, 'Indices must be an array.'
assert indices.dtype == np.int32, ('Indices must be int32s, '
'given: {}'.format(indices.dtype))
batch_size = len(indices)
priority_batch = np.empty((batch_size), dtype=np.float32)
for i, memory_index in enumerate(indices):
priority_batch[i] = self.sum_tree.get(memory_index)
return priority_batch
def get_transition_elements(self, batch_size=None):
"""Returns a 'type signature' for sample_transition_batch.
Args:
batch_size: int, number of transitions returned. If None, the default
batch_size will be used.
Returns:
signature: A namedtuple describing the method's return type signature.
"""
parent_transition_type = (
super(OutOfGraphPrioritizedReplayBuffer,
self).get_transition_elements(batch_size))
probablilities_type = [
ReplayElement('sampling_probabilities', (batch_size,), np.float32)
]
return parent_transition_type + probablilities_type
@gin.configurable(blacklist=['observation_shape', 'stack_size',
'update_horizon', 'gamma'])
class WrappedPrioritizedReplayBuffer(
circular_replay_buffer.WrappedReplayBuffer):
"""Wrapper of OutOfGraphPrioritizedReplayBuffer with in-graph sampling.
Usage:
* To add a transition: Call the add function.
* To sample a batch: Query any of the tensors in the transition dictionary.
Every sess.run that requires any of these tensors will
sample a new transition.
"""
def __init__(self,
observation_shape,
stack_size,
use_staging=True,
replay_capacity=1000000,
batch_size=32,
update_horizon=1,
gamma=0.99,
max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
extra_storage_types=None,
observation_dtype=np.uint8,
action_shape=(),
action_dtype=np.int32,
reward_shape=(),
reward_dtype=np.float32):
"""Initializes WrappedPrioritizedReplayBuffer.
Args:
observation_shape: tuple of ints.
stack_size: int, number of frames to use in state stack.
use_staging: bool, when True it would use a staging area to prefetch
the next sampling batch.
replay_capacity: int, number of transitions to keep in memory.
batch_size: int.
update_horizon: int, length of update ('n' in n-step update).
gamma: int, the discount factor.
max_sample_attempts: int, the maximum number of attempts allowed to
get a sample.
extra_storage_types: list of ReplayElements defining the type of the extra
contents that will be stored and returned by sample_transition_batch.
observation_dtype: np.dtype, type of the observations. Defaults to
np.uint8 for Atari 2600.
action_shape: tuple of ints, the shape for the action vector. Empty tuple
means the action is a scalar.
action_dtype: np.dtype, type of elements in the action.
reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
means the reward is a scalar.
reward_dtype: np.dtype, type of elements in the reward.
Raises:
ValueError: If update_horizon is not positive.
ValueError: If discount factor is not in [0, 1].
"""
memory = OutOfGraphPrioritizedReplayBuffer(
observation_shape, stack_size, replay_capacity, batch_size,
update_horizon, gamma, max_sample_attempts,
extra_storage_types=extra_storage_types,
observation_dtype=observation_dtype)
super(WrappedPrioritizedReplayBuffer, self).__init__(
observation_shape,
stack_size,
use_staging,
replay_capacity,
batch_size,
update_horizon,
gamma,
wrapped_memory=memory,
extra_storage_types=extra_storage_types,
observation_dtype=observation_dtype,
action_shape=action_shape,
action_dtype=action_dtype,
reward_shape=reward_shape,
reward_dtype=reward_dtype)
def tf_set_priority(self, indices, priorities):
"""Sets the priorities for the given indices.
Args:
indices: tf.Tensor with dtype int32 and shape [n].
priorities: tf.Tensor with dtype float and shape [n].
Returns:
A tf op setting the priorities for prioritized sampling.
"""
return tf.py_func(
self.memory.set_priority, [indices, priorities], [],
name='prioritized_replay_set_priority_py_func')
def tf_get_priority(self, indices):
"""Gets the priorities for the given indices.
Args:
indices: tf.Tensor with dtype int32 and shape [n].
Returns:
priorities: tf.Tensor with dtype float and shape [n], the priorities at
the indices.
"""
return tf.py_func(
self.memory.get_priority, [indices],
tf.float32,
name='prioritized_replay_get_priority_py_func')
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An implementation of Prioritized Experience Replay (PER).
This implementation is based on the paper "Prioritized Experience Replay"
by Tom Schaul et al. (2015). Many thanks to Tom Schaul, John Quan, and Matteo
Hessel for providing useful pointers on the algorithm and its implementation.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dopamine.replay_memory import circular_replay_buffer
from dopamine.replay_memory import sum_tree
from dopamine.replay_memory.circular_replay_buffer import ReplayElement
import numpy as np
import tensorflow as tf
import gin.tf
class OutOfGraphPrioritizedReplayBuffer(
circular_replay_buffer.OutOfGraphReplayBuffer):
"""An out-of-graph Replay Buffer for Prioritized Experience Replay.
See circular_replay_buffer.py for details.
"""
def __init__(self,
observation_shape,
stack_size,
replay_capacity,
batch_size,
update_horizon=1,
gamma=0.99,
max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
extra_storage_types=None,
observation_dtype=np.uint8,
action_shape=(),
action_dtype=np.int32,
reward_shape=(),
reward_dtype=np.float32):
"""Initializes OutOfGraphPrioritizedReplayBuffer.
Args:
observation_shape: tuple of ints.
stack_size: int, number of frames to use in state stack.
replay_capacity: int, number of transitions to keep in memory.
batch_size: int.
update_horizon: int, length of update ('n' in n-step update).
gamma: int, the discount factor.
max_sample_attempts: int, the maximum number of attempts allowed to
get a sample.
extra_storage_types: list of ReplayElements defining the type of the extra
contents that will be stored and returned by sample_transition_batch.
observation_dtype: np.dtype, type of the observations. Defaults to
np.uint8 for Atari 2600.
action_shape: tuple of ints, the shape for the action vector. Empty tuple
means the action is a scalar.
action_dtype: np.dtype, type of elements in the action.
reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
means the reward is a scalar.
reward_dtype: np.dtype, type of elements in the reward.
"""
super(OutOfGraphPrioritizedReplayBuffer, self).__init__(
observation_shape=observation_shape,
stack_size=stack_size,
replay_capacity=replay_capacity,
batch_size=batch_size,
update_horizon=update_horizon,
gamma=gamma,
max_sample_attempts=max_sample_attempts,
extra_storage_types=extra_storage_types,
observation_dtype=observation_dtype,
action_shape=action_shape,
action_dtype=action_dtype,
reward_shape=reward_shape,
reward_dtype=reward_dtype)
self.sum_tree = sum_tree.SumTree(replay_capacity)
def get_add_args_signature(self):
"""The signature of the add function.
The signature is the same as the one for OutOfGraphReplayBuffer, with an
added priority.
Returns:
list of ReplayElements defining the type of the argument signature needed
by the add function.
"""
parent_add_signature = super(OutOfGraphPrioritizedReplayBuffer,
self).get_add_args_signature()
add_signature = parent_add_signature + [
ReplayElement('priority', (), np.float32)
]
return add_signature
def _add(self, *args):
"""Internal add method to add to the underlying memory arrays.
The arguments need to match add_arg_signature.
If priority is none, it is set to the maximum priority ever seen.
Args:
*args: All the elements in a transition.
"""
# Use Schaul et al.'s (2015) scheme of setting the priority of new elements
# to the maximum priority so far.
parent_add_args = []
# Picks out 'priority' from arguments and passes the other arguments to the
# parent method.
for i, element in enumerate(self.get_add_args_signature()):
if element.name == 'priority':
priority = args[i]
else:
parent_add_args.append(args[i])
self.sum_tree.set(self.cursor(), priority)
super(OutOfGraphPrioritizedReplayBuffer, self)._add(*parent_add_args)
def sample_index_batch(self, batch_size):
"""Returns a batch of valid indices sampled as in Schaul et al. (2015).
Args:
batch_size: int, number of indices returned.
Returns:
list of ints, a batch of valid indices sampled uniformly.
Raises:
Exception: If the batch was not constructed after maximum number of tries.
"""
# Sample stratified indices. Some of them might be invalid.
indices = self.sum_tree.stratified_sample(batch_size)
allowed_attempts = self._max_sample_attempts
for i in range(len(indices)):
if not self.is_valid_transition(indices[i]):
if allowed_attempts == 0:
raise RuntimeError(
'Max sample attempts: Tried {} times but only sampled {}'
' valid indices. Batch size is {}'.
format(self._max_sample_attempts, i, batch_size))
index = indices[i]
while not self.is_valid_transition(index) and allowed_attempts > 0:
# If index i is not valid keep sampling others. Note that this
# is not stratified.
index = self.sum_tree.sample()
allowed_attempts -= 1
indices[i] = index
return indices
def sample_transition_batch(self, batch_size=None, indices=None):
"""Returns a batch of transitions with extra storage and the priorities.
The extra storage are defined through the extra_storage_types constructor
argument.
When the transition is terminal next_state_batch has undefined contents.
Args:
batch_size: int, number of transitions returned. If None, the default
batch_size will be used.
indices: None or list of ints, the indices of every transition in the
batch. If None, sample the indices uniformly.
Returns:
transition_batch: tuple of np.arrays with the shape and type as in
get_transition_elements().
"""
transition = (super(OutOfGraphPrioritizedReplayBuffer, self).
sample_transition_batch(batch_size, indices))
transition_elements = self.get_transition_elements(batch_size)
transition_names = [e.name for e in transition_elements]
probabilities_index = transition_names.index('sampling_probabilities')
indices_index = transition_names.index('indices')
indices = transition[indices_index]
# The parent returned an empty array for the probabilities. Fill it with the
# contents of the sum tree.
transition[probabilities_index][:] = self.get_priority(indices)
return transition
def set_priority(self, indices, priorities):
"""Sets the priority of the given elements according to Schaul et al.
Args:
indices: np.array with dtype int32, of indices in range
[0, replay_capacity).
priorities: float, the corresponding priorities.
"""
assert indices.dtype == np.int32, ('Indices must be integers, '
'given: {}'.format(indices.dtype))
for index, priority in zip(indices, priorities):
self.sum_tree.set(index, priority)
def get_priority(self, indices):
"""Fetches the priorities correspond to a batch of memory indices.
For any memory location not yet used, the corresponding priority is 0.
Args:
indices: np.array with dtype int32, of indices in range
[0, replay_capacity).
Returns:
priorities: float, the corresponding priorities.
"""
assert indices.shape, 'Indices must be an array.'
assert indices.dtype == np.int32, ('Indices must be int32s, '
'given: {}'.format(indices.dtype))
batch_size = len(indices)
priority_batch = np.empty((batch_size), dtype=np.float32)
for i, memory_index in enumerate(indices):
priority_batch[i] = self.sum_tree.get(memory_index)
return priority_batch
def get_transition_elements(self, batch_size=None):
"""Returns a 'type signature' for sample_transition_batch.
Args:
batch_size: int, number of transitions returned. If None, the default
batch_size will be used.
Returns:
signature: A namedtuple describing the method's return type signature.
"""
parent_transition_type = (
super(OutOfGraphPrioritizedReplayBuffer,
self).get_transition_elements(batch_size))
probablilities_type = [
ReplayElement('sampling_probabilities', (batch_size,), np.float32)
]
return parent_transition_type + probablilities_type
@gin.configurable(blacklist=['observation_shape', 'stack_size',
'update_horizon', 'gamma'])
class WrappedPrioritizedReplayBuffer(
circular_replay_buffer.WrappedReplayBuffer):
"""Wrapper of OutOfGraphPrioritizedReplayBuffer with in-graph sampling.
Usage:
* To add a transition: Call the add function.
* To sample a batch: Query any of the tensors in the transition dictionary.
Every sess.run that requires any of these tensors will
sample a new transition.
"""
def __init__(self,
observation_shape,
stack_size,
use_staging=True,
replay_capacity=1000000,
batch_size=32,
update_horizon=1,
gamma=0.99,
max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
extra_storage_types=None,
observation_dtype=np.uint8,
action_shape=(),
action_dtype=np.int32,
reward_shape=(),
reward_dtype=np.float32):
"""Initializes WrappedPrioritizedReplayBuffer.
Args:
observation_shape: tuple of ints.
stack_size: int, number of frames to use in state stack.
use_staging: bool, when True it would use a staging area to prefetch
the next sampling batch.
replay_capacity: int, number of transitions to keep in memory.
batch_size: int.
update_horizon: int, length of update ('n' in n-step update).
gamma: int, the discount factor.
max_sample_attempts: int, the maximum number of attempts allowed to
get a sample.
extra_storage_types: list of ReplayElements defining the type of the extra
contents that will be stored and returned by sample_transition_batch.
observation_dtype: np.dtype, type of the observations. Defaults to
np.uint8 for Atari 2600.
action_shape: tuple of ints, the shape for the action vector. Empty tuple
means the action is a scalar.
action_dtype: np.dtype, type of elements in the action.
reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
means the reward is a scalar.
reward_dtype: np.dtype, type of elements in the reward.
Raises:
ValueError: If update_horizon is not positive.
ValueError: If discount factor is not in [0, 1].
"""
memory = OutOfGraphPrioritizedReplayBuffer(
observation_shape, stack_size, replay_capacity, batch_size,
update_horizon, gamma, max_sample_attempts,
extra_storage_types=extra_storage_types,
observation_dtype=observation_dtype)
super(WrappedPrioritizedReplayBuffer, self).__init__(
observation_shape,
stack_size,
use_staging,
replay_capacity,
batch_size,
update_horizon,
gamma,
wrapped_memory=memory,
extra_storage_types=extra_storage_types,
observation_dtype=observation_dtype,
action_shape=action_shape,
action_dtype=action_dtype,
reward_shape=reward_shape,
reward_dtype=reward_dtype)
def tf_set_priority(self, indices, priorities):
"""Sets the priorities for the given indices.
Args:
indices: tf.Tensor with dtype int32 and shape [n].
priorities: tf.Tensor with dtype float and shape [n].
Returns:
A tf op setting the priorities for prioritized sampling.
"""
return tf.py_func(
self.memory.set_priority, [indices, priorities], [],
name='prioritized_replay_set_priority_py_func')
def tf_get_priority(self, indices):
"""Gets the priorities for the given indices.
Args:
indices: tf.Tensor with dtype int32 and shape [n].
Returns:
priorities: tf.Tensor with dtype float and shape [n], the priorities at
the indices.
"""
return tf.py_func(
self.memory.get_priority, [indices],
tf.float32,
name='prioritized_replay_get_priority_py_func')

Просмотреть файл

@ -1,205 +1,205 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A sum tree data structure.
Used for prioritized experience replay. See prioritized_replay_buffer.py
and Schaul et al. (2015).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import random
import numpy as np
class SumTree(object):
"""A sum tree data structure for storing replay priorities.
A sum tree is a complete binary tree whose leaves contain values called
priorities. Internal nodes maintain the sum of the priorities of all leaf
nodes in their subtree.
For capacity = 4, the tree may look like this:
+---+
|2.5|
+-+-+
|
+-------+--------+
| |
+-+-+ +-+-+
|1.5| |1.0|
+-+-+ +-+-+
| |
+----+----+ +----+----+
| | | |
+-+-+ +-+-+ +-+-+ +-+-+
|0.5| |1.0| |0.5| |0.5|
+---+ +---+ +---+ +---+
This is stored in a list of numpy arrays:
self.nodes = [ [2.5], [1.5, 1], [0.5, 1, 0.5, 0.5] ]
For conciseness, we allocate arrays as powers of two, and pad the excess
elements with zero values.
This is similar to the usual array-based representation of a complete binary
tree, but is a little more user-friendly.
"""
def __init__(self, capacity):
"""Creates the sum tree data structure for the given replay capacity.
Args:
capacity: int, the maximum number of elements that can be stored in this
data structure.
Raises:
ValueError: If requested capacity is not positive.
"""
assert isinstance(capacity, int)
if capacity <= 0:
raise ValueError('Sum tree capacity should be positive. Got: {}'.
format(capacity))
self.nodes = []
tree_depth = int(math.ceil(np.log2(capacity)))
level_size = 1
for _ in range(tree_depth + 1):
nodes_at_this_depth = np.zeros(level_size)
self.nodes.append(nodes_at_this_depth)
level_size *= 2
self.max_recorded_priority = 1.0
def _total_priority(self):
"""Returns the sum of all priorities stored in this sum tree.
Returns:
float, sum of priorities stored in this sum tree.
"""
return self.nodes[0][0]
def sample(self, query_value=None):
"""Samples an element from the sum tree.
Each element has probability p_i / sum_j p_j of being picked, where p_i is
the (positive) value associated with node i (possibly unnormalized).
Args:
query_value: float in [0, 1], used as the random value to select a
sample. If None, will select one randomly in [0, 1).
Returns:
int, a random element from the sum tree.
Raises:
Exception: If the sum tree is empty (i.e. its node values sum to 0), or if
the supplied query_value is larger than the total sum.
"""
if self._total_priority() == 0.0:
raise Exception('Cannot sample from an empty sum tree.')
if query_value and (query_value < 0. or query_value > 1.):
raise ValueError('query_value must be in [0, 1].')
# Sample a value in range [0, R), where R is the value stored at the root.
query_value = random.random() if query_value is None else query_value
query_value *= self._total_priority()
# Now traverse the sum tree.
node_index = 0
for nodes_at_this_depth in self.nodes[1:]:
# Compute children of previous depth's node.
left_child = node_index * 2
left_sum = nodes_at_this_depth[left_child]
# Each subtree describes a range [0, a), where a is its value.
if query_value < left_sum: # Recurse into left subtree.
node_index = left_child
else: # Recurse into right subtree.
node_index = left_child + 1
# Adjust query to be relative to right subtree.
query_value -= left_sum
return node_index
def stratified_sample(self, batch_size):
"""Performs stratified sampling using the sum tree.
Let R be the value at the root (total value of sum tree). This method will
divide [0, R) into batch_size segments, pick a random number from each of
those segments, and use that random number to sample from the sum_tree. This
is as specified in Schaul et al. (2015).
Args:
batch_size: int, the number of strata to use.
Returns:
list of batch_size elements sampled from the sum tree.
Raises:
Exception: If the sum tree is empty (i.e. its node values sum to 0).
"""
if self._total_priority() == 0.0:
raise Exception('Cannot sample from an empty sum tree.')
bounds = np.linspace(0., 1., batch_size + 1)
assert len(bounds) == batch_size + 1
segments = [(bounds[i], bounds[i+1]) for i in range(batch_size)]
query_values = [random.uniform(x[0], x[1]) for x in segments]
return [self.sample(query_value=x) for x in query_values]
def get(self, node_index):
"""Returns the value of the leaf node corresponding to the index.
Args:
node_index: The index of the leaf node.
Returns:
The value of the leaf node.
"""
return self.nodes[-1][node_index]
def set(self, node_index, value):
"""Sets the value of a leaf node and updates internal nodes accordingly.
This operation takes O(log(capacity)).
Args:
node_index: int, the index of the leaf node to be updated.
value: float, the value which we assign to the node. This value must be
nonnegative. Setting value = 0 will cause the element to never be
sampled.
Raises:
ValueError: If the given value is negative.
"""
if value < 0.0:
raise ValueError('Sum tree values should be nonnegative. Got {}'.
format(value))
self.max_recorded_priority = max(value, self.max_recorded_priority)
delta_value = value - self.nodes[-1][node_index]
# Now traverse back the tree, adjusting all sums along the way.
for nodes_at_this_depth in reversed(self.nodes):
# Note: Adding a delta leads to some tolerable numerical inaccuracies.
nodes_at_this_depth[node_index] += delta_value
node_index //= 2
assert node_index == 0, ('Sum tree traversal failed, final node index '
'is not 0.')
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A sum tree data structure.
Used for prioritized experience replay. See prioritized_replay_buffer.py
and Schaul et al. (2015).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import random
import numpy as np
class SumTree(object):
"""A sum tree data structure for storing replay priorities.
A sum tree is a complete binary tree whose leaves contain values called
priorities. Internal nodes maintain the sum of the priorities of all leaf
nodes in their subtree.
For capacity = 4, the tree may look like this:
+---+
|2.5|
+-+-+
|
+-------+--------+
| |
+-+-+ +-+-+
|1.5| |1.0|
+-+-+ +-+-+
| |
+----+----+ +----+----+
| | | |
+-+-+ +-+-+ +-+-+ +-+-+
|0.5| |1.0| |0.5| |0.5|
+---+ +---+ +---+ +---+
This is stored in a list of numpy arrays:
self.nodes = [ [2.5], [1.5, 1], [0.5, 1, 0.5, 0.5] ]
For conciseness, we allocate arrays as powers of two, and pad the excess
elements with zero values.
This is similar to the usual array-based representation of a complete binary
tree, but is a little more user-friendly.
"""
def __init__(self, capacity):
"""Creates the sum tree data structure for the given replay capacity.
Args:
capacity: int, the maximum number of elements that can be stored in this
data structure.
Raises:
ValueError: If requested capacity is not positive.
"""
assert isinstance(capacity, int)
if capacity <= 0:
raise ValueError('Sum tree capacity should be positive. Got: {}'.
format(capacity))
self.nodes = []
tree_depth = int(math.ceil(np.log2(capacity)))
level_size = 1
for _ in range(tree_depth + 1):
nodes_at_this_depth = np.zeros(level_size)
self.nodes.append(nodes_at_this_depth)
level_size *= 2
self.max_recorded_priority = 1.0
def _total_priority(self):
"""Returns the sum of all priorities stored in this sum tree.
Returns:
float, sum of priorities stored in this sum tree.
"""
return self.nodes[0][0]
def sample(self, query_value=None):
"""Samples an element from the sum tree.
Each element has probability p_i / sum_j p_j of being picked, where p_i is
the (positive) value associated with node i (possibly unnormalized).
Args:
query_value: float in [0, 1], used as the random value to select a
sample. If None, will select one randomly in [0, 1).
Returns:
int, a random element from the sum tree.
Raises:
Exception: If the sum tree is empty (i.e. its node values sum to 0), or if
the supplied query_value is larger than the total sum.
"""
if self._total_priority() == 0.0:
raise Exception('Cannot sample from an empty sum tree.')
if query_value and (query_value < 0. or query_value > 1.):
raise ValueError('query_value must be in [0, 1].')
# Sample a value in range [0, R), where R is the value stored at the root.
query_value = random.random() if query_value is None else query_value
query_value *= self._total_priority()
# Now traverse the sum tree.
node_index = 0
for nodes_at_this_depth in self.nodes[1:]:
# Compute children of previous depth's node.
left_child = node_index * 2
left_sum = nodes_at_this_depth[left_child]
# Each subtree describes a range [0, a), where a is its value.
if query_value < left_sum: # Recurse into left subtree.
node_index = left_child
else: # Recurse into right subtree.
node_index = left_child + 1
# Adjust query to be relative to right subtree.
query_value -= left_sum
return node_index
def stratified_sample(self, batch_size):
"""Performs stratified sampling using the sum tree.
Let R be the value at the root (total value of sum tree). This method will
divide [0, R) into batch_size segments, pick a random number from each of
those segments, and use that random number to sample from the sum_tree. This
is as specified in Schaul et al. (2015).
Args:
batch_size: int, the number of strata to use.
Returns:
list of batch_size elements sampled from the sum tree.
Raises:
Exception: If the sum tree is empty (i.e. its node values sum to 0).
"""
if self._total_priority() == 0.0:
raise Exception('Cannot sample from an empty sum tree.')
bounds = np.linspace(0., 1., batch_size + 1)
assert len(bounds) == batch_size + 1
segments = [(bounds[i], bounds[i+1]) for i in range(batch_size)]
query_values = [random.uniform(x[0], x[1]) for x in segments]
return [self.sample(query_value=x) for x in query_values]
def get(self, node_index):
"""Returns the value of the leaf node corresponding to the index.
Args:
node_index: The index of the leaf node.
Returns:
The value of the leaf node.
"""
return self.nodes[-1][node_index]
def set(self, node_index, value):
"""Sets the value of a leaf node and updates internal nodes accordingly.
This operation takes O(log(capacity)).
Args:
node_index: int, the index of the leaf node to be updated.
value: float, the value which we assign to the node. This value must be
nonnegative. Setting value = 0 will cause the element to never be
sampled.
Raises:
ValueError: If the given value is negative.
"""
if value < 0.0:
raise ValueError('Sum tree values should be nonnegative. Got {}'.
format(value))
self.max_recorded_priority = max(value, self.max_recorded_priority)
delta_value = value - self.nodes[-1][node_index]
# Now traverse back the tree, adjusting all sums along the way.
for nodes_at_this_depth in reversed(self.nodes):
# Note: Adding a delta leads to some tolerable numerical inaccuracies.
nodes_at_this_depth[node_index] += delta_value
node_index //= 2
assert node_index == 0, ('Sum tree traversal failed, final node index '
'is not 0.')

Просмотреть файл

@ -1,15 +1,15 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Просмотреть файл

@ -1,34 +1,34 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common testing utilities shared across agents."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import mock
import tensorflow as tf
class MockReplayBuffer(object):
"""Mock ReplayBuffer to verify the way the agent interacts with it."""
def __init__(self):
with tf.variable_scope('MockReplayBuffer', reuse=tf.AUTO_REUSE):
self.add = mock.Mock()
self.memory = mock.Mock()
self.memory.add_count = 0
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common testing utilities shared across agents."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import mock
import tensorflow as tf
class MockReplayBuffer(object):
"""Mock ReplayBuffer to verify the way the agent interacts with it."""
def __init__(self):
with tf.variable_scope('MockReplayBuffer', reuse=tf.AUTO_REUSE):
self.add = mock.Mock()
self.memory = mock.Mock()
self.memory.add_count = 0

184
setup.py
Просмотреть файл

@ -1,92 +1,92 @@
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Setup script for Dopamine.
This script will install Dopamine as a Python module.
See: https://github.com/google/dopamine
"""
import codecs
from os import path
from setuptools import find_packages
from setuptools import setup
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file.
with codecs.open(path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
install_requires = ['gin-config == 0.1.4', 'absl-py >= 0.2.2',
'opencv-python >= 3.4.1.15',
'gym >= 0.10.5']
tests_require = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2',
'opencv-python >= 3.4.1.15',
'gym >= 0.10.5', 'mock >= 1.0.0']
dopamine_description = (
'Dopamine: A framework for flexible Reinforcement Learning research')
setup(
name='dopamine_rl',
version='2.0.1',
include_package_data=True,
packages=find_packages(exclude=['docs']), # Required
package_data={'testdata': ['testdata/*.gin']},
install_requires=install_requires,
tests_require=tests_require,
description=dopamine_description,
long_description=long_description,
url='https://github.com/google/dopamine', # Optional
author='The Dopamine Team', # Optional
author_email='opensource@google.com',
classifiers=[ # Optional
'Development Status :: 4 - Beta',
# Indicate who your project is intended for
'Intended Audience :: Developers',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
# Pick your license as you wish
'License :: OSI Approved :: Apache Software License',
# Specify the Python versions you support here. In particular, ensure
# that you indicate whether you support Python 2, Python 3 or both.
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development',
'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries :: Python Modules',
],
project_urls={ # Optional
'Documentation': 'https://github.com/google/dopamine',
'Bug Reports': 'https://github.com/google/dopamine/issues',
'Source': 'https://github.com/google/dopamine',
},
license='Apache 2.0',
keywords='dopamine reinforcement-learning python machine learning'
)
# coding=utf-8
# Copyright 2018 The Dopamine Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Setup script for Dopamine.
This script will install Dopamine as a Python module.
See: https://github.com/google/dopamine
"""
import codecs
from os import path
from setuptools import find_packages
from setuptools import setup
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file.
with codecs.open(path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
install_requires = ['gin-config == 0.1.4', 'absl-py >= 0.2.2',
'opencv-python >= 3.4.1.15',
'gym >= 0.10.5']
tests_require = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2',
'opencv-python >= 3.4.1.15',
'gym >= 0.10.5', 'mock >= 1.0.0']
dopamine_description = (
'Dopamine: A framework for flexible Reinforcement Learning research')
setup(
name='dopamine_rl',
version='2.0.1',
include_package_data=True,
packages=find_packages(exclude=['docs']), # Required
package_data={'testdata': ['testdata/*.gin']},
install_requires=install_requires,
tests_require=tests_require,
description=dopamine_description,
long_description=long_description,
url='https://github.com/google/dopamine', # Optional
author='The Dopamine Team', # Optional
author_email='opensource@google.com',
classifiers=[ # Optional
'Development Status :: 4 - Beta',
# Indicate who your project is intended for
'Intended Audience :: Developers',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
# Pick your license as you wish
'License :: OSI Approved :: Apache Software License',
# Specify the Python versions you support here. In particular, ensure
# that you indicate whether you support Python 2, Python 3 or both.
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development',
'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries :: Python Modules',
],
project_urls={ # Optional
'Documentation': 'https://github.com/google/dopamine',
'Bug Reports': 'https://github.com/google/dopamine/issues',
'Source': 'https://github.com/google/dopamine',
},
license='Apache 2.0',
keywords='dopamine reinforcement-learning python machine learning'
)