first

2020-07-22 11:19:45 +00:00 · 2020-07-22 11:19:45 +00:00 · a8403a7462
--- a/.gitignore
+++ b/.gitignore
@ -1,129 +1,129 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,14 +1,14 @@
-# Contributing
-
-This project welcomes contributions and suggestions. Most contributions require you to
-agree to a Contributor License Agreement (CLA) declaring that you have the right to,
-and actually do, grant us the rights to use your contribution. For details, visit
-https://cla.microsoft.com.
-
-When you submit a pull request, a CLA-bot will automatically determine whether you need
-to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
-instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
-
-This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+# Contributing
+
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to,
+and actually do, grant us the rights to use your contribution. For details, visit
+https://cla.microsoft.com.
+
+When you submit a pull request, a CLA-bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
+instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
--- a/dopamine/init.py
+++ b/dopamine/init.py
@ -1,2 +1,2 @@
-# coding=utf-8
-name = 'dopamine'
+# coding=utf-8
+name = 'dopamine'
--- a/dopamine/agents/init.py
+++ b/dopamine/agents/init.py
@ -1,15 +1,15 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/dopamine/agents/dqn/init.py
+++ b/dopamine/agents/dqn/init.py
@ -1,15 +1,15 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/dopamine/agents/dqn/configs/dqn.gin
+++ b/dopamine/agents/dqn/configs/dqn.gin
@ -1,37 +1,37 @@
-# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
-# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
-# comparison.
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.agents.dqn.dqn_agent
-import dopamine.replay_memory.circular_replay_buffer
-import gin.tf.external_configurables
-
-DQNAgent.gamma = 0.99
-DQNAgent.update_horizon = 1
-DQNAgent.min_replay_history = 20000  # agent steps
-DQNAgent.update_period = 4
-DQNAgent.target_update_period = 8000  # agent steps
-DQNAgent.epsilon_train = 0.01
-DQNAgent.epsilon_eval = 0.001
-DQNAgent.epsilon_decay_period = 250000  # agent steps
-DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
-
-tf.train.RMSPropOptimizer.learning_rate = 0.00025
-tf.train.RMSPropOptimizer.decay = 0.95
-tf.train.RMSPropOptimizer.momentum = 0.0
-tf.train.RMSPropOptimizer.epsilon = 0.00001
-tf.train.RMSPropOptimizer.centered = True
-
-atari_lib.create_atari_environment.game_name = 'Pong'
-# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
-atari_lib.create_atari_environment.sticky_actions = True
-create_agent.agent_name = 'dqn'
-Runner.num_iterations = 200
-Runner.training_steps = 250000  # agent steps
-Runner.evaluation_steps = 125000  # agent steps
-Runner.max_steps_per_episode = 27000  # agent steps
-
-WrappedReplayBuffer.replay_capacity = 1000000
-WrappedReplayBuffer.batch_size = 32
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.min_replay_history = 20000  # agent steps
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 8000  # agent steps
+DQNAgent.epsilon_train = 0.01
+DQNAgent.epsilon_eval = 0.001
+DQNAgent.epsilon_decay_period = 250000  # agent steps
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+atari_lib.create_atari_environment.game_name = 'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+atari_lib.create_atari_environment.sticky_actions = True
+create_agent.agent_name = 'dqn'
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
--- a/dopamine/agents/dqn/configs/dqn_acrobot.gin
+++ b/dopamine/agents/dqn/configs/dqn_acrobot.gin
@ -1,35 +1,35 @@
-# Hyperparameters for a simple DQN-style Acrobot agent. The hyperparameters
-# chosen achieve reasonable performance.
-import dopamine.discrete_domains.gym_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.agents.dqn.dqn_agent
-import dopamine.replay_memory.circular_replay_buffer
-import gin.tf.external_configurables
-
-DQNAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
-DQNAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
-DQNAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
-DQNAgent.network = @gym_lib.acrobot_dqn_network
-DQNAgent.gamma = 0.99
-DQNAgent.update_horizon = 1
-DQNAgent.min_replay_history = 500
-DQNAgent.update_period = 4
-DQNAgent.target_update_period = 100
-DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
-DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-DQNAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.001
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-create_gym_environment.environment_name = 'Acrobot'
-create_gym_environment.version = 'v1'
-create_agent.agent_name = 'dqn'
-Runner.create_environment_fn = @gym_lib.create_gym_environment
-Runner.num_iterations = 500
-Runner.training_steps = 1000
-Runner.evaluation_steps = 1000
-Runner.max_steps_per_episode = 500
-
-WrappedReplayBuffer.replay_capacity = 50000
-WrappedReplayBuffer.batch_size = 128
+# Hyperparameters for a simple DQN-style Acrobot agent. The hyperparameters
+# chosen achieve reasonable performance.
+import dopamine.discrete_domains.gym_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
+DQNAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
+DQNAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
+DQNAgent.network = @gym_lib.acrobot_dqn_network
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.min_replay_history = 500
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 100
+DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.001
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+create_gym_environment.environment_name = 'Acrobot'
+create_gym_environment.version = 'v1'
+create_agent.agent_name = 'dqn'
+Runner.create_environment_fn = @gym_lib.create_gym_environment
+Runner.num_iterations = 500
+Runner.training_steps = 1000
+Runner.evaluation_steps = 1000
+Runner.max_steps_per_episode = 500
+
+WrappedReplayBuffer.replay_capacity = 50000
+WrappedReplayBuffer.batch_size = 128
--- a/dopamine/agents/dqn/configs/dqn_cartpole.gin
+++ b/dopamine/agents/dqn/configs/dqn_cartpole.gin
@ -1,35 +1,35 @@
-# Hyperparameters for a simple DQN-style Cartpole agent. The hyperparameters
-# chosen achieve reasonable performance.
-import dopamine.discrete_domains.gym_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.agents.dqn.dqn_agent
-import dopamine.replay_memory.circular_replay_buffer
-import gin.tf.external_configurables
-
-DQNAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
-DQNAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
-DQNAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
-DQNAgent.network = @gym_lib.cartpole_dqn_network
-DQNAgent.gamma = 0.99
-DQNAgent.update_horizon = 1
-DQNAgent.min_replay_history = 500
-DQNAgent.update_period = 4
-DQNAgent.target_update_period = 100
-DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
-DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-DQNAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.001
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-create_gym_environment.environment_name = 'CartPole'
-create_gym_environment.version = 'v0'
-create_agent.agent_name = 'dqn'
-Runner.create_environment_fn = @gym_lib.create_gym_environment
-Runner.num_iterations = 500
-Runner.training_steps = 1000
-Runner.evaluation_steps = 1000
-Runner.max_steps_per_episode = 200  # Default max episode length.
-
-WrappedReplayBuffer.replay_capacity = 50000
-WrappedReplayBuffer.batch_size = 128
+# Hyperparameters for a simple DQN-style Cartpole agent. The hyperparameters
+# chosen achieve reasonable performance.
+import dopamine.discrete_domains.gym_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
+DQNAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
+DQNAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
+DQNAgent.network = @gym_lib.cartpole_dqn_network
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.min_replay_history = 500
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 100
+DQNAgent.epsilon_fn = @dqn_agent.identity_epsilon
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.001
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+create_gym_environment.environment_name = 'CartPole'
+create_gym_environment.version = 'v0'
+create_agent.agent_name = 'dqn'
+Runner.create_environment_fn = @gym_lib.create_gym_environment
+Runner.num_iterations = 500
+Runner.training_steps = 1000
+Runner.evaluation_steps = 1000
+Runner.max_steps_per_episode = 200  # Default max episode length.
+
+WrappedReplayBuffer.replay_capacity = 50000
+WrappedReplayBuffer.batch_size = 128
--- a/dopamine/agents/dqn/configs/dqn_icml.gin
+++ b/dopamine/agents/dqn/configs/dqn_icml.gin
@ -1,37 +1,37 @@
-# Hyperparameters used for reporting DQN results in Bellemare et al. (2017).
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.agents.dqn.dqn_agent
-import dopamine.replay_memory.circular_replay_buffer
-import gin.tf.external_configurables
-
-DQNAgent.gamma = 0.99
-DQNAgent.update_horizon = 1
-DQNAgent.min_replay_history = 50000  # agent steps
-DQNAgent.update_period = 4
-DQNAgent.target_update_period = 10000  # agent steps
-DQNAgent.epsilon_train = 0.01
-DQNAgent.epsilon_eval = 0.001
-DQNAgent.epsilon_decay_period = 1000000  # agent steps
-DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
-
-tf.train.RMSPropOptimizer.learning_rate = 0.00025
-tf.train.RMSPropOptimizer.decay = 0.95
-tf.train.RMSPropOptimizer.momentum = 0.0
-tf.train.RMSPropOptimizer.epsilon = 0.00001
-tf.train.RMSPropOptimizer.centered = True
-
-atari_lib.create_atari_environment.game_name = 'Pong'
-# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'dqn'
-Runner.num_iterations = 200
-Runner.training_steps = 250000  # agent steps
-Runner.evaluation_steps = 125000  # agent steps
-Runner.max_steps_per_episode = 27000  # agent steps
-
-AtariPreprocessing.terminal_on_life_loss = True
-
-WrappedReplayBuffer.replay_capacity = 1000000
-WrappedReplayBuffer.batch_size = 32
+# Hyperparameters used for reporting DQN results in Bellemare et al. (2017).
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.min_replay_history = 50000  # agent steps
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 10000  # agent steps
+DQNAgent.epsilon_train = 0.01
+DQNAgent.epsilon_eval = 0.001
+DQNAgent.epsilon_decay_period = 1000000  # agent steps
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+atari_lib.create_atari_environment.game_name = 'Pong'
+# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'dqn'
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
--- a/dopamine/agents/dqn/configs/dqn_nature.gin
+++ b/dopamine/agents/dqn/configs/dqn_nature.gin
@ -1,41 +1,41 @@
-# Hyperparameters used in Mnih et al. (2015).
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.agents.dqn.dqn_agent
-import dopamine.replay_memory.circular_replay_buffer
-import gin.tf.external_configurables
-
-DQNAgent.gamma = 0.99
-DQNAgent.update_horizon = 1
-DQNAgent.runtype = 'RUNTYPE'
-DQNAgent.game = 'GAME'
-DQNAgent.min_replay_history = 50000  # agent steps
-DQNAgent.update_period = 4
-DQNAgent.target_update_period = 10000  # agent steps
-DQNAgent.epsilon_train = 0.1
-DQNAgent.epsilon_eval = 0.05
-DQNAgent.epsilon_decay_period = 1000000  # agent steps
-DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
-
-tf.train.RMSPropOptimizer.learning_rate = 0.00025
-tf.train.RMSPropOptimizer.decay = 0.95
-tf.train.RMSPropOptimizer.momentum = 0.0
-tf.train.RMSPropOptimizer.epsilon = 0.00001
-tf.train.RMSPropOptimizer.centered = True
-
-atari_lib.create_atari_environment.game_name = 'GAME'
-# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'dqn'
-Runner.game = 'GAME'
-Runner.runtype = 'RUNTYPE'
-Runner.num_iterations = 200
-Runner.training_steps = 250000  # agent steps
-Runner.evaluation_steps = 125000  # agent steps
-Runner.max_steps_per_episode = 27000  # agent steps
-
-AtariPreprocessing.terminal_on_life_loss = True
-
-WrappedReplayBuffer.replay_capacity = 1000000
-WrappedReplayBuffer.batch_size = 32
+# Hyperparameters used in Mnih et al. (2015).
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.runtype = 'RUNTYPE'
+DQNAgent.game = 'GAME'
+DQNAgent.min_replay_history = 50000  # agent steps
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 10000  # agent steps
+DQNAgent.epsilon_train = 0.1
+DQNAgent.epsilon_eval = 0.05
+DQNAgent.epsilon_decay_period = 1000000  # agent steps
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+atari_lib.create_atari_environment.game_name = 'GAME'
+# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'dqn'
+Runner.game = 'GAME'
+Runner.runtype = 'RUNTYPE'
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
--- a/dopamine/agents/dqn/dqn_agent.py
+++ b/dopamine/agents/dqn/dqn_agent.py
--- a/dopamine/agents/fqf/configs/fqf.gin
+++ b/dopamine/agents/fqf/configs/fqf.gin
@ -1,46 +1,46 @@
-# Hyperparameters follow Dabney et al. (2018).
-import dopamine.agents.fqf.fqf_agent
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-FQFAgent.kappa = 1.0
-FQFAgent.num_tau_samples = 32
-FQFAgent.num_tau_prime_samples = 32
-FQFAgent.num_quantile_samples = 32
-FQFAgent.runtype = 'RUNTYPE'
-FQFAgent.fqf_factor = 'FQFFACTOR'
-FQFAgent.fqf_ent = 'FQFENT'
-RainbowAgent.gamma = 0.99
-RainbowAgent.game = 'GAME'
-RainbowAgent.runtype = 'RUNTYPE'
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 50000 # agent steps
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 10000 # agent steps
-RainbowAgent.epsilon_train = 0.01
-RainbowAgent.epsilon_eval = 0.001
-RainbowAgent.epsilon_decay_period = 1000000 # agent steps
-RainbowAgent.replay_scheme = 'uniform'
-RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.00005
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-atari_lib.create_atari_environment.game_name = 'GAME'
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'implicit_quantile'
-Runner.num_iterations = 200
-Runner.game = 'GAME'
-Runner.runtype = 'RUNTYPE'
-Runner.training_steps = 250000
-Runner.evaluation_steps = 125000
-Runner.max_steps_per_episode = 27000
-
-AtariPreprocessing.terminal_on_life_loss = True
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
-WrappedPrioritizedReplayBuffer.batch_size = 32
+# Hyperparameters follow Dabney et al. (2018).
+import dopamine.agents.fqf.fqf_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+FQFAgent.kappa = 1.0
+FQFAgent.num_tau_samples = 32
+FQFAgent.num_tau_prime_samples = 32
+FQFAgent.num_quantile_samples = 32
+FQFAgent.runtype = 'RUNTYPE'
+FQFAgent.fqf_factor = 'FQFFACTOR'
+FQFAgent.fqf_ent = 'FQFENT'
+RainbowAgent.gamma = 0.99
+RainbowAgent.game = 'GAME'
+RainbowAgent.runtype = 'RUNTYPE'
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 50000 # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 10000 # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 1000000 # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00005
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+atari_lib.create_atari_environment.game_name = 'GAME'
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'implicit_quantile'
+Runner.num_iterations = 200
+Runner.game = 'GAME'
+Runner.runtype = 'RUNTYPE'
+Runner.training_steps = 250000
+Runner.evaluation_steps = 125000
+Runner.max_steps_per_episode = 27000
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
--- a/dopamine/agents/fqf/fqf_agent.py
+++ b/dopamine/agents/fqf/fqf_agent.py
@ -1,410 +1,420 @@
-# coding=utf-8
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import numpy as np
-
-
-
-from dopamine.agents.rainbow import rainbow_agent
-from dopamine.discrete_domains import atari_lib
-import tensorflow as tf
-
-import gin.tf
-
-slim = tf.contrib.slim
-
-
-@gin.configurable
-class FQFAgent(rainbow_agent.RainbowAgent):
-
-  def __init__(self,
-               sess,
-               num_actions,
-               network=atari_lib.fqf_network,
-               kappa=1.0,
-               runtype=None,
-               fqf_factor=0.000001,
-               fqf_ent=0.001,
-               num_tau_samples=32,
-               num_tau_prime_samples=32,
-               num_quantile_samples=32,
-               quantile_embedding_dim=64,
-               double_dqn=False,
-               summary_writer=None,
-               summary_writing_frequency=500):
-    """Initializes the agent and constructs the Graph.
-
-    Most of this constructor's parameters are IQN-specific hyperparameters whose
-    values are taken from Dabney et al. (2018).
-
-    Args:
-      sess: `tf.Session` object for running associated ops.
-      num_actions: int, number of actions the agent can take at any state.
-      network: function expecting three parameters:
-        (num_actions, network_type, state). This function will return the
-        network_type object containing the tensors output by the network.
-        See dopamine.discrete_domains.atari_lib.nature_dqn_network as
-        an example.
-      kappa: float, Huber loss cutoff.
-      num_tau_samples: int, number of online quantile samples for loss
-        estimation.
-      num_tau_prime_samples: int, number of target quantile samples for loss
-        estimation.
-      num_quantile_samples: int, number of quantile samples for computing
-        Q-values.
-      quantile_embedding_dim: int, embedding dimension for the quantile input.
-      double_dqn: boolean, whether to perform double DQN style learning
-        as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
-      summary_writer: SummaryWriter object for outputting training statistics.
-        Summary writing disabled if set to None.
-      summary_writing_frequency: int, frequency with which summaries will be
-        written. Lower values will result in slower training.
-    """
-    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
-    self._runtype= runtype
-    print (self._runtype)
-    self.fqf_factor = float(fqf_factor)
-    self.ent = float(fqf_ent)
-    self.kappa = kappa
-    print ('fqf factor:', self.fqf_factor)
-    # num_tau_samples = N below equation (3) in the paper.
-    self.num_tau_samples = num_tau_samples
-    # num_tau_prime_samples = N' below equation (3) in the paper.
-    self.num_tau_prime_samples = num_tau_prime_samples
-    # num_quantile_samples = k below equation (3) in the paper.
-    self.num_quantile_samples = num_quantile_samples
-    # quantile_embedding_dim = n above equation (4) in the paper.
-    self.quantile_embedding_dim = quantile_embedding_dim
-    # option to perform double dqn.
-    self.double_dqn = double_dqn
-    if 'adam' in self._runtype:
-        self.optimizer1 = tf.train.AdamOptimizer(
-            learning_rate=0.00005 * self.fqf_factor,
-            epsilon=0.0003125)
-    else:
-        self.optimizer1 = tf.train.RMSPropOptimizer(
-            learning_rate=0.00005 * self.fqf_factor,
-            decay=0.95,
-            momentum=0.0,
-            epsilon=0.00001,
-            centered=True)
-
-    super(FQFAgent, self).__init__(
-        sess=sess,
-        num_actions=num_actions,
-        network=network,
-        summary_writer=summary_writer,
-        summary_writing_frequency=summary_writing_frequency)
-
-  def _get_network_type(self):
-    return collections.namedtuple(
-        'iqn_network', ['quantile_values', 'quantiles', 'quantile_values_origin', 'quantiles_origin', 'Fv_diff', 'v_diff', 'quantile_values_mid', 'quantiles_mid', 'L_tau', 'gradient_tau', 'quantile_tau'])
-
-  def _network_template(self, state, num_quantiles):
-    return self.network(self.num_actions, self.quantile_embedding_dim,
-                        self._get_network_type(), state, num_quantiles, self._runtype)
-
-  def _train_step(self):
-    """Runs a single training step.
-
-    Runs a training op if both:
-      (1) A minimum number of frames have been added to the replay buffer.
-      (2) `training_steps` is a multiple of `update_period`.
-
-    Also, syncs weights from online to target network if training steps is a
-    multiple of target update period.
-    """
-    # Run a train op at the rate of self.update_period if enough training steps
-    # have been run. This matches the Nature DQN behaviour.
-    if self._replay.memory.add_count > self.min_replay_history:
-      if self.training_steps % self.update_period == 0:
-        _, _, _, loss, loss1, quan_value, quan, vdiff = self._sess.run(self._train_op)
-        if self.training_steps % 50000 == 0:
-            batchsize = 32
-            quan_value = np.reshape(quan_value, [batchsize, self.num_tau_samples])
-            quan = np.reshape(quan, [batchsize, self.num_tau_samples])
-            quan_value = quan_value[0].tolist()
-            quan = quan[0].tolist()
-            vdiff = vdiff[:, 0].tolist()
-            print (">>> loss:", loss)
-            print (">>> loss1:", loss1)
-            print (">>> value:", quan_value)
-            print (">>> quans:", quan)
-            print (">>> vdiff:", vdiff)
-            print (">>> vdiff_sum:", np.sum(vdiff))
-        if (self.summary_writer is not None and
-            self.training_steps > 0 and
-            self.training_steps % self.summary_writing_frequency == 0):
-          summary = self._sess.run(self._merged_summaries)
-          self.summary_writer.add_summary(summary, self.training_steps)
-
-      if self.training_steps % self.target_update_period == 0:
-        self._sess.run(self._sync_qt_ops)
-
-    self.training_steps += 1
-
-  def _build_networks(self):
-    """Builds the FQF computations needed for acting and training.
-
-    These are:
-      self.online_convnet: For computing the current state's quantile values.
-      self.target_convnet: For computing the next state's target quantile
-        values.
-      self._net_outputs: The actual quantile values.
-      self._q_argmax: The action maximizing the current state's Q-values.
-      self._replay_net_outputs: The replayed states' quantile values.
-      self._replay_next_target_net_outputs: The replayed next states' target
-        quantile values.
-    """
-    # Calling online_convnet will generate a new graph as defined in
-    # self._get_network_template using whatever input is passed, but will always
-    # share the same weights.
-    self.online_convnet = tf.make_template('Online', self._network_template)
-    self.target_convnet = tf.make_template('Target', self._network_template)
-
-    # Compute the Q-values which are used for action selection in the current
-    # state.
-    self._net_outputs = self.online_convnet(self.state_ph,
-                                            self.num_quantile_samples)
-    # Shape of self._net_outputs.quantile_values:
-    # num_quantile_samples x num_actions.
-    # e.g. if num_actions is 2, it might look something like this:
-    # Vals for Quantile .2  Vals for Quantile .4  Vals for Quantile .6
-    #    [[0.1, 0.5],         [0.15, -0.3],          [0.15, -0.2]]
-    # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
-    if 'ws' in self._runtype:
-        self._q_values = tf.reduce_sum(self._net_outputs.quantile_values * self._net_outputs.v_diff, axis=0)   #NOTE: quantile_values = quantile_values_mid
-    else:
-        self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
-    self._q_argmax = tf.argmax(self._q_values, axis=0)
-
-    self._replay_net_outputs = self.online_convnet(self._replay.states,
-                                                   self.num_tau_samples)
-    # Shape: (num_tau_samples x batch_size) x num_actions.
-    self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
-    self._replay_net_quantiles = self._replay_net_outputs.quantiles
-
-    # Do the same for next states in the replay buffer.
-    self._replay_net_target_outputs = self.target_convnet(
-        self._replay.next_states, self.num_tau_prime_samples)
-    # Shape: (num_tau_prime_samples x batch_size) x num_actions.
-    vals = self._replay_net_target_outputs.quantile_values
-    self._replay_net_target_quantile_values = vals
-
-    # Compute Q-values which are used for action selection for the next states
-    # in the replay buffer. Compute the argmax over the Q-values.
-    if self.double_dqn:
-      outputs_action = self.online_convnet(self._replay.next_states,
-                                           self.num_quantile_samples)
-    else:
-      outputs_action = self.target_convnet(self._replay.next_states,
-                                           self.num_quantile_samples)
-
-    # Shape: (num_quantile_samples x batch_size) x num_actions.
-    target_quantile_values_action = outputs_action.quantile_values   #NOTE: quantile_values = quantile_values_mid
-    # Shape: num_quantile_samples x batch_size x num_actions.
-    target_quantile_values_action = tf.reshape(target_quantile_values_action,
-                                               [self.num_quantile_samples,
-                                                self._replay.batch_size,
-                                                self.num_actions])
-    # Shape: batch_size x num_actions.
-    if 'ws' in self._runtype:
-        v_diff = tf.reshape(outputs_action.v_diff, [self.num_quantile_samples, self._replay.batch_size, 1])
-        self._replay_net_target_q_values = tf.squeeze(tf.reduce_sum(
-        target_quantile_values_action * v_diff, axis=0))
-    else:
-        self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
-        target_quantile_values_action, axis=0))
-    self._replay_next_qt_argmax = tf.argmax(
-        self._replay_net_target_q_values, axis=1)
-
-  def _build_target_quantile_values_op(self):
-    """Build an op used as a target for return values at given quantiles.
-
-    Returns:
-      An op calculating the target quantile return.
-    """
-    batch_size = tf.shape(self._replay.rewards)[0]
-    # Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
-    rewards = self._replay.rewards[:, None]
-    rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
-
-    is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
-    # Incorporate terminal state to discount factor.
-    # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
-    gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
-    gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
-                                  [self.num_tau_prime_samples, 1])
-
-    # Get the indices of the maximium Q-value across the action dimension.
-    # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
-
-    replay_next_qt_argmax = tf.tile(
-        self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
-
-    # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
-    batch_indices = tf.cast(tf.range(
-        self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
-
-    # Shape of batch_indexed_target_values:
-    # (num_tau_prime_samples x batch_size) x 2.
-    batch_indexed_target_values = tf.concat(
-        [batch_indices, replay_next_qt_argmax], axis=1)
-
-    # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
-    target_quantile_values = tf.gather_nd(
-        self._replay_net_target_quantile_values,
-        batch_indexed_target_values)[:, None]
-
-    return rewards + gamma_with_terminal * target_quantile_values
-
-  def _build_train_op(self):
-    """Builds a training op.
-
-    Returns:
-      train_op: An op performing one step of training from replay data.
-    """
-    batch_size = tf.shape(self._replay.rewards)[0]
-
-    target_quantile_values = tf.stop_gradient(
-        self._build_target_quantile_values_op())
-    # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
-    # the manner in which the target_quantile_values are tiled.
-    target_quantile_values = tf.reshape(target_quantile_values,
-                                        [self.num_tau_prime_samples,
-                                         batch_size, 1])
-    # Transpose dimensions so that the dimensionality is batch_size x
-    # self.num_tau_prime_samples x 1 to prepare for computation of
-    # Bellman errors.
-    # Final shape of target_quantile_values:
-    # batch_size x num_tau_prime_samples x 1.
-    target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
-
-    # Shape of indices: (num_tau_samples x batch_size) x 1.
-    # Expand dimension by one so that it can be used to index into all the
-    # quantiles when using the tf.gather_nd function (see below).
-    indices = tf.range(self.num_tau_samples * batch_size)[:, None]
-
-    # Expand the dimension by one so that it can be used to index into all the
-    # quantiles when using the tf.gather_nd function (see below).
-    reshaped_actions = self._replay.actions[:, None]
-    reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
-    # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
-    reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
-
-    chosen_action_quantile_values = tf.gather_nd(
-        self._replay_net_quantile_values, reshaped_actions)
-    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', self._replay_net_quantile_values)
-    # Transpose dimensions so that the dimensionality is batch_size x
-    # self.num_tau_samples x 1 to prepare for computation of
-    # Bellman errors.
-    # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
-    # in which the quantile values are tiled.
-    chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
-                                               [self.num_tau_samples,
-                                                batch_size, 1])
-    # Final shape of chosen_action_quantile_values:
-    # batch_size x num_tau_samples x 1.
-    chosen_action_quantile_values = tf.transpose(
-        chosen_action_quantile_values, [1, 0, 2])   #batchsize x quan x 1
-
-    ##########################################################################################
-    reshaped_actions1 = self._replay.actions[:, None]
-    reshaped_actions1 = tf.tile(reshaped_actions1, [self.num_tau_samples-1, 1])
-    # Shape of reshaped_actions1: (num_tau_samples-1 x batch_size) x 2.
-    indices1 = tf.range((self.num_tau_samples-1) * batch_size)[:, None]
-    reshaped_actions1 = tf.concat([indices1, reshaped_actions1], axis=1)
-    gradient_tau = tf.reshape(self._replay_net_outputs.gradient_tau, (-1, self.num_actions))  #31 x 32 x 18
-    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
-    gradient_tau = tf.gather_nd(
-        gradient_tau, reshaped_actions1)
-    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
-    chosen_action_gradient_tau = tf.reshape(gradient_tau,
-                                               [self.num_tau_samples-1,
-                                                batch_size, 1])
-    self.chosen_action_gradient_tau = tf.transpose(
-        chosen_action_gradient_tau, [1, 0, 2])   #batchsize x quan x 1 (32 x 31 x 18)
-    self.chosen_action_gradient_tau = self.chosen_action_gradient_tau[:,:,0]  #(32 x 31)
-    ##########################################################################################
-
-    # Shape of bellman_erors and huber_loss:
-    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
-    bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
-    #if 'fqf12' in self._runtype and 'fixbugtarg' in self._runtype:
-    #    print ("============================================================= fixbug")
-    #    print (bellman_errors.shape, self._replay_net_outputs.v_diff.shape, self.num_tau_samples)
-    #    bellman_errors = bellman_errors * self._replay_net_outputs.v_diff[:,:,None,None] * self.num_tau_samples
-    # The huber loss (see Section 2.3 of the paper) is defined via two cases:
-    # case_one: |bellman_errors| <= kappa
-    # case_two: |bellman_errors| > kappa
-    huber_loss_case_one = tf.to_float(
-        tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
-    huber_loss_case_two = tf.to_float(
-        tf.abs(bellman_errors) > self.kappa) * self.kappa * (
-            tf.abs(bellman_errors) - 0.5 * self.kappa)
-    huber_loss = huber_loss_case_one + huber_loss_case_two
-
-    # Reshape replay_quantiles to batch_size x num_tau_samples x 1
-    replay_quantiles = tf.reshape(
-        self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
-    replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2])  #batchsize x quan x 1
-
-    # Tile by num_tau_prime_samples along a new dimension. Shape is now
-    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
-    # These quantiles will be used for computation of the quantile huber loss
-    # below (see section 2.3 of the paper).
-    replay_quantiles = tf.to_float(tf.tile(
-        replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
-    # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
-    quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
-        tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
-    # Sum over current quantile value (num_tau_samples) dimension,
-    # average over target quantile value (num_tau_prime_samples) dimension.
-    # Shape: batch_size x num_tau_prime_samples x 1.
-    loss = tf.reduce_sum(quantile_huber_loss, axis=2)
-    # Shape: batch_size x 1.
-    loss = tf.reduce_mean(loss, axis=1)
-
-    chosen_action_L_tau = tf.gather_nd(self._replay_net_outputs.L_tau, reshaped_actions)
-    print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", chosen_action_L_tau.shape)
-    loss1 = tf.reduce_mean(chosen_action_L_tau, axis=0)
-    print (loss1.shape)
-
-    update_priorities_op = tf.no_op()
-    with tf.control_dependencies([update_priorities_op]):
-      if self.summary_writer is not None:
-        with tf.variable_scope('Losses'):
-          tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
-      iqn_params, fqf_params = [], []
-      params = tf.trainable_variables()
-      for p in params:
-          if 'fqf' in p.name and 'Target' not in p.name: fqf_params.append(p)
-          else: iqn_params.append(p)
-      print ("fqf_params:>>>>>>", fqf_params)
-      print ("iqn_params:>>>>>>", iqn_params)
-      #batchsize x quan
-      #batchsize x quan
-      #quan x batchsize
-      print ('================================================')
-      quantile_tau = tf.transpose(self._replay_net_outputs.quantile_tau, (1,0))
-      q_entropy = tf.reduce_sum(-quantile_tau * tf.log(quantile_tau), axis=1) * 0.001
-      #print (quantile_tau)  #32x31
-      print ("q_entropy:", q_entropy)
-      print (self.chosen_action_gradient_tau)  #32x31
-      print (fqf_params)
-      grads = tf.gradients(quantile_tau, fqf_params, grad_ys=self.chosen_action_gradient_tau)
-      print (grads)
-      grads_and_vars = [(grads[i], fqf_params[i]) for i in range(len(grads))]
-      return self.optimizer.minimize(tf.reduce_mean(loss), var_list=iqn_params), \
-              self.optimizer1.apply_gradients(grads_and_vars), \
-              self.optimizer1.minimize(self.ent * tf.reduce_mean(-q_entropy), var_list=fqf_params), \
-              tf.reduce_mean(loss), tf.reduce_mean(loss1), \
-              tf.squeeze(chosen_action_quantile_values), \
-              tf.squeeze(replay_quantiles[:,0,:,:]), \
-              self._replay_net_outputs.v_diff
+# coding=utf-8
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+
+
+from dopamine.agents.rainbow import rainbow_agent
+from dopamine.discrete_domains import atari_lib
+import tensorflow as tf
+
+import gin.tf
+
+slim = tf.contrib.slim
+
+
+@gin.configurable
+class FQFAgent(rainbow_agent.RainbowAgent):
+
+  def __init__(self,
+               sess,
+               num_actions,
+               network=atari_lib.fqf_network,
+               kappa=1.0,
+               runtype=None,
+               fqf_factor=0.000001,
+               fqf_ent=0.001,
+               num_tau_samples=32,
+               num_tau_prime_samples=32,
+               num_quantile_samples=32,
+               quantile_embedding_dim=64,
+               double_dqn=False,
+               summary_writer=None,
+               summary_writing_frequency=500):
+    """Initializes the agent and constructs the Graph.
+
+    Most of this constructor's parameters are IQN-specific hyperparameters whose
+    values are taken from Dabney et al. (2018).
+
+    Args:
+      sess: `tf.Session` object for running associated ops.
+      num_actions: int, number of actions the agent can take at any state.
+      network: function expecting three parameters:
+        (num_actions, network_type, state). This function will return the
+        network_type object containing the tensors output by the network.
+        See dopamine.discrete_domains.atari_lib.nature_dqn_network as
+        an example.
+      kappa: float, Huber loss cutoff.
+      num_tau_samples: int, number of online quantile samples for loss
+        estimation.
+      num_tau_prime_samples: int, number of target quantile samples for loss
+        estimation.
+      num_quantile_samples: int, number of quantile samples for computing
+        Q-values.
+      quantile_embedding_dim: int, embedding dimension for the quantile input.
+      double_dqn: boolean, whether to perform double DQN style learning
+        as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
+      summary_writer: SummaryWriter object for outputting training statistics.
+        Summary writing disabled if set to None.
+      summary_writing_frequency: int, frequency with which summaries will be
+        written. Lower values will result in slower training.
+    """
+    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
+    self._runtype= runtype
+    print (self._runtype)
+    self.fqf_factor = float(fqf_factor)
+    self.ent = float(fqf_ent)
+    self.kappa = kappa
+    print ('fqf factor:', self.fqf_factor)
+    # num_tau_samples = N below equation (3) in the paper.
+    self.num_tau_samples = num_tau_samples
+    # num_tau_prime_samples = N' below equation (3) in the paper.
+    self.num_tau_prime_samples = num_tau_prime_samples
+    # num_quantile_samples = k below equation (3) in the paper.
+    self.num_quantile_samples = num_quantile_samples
+    # quantile_embedding_dim = n above equation (4) in the paper.
+    self.quantile_embedding_dim = quantile_embedding_dim
+    # option to perform double dqn.
+    self.double_dqn = double_dqn
+    if 'adam' in self._runtype:
+        self.optimizer1 = tf.train.AdamOptimizer(
+            learning_rate=0.00005 * self.fqf_factor,
+            epsilon=0.0003125)
+    else:
+        self.optimizer1 = tf.train.RMSPropOptimizer(
+            learning_rate=0.00005 * self.fqf_factor,
+            decay=0.95,
+            momentum=0.0,
+            epsilon=0.00001,
+            centered=True)
+
+    super(FQFAgent, self).__init__(
+        sess=sess,
+        num_actions=num_actions,
+        network=network,
+        summary_writer=summary_writer,
+        summary_writing_frequency=summary_writing_frequency)
+
+  def _get_network_type(self):
+    return collections.namedtuple(
+        'iqn_network', ['quantile_values', 'quantiles', 'quantile_values_origin', 'quantiles_origin', 'Fv_diff', 'v_diff', 'quantile_values_mid', 'quantiles_mid', 'L_tau', 'gradient_tau', 'quantile_tau'])
+
+  def _network_template(self, state, num_quantiles):
+    return self.network(self.num_actions, self.quantile_embedding_dim,
+                        self._get_network_type(), state, num_quantiles, self._runtype)
+
+  def _train_step(self):
+    """Runs a single training step.
+
+    Runs a training op if both:
+      (1) A minimum number of frames have been added to the replay buffer.
+      (2) `training_steps` is a multiple of `update_period`.
+
+    Also, syncs weights from online to target network if training steps is a
+    multiple of target update period.
+    """
+    # Run a train op at the rate of self.update_period if enough training steps
+    # have been run. This matches the Nature DQN behaviour.
+    if self._replay.memory.add_count > self.min_replay_history:
+      if self.training_steps % self.update_period == 0:
+        _, _, _, loss, loss1, quan_value, quan, vdiff = self._sess.run(self._train_op)
+        if self.training_steps % 50000 == 0:
+            batchsize = 32
+            quan_value = np.reshape(quan_value, [batchsize, self.num_tau_samples])
+            quan = np.reshape(quan, [batchsize, self.num_tau_samples])
+            quan_value = quan_value[0].tolist()
+            quan = quan[0].tolist()
+            vdiff = vdiff[:, 0].tolist()
+            print (">>> loss:", loss)
+            print (">>> loss1:", loss1)
+            print (">>> value:", quan_value)
+            print (">>> quans:", quan)
+            print (">>> vdiff:", vdiff)
+            print (">>> vdiff_sum:", np.sum(vdiff))
+        if (self.summary_writer is not None and
+            self.training_steps > 0 and
+            self.training_steps % self.summary_writing_frequency == 0):
+          summary = self._sess.run(self._merged_summaries)
+          self.summary_writer.add_summary(summary, self.training_steps)
+
+      if self.training_steps % self.target_update_period == 0:
+        self._sess.run(self._sync_qt_ops)
+
+    self.training_steps += 1
+
+  def _build_networks(self):
+    """Builds the FQF computations needed for acting and training.
+
+    These are:
+      self.online_convnet: For computing the current state's quantile values.
+      self.target_convnet: For computing the next state's target quantile
+        values.
+      self._net_outputs: The actual quantile values.
+      self._q_argmax: The action maximizing the current state's Q-values.
+      self._replay_net_outputs: The replayed states' quantile values.
+      self._replay_next_target_net_outputs: The replayed next states' target
+        quantile values.
+    """
+    # Calling online_convnet will generate a new graph as defined in
+    # self._get_network_template using whatever input is passed, but will always
+    # share the same weights.
+    self.online_convnet = tf.make_template('Online', self._network_template)
+    self.target_convnet = tf.make_template('Target', self._network_template)
+
+    # Compute the Q-values which are used for action selection in the current
+    # state.
+    self._net_outputs = self.online_convnet(self.state_ph,
+                                            self.num_quantile_samples)
+    # Shape of self._net_outputs.quantile_values:
+    # num_quantile_samples x num_actions.
+    # e.g. if num_actions is 2, it might look something like this:
+    # Vals for Quantile .2  Vals for Quantile .4  Vals for Quantile .6
+    #    [[0.1, 0.5],         [0.15, -0.3],          [0.15, -0.2]]
+    # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
+    if 'ws' in self._runtype:
+        self._q_values = tf.reduce_sum(self._net_outputs.quantile_values * self._net_outputs.v_diff, axis=0)   #NOTE: quantile_values = quantile_values_mid
+    else:
+        self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
+    self._q_argmax = tf.argmax(self._q_values, axis=0)
+
+    self._replay_net_outputs = self.online_convnet(self._replay.states,
+                                                   self.num_tau_samples)
+    # Shape: (num_tau_samples x batch_size) x num_actions.
+    self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
+    self._replay_net_quantiles = self._replay_net_outputs.quantiles
+
+    # Do the same for next states in the replay buffer.
+    self._replay_net_target_outputs = self.target_convnet(
+        self._replay.next_states, self.num_tau_prime_samples)
+    # Shape: (num_tau_prime_samples x batch_size) x num_actions.
+    vals = self._replay_net_target_outputs.quantile_values
+    self._replay_net_target_quantile_values = vals
+
+    # Compute Q-values which are used for action selection for the next states
+    # in the replay buffer. Compute the argmax over the Q-values.
+    if self.double_dqn:
+      outputs_action = self.online_convnet(self._replay.next_states,
+                                           self.num_quantile_samples)
+    else:
+      outputs_action = self.target_convnet(self._replay.next_states,
+                                           self.num_quantile_samples)
+
+    # Shape: (num_quantile_samples x batch_size) x num_actions.
+    target_quantile_values_action = outputs_action.quantile_values   #NOTE: quantile_values = quantile_values_mid
+    # Shape: num_quantile_samples x batch_size x num_actions.
+    target_quantile_values_action = tf.reshape(target_quantile_values_action,
+                                               [self.num_quantile_samples,
+                                                self._replay.batch_size,
+                                                self.num_actions])
+    # Shape: batch_size x num_actions.
+    if 'ws' in self._runtype:
+        v_diff = tf.reshape(outputs_action.v_diff, [self.num_quantile_samples, self._replay.batch_size, 1])
+        self._replay_net_target_q_values = tf.squeeze(tf.reduce_sum(
+        target_quantile_values_action * v_diff, axis=0))
+    else:
+        self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
+        target_quantile_values_action, axis=0))
+    self._replay_next_qt_argmax = tf.argmax(
+        self._replay_net_target_q_values, axis=1)
+
+  def _build_target_quantile_values_op(self):
+    """Build an op used as a target for return values at given quantiles.
+
+    Returns:
+      An op calculating the target quantile return.
+    """
+    batch_size = tf.shape(self._replay.rewards)[0]
+    # Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
+    rewards = self._replay.rewards[:, None]
+    rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
+
+    is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
+    # Incorporate terminal state to discount factor.
+    # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
+    gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
+    gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
+                                  [self.num_tau_prime_samples, 1])
+
+    # Get the indices of the maximium Q-value across the action dimension.
+    # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
+
+    replay_next_qt_argmax = tf.tile(
+        self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
+
+    # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
+    batch_indices = tf.cast(tf.range(
+        self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
+
+    # Shape of batch_indexed_target_values:
+    # (num_tau_prime_samples x batch_size) x 2.
+    batch_indexed_target_values = tf.concat(
+        [batch_indices, replay_next_qt_argmax], axis=1)
+
+    # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
+    target_quantile_values = tf.gather_nd(
+        self._replay_net_target_quantile_values,
+        batch_indexed_target_values)[:, None]
+
+    return rewards + gamma_with_terminal * target_quantile_values
+
+  def _build_train_op(self):
+    """Builds a training op.
+
+    Returns:
+      train_op: An op performing one step of training from replay data.
+    """
+    batch_size = tf.shape(self._replay.rewards)[0]
+
+    target_quantile_values = tf.stop_gradient(
+        self._build_target_quantile_values_op())
+    # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
+    # the manner in which the target_quantile_values are tiled.
+    target_quantile_values = tf.reshape(target_quantile_values,
+                                        [self.num_tau_prime_samples,
+                                         batch_size, 1])
+    # Transpose dimensions so that the dimensionality is batch_size x
+    # self.num_tau_prime_samples x 1 to prepare for computation of
+    # Bellman errors.
+    # Final shape of target_quantile_values:
+    # batch_size x num_tau_prime_samples x 1.
+    target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
+
+    # Shape of indices: (num_tau_samples x batch_size) x 1.
+    # Expand dimension by one so that it can be used to index into all the
+    # quantiles when using the tf.gather_nd function (see below).
+    indices = tf.range(self.num_tau_samples * batch_size)[:, None]
+
+    # Expand the dimension by one so that it can be used to index into all the
+    # quantiles when using the tf.gather_nd function (see below).
+    reshaped_actions = self._replay.actions[:, None]
+    reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
+    # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
+    reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
+
+    chosen_action_quantile_values = tf.gather_nd(
+        self._replay_net_quantile_values, reshaped_actions)
+    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', self._replay_net_quantile_values)
+    # Transpose dimensions so that the dimensionality is batch_size x
+    # self.num_tau_samples x 1 to prepare for computation of
+    # Bellman errors.
+    # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
+    # in which the quantile values are tiled.
+    chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
+                                               [self.num_tau_samples,
+                                                batch_size, 1])
+    # Final shape of chosen_action_quantile_values:
+    # batch_size x num_tau_samples x 1.
+    chosen_action_quantile_values = tf.transpose(
+        chosen_action_quantile_values, [1, 0, 2])   #batchsize x quan x 1
+
+    ##########################################################################################
+    reshaped_actions1 = self._replay.actions[:, None]
+    reshaped_actions1 = tf.tile(reshaped_actions1, [self.num_tau_samples-1, 1])
+    # Shape of reshaped_actions1: (num_tau_samples-1 x batch_size) x 2.
+    indices1 = tf.range((self.num_tau_samples-1) * batch_size)[:, None]
+    reshaped_actions1 = tf.concat([indices1, reshaped_actions1], axis=1)
+    gradient_tau = tf.reshape(self._replay_net_outputs.gradient_tau, (-1, self.num_actions))  #31 x 32 x 18
+    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
+    gradient_tau = tf.gather_nd(
+        gradient_tau, reshaped_actions1)
+    print ('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', gradient_tau)
+    chosen_action_gradient_tau = tf.reshape(gradient_tau,
+                                               [self.num_tau_samples-1,
+                                                batch_size, 1])
+    self.chosen_action_gradient_tau = tf.transpose(
+        chosen_action_gradient_tau, [1, 0, 2])   #batchsize x quan x 1 (32 x 31 x 18)
+    self.chosen_action_gradient_tau = self.chosen_action_gradient_tau[:,:,0]  #(32 x 31)
+    ##########################################################################################
+
+    # Shape of bellman_erors and huber_loss:
+    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
+    #if 'fqf12' in self._runtype and 'fixbugtarg' in self._runtype:
+    #    print ("============================================================= fixbug")
+    #    print (bellman_errors.shape, self._replay_net_outputs.v_diff.shape, self.num_tau_samples)
+    #    bellman_errors = bellman_errors * self._replay_net_outputs.v_diff[:,:,None,None] * self.num_tau_samples
+    # The huber loss (see Section 2.3 of the paper) is defined via two cases:
+    # case_one: |bellman_errors| <= kappa
+    # case_two: |bellman_errors| > kappa
+    huber_loss_case_one = tf.to_float(
+        tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
+    huber_loss_case_two = tf.to_float(
+        tf.abs(bellman_errors) > self.kappa) * self.kappa * (
+            tf.abs(bellman_errors) - 0.5 * self.kappa)
+    huber_loss = huber_loss_case_one + huber_loss_case_two
+
+    # Reshape replay_quantiles to batch_size x num_tau_samples x 1
+    replay_quantiles = tf.reshape(
+        self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
+    replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2])  #batchsize x quan x 1
+
+    # Tile by num_tau_prime_samples along a new dimension. Shape is now
+    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    # These quantiles will be used for computation of the quantile huber loss
+    # below (see section 2.3 of the paper).
+    replay_quantiles = tf.to_float(tf.tile(
+        replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
+    # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
+        tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
+    # Sum over current quantile value (num_tau_samples) dimension,
+    # average over target quantile value (num_tau_prime_samples) dimension.
+    # Shape: batch_size x num_tau_prime_samples x 1.
+    loss = tf.reduce_sum(quantile_huber_loss, axis=2)
+    # Shape: batch_size x 1.
+    loss = tf.reduce_mean(loss, axis=1)
+
+    chosen_action_L_tau = tf.gather_nd(self._replay_net_outputs.L_tau, reshaped_actions)
+    print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", chosen_action_L_tau.shape)
+    loss1 = tf.reduce_mean(chosen_action_L_tau, axis=0)
+    print (loss1.shape)
+
+    update_priorities_op = tf.no_op()
+    with tf.control_dependencies([update_priorities_op]):
+      if self.summary_writer is not None:
+        with tf.variable_scope('Losses'):
+          tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
+      iqn_params, fqf_params = [], []
+      params = tf.trainable_variables()
+      for p in params:
+          if 'fqf' in p.name and 'Target' not in p.name: fqf_params.append(p)
+          else: iqn_params.append(p)
+      print ("fqf_params:>>>>>>", fqf_params)
+      print ("iqn_params:>>>>>>", iqn_params)
+      #batchsize x quan
+      #batchsize x quan
+      #quan x batchsize
+      print ('================================================')
+      quantile_tau = tf.transpose(self._replay_net_outputs.quantile_tau, (1,0))
+      q_entropy = tf.reduce_sum(-quantile_tau * tf.log(quantile_tau), axis=1) * 0.001
+      #print (quantile_tau)  #32x31
+      print ("q_entropy:", q_entropy)
+      print (self.chosen_action_gradient_tau)  #32x31
+      print (fqf_params)
+      grads = tf.gradients(quantile_tau, fqf_params, grad_ys=self.chosen_action_gradient_tau)
+      print (grads)
+      grads_and_vars = [(grads[i], fqf_params[i]) for i in range(len(grads))]
+      if 'sqloss' in self._runtype:
+          print ('use sqloss')
+          return self.optimizer.minimize(tf.reduce_mean(loss), var_list=iqn_params), \
+                      self.optimizer1.minimize(tf.reduce_mean(loss1), var_list=fqf_params), \
+                      tf.reduce_mean(loss), tf.reduce_mean(loss1), \
+                      tf.squeeze(chosen_action_quantile_values), \
+                      tf.squeeze(replay_quantiles[:,0,:,:]), \
+                      self._replay_net_outputs.v_diff
+      else:
+          print ('use directBP')
+          return self.optimizer.minimize(tf.reduce_mean(loss), var_list=iqn_params), \
+                  self.optimizer1.apply_gradients(grads_and_vars), \
+                  self.optimizer1.minimize(self.ent * tf.reduce_mean(-q_entropy), var_list=fqf_params), \
+                  tf.reduce_mean(loss), tf.reduce_mean(loss1), \
+                  tf.squeeze(chosen_action_quantile_values), \
+                  tf.squeeze(replay_quantiles[:,0,:,:]), \
+                  self._replay_net_outputs.v_diff
--- a/dopamine/agents/implicit_quantile/init.py
+++ b/dopamine/agents/implicit_quantile/init.py
@ -1,15 +1,15 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin
+++ b/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin
@ -1,46 +1,46 @@
-# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to
-# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
-# comparison.
-import dopamine.agents.implicit_quantile.implicit_quantile_agent
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-ImplicitQuantileAgent.kappa = 1.0
-ImplicitQuantileAgent.num_tau_samples = 32
-ImplicitQuantileAgent.num_tau_prime_samples = 32
-ImplicitQuantileAgent.num_quantile_samples = 32
-ImplicitQuantileAgent.runtype = 'RUNTYPE'
-RainbowAgent.gamma = 0.99
-RainbowAgent.game = 'GAME'
-RainbowAgent.runtype = 'RUNTYPE'
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 20000 # agent steps
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 8000 # agent steps
-RainbowAgent.epsilon_train = 0.01
-RainbowAgent.epsilon_eval = 0.001
-RainbowAgent.epsilon_decay_period = 250000  # agent steps
-# IQN currently does not support prioritized replay.
-RainbowAgent.replay_scheme = 'uniform'
-RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.00005
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-atari_lib.create_atari_environment.game_name = 'GAME'
-# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
-atari_lib.create_atari_environment.sticky_actions = True
-create_agent.agent_name = 'implicit_quantile'
-Runner.num_iterations = 200
-Runner.game = 'GAME'
-Runner.runtype = 'RUNTYPE'
-Runner.training_steps = 250000
-Runner.evaluation_steps = 125000
-Runner.max_steps_per_episode = 27000
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
-WrappedPrioritizedReplayBuffer.batch_size = 32
+# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.agents.implicit_quantile.implicit_quantile_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+ImplicitQuantileAgent.kappa = 1.0
+ImplicitQuantileAgent.num_tau_samples = 32
+ImplicitQuantileAgent.num_tau_prime_samples = 32
+ImplicitQuantileAgent.num_quantile_samples = 32
+ImplicitQuantileAgent.runtype = 'RUNTYPE'
+RainbowAgent.gamma = 0.99
+RainbowAgent.game = 'GAME'
+RainbowAgent.runtype = 'RUNTYPE'
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 20000 # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000 # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+# IQN currently does not support prioritized replay.
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00005
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+atari_lib.create_atari_environment.game_name = 'GAME'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+atari_lib.create_atari_environment.sticky_actions = True
+create_agent.agent_name = 'implicit_quantile'
+Runner.num_iterations = 200
+Runner.game = 'GAME'
+Runner.runtype = 'RUNTYPE'
+Runner.training_steps = 250000
+Runner.evaluation_steps = 125000
+Runner.max_steps_per_episode = 27000
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
--- a/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin
+++ b/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin
@ -1,43 +1,43 @@
-# Hyperparameters follow Dabney et al. (2018).
-import dopamine.agents.implicit_quantile.implicit_quantile_agent
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-ImplicitQuantileAgent.kappa = 1.0
-ImplicitQuantileAgent.num_tau_samples = 32
-ImplicitQuantileAgent.num_tau_prime_samples = 32
-ImplicitQuantileAgent.num_quantile_samples = 32
-RainbowAgent.gamma = 0.99
-RainbowAgent.game = 'GAME'
-RainbowAgent.runtype = 'RUNTYPE'
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 50000 # agent steps
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 10000 # agent steps
-RainbowAgent.epsilon_train = 0.01
-RainbowAgent.epsilon_eval = 0.001
-RainbowAgent.epsilon_decay_period = 1000000 # agent steps
-RainbowAgent.replay_scheme = 'uniform'
-RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.00005
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-atari_lib.create_atari_environment.game_name = 'GAME'
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'implicit_quantile'
-Runner.num_iterations = 200
-Runner.game = 'GAME'
-Runner.runtype = 'RUNTYPE'
-Runner.training_steps = 250000
-Runner.evaluation_steps = 125000
-Runner.max_steps_per_episode = 27000
-
-AtariPreprocessing.terminal_on_life_loss = True
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
-WrappedPrioritizedReplayBuffer.batch_size = 32
+# Hyperparameters follow Dabney et al. (2018).
+import dopamine.agents.implicit_quantile.implicit_quantile_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+ImplicitQuantileAgent.kappa = 1.0
+ImplicitQuantileAgent.num_tau_samples = 32
+ImplicitQuantileAgent.num_tau_prime_samples = 32
+ImplicitQuantileAgent.num_quantile_samples = 32
+RainbowAgent.gamma = 0.99
+RainbowAgent.game = 'GAME'
+RainbowAgent.runtype = 'RUNTYPE'
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 50000 # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 10000 # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 1000000 # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00005
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+atari_lib.create_atari_environment.game_name = 'GAME'
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'implicit_quantile'
+Runner.num_iterations = 200
+Runner.game = 'GAME'
+Runner.runtype = 'RUNTYPE'
+Runner.training_steps = 250000
+Runner.evaluation_steps = 125000
+Runner.max_steps_per_episode = 27000
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
--- a/dopamine/agents/implicit_quantile/implicit_quantile_agent.py
+++ b/dopamine/agents/implicit_quantile/implicit_quantile_agent.py
@ -1,348 +1,348 @@
-#1 coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The implicit quantile networks (IQN) agent.
-
-The agent follows the description given in "Implicit Quantile Networks for
-Distributional RL" (Dabney et. al, 2018).
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import numpy as np
-
-
-
-from dopamine.agents.rainbow import rainbow_agent
-from dopamine.discrete_domains import atari_lib
-import tensorflow as tf
-
-import gin.tf
-
-slim = tf.contrib.slim
-
-
-@gin.configurable
-class ImplicitQuantileAgent(rainbow_agent.RainbowAgent):
-  """An extension of Rainbow to perform implicit quantile regression."""
-
-  def __init__(self,
-               sess,
-               num_actions,
-               network=atari_lib.implicit_quantile_network,
-               kappa=1.0,
-               num_tau_samples=32,
-               num_tau_prime_samples=32,
-               num_quantile_samples=32,
-               quantile_embedding_dim=64,
-               double_dqn=False,
-               summary_writer=None,
-               summary_writing_frequency=500):
-    """Initializes the agent and constructs the Graph.
-
-    Most of this constructor's parameters are IQN-specific hyperparameters whose
-    values are taken from Dabney et al. (2018).
-
-    Args:
-      sess: `tf.Session` object for running associated ops.
-      num_actions: int, number of actions the agent can take at any state.
-      network: function expecting three parameters:
-        (num_actions, network_type, state). This function will return the
-        network_type object containing the tensors output by the network.
-        See dopamine.discrete_domains.atari_lib.nature_dqn_network as
-        an example.
-      kappa: float, Huber loss cutoff.
-      num_tau_samples: int, number of online quantile samples for loss
-        estimation.
-      num_tau_prime_samples: int, number of target quantile samples for loss
-        estimation.
-      num_quantile_samples: int, number of quantile samples for computing
-        Q-values.
-      quantile_embedding_dim: int, embedding dimension for the quantile input.
-      double_dqn: boolean, whether to perform double DQN style learning
-        as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
-      summary_writer: SummaryWriter object for outputting training statistics.
-        Summary writing disabled if set to None.
-      summary_writing_frequency: int, frequency with which summaries will be
-        written. Lower values will result in slower training.
-    """
-    self.kappa = kappa
-    # num_tau_samples = N below equation (3) in the paper.
-    self.num_tau_samples = num_tau_samples
-    # num_tau_prime_samples = N' below equation (3) in the paper.
-    self.num_tau_prime_samples = num_tau_prime_samples
-    # num_quantile_samples = k below equation (3) in the paper.
-    self.num_quantile_samples = num_quantile_samples
-    # quantile_embedding_dim = n above equation (4) in the paper.
-    self.quantile_embedding_dim = quantile_embedding_dim
-    # option to perform double dqn.
-    self.double_dqn = double_dqn
-
-    super(ImplicitQuantileAgent, self).__init__(
-        sess=sess,
-        num_actions=num_actions,
-        network=network,
-        summary_writer=summary_writer,
-        summary_writing_frequency=summary_writing_frequency)
-
-  def _get_network_type(self):
-    """Returns the type of the outputs of the implicit quantile network.
-
-    Returns:
-      _network_type object defining the outputs of the network.
-    """
-    return collections.namedtuple(
-        'iqn_network', ['quantile_values', 'quantiles'])
-
-  def _network_template(self, state, num_quantiles):
-    r"""Builds an Implicit Quantile ConvNet.
-
-    Takes state and quantile as inputs and outputs state-action quantile values.
-
-    Args:
-      state: A `tf.placeholder` for the RL state.
-      num_quantiles: int, number of quantile inputs.
-
-    Returns:
-      _network_type object containing quantile value outputs of the network.
-    """
-    return self.network(self.num_actions, self.quantile_embedding_dim,
-                        self._get_network_type(), state, num_quantiles)
-
-  def _train_step(self):
-    """Runs a single training step.
-
-    Runs a training op if both:
-      (1) A minimum number of frames have been added to the replay buffer.
-      (2) `training_steps` is a multiple of `update_period`.
-
-    Also, syncs weights from online to target network if training steps is a
-    multiple of target update period.
-    """
-    # Run a train op at the rate of self.update_period if enough training steps
-    # have been run. This matches the Nature DQN behaviour.
-    if self._replay.memory.add_count > self.min_replay_history:
-      if self.training_steps % self.update_period == 0:
-        self._sess.run(self._train_op)
-        if (self.summary_writer is not None and
-            self.training_steps > 0 and
-            self.training_steps % self.summary_writing_frequency == 0):
-          summary = self._sess.run(self._merged_summaries)
-          self.summary_writer.add_summary(summary, self.training_steps)
-
-      if self.training_steps % self.target_update_period == 0:
-        self._sess.run(self._sync_qt_ops)
-
-    self.training_steps += 1
-
-  def _build_networks(self):
-    """Builds the IQN computations needed for acting and training.
-
-    These are:
-      self.online_convnet: For computing the current state's quantile values.
-      self.target_convnet: For computing the next state's target quantile
-        values.
-      self._net_outputs: The actual quantile values.
-      self._q_argmax: The action maximizing the current state's Q-values.
-      self._replay_net_outputs: The replayed states' quantile values.
-      self._replay_next_target_net_outputs: The replayed next states' target
-        quantile values.
-    """
-    # Calling online_convnet will generate a new graph as defined in
-    # self._get_network_template using whatever input is passed, but will always
-    # share the same weights.
-    self.online_convnet = tf.make_template('Online', self._network_template)
-    self.target_convnet = tf.make_template('Target', self._network_template)
-
-    # Compute the Q-values which are used for action selection in the current
-    # state.
-    self._net_outputs = self.online_convnet(self.state_ph,
-                                            self.num_quantile_samples)
-    # Shape of self._net_outputs.quantile_values:
-    # num_quantile_samples x num_actions.
-    # e.g. if num_actions is 2, it might look something like this:
-    # Vals for Quantile .2  Vals for Quantile .4  Vals for Quantile .6
-    #    [[0.1, 0.5],         [0.15, -0.3],          [0.15, -0.2]]
-    # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
-    self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
-    self._q_argmax = tf.argmax(self._q_values, axis=0)
-
-    self._replay_net_outputs = self.online_convnet(self._replay.states,
-                                                   self.num_tau_samples)
-    # Shape: (num_tau_samples x batch_size) x num_actions.
-    self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
-    self._replay_net_quantiles = self._replay_net_outputs.quantiles
-
-    # Do the same for next states in the replay buffer.
-    self._replay_net_target_outputs = self.target_convnet(
-        self._replay.next_states, self.num_tau_prime_samples)
-    # Shape: (num_tau_prime_samples x batch_size) x num_actions.
-    vals = self._replay_net_target_outputs.quantile_values
-    self._replay_net_target_quantile_values = vals
-
-    # Compute Q-values which are used for action selection for the next states
-    # in the replay buffer. Compute the argmax over the Q-values.
-    if self.double_dqn:
-      outputs_action = self.online_convnet(self._replay.next_states,
-                                           self.num_quantile_samples)
-    else:
-      outputs_action = self.target_convnet(self._replay.next_states,
-                                           self.num_quantile_samples)
-
-    # Shape: (num_quantile_samples x batch_size) x num_actions.
-    target_quantile_values_action = outputs_action.quantile_values
-    # Shape: num_quantile_samples x batch_size x num_actions.
-    target_quantile_values_action = tf.reshape(target_quantile_values_action,
-                                               [self.num_quantile_samples,
-                                                self._replay.batch_size,
-                                                self.num_actions])
-    # Shape: batch_size x num_actions.
-    self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
-            target_quantile_values_action, axis=0))
-    self._replay_next_qt_argmax = tf.argmax(
-        self._replay_net_target_q_values, axis=1)
-
-  def _build_target_quantile_values_op(self):
-    """Build an op used as a target for return values at given quantiles.
-
-    Returns:
-      An op calculating the target quantile return.
-    """
-    batch_size = tf.shape(self._replay.rewards)[0]
-    # Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
-    rewards = self._replay.rewards[:, None]
-    rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
-
-    is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
-    # Incorporate terminal state to discount factor.
-    # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
-    gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
-    gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
-                                  [self.num_tau_prime_samples, 1])
-
-    # Get the indices of the maximium Q-value across the action dimension.
-    # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
-
-    replay_next_qt_argmax = tf.tile(
-        self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
-
-    # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
-    batch_indices = tf.cast(tf.range(
-        self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
-
-    # Shape of batch_indexed_target_values:
-    # (num_tau_prime_samples x batch_size) x 2.
-    batch_indexed_target_values = tf.concat(
-        [batch_indices, replay_next_qt_argmax], axis=1)
-
-    # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
-    target_quantile_values = tf.gather_nd(
-        self._replay_net_target_quantile_values,
-        batch_indexed_target_values)[:, None]
-
-    return rewards + gamma_with_terminal * target_quantile_values
-
-  def _build_train_op(self):
-    """Builds a training op.
-
-    Returns:
-      train_op: An op performing one step of training from replay data.
-    """
-    batch_size = tf.shape(self._replay.rewards)[0]
-
-    target_quantile_values = tf.stop_gradient(
-        self._build_target_quantile_values_op())
-    # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
-    # the manner in which the target_quantile_values are tiled.
-    target_quantile_values = tf.reshape(target_quantile_values,
-                                        [self.num_tau_prime_samples,
-                                         batch_size, 1])
-    # Transpose dimensions so that the dimensionality is batch_size x
-    # self.num_tau_prime_samples x 1 to prepare for computation of
-    # Bellman errors.
-    # Final shape of target_quantile_values:
-    # batch_size x num_tau_prime_samples x 1.
-    target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
-
-    # Shape of indices: (num_tau_samples x batch_size) x 1.
-    # Expand dimension by one so that it can be used to index into all the
-    # quantiles when using the tf.gather_nd function (see below).
-    indices = tf.range(self.num_tau_samples * batch_size)[:, None]
-
-    # Expand the dimension by one so that it can be used to index into all the
-    # quantiles when using the tf.gather_nd function (see below).
-    reshaped_actions = self._replay.actions[:, None]
-    reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
-    # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
-    reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
-
-    chosen_action_quantile_values = tf.gather_nd(
-        self._replay_net_quantile_values, reshaped_actions)
-    # Transpose dimensions so that the dimensionality is batch_size x
-    # self.num_tau_samples x 1 to prepare for computation of
-    # Bellman errors.
-    # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
-    # in which the quantile values are tiled.
-    chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
-                                               [self.num_tau_samples,
-                                                batch_size, 1])
-    # Final shape of chosen_action_quantile_values:
-    # batch_size x num_tau_samples x 1.
-    chosen_action_quantile_values = tf.transpose(
-        chosen_action_quantile_values, [1, 0, 2])   #batchsize x quan x 1
-
-    # Shape of bellman_erors and huber_loss:
-    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
-    bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
-    # The huber loss (see Section 2.3 of the paper) is defined via two cases:
-    # case_one: |bellman_errors| <= kappa
-    # case_two: |bellman_errors| > kappa
-    huber_loss_case_one = tf.to_float(
-        tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
-    huber_loss_case_two = tf.to_float(
-        tf.abs(bellman_errors) > self.kappa) * self.kappa * (
-            tf.abs(bellman_errors) - 0.5 * self.kappa)
-    huber_loss = huber_loss_case_one + huber_loss_case_two
-
-    # Reshape replay_quantiles to batch_size x num_tau_samples x 1
-    replay_quantiles = tf.reshape(
-        self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
-    replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2])  #batchsize x quan x 1
-
-    # Tile by num_tau_prime_samples along a new dimension. Shape is now
-    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
-    # These quantiles will be used for computation of the quantile huber loss
-    # below (see section 2.3 of the paper).
-    replay_quantiles = tf.to_float(tf.tile(
-        replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
-    # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
-    quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
-        tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
-    # Sum over current quantile value (num_tau_samples) dimension,
-    # average over target quantile value (num_tau_prime_samples) dimension.
-    # Shape: batch_size x num_tau_prime_samples x 1.
-    loss = tf.reduce_sum(quantile_huber_loss, axis=2)
-    # Shape: batch_size x 1.
-    loss = tf.reduce_mean(loss, axis=1)
-
-    # TODO(kumasaurabh): Add prioritized replay functionality here.
-    update_priorities_op = tf.no_op()
-    with tf.control_dependencies([update_priorities_op]):
-      if self.summary_writer is not None:
-        with tf.variable_scope('Losses'):
-          tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
-      return self.optimizer.minimize(tf.reduce_mean(loss))
+#1 coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The implicit quantile networks (IQN) agent.
+
+The agent follows the description given in "Implicit Quantile Networks for
+Distributional RL" (Dabney et. al, 2018).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+
+
+from dopamine.agents.rainbow import rainbow_agent
+from dopamine.discrete_domains import atari_lib
+import tensorflow as tf
+
+import gin.tf
+
+slim = tf.contrib.slim
+
+
+@gin.configurable
+class ImplicitQuantileAgent(rainbow_agent.RainbowAgent):
+  """An extension of Rainbow to perform implicit quantile regression."""
+
+  def __init__(self,
+               sess,
+               num_actions,
+               network=atari_lib.implicit_quantile_network,
+               kappa=1.0,
+               num_tau_samples=32,
+               num_tau_prime_samples=32,
+               num_quantile_samples=32,
+               quantile_embedding_dim=64,
+               double_dqn=False,
+               summary_writer=None,
+               summary_writing_frequency=500):
+    """Initializes the agent and constructs the Graph.
+
+    Most of this constructor's parameters are IQN-specific hyperparameters whose
+    values are taken from Dabney et al. (2018).
+
+    Args:
+      sess: `tf.Session` object for running associated ops.
+      num_actions: int, number of actions the agent can take at any state.
+      network: function expecting three parameters:
+        (num_actions, network_type, state). This function will return the
+        network_type object containing the tensors output by the network.
+        See dopamine.discrete_domains.atari_lib.nature_dqn_network as
+        an example.
+      kappa: float, Huber loss cutoff.
+      num_tau_samples: int, number of online quantile samples for loss
+        estimation.
+      num_tau_prime_samples: int, number of target quantile samples for loss
+        estimation.
+      num_quantile_samples: int, number of quantile samples for computing
+        Q-values.
+      quantile_embedding_dim: int, embedding dimension for the quantile input.
+      double_dqn: boolean, whether to perform double DQN style learning
+        as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
+      summary_writer: SummaryWriter object for outputting training statistics.
+        Summary writing disabled if set to None.
+      summary_writing_frequency: int, frequency with which summaries will be
+        written. Lower values will result in slower training.
+    """
+    self.kappa = kappa
+    # num_tau_samples = N below equation (3) in the paper.
+    self.num_tau_samples = num_tau_samples
+    # num_tau_prime_samples = N' below equation (3) in the paper.
+    self.num_tau_prime_samples = num_tau_prime_samples
+    # num_quantile_samples = k below equation (3) in the paper.
+    self.num_quantile_samples = num_quantile_samples
+    # quantile_embedding_dim = n above equation (4) in the paper.
+    self.quantile_embedding_dim = quantile_embedding_dim
+    # option to perform double dqn.
+    self.double_dqn = double_dqn
+
+    super(ImplicitQuantileAgent, self).__init__(
+        sess=sess,
+        num_actions=num_actions,
+        network=network,
+        summary_writer=summary_writer,
+        summary_writing_frequency=summary_writing_frequency)
+
+  def _get_network_type(self):
+    """Returns the type of the outputs of the implicit quantile network.
+
+    Returns:
+      _network_type object defining the outputs of the network.
+    """
+    return collections.namedtuple(
+        'iqn_network', ['quantile_values', 'quantiles'])
+
+  def _network_template(self, state, num_quantiles):
+    r"""Builds an Implicit Quantile ConvNet.
+
+    Takes state and quantile as inputs and outputs state-action quantile values.
+
+    Args:
+      state: A `tf.placeholder` for the RL state.
+      num_quantiles: int, number of quantile inputs.
+
+    Returns:
+      _network_type object containing quantile value outputs of the network.
+    """
+    return self.network(self.num_actions, self.quantile_embedding_dim,
+                        self._get_network_type(), state, num_quantiles)
+
+  def _train_step(self):
+    """Runs a single training step.
+
+    Runs a training op if both:
+      (1) A minimum number of frames have been added to the replay buffer.
+      (2) `training_steps` is a multiple of `update_period`.
+
+    Also, syncs weights from online to target network if training steps is a
+    multiple of target update period.
+    """
+    # Run a train op at the rate of self.update_period if enough training steps
+    # have been run. This matches the Nature DQN behaviour.
+    if self._replay.memory.add_count > self.min_replay_history:
+      if self.training_steps % self.update_period == 0:
+        self._sess.run(self._train_op)
+        if (self.summary_writer is not None and
+            self.training_steps > 0 and
+            self.training_steps % self.summary_writing_frequency == 0):
+          summary = self._sess.run(self._merged_summaries)
+          self.summary_writer.add_summary(summary, self.training_steps)
+
+      if self.training_steps % self.target_update_period == 0:
+        self._sess.run(self._sync_qt_ops)
+
+    self.training_steps += 1
+
+  def _build_networks(self):
+    """Builds the IQN computations needed for acting and training.
+
+    These are:
+      self.online_convnet: For computing the current state's quantile values.
+      self.target_convnet: For computing the next state's target quantile
+        values.
+      self._net_outputs: The actual quantile values.
+      self._q_argmax: The action maximizing the current state's Q-values.
+      self._replay_net_outputs: The replayed states' quantile values.
+      self._replay_next_target_net_outputs: The replayed next states' target
+        quantile values.
+    """
+    # Calling online_convnet will generate a new graph as defined in
+    # self._get_network_template using whatever input is passed, but will always
+    # share the same weights.
+    self.online_convnet = tf.make_template('Online', self._network_template)
+    self.target_convnet = tf.make_template('Target', self._network_template)
+
+    # Compute the Q-values which are used for action selection in the current
+    # state.
+    self._net_outputs = self.online_convnet(self.state_ph,
+                                            self.num_quantile_samples)
+    # Shape of self._net_outputs.quantile_values:
+    # num_quantile_samples x num_actions.
+    # e.g. if num_actions is 2, it might look something like this:
+    # Vals for Quantile .2  Vals for Quantile .4  Vals for Quantile .6
+    #    [[0.1, 0.5],         [0.15, -0.3],          [0.15, -0.2]]
+    # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
+    self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
+    self._q_argmax = tf.argmax(self._q_values, axis=0)
+
+    self._replay_net_outputs = self.online_convnet(self._replay.states,
+                                                   self.num_tau_samples)
+    # Shape: (num_tau_samples x batch_size) x num_actions.
+    self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
+    self._replay_net_quantiles = self._replay_net_outputs.quantiles
+
+    # Do the same for next states in the replay buffer.
+    self._replay_net_target_outputs = self.target_convnet(
+        self._replay.next_states, self.num_tau_prime_samples)
+    # Shape: (num_tau_prime_samples x batch_size) x num_actions.
+    vals = self._replay_net_target_outputs.quantile_values
+    self._replay_net_target_quantile_values = vals
+
+    # Compute Q-values which are used for action selection for the next states
+    # in the replay buffer. Compute the argmax over the Q-values.
+    if self.double_dqn:
+      outputs_action = self.online_convnet(self._replay.next_states,
+                                           self.num_quantile_samples)
+    else:
+      outputs_action = self.target_convnet(self._replay.next_states,
+                                           self.num_quantile_samples)
+
+    # Shape: (num_quantile_samples x batch_size) x num_actions.
+    target_quantile_values_action = outputs_action.quantile_values
+    # Shape: num_quantile_samples x batch_size x num_actions.
+    target_quantile_values_action = tf.reshape(target_quantile_values_action,
+                                               [self.num_quantile_samples,
+                                                self._replay.batch_size,
+                                                self.num_actions])
+    # Shape: batch_size x num_actions.
+    self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
+            target_quantile_values_action, axis=0))
+    self._replay_next_qt_argmax = tf.argmax(
+        self._replay_net_target_q_values, axis=1)
+
+  def _build_target_quantile_values_op(self):
+    """Build an op used as a target for return values at given quantiles.
+
+    Returns:
+      An op calculating the target quantile return.
+    """
+    batch_size = tf.shape(self._replay.rewards)[0]
+    # Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
+    rewards = self._replay.rewards[:, None]
+    rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
+
+    is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
+    # Incorporate terminal state to discount factor.
+    # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
+    gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
+    gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
+                                  [self.num_tau_prime_samples, 1])
+
+    # Get the indices of the maximium Q-value across the action dimension.
+    # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
+
+    replay_next_qt_argmax = tf.tile(
+        self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
+
+    # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
+    batch_indices = tf.cast(tf.range(
+        self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
+
+    # Shape of batch_indexed_target_values:
+    # (num_tau_prime_samples x batch_size) x 2.
+    batch_indexed_target_values = tf.concat(
+        [batch_indices, replay_next_qt_argmax], axis=1)
+
+    # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
+    target_quantile_values = tf.gather_nd(
+        self._replay_net_target_quantile_values,
+        batch_indexed_target_values)[:, None]
+
+    return rewards + gamma_with_terminal * target_quantile_values
+
+  def _build_train_op(self):
+    """Builds a training op.
+
+    Returns:
+      train_op: An op performing one step of training from replay data.
+    """
+    batch_size = tf.shape(self._replay.rewards)[0]
+
+    target_quantile_values = tf.stop_gradient(
+        self._build_target_quantile_values_op())
+    # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
+    # the manner in which the target_quantile_values are tiled.
+    target_quantile_values = tf.reshape(target_quantile_values,
+                                        [self.num_tau_prime_samples,
+                                         batch_size, 1])
+    # Transpose dimensions so that the dimensionality is batch_size x
+    # self.num_tau_prime_samples x 1 to prepare for computation of
+    # Bellman errors.
+    # Final shape of target_quantile_values:
+    # batch_size x num_tau_prime_samples x 1.
+    target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
+
+    # Shape of indices: (num_tau_samples x batch_size) x 1.
+    # Expand dimension by one so that it can be used to index into all the
+    # quantiles when using the tf.gather_nd function (see below).
+    indices = tf.range(self.num_tau_samples * batch_size)[:, None]
+
+    # Expand the dimension by one so that it can be used to index into all the
+    # quantiles when using the tf.gather_nd function (see below).
+    reshaped_actions = self._replay.actions[:, None]
+    reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
+    # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
+    reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
+
+    chosen_action_quantile_values = tf.gather_nd(
+        self._replay_net_quantile_values, reshaped_actions)
+    # Transpose dimensions so that the dimensionality is batch_size x
+    # self.num_tau_samples x 1 to prepare for computation of
+    # Bellman errors.
+    # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
+    # in which the quantile values are tiled.
+    chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
+                                               [self.num_tau_samples,
+                                                batch_size, 1])
+    # Final shape of chosen_action_quantile_values:
+    # batch_size x num_tau_samples x 1.
+    chosen_action_quantile_values = tf.transpose(
+        chosen_action_quantile_values, [1, 0, 2])   #batchsize x quan x 1
+
+    # Shape of bellman_erors and huber_loss:
+    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    bellman_errors = target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]
+    # The huber loss (see Section 2.3 of the paper) is defined via two cases:
+    # case_one: |bellman_errors| <= kappa
+    # case_two: |bellman_errors| > kappa
+    huber_loss_case_one = tf.to_float(
+        tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
+    huber_loss_case_two = tf.to_float(
+        tf.abs(bellman_errors) > self.kappa) * self.kappa * (
+            tf.abs(bellman_errors) - 0.5 * self.kappa)
+    huber_loss = huber_loss_case_one + huber_loss_case_two
+
+    # Reshape replay_quantiles to batch_size x num_tau_samples x 1
+    replay_quantiles = tf.reshape(
+        self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
+    replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2])  #batchsize x quan x 1
+
+    # Tile by num_tau_prime_samples along a new dimension. Shape is now
+    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    # These quantiles will be used for computation of the quantile huber loss
+    # below (see section 2.3 of the paper).
+    replay_quantiles = tf.to_float(tf.tile(
+        replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
+    # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    quantile_huber_loss = (tf.abs(tf.stop_gradient(replay_quantiles) - tf.stop_gradient(
+        tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
+    # Sum over current quantile value (num_tau_samples) dimension,
+    # average over target quantile value (num_tau_prime_samples) dimension.
+    # Shape: batch_size x num_tau_prime_samples x 1.
+    loss = tf.reduce_sum(quantile_huber_loss, axis=2)
+    # Shape: batch_size x 1.
+    loss = tf.reduce_mean(loss, axis=1)
+
+    # TODO(kumasaurabh): Add prioritized replay functionality here.
+    update_priorities_op = tf.no_op()
+    with tf.control_dependencies([update_priorities_op]):
+      if self.summary_writer is not None:
+        with tf.variable_scope('Losses'):
+          tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
+      return self.optimizer.minimize(tf.reduce_mean(loss))
--- a/dopamine/agents/rainbow/init.py
+++ b/dopamine/agents/rainbow/init.py
@ -1,15 +1,15 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/dopamine/agents/rainbow/configs/c51.gin
+++ b/dopamine/agents/rainbow/configs/c51.gin
@ -1,42 +1,42 @@
-# Hyperparameters follow the settings from Bellemare et al. (2017), but we
-# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to
-# ensure apples-to-apples comparison.
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-RainbowAgent.num_atoms = 51
-RainbowAgent.dueltype = 'DUELTYPE'
-RainbowAgent.game = 'GAME'
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 20000  # agent steps
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 8000  # agent steps
-RainbowAgent.epsilon_train = 0.01
-RainbowAgent.epsilon_eval = 0.001
-RainbowAgent.epsilon_decay_period = 250000  # agent steps
-RainbowAgent.replay_scheme = 'uniform'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.00025
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
-# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
-#atari_lib.create_atari_environment.sticky_actions = True
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'rainbow'
-Runner.num_iterations = 200
-Runner.dueltype = 'DUELTYPE'
-Runner.game = 'GAME'
-Runner.training_steps = 250000  # agent steps
-Runner.evaluation_steps = 125000  # agent steps
-Runner.max_steps_per_episode = 27000  # agent steps
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
-WrappedPrioritizedReplayBuffer.batch_size = 32
+# Hyperparameters follow the settings from Bellemare et al. (2017), but we
+# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to
+# ensure apples-to-apples comparison.
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.dueltype = 'DUELTYPE'
+RainbowAgent.game = 'GAME'
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 20000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00025
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+#atari_lib.create_atari_environment.sticky_actions = True
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'rainbow'
+Runner.num_iterations = 200
+Runner.dueltype = 'DUELTYPE'
+Runner.game = 'GAME'
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
--- a/dopamine/agents/rainbow/configs/c51_acrobot.gin
+++ b/dopamine/agents/rainbow/configs/c51_acrobot.gin
@ -1,39 +1,39 @@
-# Hyperparameters for a simple C51-style Acrobot agent. The hyperparameters
-# chosen achieve reasonable performance.
-import dopamine.agents.dqn.dqn_agent
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.gym_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
-RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
-RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
-RainbowAgent.network = @gym_lib.acrobot_rainbow_network
-RainbowAgent.num_atoms = 51
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 500
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 100
-RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
-RainbowAgent.replay_scheme = 'uniform'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.1
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-create_gym_environment.environment_name = 'Acrobot'
-create_gym_environment.version = 'v1'
-create_agent.agent_name = 'rainbow'
-Runner.create_environment_fn = @gym_lib.create_gym_environment
-Runner.num_iterations = 500
-Runner.training_steps = 1000
-Runner.evaluation_steps = 1000
-Runner.max_steps_per_episode = 500
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 50000
-WrappedPrioritizedReplayBuffer.batch_size = 128
+# Hyperparameters for a simple C51-style Acrobot agent. The hyperparameters
+# chosen achieve reasonable performance.
+import dopamine.agents.dqn.dqn_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.gym_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
+RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
+RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
+RainbowAgent.network = @gym_lib.acrobot_rainbow_network
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 500
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 100
+RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.1
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+create_gym_environment.environment_name = 'Acrobot'
+create_gym_environment.version = 'v1'
+create_agent.agent_name = 'rainbow'
+Runner.create_environment_fn = @gym_lib.create_gym_environment
+Runner.num_iterations = 500
+Runner.training_steps = 1000
+Runner.evaluation_steps = 1000
+Runner.max_steps_per_episode = 500
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 50000
+WrappedPrioritizedReplayBuffer.batch_size = 128
--- a/dopamine/agents/rainbow/configs/c51_cartpole.gin
+++ b/dopamine/agents/rainbow/configs/c51_cartpole.gin
@ -1,39 +1,39 @@
-# Hyperparameters for a simple C51-style Cartpole agent. The hyperparameters
-# chosen achieve reasonable performance.
-import dopamine.agents.dqn.dqn_agent
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.gym_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
-RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
-RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
-RainbowAgent.network = @gym_lib.cartpole_rainbow_network
-RainbowAgent.num_atoms = 51
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 500
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 100
-RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
-RainbowAgent.replay_scheme = 'uniform'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.001
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-create_gym_environment.environment_name = 'CartPole'
-create_gym_environment.version = 'v0'
-create_agent.agent_name = 'rainbow'
-Runner.create_environment_fn = @gym_lib.create_gym_environment
-Runner.num_iterations = 500
-Runner.training_steps = 1000
-Runner.evaluation_steps = 1000
-Runner.max_steps_per_episode = 200  # Default max episode length.
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 50000
-WrappedPrioritizedReplayBuffer.batch_size = 128
+# Hyperparameters for a simple C51-style Cartpole agent. The hyperparameters
+# chosen achieve reasonable performance.
+import dopamine.agents.dqn.dqn_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.gym_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
+RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
+RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
+RainbowAgent.network = @gym_lib.cartpole_rainbow_network
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 500
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 100
+RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.001
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+create_gym_environment.environment_name = 'CartPole'
+create_gym_environment.version = 'v0'
+create_agent.agent_name = 'rainbow'
+Runner.create_environment_fn = @gym_lib.create_gym_environment
+Runner.num_iterations = 500
+Runner.training_steps = 1000
+Runner.evaluation_steps = 1000
+Runner.max_steps_per_episode = 200  # Default max episode length.
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 50000
+WrappedPrioritizedReplayBuffer.batch_size = 128
--- a/dopamine/agents/rainbow/configs/c51_icml.gin
+++ b/dopamine/agents/rainbow/configs/c51_icml.gin
@ -1,41 +1,41 @@
-# Hyperparameters used in Bellemare et al. (2017).
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-RainbowAgent.num_atoms = 51
-RainbowAgent.dueltype = 'DUELTYPE'
-RainbowAgent.game = 'GAME'
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 50000  # agent steps
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 10000  # agent steps
-RainbowAgent.epsilon_train = 0.01
-RainbowAgent.epsilon_eval = 0.001
-RainbowAgent.epsilon_decay_period = 1000000  # agent steps
-RainbowAgent.replay_scheme = 'uniform'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.00025
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
-# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'rainbow'
-Runner.num_iterations = 200
-Runner.dueltype = 'DUELTYPE'
-Runner.game = 'GAME'
-Runner.training_steps = 250000  # agent steps
-Runner.evaluation_steps = 125000  # agent steps
-Runner.max_steps_per_episode = 27000  # agent steps
-
-AtariPreprocessing.terminal_on_life_loss = True
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
-WrappedPrioritizedReplayBuffer.batch_size = 32
+# Hyperparameters used in Bellemare et al. (2017).
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.dueltype = 'DUELTYPE'
+RainbowAgent.game = 'GAME'
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 50000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 10000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 1000000  # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00025
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
+# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'rainbow'
+Runner.num_iterations = 200
+Runner.dueltype = 'DUELTYPE'
+Runner.game = 'GAME'
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
--- a/dopamine/agents/rainbow/configs/rainbow.gin
+++ b/dopamine/agents/rainbow/configs/rainbow.gin
@ -1,42 +1,42 @@
-# Hyperparameters follow Hessel et al. (2018), except for sticky_actions,
-# which was False (not using sticky actions) in the original paper.
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-RainbowAgent.num_atoms = 51
-RainbowAgent.runtype = 'RUNTYPE'
-RainbowAgent.game = 'GAME'
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 3
-RainbowAgent.min_replay_history = 20000  # agent steps
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 8000  # agent steps
-RainbowAgent.epsilon_train = 0.01
-RainbowAgent.epsilon_eval = 0.001
-RainbowAgent.epsilon_decay_period = 250000  # agent steps
-RainbowAgent.replay_scheme = 'prioritized'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-# Note these parameters are different from C51's.
-tf.train.AdamOptimizer.learning_rate = 0.0000625
-tf.train.AdamOptimizer.epsilon = 0.00015
-
-atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
-# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
-#atari_lib.create_atari_environment.sticky_actions = True
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'rainbow'
-Runner.num_iterations = 200
-Runner.runtype = 'RUNTYPE'
-Runner.game = 'GAME'
-Runner.training_steps = 250000  # agent steps
-Runner.evaluation_steps = 125000  # agent steps
-Runner.max_steps_per_episode = 27000  # agent steps
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
-WrappedPrioritizedReplayBuffer.batch_size = 32
+# Hyperparameters follow Hessel et al. (2018), except for sticky_actions,
+# which was False (not using sticky actions) in the original paper.
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.runtype = 'RUNTYPE'
+RainbowAgent.game = 'GAME'
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 3
+RainbowAgent.min_replay_history = 20000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+RainbowAgent.replay_scheme = 'prioritized'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+# Note these parameters are different from C51's.
+tf.train.AdamOptimizer.learning_rate = 0.0000625
+tf.train.AdamOptimizer.epsilon = 0.00015
+
+atari_lib.create_atari_environment.game_name = 'GAME' #'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+#atari_lib.create_atari_environment.sticky_actions = True
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'rainbow'
+Runner.num_iterations = 200
+Runner.runtype = 'RUNTYPE'
+Runner.game = 'GAME'
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
--- a/dopamine/agents/rainbow/configs/rainbow_aaai.gin
+++ b/dopamine/agents/rainbow/configs/rainbow_aaai.gin
@ -1,43 +1,43 @@
-# Hyperparameters follow Hessel et al. (2018).
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.atari_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-import os
-
-RainbowAgent.num_atoms = 51
-RainbowAgent.runtype = 'RUNTYPE'
-RainbowAgent.game = 'GAME'
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 1
-RainbowAgent.min_replay_history = 20000  # agent steps
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 8000  # agent steps
-RainbowAgent.epsilon_train = 0.01
-RainbowAgent.epsilon_eval = 0.001
-RainbowAgent.epsilon_decay_period = 250000  # agent steps
-#RainbowAgent.replay_scheme = 'prioritized'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-# Note these parameters are different from C51's.
-tf.train.AdamOptimizer.learning_rate = 0.0000625
-tf.train.AdamOptimizer.epsilon = 0.00015
-
-atari_lib.create_atari_environment.game_name = 'GAME' #'StarGunner'
-# Deterministic ALE version used in the AAAI paper.
-atari_lib.create_atari_environment.sticky_actions = False
-create_agent.agent_name = 'rainbow'
-Runner.num_iterations = 200
-Runner.runtype = 'RUNTYPE'
-Runner.game = 'GAME'
-Runner.training_steps = 250000  # agent steps
-Runner.evaluation_steps = 125000  # agent steps
-Runner.max_steps_per_episode = 27000  # agent steps
-
-AtariPreprocessing.terminal_on_life_loss = True
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
-WrappedPrioritizedReplayBuffer.batch_size = 32
+# Hyperparameters follow Hessel et al. (2018).
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.atari_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+import os
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.runtype = 'RUNTYPE'
+RainbowAgent.game = 'GAME'
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 20000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+#RainbowAgent.replay_scheme = 'prioritized'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+# Note these parameters are different from C51's.
+tf.train.AdamOptimizer.learning_rate = 0.0000625
+tf.train.AdamOptimizer.epsilon = 0.00015
+
+atari_lib.create_atari_environment.game_name = 'GAME' #'StarGunner'
+# Deterministic ALE version used in the AAAI paper.
+atari_lib.create_atari_environment.sticky_actions = False
+create_agent.agent_name = 'rainbow'
+Runner.num_iterations = 200
+Runner.runtype = 'RUNTYPE'
+Runner.game = 'GAME'
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
--- a/dopamine/agents/rainbow/configs/rainbow_acrobot.gin
+++ b/dopamine/agents/rainbow/configs/rainbow_acrobot.gin
@ -1,38 +1,38 @@
-# Hyperparameters for a simple Rainbow-style Acrobot agent. The hyperparameters
-# chosen achieve reasonable performance.
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.gym_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
-RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
-RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
-RainbowAgent.network = @gym_lib.acrobot_rainbow_network
-RainbowAgent.num_atoms = 51
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 3
-RainbowAgent.min_replay_history = 500
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 100
-RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
-RainbowAgent.replay_scheme = 'prioritized'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.09
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-create_gym_environment.environment_name = 'Acrobot'
-create_gym_environment.version = 'v1'
-create_agent.agent_name = 'rainbow'
-Runner.create_environment_fn = @gym_lib.create_gym_environment
-Runner.num_iterations = 500
-Runner.training_steps = 1000
-Runner.evaluation_steps = 1000
-Runner.max_steps_per_episode = 500
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 50000
-WrappedPrioritizedReplayBuffer.batch_size = 128
+# Hyperparameters for a simple Rainbow-style Acrobot agent. The hyperparameters
+# chosen achieve reasonable performance.
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.gym_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.observation_shape = %gym_lib.ACROBOT_OBSERVATION_SHAPE
+RainbowAgent.observation_dtype = %gym_lib.ACROBOT_OBSERVATION_DTYPE
+RainbowAgent.stack_size = %gym_lib.ACROBOT_STACK_SIZE
+RainbowAgent.network = @gym_lib.acrobot_rainbow_network
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 3
+RainbowAgent.min_replay_history = 500
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 100
+RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
+RainbowAgent.replay_scheme = 'prioritized'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.09
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+create_gym_environment.environment_name = 'Acrobot'
+create_gym_environment.version = 'v1'
+create_agent.agent_name = 'rainbow'
+Runner.create_environment_fn = @gym_lib.create_gym_environment
+Runner.num_iterations = 500
+Runner.training_steps = 1000
+Runner.evaluation_steps = 1000
+Runner.max_steps_per_episode = 500
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 50000
+WrappedPrioritizedReplayBuffer.batch_size = 128
--- a/dopamine/agents/rainbow/configs/rainbow_cartpole.gin
+++ b/dopamine/agents/rainbow/configs/rainbow_cartpole.gin
@ -1,39 +1,39 @@
-# Hyperparameters for a simple Rainbow-style Cartpole agent. The
-# hyperparameters chosen achieve reasonable performance.
-import dopamine.agents.dqn.dqn_agent
-import dopamine.agents.rainbow.rainbow_agent
-import dopamine.discrete_domains.gym_lib
-import dopamine.discrete_domains.run_experiment
-import dopamine.replay_memory.prioritized_replay_buffer
-import gin.tf.external_configurables
-
-RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
-RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
-RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
-RainbowAgent.network = @gym_lib.cartpole_rainbow_network
-RainbowAgent.num_atoms = 51
-RainbowAgent.vmax = 10.
-RainbowAgent.gamma = 0.99
-RainbowAgent.update_horizon = 3
-RainbowAgent.min_replay_history = 500
-RainbowAgent.update_period = 4
-RainbowAgent.target_update_period = 100
-RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
-RainbowAgent.replay_scheme = 'prioritized'
-RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
-RainbowAgent.optimizer = @tf.train.AdamOptimizer()
-
-tf.train.AdamOptimizer.learning_rate = 0.09
-tf.train.AdamOptimizer.epsilon = 0.0003125
-
-create_gym_environment.environment_name = 'CartPole'
-create_gym_environment.version = 'v0'
-create_agent.agent_name = 'rainbow'
-Runner.create_environment_fn = @gym_lib.create_gym_environment
-Runner.num_iterations = 500
-Runner.training_steps = 1000
-Runner.evaluation_steps = 1000
-Runner.max_steps_per_episode = 200  # Default max episode length.
-
-WrappedPrioritizedReplayBuffer.replay_capacity = 50000
-WrappedPrioritizedReplayBuffer.batch_size = 128
+# Hyperparameters for a simple Rainbow-style Cartpole agent. The
+# hyperparameters chosen achieve reasonable performance.
+import dopamine.agents.dqn.dqn_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.discrete_domains.gym_lib
+import dopamine.discrete_domains.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.observation_shape = %gym_lib.CARTPOLE_OBSERVATION_SHAPE
+RainbowAgent.observation_dtype = %gym_lib.CARTPOLE_OBSERVATION_DTYPE
+RainbowAgent.stack_size = %gym_lib.CARTPOLE_STACK_SIZE
+RainbowAgent.network = @gym_lib.cartpole_rainbow_network
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 3
+RainbowAgent.min_replay_history = 500
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 100
+RainbowAgent.epsilon_fn = @dqn_agent.identity_epsilon
+RainbowAgent.replay_scheme = 'prioritized'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.09
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+create_gym_environment.environment_name = 'CartPole'
+create_gym_environment.version = 'v0'
+create_agent.agent_name = 'rainbow'
+Runner.create_environment_fn = @gym_lib.create_gym_environment
+Runner.num_iterations = 500
+Runner.training_steps = 1000
+Runner.evaluation_steps = 1000
+Runner.max_steps_per_episode = 200  # Default max episode length.
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 50000
+WrappedPrioritizedReplayBuffer.batch_size = 128
--- a/dopamine/agents/rainbow/rainbow_agent.py
+++ b/dopamine/agents/rainbow/rainbow_agent.py
--- a/dopamine/colab/README.md
+++ b/dopamine/colab/README.md
@ -1,31 +1,31 @@
-# Colabs
-
-This directory contains
-[`utils.py`](https://github.com/google/dopamine/blob/master/dopamine/colab/utils.py),
-which provides a number of useful utilities for loading experiment statistics.
-
-We also provide a set of colabs to help illustrate how you can use Dopamine.
-
-## Agents
-
-In this
-[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/agents.ipynb)
-we illustrate how to create a new agent by either subclassing
-[`DQN`](https://github.com/google/dopamine/blob/master/dopamine/agents/dqn/dqn_agent.py)
-or by creating a new agent from scratch.
-
-## Loading statistics
-
-In this
-[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/load_statistics.ipynb)
-we illustrate how to load and visualize the logs data produced by Dopamine.
-
-## Visualizing with Tensorboard
-In this
-[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/tensorboard.ipynb)
-we illustrate how to download and visualize different agents with Tensorboard.
-
-## Training on Cartpole
-In this
-[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/cartpole.ipynb)
-we illustrate how to train DQN and C51 on the Cartpole environment.
+# Colabs
+
+This directory contains
+[`utils.py`](https://github.com/google/dopamine/blob/master/dopamine/colab/utils.py),
+which provides a number of useful utilities for loading experiment statistics.
+
+We also provide a set of colabs to help illustrate how you can use Dopamine.
+
+## Agents
+
+In this
+[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/agents.ipynb)
+we illustrate how to create a new agent by either subclassing
+[`DQN`](https://github.com/google/dopamine/blob/master/dopamine/agents/dqn/dqn_agent.py)
+or by creating a new agent from scratch.
+
+## Loading statistics
+
+In this
+[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/load_statistics.ipynb)
+we illustrate how to load and visualize the logs data produced by Dopamine.
+
+## Visualizing with Tensorboard
+In this
+[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/tensorboard.ipynb)
+we illustrate how to download and visualize different agents with Tensorboard.
+
+## Training on Cartpole
+In this
+[colab](https://colab.research.google.com/github/google/dopamine/blob/master/dopamine/colab/cartpole.ipynb)
+we illustrate how to train DQN and C51 on the Cartpole environment.
--- a/dopamine/colab/init.py
+++ b/dopamine/colab/init.py
@ -1,15 +1,15 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/dopamine/colab/agents.ipynb
+++ b/dopamine/colab/agents.ipynb
--- a/dopamine/colab/cartpole.ipynb
+++ b/dopamine/colab/cartpole.ipynb
--- a/dopamine/colab/load_statistics.ipynb
+++ b/dopamine/colab/load_statistics.ipynb
--- a/dopamine/colab/tensorboard.ipynb
+++ b/dopamine/colab/tensorboard.ipynb
@ -1,112 +1,112 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "tensorboard.ipynb",
-      "version": "0.3.2",
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    }
-  },
-  "cells": [
-    {
-      "metadata": {
-        "id": "VYNA79KmgvbY",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Copyright 2018 The Dopamine Authors.\n",
-        "\n",
-        "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
-        "\n",
-        "https://www.apache.org/licenses/LICENSE-2.0\n",
-        "\n",
-        "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Ctd9k0h6wnqT",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Visualize Dopamine baselines with Tensorboard\n",
-        "This colab allows you to easily view the trained baselines with Tensorboard (even if you don't have Tensorboard on your local machine!).\n",
-        "\n",
-        "Simply specify the game you would like to visualize and then run the cells in order.\n",
-        "\n",
-        "_The instructions for setting up Tensorboard were obtained from https://www.dlology.com/blog/quick-guide-to-run-tensorboard-in-google-colab/_"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "s8r_45_0qpmb",
-        "colab_type": "code",
-        "colab": {},
-        "cellView": "form"
-      },
-      "cell_type": "code",
-      "source": [
-        "# @title Prepare all necessary files and binaries.\n",
-        "!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip\n",
-        "!unzip ngrok-stable-linux-amd64.zip\n",
-        "!gsutil -q -m cp -R gs://download-dopamine-rl/compiled_tb_event_files.tar.gz /content/\n",
-        "!tar -xvzf /content/compiled_tb_event_files.tar.gz"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "D-oZRzeWwHZN",
-        "colab_type": "code",
-        "colab": {},
-        "cellView": "form"
-      },
-      "cell_type": "code",
-      "source": [
-        "# @title Select which game to visualize.\n",
-        "game = 'Asterix'  # @param['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge', 'Zaxxon']\n",
-        "agents = ['dqn', 'c51', 'rainbow', 'iqn']\n",
-        "for agent in agents:\n",
-        "  for run in range(1, 6):\n",
-        "    !mkdir -p \"/content/$game/$agent/$run\"\n",
-        "    !cp -r \"/content/$agent/$game/$run\" \"/content/$game/$agent/$run\"\n",
-        "LOG_DIR = '/content/{}'.format(game)\n",
-        "get_ipython().system_raw(\n",
-        "    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'\n",
-        "    .format(LOG_DIR)\n",
-        ")"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "zlKKnaP4y9FA",
-        "colab_type": "code",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 35
-        },
-        "cellView": "form",
-        "outputId": "3abff714-c484-436e-dc5f-88b15511f4f2"
-      },
-      "cell_type": "code",
-      "source": [
-        "# @title Start the tensorboard\n",
-        "get_ipython().system_raw('./ngrok http 6006 &')\n",
-        "! curl -s http://localhost:4040/api/tunnels | python3 -c \\\n",
-        "    \"import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])\""
-      ],
-      "execution_count": 0,
-      "outputs": []
-    }
-  ]
-}
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "tensorboard.ipynb",
+      "version": "0.3.2",
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "id": "VYNA79KmgvbY",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Copyright 2018 The Dopamine Authors.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
+        "\n",
+        "https://www.apache.org/licenses/LICENSE-2.0\n",
+        "\n",
+        "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "Ctd9k0h6wnqT",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Visualize Dopamine baselines with Tensorboard\n",
+        "This colab allows you to easily view the trained baselines with Tensorboard (even if you don't have Tensorboard on your local machine!).\n",
+        "\n",
+        "Simply specify the game you would like to visualize and then run the cells in order.\n",
+        "\n",
+        "_The instructions for setting up Tensorboard were obtained from https://www.dlology.com/blog/quick-guide-to-run-tensorboard-in-google-colab/_"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "s8r_45_0qpmb",
+        "colab_type": "code",
+        "colab": {},
+        "cellView": "form"
+      },
+      "cell_type": "code",
+      "source": [
+        "# @title Prepare all necessary files and binaries.\n",
+        "!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip\n",
+        "!unzip ngrok-stable-linux-amd64.zip\n",
+        "!gsutil -q -m cp -R gs://download-dopamine-rl/compiled_tb_event_files.tar.gz /content/\n",
+        "!tar -xvzf /content/compiled_tb_event_files.tar.gz"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "D-oZRzeWwHZN",
+        "colab_type": "code",
+        "colab": {},
+        "cellView": "form"
+      },
+      "cell_type": "code",
+      "source": [
+        "# @title Select which game to visualize.\n",
+        "game = 'Asterix'  # @param['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge', 'Zaxxon']\n",
+        "agents = ['dqn', 'c51', 'rainbow', 'iqn']\n",
+        "for agent in agents:\n",
+        "  for run in range(1, 6):\n",
+        "    !mkdir -p \"/content/$game/$agent/$run\"\n",
+        "    !cp -r \"/content/$agent/$game/$run\" \"/content/$game/$agent/$run\"\n",
+        "LOG_DIR = '/content/{}'.format(game)\n",
+        "get_ipython().system_raw(\n",
+        "    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'\n",
+        "    .format(LOG_DIR)\n",
+        ")"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "zlKKnaP4y9FA",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "cellView": "form",
+        "outputId": "3abff714-c484-436e-dc5f-88b15511f4f2"
+      },
+      "cell_type": "code",
+      "source": [
+        "# @title Start the tensorboard\n",
+        "get_ipython().system_raw('./ngrok http 6006 &')\n",
+        "! curl -s http://localhost:4040/api/tunnels | python3 -c \\\n",
+        "    \"import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}
--- a/dopamine/colab/utils.py
+++ b/dopamine/colab/utils.py
@ -1,280 +1,280 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This provides utilities for dealing with Dopamine data.
-
-See: dopamine/common/logger.py .
-"""
-
-import itertools
-import os
-import pickle
-import sys
-
-
-
-import numpy as np
-import pandas as pd
-
-import tensorflow as tf
-
-FILE_PREFIX = 'log'
-ITERATION_PREFIX = 'iteration_'
-
-ALL_GAMES = ['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
-             'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk',
-             'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede',
-             'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk',
-             'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite',
-             'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond',
-             'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
-             'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix',
-             'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid',
-             'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris',
-             'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham',
-             'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge',
-             'Zaxxon']
-
-
-def load_baselines(base_dir, verbose=False):
-  """Reads in the baseline experimental data from a specified base directory.
-
-  Args:
-    base_dir: string, base directory where to read data from.
-    verbose: bool, whether to print warning messages.
-
-  Returns:
-    A dict containing pandas DataFrames for all available agents and games.
-  """
-  experimental_data = {}
-  for game in ALL_GAMES:
-    for agent in ['dqn', 'c51', 'rainbow', 'iqn']:
-      game_data_file = os.path.join(base_dir, agent, '{}.pkl'.format(game))
-      if not tf.gfile.Exists(game_data_file):
-        if verbose:
-          # pylint: disable=superfluous-parens
-          print('Unable to load data for agent {} on game {}'.format(agent,
-                                                                     game))
-          # pylint: enable=superfluous-parens
-        continue
-      with tf.gfile.Open(game_data_file, 'rb') as f:
-        if sys.version_info.major >= 3:
-          # pylint: disable=unexpected-keyword-arg
-          single_agent_data = pickle.load(f, encoding='latin1')
-          # pylint: enable=unexpected-keyword-arg
-        else:
-          single_agent_data = pickle.load(f)
-        single_agent_data['agent'] = agent
-        if game in experimental_data:
-          experimental_data[game] = experimental_data[game].merge(
-              single_agent_data, how='outer')
-        else:
-          experimental_data[game] = single_agent_data
-  return experimental_data
-
-
-def load_statistics(log_path, iteration_number=None, verbose=True):
-  """Reads in a statistics object from log_path.
-
-  Args:
-    log_path: string, provides the full path to the training/eval statistics.
-    iteration_number: The iteration number of the statistics object we want
-      to read. If set to None, load the latest version.
-    verbose: Whether to output information about the load procedure.
-
-  Returns:
-    data: The requested statistics object.
-    iteration: The corresponding iteration number.
-
-  Raises:
-    Exception: if data is not present.
-  """
-  # If no iteration is specified, we'll look for the most recent.
-  if iteration_number is None:
-    iteration_number = get_latest_iteration(log_path)
-
-  log_file = '%s/%s_%d' % (log_path, FILE_PREFIX, iteration_number)
-
-  if verbose:
-    # pylint: disable=superfluous-parens
-    print('Reading statistics from: {}'.format(log_file))
-    # pylint: enable=superfluous-parens
-
-  with tf.gfile.Open(log_file, 'rb') as f:
-    return pickle.load(f), iteration_number
-
-
-def get_latest_file(path):
-  """Return the file named 'path_[0-9]*' with the largest such number.
-
-  Args:
-    path: The base path (including directory and base name) to search.
-
-  Returns:
-    The latest file (in terms of given numbers).
-  """
-  try:
-    latest_iteration = get_latest_iteration(path)
-    return os.path.join(path, '{}_{}'.format(FILE_PREFIX, latest_iteration))
-  except ValueError:
-    return None
-
-
-def get_latest_iteration(path):
-  """Return the largest iteration number corresponding to the given path.
-
-  Args:
-    path: The base path (including directory and base name) to search.
-
-  Returns:
-    The latest iteration number.
-
-  Raises:
-    ValueError: if there is not available log data at the given path.
-  """
-  glob = os.path.join(path, '{}_[0-9]*'.format(FILE_PREFIX))
-  log_files = tf.gfile.Glob(glob)
-
-  if not log_files:
-    raise ValueError('No log data found at {}'.format(path))
-
-  def extract_iteration(x):
-    return int(x[x.rfind('_') + 1:])
-
-  latest_iteration = max(extract_iteration(x) for x in log_files)
-  return latest_iteration
-
-
-def summarize_data(data, summary_keys):
-  """Processes log data into a per-iteration summary.
-
-  Args:
-    data: Dictionary loaded by load_statistics describing the data. This
-      dictionary has keys iteration_0, iteration_1, ... describing per-iteration
-      data.
-    summary_keys: List of per-iteration data to be summarized.
-
-  Example:
-    data = load_statistics(...)
-    summarize_data(data, ['train_episode_returns',
-        'eval_episode_returns'])
-
-  Returns:
-    A dictionary mapping each key in returns_keys to a per-iteration summary.
-  """
-  summary = {}
-  latest_iteration_number = len(data.keys())
-  current_value = None
-
-  for key in summary_keys:
-    summary[key] = []
-    # Compute per-iteration average of the given key.
-    for i in range(latest_iteration_number):
-      iter_key = '{}{}'.format(ITERATION_PREFIX, i)
-      # We allow reporting the same value multiple times when data is missing.
-      # If there is no data for this iteration, use the previous'.
-      if iter_key in data:
-        current_value = np.mean(data[iter_key][key])
-      summary[key].append(current_value)
-
-  return summary
-
-
-def read_experiment(log_path,
-                    parameter_set=None,
-                    job_descriptor='',
-                    iteration_number=None,
-                    summary_keys=('train_episode_returns',
-                                  'eval_episode_returns'),
-                    verbose=False):
-  """Reads in a set of experimental results from log_path.
-
-  The provided parameter_set is an ordered_dict which
-    1) defines the parameters of this experiment,
-    2) defines the order in which they occur in the job descriptor.
-
-  The method reads all experiments of the form
-
-  ${log_path}/${job_descriptor}.format(params)/logs,
-
-  where params is constructed from the cross product of the elements in
-  the parameter_set.
-
-  For example:
-    parameter_set = collections.OrderedDict([
-        ('game', ['Asterix', 'Pong']),
-        ('epsilon', ['0', '0.1'])
-    ])
-    read_experiment('/tmp/logs', parameter_set, job_descriptor='{}_{}')
-    Will try to read logs from:
-    - /tmp/logs/Asterix_0/logs
-    - /tmp/logs/Asterix_0.1/logs
-    - /tmp/logs/Pong_0/logs
-    - /tmp/logs/Pong_0.1/logs
-
-  Args:
-    log_path: string, base path specifying where results live.
-    parameter_set: An ordered_dict mapping parameter names to allowable values.
-    job_descriptor: A job descriptor string which is used to construct the full
-      path for each trial within an experiment.
-    iteration_number: Int, if not None determines the iteration number at which
-      we read in results.
-    summary_keys: Iterable of strings, iteration statistics to summarize.
-    verbose: If True, print out additional information.
-
-  Returns:
-    A Pandas dataframe containing experimental results.
-  """
-  keys = [] if parameter_set is None else list(parameter_set.keys())
-  # Extract parameter value lists, one per parameter.
-  ordered_values = [parameter_set[key] for key in keys]
-
-  column_names = keys + ['iteration'] + list(summary_keys)
-  num_parameter_settings = len([_ for _ in itertools.product(*ordered_values)])
-  expected_num_iterations = 200
-  expected_num_rows = num_parameter_settings * expected_num_iterations
-
-  # Create DataFrame with predicted number of rows.
-  data_frame = pd.DataFrame(index=np.arange(0, expected_num_rows),
-                            columns=column_names)
-  row_index = 0
-
-  # Now take their cross product. This generates tuples of the form
-  # (p1, p2, p3, ...) where p1, p2, p3 are parameter values for the first,
-  # second, etc. parameters as ordered in value_set.
-  for parameter_tuple in itertools.product(*ordered_values):
-    if job_descriptor is not None:
-      name = job_descriptor.format(*parameter_tuple)
-    else:
-      # Construct name for values.
-      name = '-'.join([keys[i] + '_' + str(parameter_tuple[i])
-                       for i in range(len(keys))])
-
-    experiment_path = '{}/{}/logs'.format(log_path, name)
-
-    raw_data, last_iteration = load_statistics(
-        experiment_path, iteration_number=iteration_number, verbose=verbose)
-
-    summary = summarize_data(raw_data, summary_keys)
-    for iteration in range(last_iteration):
-      # The row contains all the parameters, the iteration, and finally the
-      # requested values.
-      row_data = (list(parameter_tuple) + [iteration] +
-                  [summary[key][iteration] for key in summary_keys])
-      data_frame.loc[row_index] = row_data
-
-      row_index += 1
-
-  # Shed any unused rows.
-  return data_frame.drop(np.arange(row_index, expected_num_rows))
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This provides utilities for dealing with Dopamine data.
+
+See: dopamine/common/logger.py .
+"""
+
+import itertools
+import os
+import pickle
+import sys
+
+
+
+import numpy as np
+import pandas as pd
+
+import tensorflow as tf
+
+FILE_PREFIX = 'log'
+ITERATION_PREFIX = 'iteration_'
+
+ALL_GAMES = ['AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
+             'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk',
+             'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede',
+             'ChopperCommand', 'CrazyClimber', 'DemonAttack', 'DoubleDunk',
+             'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite',
+             'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond',
+             'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
+             'MontezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix',
+             'Pitfall', 'Pong', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid',
+             'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris',
+             'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham',
+             'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'YarsRevenge',
+             'Zaxxon']
+
+
+def load_baselines(base_dir, verbose=False):
+  """Reads in the baseline experimental data from a specified base directory.
+
+  Args:
+    base_dir: string, base directory where to read data from.
+    verbose: bool, whether to print warning messages.
+
+  Returns:
+    A dict containing pandas DataFrames for all available agents and games.
+  """
+  experimental_data = {}
+  for game in ALL_GAMES:
+    for agent in ['dqn', 'c51', 'rainbow', 'iqn']:
+      game_data_file = os.path.join(base_dir, agent, '{}.pkl'.format(game))
+      if not tf.gfile.Exists(game_data_file):
+        if verbose:
+          # pylint: disable=superfluous-parens
+          print('Unable to load data for agent {} on game {}'.format(agent,
+                                                                     game))
+          # pylint: enable=superfluous-parens
+        continue
+      with tf.gfile.Open(game_data_file, 'rb') as f:
+        if sys.version_info.major >= 3:
+          # pylint: disable=unexpected-keyword-arg
+          single_agent_data = pickle.load(f, encoding='latin1')
+          # pylint: enable=unexpected-keyword-arg
+        else:
+          single_agent_data = pickle.load(f)
+        single_agent_data['agent'] = agent
+        if game in experimental_data:
+          experimental_data[game] = experimental_data[game].merge(
+              single_agent_data, how='outer')
+        else:
+          experimental_data[game] = single_agent_data
+  return experimental_data
+
+
+def load_statistics(log_path, iteration_number=None, verbose=True):
+  """Reads in a statistics object from log_path.
+
+  Args:
+    log_path: string, provides the full path to the training/eval statistics.
+    iteration_number: The iteration number of the statistics object we want
+      to read. If set to None, load the latest version.
+    verbose: Whether to output information about the load procedure.
+
+  Returns:
+    data: The requested statistics object.
+    iteration: The corresponding iteration number.
+
+  Raises:
+    Exception: if data is not present.
+  """
+  # If no iteration is specified, we'll look for the most recent.
+  if iteration_number is None:
+    iteration_number = get_latest_iteration(log_path)
+
+  log_file = '%s/%s_%d' % (log_path, FILE_PREFIX, iteration_number)
+
+  if verbose:
+    # pylint: disable=superfluous-parens
+    print('Reading statistics from: {}'.format(log_file))
+    # pylint: enable=superfluous-parens
+
+  with tf.gfile.Open(log_file, 'rb') as f:
+    return pickle.load(f), iteration_number
+
+
+def get_latest_file(path):
+  """Return the file named 'path_[0-9]*' with the largest such number.
+
+  Args:
+    path: The base path (including directory and base name) to search.
+
+  Returns:
+    The latest file (in terms of given numbers).
+  """
+  try:
+    latest_iteration = get_latest_iteration(path)
+    return os.path.join(path, '{}_{}'.format(FILE_PREFIX, latest_iteration))
+  except ValueError:
+    return None
+
+
+def get_latest_iteration(path):
+  """Return the largest iteration number corresponding to the given path.
+
+  Args:
+    path: The base path (including directory and base name) to search.
+
+  Returns:
+    The latest iteration number.
+
+  Raises:
+    ValueError: if there is not available log data at the given path.
+  """
+  glob = os.path.join(path, '{}_[0-9]*'.format(FILE_PREFIX))
+  log_files = tf.gfile.Glob(glob)
+
+  if not log_files:
+    raise ValueError('No log data found at {}'.format(path))
+
+  def extract_iteration(x):
+    return int(x[x.rfind('_') + 1:])
+
+  latest_iteration = max(extract_iteration(x) for x in log_files)
+  return latest_iteration
+
+
+def summarize_data(data, summary_keys):
+  """Processes log data into a per-iteration summary.
+
+  Args:
+    data: Dictionary loaded by load_statistics describing the data. This
+      dictionary has keys iteration_0, iteration_1, ... describing per-iteration
+      data.
+    summary_keys: List of per-iteration data to be summarized.
+
+  Example:
+    data = load_statistics(...)
+    summarize_data(data, ['train_episode_returns',
+        'eval_episode_returns'])
+
+  Returns:
+    A dictionary mapping each key in returns_keys to a per-iteration summary.
+  """
+  summary = {}
+  latest_iteration_number = len(data.keys())
+  current_value = None
+
+  for key in summary_keys:
+    summary[key] = []
+    # Compute per-iteration average of the given key.
+    for i in range(latest_iteration_number):
+      iter_key = '{}{}'.format(ITERATION_PREFIX, i)
+      # We allow reporting the same value multiple times when data is missing.
+      # If there is no data for this iteration, use the previous'.
+      if iter_key in data:
+        current_value = np.mean(data[iter_key][key])
+      summary[key].append(current_value)
+
+  return summary
+
+
+def read_experiment(log_path,
+                    parameter_set=None,
+                    job_descriptor='',
+                    iteration_number=None,
+                    summary_keys=('train_episode_returns',
+                                  'eval_episode_returns'),
+                    verbose=False):
+  """Reads in a set of experimental results from log_path.
+
+  The provided parameter_set is an ordered_dict which
+    1) defines the parameters of this experiment,
+    2) defines the order in which they occur in the job descriptor.
+
+  The method reads all experiments of the form
+
+  ${log_path}/${job_descriptor}.format(params)/logs,
+
+  where params is constructed from the cross product of the elements in
+  the parameter_set.
+
+  For example:
+    parameter_set = collections.OrderedDict([
+        ('game', ['Asterix', 'Pong']),
+        ('epsilon', ['0', '0.1'])
+    ])
+    read_experiment('/tmp/logs', parameter_set, job_descriptor='{}_{}')
+    Will try to read logs from:
+    - /tmp/logs/Asterix_0/logs
+    - /tmp/logs/Asterix_0.1/logs
+    - /tmp/logs/Pong_0/logs
+    - /tmp/logs/Pong_0.1/logs
+
+  Args:
+    log_path: string, base path specifying where results live.
+    parameter_set: An ordered_dict mapping parameter names to allowable values.
+    job_descriptor: A job descriptor string which is used to construct the full
+      path for each trial within an experiment.
+    iteration_number: Int, if not None determines the iteration number at which
+      we read in results.
+    summary_keys: Iterable of strings, iteration statistics to summarize.
+    verbose: If True, print out additional information.
+
+  Returns:
+    A Pandas dataframe containing experimental results.
+  """
+  keys = [] if parameter_set is None else list(parameter_set.keys())
+  # Extract parameter value lists, one per parameter.
+  ordered_values = [parameter_set[key] for key in keys]
+
+  column_names = keys + ['iteration'] + list(summary_keys)
+  num_parameter_settings = len([_ for _ in itertools.product(*ordered_values)])
+  expected_num_iterations = 200
+  expected_num_rows = num_parameter_settings * expected_num_iterations
+
+  # Create DataFrame with predicted number of rows.
+  data_frame = pd.DataFrame(index=np.arange(0, expected_num_rows),
+                            columns=column_names)
+  row_index = 0
+
+  # Now take their cross product. This generates tuples of the form
+  # (p1, p2, p3, ...) where p1, p2, p3 are parameter values for the first,
+  # second, etc. parameters as ordered in value_set.
+  for parameter_tuple in itertools.product(*ordered_values):
+    if job_descriptor is not None:
+      name = job_descriptor.format(*parameter_tuple)
+    else:
+      # Construct name for values.
+      name = '-'.join([keys[i] + '_' + str(parameter_tuple[i])
+                       for i in range(len(keys))])
+
+    experiment_path = '{}/{}/logs'.format(log_path, name)
+
+    raw_data, last_iteration = load_statistics(
+        experiment_path, iteration_number=iteration_number, verbose=verbose)
+
+    summary = summarize_data(raw_data, summary_keys)
+    for iteration in range(last_iteration):
+      # The row contains all the parameters, the iteration, and finally the
+      # requested values.
+      row_data = (list(parameter_tuple) + [iteration] +
+                  [summary[key][iteration] for key in summary_keys])
+      data_frame.loc[row_index] = row_data
+
+      row_index += 1
+
+  # Shed any unused rows.
+  return data_frame.drop(np.arange(row_index, expected_num_rows))
--- a/dopamine/discrete_domains/.DS_Store
+++ b/dopamine/discrete_domains/.DS_Store
--- a/dopamine/discrete_domains/init.py
+++ b/dopamine/discrete_domains/init.py
@ -1 +1 @@
-# coding=utf-8
+# coding=utf-8
--- a/dopamine/discrete_domains/atari_lib.py
+++ b/dopamine/discrete_domains/atari_lib.py
--- a/dopamine/discrete_domains/checkpointer.py
+++ b/dopamine/discrete_domains/checkpointer.py
@ -1,177 +1,177 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""A checkpointing mechanism for Dopamine agents.
-
-This Checkpointer expects a base directory where checkpoints for different
-iterations are stored. Specifically, Checkpointer.save_checkpoint() takes in
-as input a dictionary 'data' to be pickled to disk. At each iteration, we
-write a file called 'cpkt.#', where # is the iteration number. The
-Checkpointer also cleans up old files, maintaining up to the CHECKPOINT_DURATION
-most recent iterations.
-
-The Checkpointer writes a sentinel file to indicate that checkpointing was
-globally successful. This means that all other checkpointing activities
-(saving the Tensorflow graph, the replay buffer) should be performed *prior*
-to calling Checkpointer.save_checkpoint(). This allows the Checkpointer to
-detect incomplete checkpoints.
-
-#### Example
-
-After running 10 iterations (numbered 0...9) with base_directory='/checkpoint',
-the following files will exist:
-```
-  /checkpoint/cpkt.6
-  /checkpoint/cpkt.7
-  /checkpoint/cpkt.8
-  /checkpoint/cpkt.9
-  /checkpoint/sentinel_checkpoint_complete.6
-  /checkpoint/sentinel_checkpoint_complete.7
-  /checkpoint/sentinel_checkpoint_complete.8
-  /checkpoint/sentinel_checkpoint_complete.9
-```
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import pickle
-import tensorflow as tf
-
-CHECKPOINT_DURATION = 4
-
-
-def get_latest_checkpoint_number(base_directory):
-  """Returns the version number of the latest completed checkpoint.
-
-  Args:
-    base_directory: str, directory in which to look for checkpoint files.
-
-  Returns:
-    int, the iteration number of the latest checkpoint, or -1 if none was found.
-  """
-  glob = os.path.join(base_directory, 'sentinel_checkpoint_complete.*')
-  def extract_iteration(x):
-    return int(x[x.rfind('.') + 1:])
-  try:
-    checkpoint_files = tf.gfile.Glob(glob)
-  except tf.errors.NotFoundError:
-    return -1
-  try:
-    latest_iteration = max(extract_iteration(x) for x in checkpoint_files)
-    return latest_iteration
-  except ValueError:
-    return -1
-
-
-class Checkpointer(object):
-  """Class for managing checkpoints for Dopamine agents.
-  """
-
-  def __init__(self, base_directory, checkpoint_file_prefix='ckpt',
-               checkpoint_frequency=1):
-    """Initializes Checkpointer.
-
-    Args:
-      base_directory: str, directory where all checkpoints are saved/loaded.
-      checkpoint_file_prefix: str, prefix to use for naming checkpoint files.
-      checkpoint_frequency: int, the frequency at which to checkpoint.
-
-    Raises:
-      ValueError: if base_directory is empty, or not creatable.
-    """
-    if not base_directory:
-      raise ValueError('No path provided to Checkpointer.')
-    self._checkpoint_file_prefix = checkpoint_file_prefix
-    self._checkpoint_frequency = checkpoint_frequency
-    self._base_directory = base_directory
-    try:
-      tf.gfile.MakeDirs(base_directory)
-    except tf.errors.PermissionDeniedError:
-      # We catch the PermissionDeniedError and issue a more useful exception.
-      raise ValueError('Unable to create checkpoint path: {}.'.format(
-          base_directory))
-
-  def _generate_filename(self, file_prefix, iteration_number):
-    """Returns a checkpoint filename from prefix and iteration number."""
-    filename = '{}.{}'.format(file_prefix, iteration_number)
-    return os.path.join(self._base_directory, filename)
-
-  def _save_data_to_file(self, data, filename):
-    """Saves the given 'data' object to a file."""
-    with tf.gfile.GFile(filename, 'w') as fout:
-      pickle.dump(data, fout)
-
-  def save_checkpoint(self, iteration_number, data):
-    """Saves a new checkpoint at the current iteration_number.
-
-    Args:
-      iteration_number: int, the current iteration number for this checkpoint.
-      data: Any (picklable) python object containing the data to store in the
-        checkpoint.
-    """
-    if iteration_number % self._checkpoint_frequency != 0:
-      return
-
-    filename = self._generate_filename(self._checkpoint_file_prefix,
-                                       iteration_number)
-    self._save_data_to_file(data, filename)
-    filename = self._generate_filename('sentinel_checkpoint_complete',
-                                       iteration_number)
-    with tf.gfile.GFile(filename, 'wb') as fout:
-      fout.write('done')
-
-    self._clean_up_old_checkpoints(iteration_number)
-
-  def _clean_up_old_checkpoints(self, iteration_number):
-    """Removes sufficiently old checkpoints."""
-    # After writing a the checkpoint and sentinel file, we garbage collect files
-    # that are CHECKPOINT_DURATION * self._checkpoint_frequency versions old.
-    stale_iteration_number = iteration_number - (self._checkpoint_frequency *
-                                                 CHECKPOINT_DURATION)
-
-    if stale_iteration_number >= 0:
-      stale_file = self._generate_filename(self._checkpoint_file_prefix,
-                                           stale_iteration_number)
-      stale_sentinel = self._generate_filename('sentinel_checkpoint_complete',
-                                               stale_iteration_number)
-      try:
-        tf.gfile.Remove(stale_file)
-        tf.gfile.Remove(stale_sentinel)
-      except tf.errors.NotFoundError:
-        # Ignore if file not found.
-        tf.logging.info('Unable to remove {} or {}.'.format(stale_file,
-                                                            stale_sentinel))
-
-  def _load_data_from_file(self, filename):
-    if not tf.gfile.Exists(filename):
-      return None
-    with tf.gfile.GFile(filename, 'rb') as fin:
-      return pickle.load(fin)
-
-  def load_checkpoint(self, iteration_number):
-    """Tries to reload a checkpoint at the selected iteration number.
-
-    Args:
-      iteration_number: The checkpoint iteration number to try to load.
-
-    Returns:
-      If the checkpoint files exist, two unpickled objects that were passed in
-        as data to save_checkpoint; returns None if the files do not exist.
-    """
-    checkpoint_file = self._generate_filename(self._checkpoint_file_prefix,
-                                              iteration_number)
-    return self._load_data_from_file(checkpoint_file)
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A checkpointing mechanism for Dopamine agents.
+
+This Checkpointer expects a base directory where checkpoints for different
+iterations are stored. Specifically, Checkpointer.save_checkpoint() takes in
+as input a dictionary 'data' to be pickled to disk. At each iteration, we
+write a file called 'cpkt.#', where # is the iteration number. The
+Checkpointer also cleans up old files, maintaining up to the CHECKPOINT_DURATION
+most recent iterations.
+
+The Checkpointer writes a sentinel file to indicate that checkpointing was
+globally successful. This means that all other checkpointing activities
+(saving the Tensorflow graph, the replay buffer) should be performed *prior*
+to calling Checkpointer.save_checkpoint(). This allows the Checkpointer to
+detect incomplete checkpoints.
+
+#### Example
+
+After running 10 iterations (numbered 0...9) with base_directory='/checkpoint',
+the following files will exist:
+```
+  /checkpoint/cpkt.6
+  /checkpoint/cpkt.7
+  /checkpoint/cpkt.8
+  /checkpoint/cpkt.9
+  /checkpoint/sentinel_checkpoint_complete.6
+  /checkpoint/sentinel_checkpoint_complete.7
+  /checkpoint/sentinel_checkpoint_complete.8
+  /checkpoint/sentinel_checkpoint_complete.9
+```
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pickle
+import tensorflow as tf
+
+CHECKPOINT_DURATION = 4
+
+
+def get_latest_checkpoint_number(base_directory):
+  """Returns the version number of the latest completed checkpoint.
+
+  Args:
+    base_directory: str, directory in which to look for checkpoint files.
+
+  Returns:
+    int, the iteration number of the latest checkpoint, or -1 if none was found.
+  """
+  glob = os.path.join(base_directory, 'sentinel_checkpoint_complete.*')
+  def extract_iteration(x):
+    return int(x[x.rfind('.') + 1:])
+  try:
+    checkpoint_files = tf.gfile.Glob(glob)
+  except tf.errors.NotFoundError:
+    return -1
+  try:
+    latest_iteration = max(extract_iteration(x) for x in checkpoint_files)
+    return latest_iteration
+  except ValueError:
+    return -1
+
+
+class Checkpointer(object):
+  """Class for managing checkpoints for Dopamine agents.
+  """
+
+  def __init__(self, base_directory, checkpoint_file_prefix='ckpt',
+               checkpoint_frequency=1):
+    """Initializes Checkpointer.
+
+    Args:
+      base_directory: str, directory where all checkpoints are saved/loaded.
+      checkpoint_file_prefix: str, prefix to use for naming checkpoint files.
+      checkpoint_frequency: int, the frequency at which to checkpoint.
+
+    Raises:
+      ValueError: if base_directory is empty, or not creatable.
+    """
+    if not base_directory:
+      raise ValueError('No path provided to Checkpointer.')
+    self._checkpoint_file_prefix = checkpoint_file_prefix
+    self._checkpoint_frequency = checkpoint_frequency
+    self._base_directory = base_directory
+    try:
+      tf.gfile.MakeDirs(base_directory)
+    except tf.errors.PermissionDeniedError:
+      # We catch the PermissionDeniedError and issue a more useful exception.
+      raise ValueError('Unable to create checkpoint path: {}.'.format(
+          base_directory))
+
+  def _generate_filename(self, file_prefix, iteration_number):
+    """Returns a checkpoint filename from prefix and iteration number."""
+    filename = '{}.{}'.format(file_prefix, iteration_number)
+    return os.path.join(self._base_directory, filename)
+
+  def _save_data_to_file(self, data, filename):
+    """Saves the given 'data' object to a file."""
+    with tf.gfile.GFile(filename, 'w') as fout:
+      pickle.dump(data, fout)
+
+  def save_checkpoint(self, iteration_number, data):
+    """Saves a new checkpoint at the current iteration_number.
+
+    Args:
+      iteration_number: int, the current iteration number for this checkpoint.
+      data: Any (picklable) python object containing the data to store in the
+        checkpoint.
+    """
+    if iteration_number % self._checkpoint_frequency != 0:
+      return
+
+    filename = self._generate_filename(self._checkpoint_file_prefix,
+                                       iteration_number)
+    self._save_data_to_file(data, filename)
+    filename = self._generate_filename('sentinel_checkpoint_complete',
+                                       iteration_number)
+    with tf.gfile.GFile(filename, 'wb') as fout:
+      fout.write('done')
+
+    self._clean_up_old_checkpoints(iteration_number)
+
+  def _clean_up_old_checkpoints(self, iteration_number):
+    """Removes sufficiently old checkpoints."""
+    # After writing a the checkpoint and sentinel file, we garbage collect files
+    # that are CHECKPOINT_DURATION * self._checkpoint_frequency versions old.
+    stale_iteration_number = iteration_number - (self._checkpoint_frequency *
+                                                 CHECKPOINT_DURATION)
+
+    if stale_iteration_number >= 0:
+      stale_file = self._generate_filename(self._checkpoint_file_prefix,
+                                           stale_iteration_number)
+      stale_sentinel = self._generate_filename('sentinel_checkpoint_complete',
+                                               stale_iteration_number)
+      try:
+        tf.gfile.Remove(stale_file)
+        tf.gfile.Remove(stale_sentinel)
+      except tf.errors.NotFoundError:
+        # Ignore if file not found.
+        tf.logging.info('Unable to remove {} or {}.'.format(stale_file,
+                                                            stale_sentinel))
+
+  def _load_data_from_file(self, filename):
+    if not tf.gfile.Exists(filename):
+      return None
+    with tf.gfile.GFile(filename, 'rb') as fin:
+      return pickle.load(fin)
+
+  def load_checkpoint(self, iteration_number):
+    """Tries to reload a checkpoint at the selected iteration number.
+
+    Args:
+      iteration_number: The checkpoint iteration number to try to load.
+
+    Returns:
+      If the checkpoint files exist, two unpickled objects that were passed in
+        as data to save_checkpoint; returns None if the files do not exist.
+    """
+    checkpoint_file = self._generate_filename(self._checkpoint_file_prefix,
+                                              iteration_number)
+    return self._load_data_from_file(checkpoint_file)
--- a/dopamine/discrete_domains/gym_lib.py
+++ b/dopamine/discrete_domains/gym_lib.py
@ -1,335 +1,335 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Gym-specific (non-Atari) utilities.
-
-Some network specifications specific to certain Gym environments are provided
-here.
-
-Includes a wrapper class around Gym environments. This class makes general Gym
-environments conformant with the API Dopamine is expecting.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-import math
-
-
-
-import gym
-import numpy as np
-import tensorflow as tf
-
-import gin.tf
-
-
-CARTPOLE_MIN_VALS = np.array([-2.4, -5., -math.pi/12., -math.pi*2.])
-CARTPOLE_MAX_VALS = np.array([2.4, 5., math.pi/12., math.pi*2.])
-ACROBOT_MIN_VALS = np.array([-1., -1., -1., -1., -5., -5.])
-ACROBOT_MAX_VALS = np.array([1., 1., 1., 1., 5., 5.])
-gin.constant('gym_lib.CARTPOLE_OBSERVATION_SHAPE', (4, 1))
-gin.constant('gym_lib.CARTPOLE_OBSERVATION_DTYPE', tf.float32)
-gin.constant('gym_lib.CARTPOLE_STACK_SIZE', 1)
-gin.constant('gym_lib.ACROBOT_OBSERVATION_SHAPE', (6, 1))
-gin.constant('gym_lib.ACROBOT_OBSERVATION_DTYPE', tf.float32)
-gin.constant('gym_lib.ACROBOT_STACK_SIZE', 1)
-
-slim = tf.contrib.slim
-
-
-@gin.configurable
-def create_gym_environment(environment_name=None, version='v0'):
-  """Wraps a Gym environment with some basic preprocessing.
-
-  Args:
-    environment_name: str, the name of the environment to run.
-    version: str, version of the environment to run.
-
-  Returns:
-    A Gym environment with some standard preprocessing.
-  """
-  assert environment_name is not None
-  full_game_name = '{}-{}'.format(environment_name, version)
-  env = gym.make(full_game_name)
-  # Strip out the TimeLimit wrapper from Gym, which caps us at 200 steps.
-  env = env.env
-  # Wrap the returned environment in a class which conforms to the API expected
-  # by Dopamine.
-  env = GymPreprocessing(env)
-  return env
-
-
-@gin.configurable
-def _basic_discrete_domain_network(min_vals, max_vals, num_actions, state,
-                                   num_atoms=None):
-  """Builds a basic network for discrete domains, rescaling inputs to [-1, 1].
-
-  Args:
-    min_vals: float, minimum attainable values (must be same shape as `state`).
-    max_vals: float, maximum attainable values (must be same shape as `state`).
-    num_actions: int, number of actions.
-    state: `tf.Tensor`, the state input.
-    num_atoms: int or None, if None will construct a DQN-style network,
-      otherwise will construct a Rainbow-style network.
-
-  Returns:
-    The Q-values for DQN-style agents or logits for Rainbow-style agents.
-  """
-  net = tf.cast(state, tf.float32)
-  net = slim.flatten(net)
-  net -= min_vals
-  net /= max_vals - min_vals
-  net = 2.0 * net - 1.0  # Rescale in range [-1, 1].
-  net = slim.fully_connected(net, 512)
-  net = slim.fully_connected(net, 512)
-  if num_atoms is None:
-    # We are constructing a DQN-style network.
-    return slim.fully_connected(net, num_actions, activation_fn=None)
-  else:
-    # We are constructing a rainbow-style network.
-    return slim.fully_connected(net, num_actions * num_atoms,
-                                activation_fn=None)
-
-
-@gin.configurable
-def cartpole_dqn_network(num_actions, network_type, state):
-  """Builds the deep network used to compute the agent's Q-values.
-
-  It rescales the input features to a range that yields improved performance.
-
-  Args:
-    num_actions: int, number of actions.
-    network_type: namedtuple, collection of expected values to return.
-    state: `tf.Tensor`, contains the agent's current state.
-
-  Returns:
-    net: _network_type object containing the tensors output by the network.
-  """
-  q_values = _basic_discrete_domain_network(
-      CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state)
-  return network_type(q_values)
-
-
-class FourierBasis(object):
-  """Fourier Basis linear function approximation.
-
-  Requires the ranges for each dimension, and is thus able to use only sine or
-  cosine (and uses cosine). So, this has half the coefficients that a full
-  Fourier approximation would use.
-
-  Many thanks to Will Dabney (wdabney@) for this implementation.
-
-  From the paper:
-  G.D. Konidaris, S. Osentoski and P.S. Thomas. (2011)
-  Value Function Approximation in Reinforcement Learning using the Fourier Basis
-  """
-
-  def __init__(self, nvars, min_vals=0, max_vals=None, order=3):
-    self.order = order
-    self.min_vals = min_vals
-    self.max_vals = max_vals
-    terms = itertools.product(range(order + 1), repeat=nvars)
-
-    # Removing first iterate because it corresponds to the constant bias
-    self.multipliers = tf.constant(
-        [list(map(int, x)) for x in terms][1:], dtype=tf.float32)
-
-  def scale(self, values):
-    shifted = values - self.min_vals
-    if self.max_vals is None:
-      return shifted
-
-    return shifted / (self.max_vals - self.min_vals)
-
-  def compute_features(self, features):
-    # Important to rescale features to be between [0,1]
-    scaled = self.scale(features)
-    return tf.cos(np.pi * tf.matmul(scaled, self.multipliers, transpose_b=True))
-
-
-@gin.configurable
-def fourier_dqn_network(min_vals,
-                        max_vals,
-                        num_actions,
-                        state,
-                        fourier_basis_order=3):
-  """Builds the function approximator used to compute the agent's Q-values.
-
-  It uses FourierBasis features and a linear layer.
-
-  Args:
-    min_vals: float, minimum attainable values (must be same shape as `state`).
-    max_vals: float, maximum attainable values (must be same shape as `state`).
-    num_actions: int, number of actions.
-    state: `tf.Tensor`, contains the agent's current state.
-    fourier_basis_order: int, order of the Fourier basis functions.
-
-  Returns:
-    The Q-values for DQN-style agents or logits for Rainbow-style agents.
-  """
-  net = tf.cast(state, tf.float32)
-  net = slim.flatten(net)
-
-  # Feed state through Fourier basis.
-  feature_generator = FourierBasis(
-      net.get_shape().as_list()[-1],
-      min_vals,
-      max_vals,
-      order=fourier_basis_order)
-  net = feature_generator.compute_features(net)
-
-  # Q-values are always linear w.r.t. last layer.
-  q_values = slim.fully_connected(
-      net, num_actions, activation_fn=None, biases_initializer=None)
-  return q_values
-
-
-def cartpole_fourier_dqn_network(num_actions, network_type, state):
-  """Builds the function approximator used to compute the agent's Q-values.
-
-  It uses the Fourier basis features and a linear function approximator.
-
-  Args:
-    num_actions: int, number of actions.
-    network_type: namedtuple, collection of expected values to return.
-    state: `tf.Tensor`, contains the agent's current state.
-
-  Returns:
-    net: _network_type object containing the tensors output by the network.
-  """
-  q_values = fourier_dqn_network(CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS,
-                                 num_actions, state)
-  return network_type(q_values)
-
-
-@gin.configurable
-def cartpole_rainbow_network(num_actions, num_atoms, support, network_type,
-                             state):
-  """Build the deep network used to compute the agent's Q-value distributions.
-
-  Args:
-    num_actions: int, number of actions.
-    num_atoms: int, the number of buckets of the value function distribution.
-    support: tf.linspace, the support of the Q-value distribution.
-    network_type: `namedtuple`, collection of expected values to return.
-    state: `tf.Tensor`, contains the agent's current state.
-
-  Returns:
-    net: _network_type object containing the tensors output by the network.
-  """
-  net = _basic_discrete_domain_network(
-      CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state,
-      num_atoms=num_atoms)
-  logits = tf.reshape(net, [-1, num_actions, num_atoms])
-  probabilities = tf.contrib.layers.softmax(logits)
-  q_values = tf.reduce_sum(support * probabilities, axis=2)
-  return network_type(q_values, logits, probabilities)
-
-
-@gin.configurable
-def acrobot_dqn_network(num_actions, network_type, state):
-  """Builds the deep network used to compute the agent's Q-values.
-
-  It rescales the input features to a range that yields improved performance.
-
-  Args:
-    num_actions: int, number of actions.
-    network_type: namedtuple, collection of expected values to return.
-    state: `tf.Tensor`, contains the agent's current state.
-
-  Returns:
-    net: _network_type object containing the tensors output by the network.
-  """
-  q_values = _basic_discrete_domain_network(
-      ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state)
-  return network_type(q_values)
-
-
-@gin.configurable
-def acrobot_fourier_dqn_network(num_actions, network_type, state):
-  """Builds the function approximator used to compute the agent's Q-values.
-
-  It uses the Fourier basis features and a linear function approximator.
-
-  Args:
-    num_actions: int, number of actions.
-    network_type: namedtuple, collection of expected values to return.
-    state: `tf.Tensor`, contains the agent's current state.
-
-  Returns:
-    net: _network_type object containing the tensors output by the network.
-  """
-  q_values = fourier_dqn_network(ACROBOT_MIN_VALS, ACROBOT_MAX_VALS,
-                                 num_actions, state)
-  return network_type(q_values)
-
-
-@gin.configurable
-def acrobot_rainbow_network(num_actions, num_atoms, support, network_type,
-                            state):
-  """Build the deep network used to compute the agent's Q-value distributions.
-
-  Args:
-    num_actions: int, number of actions.
-    num_atoms: int, the number of buckets of the value function distribution.
-    support: tf.linspace, the support of the Q-value distribution.
-    network_type: `namedtuple`, collection of expected values to return.
-    state: `tf.Tensor`, contains the agent's current state.
-
-  Returns:
-    net: _network_type object containing the tensors output by the network.
-  """
-  net = _basic_discrete_domain_network(
-      ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state,
-      num_atoms=num_atoms)
-  logits = tf.reshape(net, [-1, num_actions, num_atoms])
-  probabilities = tf.contrib.layers.softmax(logits)
-  q_values = tf.reduce_sum(support * probabilities, axis=2)
-  return network_type(q_values, logits, probabilities)
-
-
-@gin.configurable
-class GymPreprocessing(object):
-  """A Wrapper class around Gym environments."""
-
-  def __init__(self, environment):
-    self.environment = environment
-    self.game_over = False
-
-  @property
-  def observation_space(self):
-    return self.environment.observation_space
-
-  @property
-  def action_space(self):
-    return self.environment.action_space
-
-  @property
-  def reward_range(self):
-    return self.environment.reward_range
-
-  @property
-  def metadata(self):
-    return self.environment.metadata
-
-  def reset(self):
-    return self.environment.reset()
-
-  def step(self, action):
-    observation, reward, game_over, info = self.environment.step(action)
-    self.game_over = game_over
-    return observation, reward, game_over, info
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gym-specific (non-Atari) utilities.
+
+Some network specifications specific to certain Gym environments are provided
+here.
+
+Includes a wrapper class around Gym environments. This class makes general Gym
+environments conformant with the API Dopamine is expecting.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import math
+
+
+
+import gym
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+
+CARTPOLE_MIN_VALS = np.array([-2.4, -5., -math.pi/12., -math.pi*2.])
+CARTPOLE_MAX_VALS = np.array([2.4, 5., math.pi/12., math.pi*2.])
+ACROBOT_MIN_VALS = np.array([-1., -1., -1., -1., -5., -5.])
+ACROBOT_MAX_VALS = np.array([1., 1., 1., 1., 5., 5.])
+gin.constant('gym_lib.CARTPOLE_OBSERVATION_SHAPE', (4, 1))
+gin.constant('gym_lib.CARTPOLE_OBSERVATION_DTYPE', tf.float32)
+gin.constant('gym_lib.CARTPOLE_STACK_SIZE', 1)
+gin.constant('gym_lib.ACROBOT_OBSERVATION_SHAPE', (6, 1))
+gin.constant('gym_lib.ACROBOT_OBSERVATION_DTYPE', tf.float32)
+gin.constant('gym_lib.ACROBOT_STACK_SIZE', 1)
+
+slim = tf.contrib.slim
+
+
+@gin.configurable
+def create_gym_environment(environment_name=None, version='v0'):
+  """Wraps a Gym environment with some basic preprocessing.
+
+  Args:
+    environment_name: str, the name of the environment to run.
+    version: str, version of the environment to run.
+
+  Returns:
+    A Gym environment with some standard preprocessing.
+  """
+  assert environment_name is not None
+  full_game_name = '{}-{}'.format(environment_name, version)
+  env = gym.make(full_game_name)
+  # Strip out the TimeLimit wrapper from Gym, which caps us at 200 steps.
+  env = env.env
+  # Wrap the returned environment in a class which conforms to the API expected
+  # by Dopamine.
+  env = GymPreprocessing(env)
+  return env
+
+
+@gin.configurable
+def _basic_discrete_domain_network(min_vals, max_vals, num_actions, state,
+                                   num_atoms=None):
+  """Builds a basic network for discrete domains, rescaling inputs to [-1, 1].
+
+  Args:
+    min_vals: float, minimum attainable values (must be same shape as `state`).
+    max_vals: float, maximum attainable values (must be same shape as `state`).
+    num_actions: int, number of actions.
+    state: `tf.Tensor`, the state input.
+    num_atoms: int or None, if None will construct a DQN-style network,
+      otherwise will construct a Rainbow-style network.
+
+  Returns:
+    The Q-values for DQN-style agents or logits for Rainbow-style agents.
+  """
+  net = tf.cast(state, tf.float32)
+  net = slim.flatten(net)
+  net -= min_vals
+  net /= max_vals - min_vals
+  net = 2.0 * net - 1.0  # Rescale in range [-1, 1].
+  net = slim.fully_connected(net, 512)
+  net = slim.fully_connected(net, 512)
+  if num_atoms is None:
+    # We are constructing a DQN-style network.
+    return slim.fully_connected(net, num_actions, activation_fn=None)
+  else:
+    # We are constructing a rainbow-style network.
+    return slim.fully_connected(net, num_actions * num_atoms,
+                                activation_fn=None)
+
+
+@gin.configurable
+def cartpole_dqn_network(num_actions, network_type, state):
+  """Builds the deep network used to compute the agent's Q-values.
+
+  It rescales the input features to a range that yields improved performance.
+
+  Args:
+    num_actions: int, number of actions.
+    network_type: namedtuple, collection of expected values to return.
+    state: `tf.Tensor`, contains the agent's current state.
+
+  Returns:
+    net: _network_type object containing the tensors output by the network.
+  """
+  q_values = _basic_discrete_domain_network(
+      CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state)
+  return network_type(q_values)
+
+
+class FourierBasis(object):
+  """Fourier Basis linear function approximation.
+
+  Requires the ranges for each dimension, and is thus able to use only sine or
+  cosine (and uses cosine). So, this has half the coefficients that a full
+  Fourier approximation would use.
+
+  Many thanks to Will Dabney (wdabney@) for this implementation.
+
+  From the paper:
+  G.D. Konidaris, S. Osentoski and P.S. Thomas. (2011)
+  Value Function Approximation in Reinforcement Learning using the Fourier Basis
+  """
+
+  def __init__(self, nvars, min_vals=0, max_vals=None, order=3):
+    self.order = order
+    self.min_vals = min_vals
+    self.max_vals = max_vals
+    terms = itertools.product(range(order + 1), repeat=nvars)
+
+    # Removing first iterate because it corresponds to the constant bias
+    self.multipliers = tf.constant(
+        [list(map(int, x)) for x in terms][1:], dtype=tf.float32)
+
+  def scale(self, values):
+    shifted = values - self.min_vals
+    if self.max_vals is None:
+      return shifted
+
+    return shifted / (self.max_vals - self.min_vals)
+
+  def compute_features(self, features):
+    # Important to rescale features to be between [0,1]
+    scaled = self.scale(features)
+    return tf.cos(np.pi * tf.matmul(scaled, self.multipliers, transpose_b=True))
+
+
+@gin.configurable
+def fourier_dqn_network(min_vals,
+                        max_vals,
+                        num_actions,
+                        state,
+                        fourier_basis_order=3):
+  """Builds the function approximator used to compute the agent's Q-values.
+
+  It uses FourierBasis features and a linear layer.
+
+  Args:
+    min_vals: float, minimum attainable values (must be same shape as `state`).
+    max_vals: float, maximum attainable values (must be same shape as `state`).
+    num_actions: int, number of actions.
+    state: `tf.Tensor`, contains the agent's current state.
+    fourier_basis_order: int, order of the Fourier basis functions.
+
+  Returns:
+    The Q-values for DQN-style agents or logits for Rainbow-style agents.
+  """
+  net = tf.cast(state, tf.float32)
+  net = slim.flatten(net)
+
+  # Feed state through Fourier basis.
+  feature_generator = FourierBasis(
+      net.get_shape().as_list()[-1],
+      min_vals,
+      max_vals,
+      order=fourier_basis_order)
+  net = feature_generator.compute_features(net)
+
+  # Q-values are always linear w.r.t. last layer.
+  q_values = slim.fully_connected(
+      net, num_actions, activation_fn=None, biases_initializer=None)
+  return q_values
+
+
+def cartpole_fourier_dqn_network(num_actions, network_type, state):
+  """Builds the function approximator used to compute the agent's Q-values.
+
+  It uses the Fourier basis features and a linear function approximator.
+
+  Args:
+    num_actions: int, number of actions.
+    network_type: namedtuple, collection of expected values to return.
+    state: `tf.Tensor`, contains the agent's current state.
+
+  Returns:
+    net: _network_type object containing the tensors output by the network.
+  """
+  q_values = fourier_dqn_network(CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS,
+                                 num_actions, state)
+  return network_type(q_values)
+
+
+@gin.configurable
+def cartpole_rainbow_network(num_actions, num_atoms, support, network_type,
+                             state):
+  """Build the deep network used to compute the agent's Q-value distributions.
+
+  Args:
+    num_actions: int, number of actions.
+    num_atoms: int, the number of buckets of the value function distribution.
+    support: tf.linspace, the support of the Q-value distribution.
+    network_type: `namedtuple`, collection of expected values to return.
+    state: `tf.Tensor`, contains the agent's current state.
+
+  Returns:
+    net: _network_type object containing the tensors output by the network.
+  """
+  net = _basic_discrete_domain_network(
+      CARTPOLE_MIN_VALS, CARTPOLE_MAX_VALS, num_actions, state,
+      num_atoms=num_atoms)
+  logits = tf.reshape(net, [-1, num_actions, num_atoms])
+  probabilities = tf.contrib.layers.softmax(logits)
+  q_values = tf.reduce_sum(support * probabilities, axis=2)
+  return network_type(q_values, logits, probabilities)
+
+
+@gin.configurable
+def acrobot_dqn_network(num_actions, network_type, state):
+  """Builds the deep network used to compute the agent's Q-values.
+
+  It rescales the input features to a range that yields improved performance.
+
+  Args:
+    num_actions: int, number of actions.
+    network_type: namedtuple, collection of expected values to return.
+    state: `tf.Tensor`, contains the agent's current state.
+
+  Returns:
+    net: _network_type object containing the tensors output by the network.
+  """
+  q_values = _basic_discrete_domain_network(
+      ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state)
+  return network_type(q_values)
+
+
+@gin.configurable
+def acrobot_fourier_dqn_network(num_actions, network_type, state):
+  """Builds the function approximator used to compute the agent's Q-values.
+
+  It uses the Fourier basis features and a linear function approximator.
+
+  Args:
+    num_actions: int, number of actions.
+    network_type: namedtuple, collection of expected values to return.
+    state: `tf.Tensor`, contains the agent's current state.
+
+  Returns:
+    net: _network_type object containing the tensors output by the network.
+  """
+  q_values = fourier_dqn_network(ACROBOT_MIN_VALS, ACROBOT_MAX_VALS,
+                                 num_actions, state)
+  return network_type(q_values)
+
+
+@gin.configurable
+def acrobot_rainbow_network(num_actions, num_atoms, support, network_type,
+                            state):
+  """Build the deep network used to compute the agent's Q-value distributions.
+
+  Args:
+    num_actions: int, number of actions.
+    num_atoms: int, the number of buckets of the value function distribution.
+    support: tf.linspace, the support of the Q-value distribution.
+    network_type: `namedtuple`, collection of expected values to return.
+    state: `tf.Tensor`, contains the agent's current state.
+
+  Returns:
+    net: _network_type object containing the tensors output by the network.
+  """
+  net = _basic_discrete_domain_network(
+      ACROBOT_MIN_VALS, ACROBOT_MAX_VALS, num_actions, state,
+      num_atoms=num_atoms)
+  logits = tf.reshape(net, [-1, num_actions, num_atoms])
+  probabilities = tf.contrib.layers.softmax(logits)
+  q_values = tf.reduce_sum(support * probabilities, axis=2)
+  return network_type(q_values, logits, probabilities)
+
+
+@gin.configurable
+class GymPreprocessing(object):
+  """A Wrapper class around Gym environments."""
+
+  def __init__(self, environment):
+    self.environment = environment
+    self.game_over = False
+
+  @property
+  def observation_space(self):
+    return self.environment.observation_space
+
+  @property
+  def action_space(self):
+    return self.environment.action_space
+
+  @property
+  def reward_range(self):
+    return self.environment.reward_range
+
+  @property
+  def metadata(self):
+    return self.environment.metadata
+
+  def reset(self):
+    return self.environment.reset()
+
+  def step(self, action):
+    observation, reward, game_over, info = self.environment.step(action)
+    self.game_over = game_over
+    return observation, reward, game_over, info
--- a/dopamine/discrete_domains/iteration_statistics.py
+++ b/dopamine/discrete_domains/iteration_statistics.py
@ -1,49 +1,49 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""A class for storing iteration-specific metrics.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-class IterationStatistics(object):
-  """A class for storing iteration-specific metrics.
-
-  The internal format is as follows: we maintain a mapping from keys to lists.
-  Each list contains all the values corresponding to the given key.
-
-  For example, self.data_lists['train_episode_returns'] might contain the
-    per-episode returns achieved during this iteration.
-
-  Attributes:
-    data_lists: dict mapping each metric_name (str) to a list of said metric
-      across episodes.
-  """
-
-  def __init__(self):
-    self.data_lists = {}
-
-  def append(self, data_pairs):
-    """Add the given values to their corresponding key-indexed lists.
-
-    Args:
-      data_pairs: A dictionary of key-value pairs to be recorded.
-    """
-    for key, value in data_pairs.items():
-      if key not in self.data_lists:
-        self.data_lists[key] = []
-      self.data_lists[key].append(value)
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A class for storing iteration-specific metrics.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class IterationStatistics(object):
+  """A class for storing iteration-specific metrics.
+
+  The internal format is as follows: we maintain a mapping from keys to lists.
+  Each list contains all the values corresponding to the given key.
+
+  For example, self.data_lists['train_episode_returns'] might contain the
+    per-episode returns achieved during this iteration.
+
+  Attributes:
+    data_lists: dict mapping each metric_name (str) to a list of said metric
+      across episodes.
+  """
+
+  def __init__(self):
+    self.data_lists = {}
+
+  def append(self, data_pairs):
+    """Add the given values to their corresponding key-indexed lists.
+
+    Args:
+      data_pairs: A dictionary of key-value pairs to be recorded.
+    """
+    for key, value in data_pairs.items():
+      if key not in self.data_lists:
+        self.data_lists[key] = []
+      self.data_lists[key].append(value)
--- a/dopamine/discrete_domains/logger.py
+++ b/dopamine/discrete_domains/logger.py
@ -1,105 +1,105 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""A lightweight logging mechanism for dopamine agents."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import pickle
-import tensorflow as tf
-
-
-CHECKPOINT_DURATION = 4
-
-
-class Logger(object):
-  """Class for maintaining a dictionary of data to log."""
-
-  def __init__(self, logging_dir):
-    """Initializes Logger.
-
-    Args:
-      logging_dir: str, Directory to which logs are written.
-    """
-    # Dict used by logger to store data.
-    self.data = {}
-    self._logging_enabled = True
-
-    if not logging_dir:
-      tf.logging.info('Logging directory not specified, will not log.')
-      self._logging_enabled = False
-      return
-    # Try to create logging directory.
-    try:
-      tf.gfile.MakeDirs(logging_dir)
-    except tf.errors.PermissionDeniedError:
-      # If it already exists, ignore exception.
-      pass
-    if not tf.gfile.Exists(logging_dir):
-      tf.logging.warning(
-          'Could not create directory %s, logging will be disabled.',
-          logging_dir)
-      self._logging_enabled = False
-      return
-    self._logging_dir = logging_dir
-
-  def __setitem__(self, key, value):
-    """This method will set an entry at key with value in the dictionary.
-
-    It will effectively overwrite any previous data at the same key.
-
-    Args:
-      key: str, indicating key where to write the entry.
-      value: A python object to store.
-    """
-    if self._logging_enabled:
-      self.data[key] = value
-
-  def _generate_filename(self, filename_prefix, iteration_number):
-    filename = '{}_{}'.format(filename_prefix, iteration_number)
-    return os.path.join(self._logging_dir, filename)
-
-  def log_to_file(self, filename_prefix, iteration_number):
-    """Save the pickled dictionary to a file.
-
-    Args:
-      filename_prefix: str, name of the file to use (without iteration
-        number).
-      iteration_number: int, the iteration number, appended to the end of
-        filename_prefix.
-    """
-    if not self._logging_enabled:
-      tf.logging.warning('Logging is disabled.')
-      return
-    log_file = self._generate_filename(filename_prefix, iteration_number)
-    with tf.gfile.GFile(log_file, 'w') as fout:
-      pickle.dump(self.data, fout, protocol=pickle.HIGHEST_PROTOCOL)
-    # After writing a checkpoint file, we garbage collect the log file
-    # that is CHECKPOINT_DURATION versions old.
-    stale_iteration_number = iteration_number - CHECKPOINT_DURATION
-    if stale_iteration_number >= 0:
-      stale_file = self._generate_filename(filename_prefix,
-                                           stale_iteration_number)
-      try:
-        tf.gfile.Remove(stale_file)
-      except tf.errors.NotFoundError:
-        # Ignore if file not found.
-        pass
-
-  def is_logging_enabled(self):
-    """Return if logging is enabled."""
-    return self._logging_enabled
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A lightweight logging mechanism for dopamine agents."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pickle
+import tensorflow as tf
+
+
+CHECKPOINT_DURATION = 4
+
+
+class Logger(object):
+  """Class for maintaining a dictionary of data to log."""
+
+  def __init__(self, logging_dir):
+    """Initializes Logger.
+
+    Args:
+      logging_dir: str, Directory to which logs are written.
+    """
+    # Dict used by logger to store data.
+    self.data = {}
+    self._logging_enabled = True
+
+    if not logging_dir:
+      tf.logging.info('Logging directory not specified, will not log.')
+      self._logging_enabled = False
+      return
+    # Try to create logging directory.
+    try:
+      tf.gfile.MakeDirs(logging_dir)
+    except tf.errors.PermissionDeniedError:
+      # If it already exists, ignore exception.
+      pass
+    if not tf.gfile.Exists(logging_dir):
+      tf.logging.warning(
+          'Could not create directory %s, logging will be disabled.',
+          logging_dir)
+      self._logging_enabled = False
+      return
+    self._logging_dir = logging_dir
+
+  def __setitem__(self, key, value):
+    """This method will set an entry at key with value in the dictionary.
+
+    It will effectively overwrite any previous data at the same key.
+
+    Args:
+      key: str, indicating key where to write the entry.
+      value: A python object to store.
+    """
+    if self._logging_enabled:
+      self.data[key] = value
+
+  def _generate_filename(self, filename_prefix, iteration_number):
+    filename = '{}_{}'.format(filename_prefix, iteration_number)
+    return os.path.join(self._logging_dir, filename)
+
+  def log_to_file(self, filename_prefix, iteration_number):
+    """Save the pickled dictionary to a file.
+
+    Args:
+      filename_prefix: str, name of the file to use (without iteration
+        number).
+      iteration_number: int, the iteration number, appended to the end of
+        filename_prefix.
+    """
+    if not self._logging_enabled:
+      tf.logging.warning('Logging is disabled.')
+      return
+    log_file = self._generate_filename(filename_prefix, iteration_number)
+    with tf.gfile.GFile(log_file, 'w') as fout:
+      pickle.dump(self.data, fout, protocol=pickle.HIGHEST_PROTOCOL)
+    # After writing a checkpoint file, we garbage collect the log file
+    # that is CHECKPOINT_DURATION versions old.
+    stale_iteration_number = iteration_number - CHECKPOINT_DURATION
+    if stale_iteration_number >= 0:
+      stale_file = self._generate_filename(filename_prefix,
+                                           stale_iteration_number)
+      try:
+        tf.gfile.Remove(stale_file)
+      except tf.errors.NotFoundError:
+        # Ignore if file not found.
+        pass
+
+  def is_logging_enabled(self):
+    """Return if logging is enabled."""
+    return self._logging_enabled
--- a/dopamine/discrete_domains/run-fqf.sh
+++ b/dopamine/discrete_domains/run-fqf.sh
@ -7,10 +7,13 @@ mkdir ../agents/fqf/configs/gins &> /dev/null
 n=0
 #iqn_fqf-ws-sticky-0" "iqn_fqf-ws-sticky-0"
 declare -a games=("Centipede")
-declare -a seeds=(0 1 2)
-declare -a factors=(0.00001)
-declare -a ents=(0.0001)
+#Berzerk Gopher Kangaroo ChopperCommand Centipede Breakout Amidar KungFuMaster DoubleDunk
+declare -a seeds=(0)
+declare -a factors=(0.00001 0.000001)
+declare -a ents=(0.0001 0.00001)
 declare -a optimizers=('rmsprop')
+declare -a losses=('directbp' 'sqloss')
+
 for game in "${games[@]}"
 do
    for opt in "${optimizers[@]}"
@ -21,12 +24,15 @@ do
            do
                for ent in "${ents[@]}"
                do
-                    d="iqn_fqf-ws-${opt}-f${factor}-e${ent}-s${seed}"
-                    sed -e "s!GAME!${game}!" -e "s!RUNTYPE!$d!" -e "s!FQFFACTOR!${factor}!" -e "s!FQFENT!${ent}!" ../agents/fqf/configs/fqf.gin > ../agents/fqf/configs/gins/${d}_${game}.gin
-                    CUDA_VISIBLE_DEVICES=$n nohup python train.py --base_dir=/tmp/${d}-${game} --gin_files="../agents/fqf/configs/gins/${d}_${game}.gin" >& logs/output_${game}_${d} &
-                    echo "$i, $n"
-                    n=$((($n+1) % 4))
-                    sleep 2
+                    for loss in "${losses[@]}"
+                    do
+                        d="iqn_fqf-ws-${loss}-${opt}-f${factor}-e${ent}-s${seed}"
+                        sed -e "s!GAME!${game}!" -e "s!RUNTYPE!$d!" -e "s!FQFFACTOR!${factor}!" -e "s!FQFENT!${ent}!" ../agents/fqf/configs/fqf.gin > ../agents/fqf/configs/gins/${d}_${game}.gin
+                        CUDA_VISIBLE_DEVICES=$n nohup python train.py --base_dir=/tmp/${d}-${game} --gin_files="../agents/fqf/configs/gins/${d}_${game}.gin" >& logs/output_${game}_${d} &
+                        echo "$d, $n"
+                        n=$((($n+1) % 4))
+                        sleep 2
+                    done
                done
            done
        done
--- a/dopamine/discrete_domains/run-iqn.sh
+++ b/dopamine/discrete_domains/run-iqn.sh
@ -14,7 +14,7 @@ do
        d="iqn-s${seed}"
        sed -e "s!GAME!${game}!" -e "s!RUNTYPE!$d!" ../agents/implicit_quantile/configs/implicit_quantile_icml.gin > ../agents/implicit_quantile/configs/gins/${d}_icml_${game}.gin
        CUDA_VISIBLE_DEVICES=$n nohup python train.py --base_dir=/tmp/${d}-${game} --gin_files="../agents/implicit_quantile/configs/gins/${d}_icml_${game}.gin" >& logs/output_${game}_${d} &
-        echo "$i, $n"
+        echo "$d, $n"
        n=$(($n+1))
        sleep 2
    done
--- a/dopamine/discrete_domains/run_experiment.py
+++ b/dopamine/discrete_domains/run_experiment.py
--- a/dopamine/discrete_domains/train.py
+++ b/dopamine/discrete_domains/train.py
@ -1,65 +1,65 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-r"""The entry point for running a Dopamine agent.
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import sys
-sys.path = ['../../'] + sys.path
-print (sys.path)
-#exit(0)
-
-
-
-from absl import app
-from absl import flags
-
-from dopamine.discrete_domains import run_experiment
-
-import tensorflow as tf
-
-
-flags.DEFINE_string('base_dir', None,
-                    'Base directory to host all required sub-directories.')
-flags.DEFINE_multi_string(
-    'gin_files', [], 'List of paths to gin configuration files (e.g.'
-    '"dopamine/agents/dqn/dqn.gin").')
-flags.DEFINE_multi_string(
-    'gin_bindings', [],
-    'Gin bindings to override the values set in the config files '
-    '(e.g. "DQNAgent.epsilon_train=0.1",'
-    '      "create_environment.game_name="Pong"").')
-
-FLAGS = flags.FLAGS
-
-
-def main(unused_argv):
-  """Main method.
-
-  Args:
-    unused_argv: Arguments (unused).
-  """
-  tf.logging.set_verbosity(tf.logging.INFO)
-  run_experiment.load_gin_configs(FLAGS.gin_files, FLAGS.gin_bindings)
-  runner = run_experiment.create_runner(FLAGS.base_dir)
-  runner.run_experiment()
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('base_dir')
-  app.run(main)
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""The entry point for running a Dopamine agent.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+sys.path = ['../../'] + sys.path
+print (sys.path)
+#exit(0)
+
+
+
+from absl import app
+from absl import flags
+
+from dopamine.discrete_domains import run_experiment
+
+import tensorflow as tf
+
+
+flags.DEFINE_string('base_dir', None,
+                    'Base directory to host all required sub-directories.')
+flags.DEFINE_multi_string(
+    'gin_files', [], 'List of paths to gin configuration files (e.g.'
+    '"dopamine/agents/dqn/dqn.gin").')
+flags.DEFINE_multi_string(
+    'gin_bindings', [],
+    'Gin bindings to override the values set in the config files '
+    '(e.g. "DQNAgent.epsilon_train=0.1",'
+    '      "create_environment.game_name="Pong"").')
+
+FLAGS = flags.FLAGS
+
+
+def main(unused_argv):
+  """Main method.
+
+  Args:
+    unused_argv: Arguments (unused).
+  """
+  tf.logging.set_verbosity(tf.logging.INFO)
+  run_experiment.load_gin_configs(FLAGS.gin_files, FLAGS.gin_bindings)
+  runner = run_experiment.create_runner(FLAGS.base_dir)
+  runner.run_experiment()
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('base_dir')
+  app.run(main)
--- a/dopamine/replay_memory/init.py
+++ b/dopamine/replay_memory/init.py
@ -1,15 +1,15 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/dopamine/replay_memory/circular_replay_buffer.py
+++ b/dopamine/replay_memory/circular_replay_buffer.py
--- a/dopamine/replay_memory/prioritized_replay_buffer.py
+++ b/dopamine/replay_memory/prioritized_replay_buffer.py
@ -1,357 +1,357 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""An implementation of Prioritized Experience Replay (PER).
-
-This implementation is based on the paper "Prioritized Experience Replay"
-by Tom Schaul et al. (2015). Many thanks to Tom Schaul, John Quan, and Matteo
-Hessel for providing useful pointers on the algorithm and its implementation.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-
-from dopamine.replay_memory import circular_replay_buffer
-from dopamine.replay_memory import sum_tree
-from dopamine.replay_memory.circular_replay_buffer import ReplayElement
-import numpy as np
-import tensorflow as tf
-
-import gin.tf
-
-
-class OutOfGraphPrioritizedReplayBuffer(
-    circular_replay_buffer.OutOfGraphReplayBuffer):
-  """An out-of-graph Replay Buffer for Prioritized Experience Replay.
-
-  See circular_replay_buffer.py for details.
-  """
-
-  def __init__(self,
-               observation_shape,
-               stack_size,
-               replay_capacity,
-               batch_size,
-               update_horizon=1,
-               gamma=0.99,
-               max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
-               extra_storage_types=None,
-               observation_dtype=np.uint8,
-               action_shape=(),
-               action_dtype=np.int32,
-               reward_shape=(),
-               reward_dtype=np.float32):
-    """Initializes OutOfGraphPrioritizedReplayBuffer.
-
-    Args:
-      observation_shape: tuple of ints.
-      stack_size: int, number of frames to use in state stack.
-      replay_capacity: int, number of transitions to keep in memory.
-      batch_size: int.
-      update_horizon: int, length of update ('n' in n-step update).
-      gamma: int, the discount factor.
-      max_sample_attempts: int, the maximum number of attempts allowed to
-        get a sample.
-      extra_storage_types: list of ReplayElements defining the type of the extra
-        contents that will be stored and returned by sample_transition_batch.
-      observation_dtype: np.dtype, type of the observations. Defaults to
-        np.uint8 for Atari 2600.
-      action_shape: tuple of ints, the shape for the action vector. Empty tuple
-        means the action is a scalar.
-      action_dtype: np.dtype, type of elements in the action.
-      reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
-        means the reward is a scalar.
-      reward_dtype: np.dtype, type of elements in the reward.
-    """
-    super(OutOfGraphPrioritizedReplayBuffer, self).__init__(
-        observation_shape=observation_shape,
-        stack_size=stack_size,
-        replay_capacity=replay_capacity,
-        batch_size=batch_size,
-        update_horizon=update_horizon,
-        gamma=gamma,
-        max_sample_attempts=max_sample_attempts,
-        extra_storage_types=extra_storage_types,
-        observation_dtype=observation_dtype,
-        action_shape=action_shape,
-        action_dtype=action_dtype,
-        reward_shape=reward_shape,
-        reward_dtype=reward_dtype)
-
-    self.sum_tree = sum_tree.SumTree(replay_capacity)
-
-  def get_add_args_signature(self):
-    """The signature of the add function.
-
-    The signature is the same as the one for OutOfGraphReplayBuffer, with an
-    added priority.
-
-    Returns:
-      list of ReplayElements defining the type of the argument signature needed
-        by the add function.
-    """
-    parent_add_signature = super(OutOfGraphPrioritizedReplayBuffer,
-                                 self).get_add_args_signature()
-    add_signature = parent_add_signature + [
-        ReplayElement('priority', (), np.float32)
-    ]
-    return add_signature
-
-  def _add(self, *args):
-    """Internal add method to add to the underlying memory arrays.
-
-    The arguments need to match add_arg_signature.
-
-    If priority is none, it is set to the maximum priority ever seen.
-
-    Args:
-      *args: All the elements in a transition.
-    """
-    # Use Schaul et al.'s (2015) scheme of setting the priority of new elements
-    # to the maximum priority so far.
-    parent_add_args = []
-    # Picks out 'priority' from arguments and passes the other arguments to the
-    # parent method.
-    for i, element in enumerate(self.get_add_args_signature()):
-      if element.name == 'priority':
-        priority = args[i]
-      else:
-        parent_add_args.append(args[i])
-
-    self.sum_tree.set(self.cursor(), priority)
-
-    super(OutOfGraphPrioritizedReplayBuffer, self)._add(*parent_add_args)
-
-  def sample_index_batch(self, batch_size):
-    """Returns a batch of valid indices sampled as in Schaul et al. (2015).
-
-    Args:
-      batch_size: int, number of indices returned.
-
-    Returns:
-      list of ints, a batch of valid indices sampled uniformly.
-
-    Raises:
-      Exception: If the batch was not constructed after maximum number of tries.
-    """
-    # Sample stratified indices. Some of them might be invalid.
-    indices = self.sum_tree.stratified_sample(batch_size)
-    allowed_attempts = self._max_sample_attempts
-    for i in range(len(indices)):
-      if not self.is_valid_transition(indices[i]):
-        if allowed_attempts == 0:
-          raise RuntimeError(
-              'Max sample attempts: Tried {} times but only sampled {}'
-              ' valid indices. Batch size is {}'.
-              format(self._max_sample_attempts, i, batch_size))
-        index = indices[i]
-        while not self.is_valid_transition(index) and allowed_attempts > 0:
-          # If index i is not valid keep sampling others. Note that this
-          # is not stratified.
-          index = self.sum_tree.sample()
-          allowed_attempts -= 1
-        indices[i] = index
-    return indices
-
-  def sample_transition_batch(self, batch_size=None, indices=None):
-    """Returns a batch of transitions with extra storage and the priorities.
-
-    The extra storage are defined through the extra_storage_types constructor
-    argument.
-
-    When the transition is terminal next_state_batch has undefined contents.
-
-    Args:
-      batch_size: int, number of transitions returned. If None, the default
-        batch_size will be used.
-      indices: None or list of ints, the indices of every transition in the
-        batch. If None, sample the indices uniformly.
-
-    Returns:
-      transition_batch: tuple of np.arrays with the shape and type as in
-        get_transition_elements().
-    """
-    transition = (super(OutOfGraphPrioritizedReplayBuffer, self).
-                  sample_transition_batch(batch_size, indices))
-    transition_elements = self.get_transition_elements(batch_size)
-    transition_names = [e.name for e in transition_elements]
-    probabilities_index = transition_names.index('sampling_probabilities')
-    indices_index = transition_names.index('indices')
-    indices = transition[indices_index]
-    # The parent returned an empty array for the probabilities. Fill it with the
-    # contents of the sum tree.
-    transition[probabilities_index][:] = self.get_priority(indices)
-    return transition
-
-  def set_priority(self, indices, priorities):
-    """Sets the priority of the given elements according to Schaul et al.
-
-    Args:
-      indices: np.array with dtype int32, of indices in range
-        [0, replay_capacity).
-      priorities: float, the corresponding priorities.
-    """
-    assert indices.dtype == np.int32, ('Indices must be integers, '
-                                       'given: {}'.format(indices.dtype))
-    for index, priority in zip(indices, priorities):
-      self.sum_tree.set(index, priority)
-
-  def get_priority(self, indices):
-    """Fetches the priorities correspond to a batch of memory indices.
-
-    For any memory location not yet used, the corresponding priority is 0.
-
-    Args:
-      indices: np.array with dtype int32, of indices in range
-        [0, replay_capacity).
-
-    Returns:
-      priorities: float, the corresponding priorities.
-    """
-    assert indices.shape, 'Indices must be an array.'
-    assert indices.dtype == np.int32, ('Indices must be int32s, '
-                                       'given: {}'.format(indices.dtype))
-    batch_size = len(indices)
-    priority_batch = np.empty((batch_size), dtype=np.float32)
-    for i, memory_index in enumerate(indices):
-      priority_batch[i] = self.sum_tree.get(memory_index)
-    return priority_batch
-
-  def get_transition_elements(self, batch_size=None):
-    """Returns a 'type signature' for sample_transition_batch.
-
-    Args:
-      batch_size: int, number of transitions returned. If None, the default
-        batch_size will be used.
-    Returns:
-      signature: A namedtuple describing the method's return type signature.
-    """
-    parent_transition_type = (
-        super(OutOfGraphPrioritizedReplayBuffer,
-              self).get_transition_elements(batch_size))
-    probablilities_type = [
-        ReplayElement('sampling_probabilities', (batch_size,), np.float32)
-    ]
-    return parent_transition_type + probablilities_type
-
-
-@gin.configurable(blacklist=['observation_shape', 'stack_size',
-                             'update_horizon', 'gamma'])
-class WrappedPrioritizedReplayBuffer(
-    circular_replay_buffer.WrappedReplayBuffer):
-  """Wrapper of OutOfGraphPrioritizedReplayBuffer with in-graph sampling.
-
-  Usage:
-
-    * To add a transition:  Call the add function.
-
-    * To sample a batch:  Query any of the tensors in the transition dictionary.
-                          Every sess.run that requires any of these tensors will
-                          sample a new transition.
-  """
-
-  def __init__(self,
-               observation_shape,
-               stack_size,
-               use_staging=True,
-               replay_capacity=1000000,
-               batch_size=32,
-               update_horizon=1,
-               gamma=0.99,
-               max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
-               extra_storage_types=None,
-               observation_dtype=np.uint8,
-               action_shape=(),
-               action_dtype=np.int32,
-               reward_shape=(),
-               reward_dtype=np.float32):
-    """Initializes WrappedPrioritizedReplayBuffer.
-
-    Args:
-      observation_shape: tuple of ints.
-      stack_size: int, number of frames to use in state stack.
-      use_staging: bool, when True it would use a staging area to prefetch
-        the next sampling batch.
-      replay_capacity: int, number of transitions to keep in memory.
-      batch_size: int.
-      update_horizon: int, length of update ('n' in n-step update).
-      gamma: int, the discount factor.
-      max_sample_attempts: int, the maximum number of attempts allowed to
-        get a sample.
-      extra_storage_types: list of ReplayElements defining the type of the extra
-        contents that will be stored and returned by sample_transition_batch.
-      observation_dtype: np.dtype, type of the observations. Defaults to
-        np.uint8 for Atari 2600.
-      action_shape: tuple of ints, the shape for the action vector. Empty tuple
-        means the action is a scalar.
-      action_dtype: np.dtype, type of elements in the action.
-      reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
-        means the reward is a scalar.
-      reward_dtype: np.dtype, type of elements in the reward.
-
-    Raises:
-      ValueError: If update_horizon is not positive.
-      ValueError: If discount factor is not in [0, 1].
-    """
-    memory = OutOfGraphPrioritizedReplayBuffer(
-        observation_shape, stack_size, replay_capacity, batch_size,
-        update_horizon, gamma, max_sample_attempts,
-        extra_storage_types=extra_storage_types,
-        observation_dtype=observation_dtype)
-    super(WrappedPrioritizedReplayBuffer, self).__init__(
-        observation_shape,
-        stack_size,
-        use_staging,
-        replay_capacity,
-        batch_size,
-        update_horizon,
-        gamma,
-        wrapped_memory=memory,
-        extra_storage_types=extra_storage_types,
-        observation_dtype=observation_dtype,
-        action_shape=action_shape,
-        action_dtype=action_dtype,
-        reward_shape=reward_shape,
-        reward_dtype=reward_dtype)
-
-  def tf_set_priority(self, indices, priorities):
-    """Sets the priorities for the given indices.
-
-    Args:
-      indices: tf.Tensor with dtype int32 and shape [n].
-      priorities: tf.Tensor with dtype float and shape [n].
-
-    Returns:
-       A tf op setting the priorities for prioritized sampling.
-    """
-    return tf.py_func(
-        self.memory.set_priority, [indices, priorities], [],
-        name='prioritized_replay_set_priority_py_func')
-
-  def tf_get_priority(self, indices):
-    """Gets the priorities for the given indices.
-
-    Args:
-      indices: tf.Tensor with dtype int32 and shape [n].
-
-    Returns:
-      priorities: tf.Tensor with dtype float and shape [n], the priorities at
-        the indices.
-    """
-    return tf.py_func(
-        self.memory.get_priority, [indices],
-        tf.float32,
-        name='prioritized_replay_get_priority_py_func')
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An implementation of Prioritized Experience Replay (PER).
+
+This implementation is based on the paper "Prioritized Experience Replay"
+by Tom Schaul et al. (2015). Many thanks to Tom Schaul, John Quan, and Matteo
+Hessel for providing useful pointers on the algorithm and its implementation.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+
+from dopamine.replay_memory import circular_replay_buffer
+from dopamine.replay_memory import sum_tree
+from dopamine.replay_memory.circular_replay_buffer import ReplayElement
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+
+class OutOfGraphPrioritizedReplayBuffer(
+    circular_replay_buffer.OutOfGraphReplayBuffer):
+  """An out-of-graph Replay Buffer for Prioritized Experience Replay.
+
+  See circular_replay_buffer.py for details.
+  """
+
+  def __init__(self,
+               observation_shape,
+               stack_size,
+               replay_capacity,
+               batch_size,
+               update_horizon=1,
+               gamma=0.99,
+               max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
+               extra_storage_types=None,
+               observation_dtype=np.uint8,
+               action_shape=(),
+               action_dtype=np.int32,
+               reward_shape=(),
+               reward_dtype=np.float32):
+    """Initializes OutOfGraphPrioritizedReplayBuffer.
+
+    Args:
+      observation_shape: tuple of ints.
+      stack_size: int, number of frames to use in state stack.
+      replay_capacity: int, number of transitions to keep in memory.
+      batch_size: int.
+      update_horizon: int, length of update ('n' in n-step update).
+      gamma: int, the discount factor.
+      max_sample_attempts: int, the maximum number of attempts allowed to
+        get a sample.
+      extra_storage_types: list of ReplayElements defining the type of the extra
+        contents that will be stored and returned by sample_transition_batch.
+      observation_dtype: np.dtype, type of the observations. Defaults to
+        np.uint8 for Atari 2600.
+      action_shape: tuple of ints, the shape for the action vector. Empty tuple
+        means the action is a scalar.
+      action_dtype: np.dtype, type of elements in the action.
+      reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
+        means the reward is a scalar.
+      reward_dtype: np.dtype, type of elements in the reward.
+    """
+    super(OutOfGraphPrioritizedReplayBuffer, self).__init__(
+        observation_shape=observation_shape,
+        stack_size=stack_size,
+        replay_capacity=replay_capacity,
+        batch_size=batch_size,
+        update_horizon=update_horizon,
+        gamma=gamma,
+        max_sample_attempts=max_sample_attempts,
+        extra_storage_types=extra_storage_types,
+        observation_dtype=observation_dtype,
+        action_shape=action_shape,
+        action_dtype=action_dtype,
+        reward_shape=reward_shape,
+        reward_dtype=reward_dtype)
+
+    self.sum_tree = sum_tree.SumTree(replay_capacity)
+
+  def get_add_args_signature(self):
+    """The signature of the add function.
+
+    The signature is the same as the one for OutOfGraphReplayBuffer, with an
+    added priority.
+
+    Returns:
+      list of ReplayElements defining the type of the argument signature needed
+        by the add function.
+    """
+    parent_add_signature = super(OutOfGraphPrioritizedReplayBuffer,
+                                 self).get_add_args_signature()
+    add_signature = parent_add_signature + [
+        ReplayElement('priority', (), np.float32)
+    ]
+    return add_signature
+
+  def _add(self, *args):
+    """Internal add method to add to the underlying memory arrays.
+
+    The arguments need to match add_arg_signature.
+
+    If priority is none, it is set to the maximum priority ever seen.
+
+    Args:
+      *args: All the elements in a transition.
+    """
+    # Use Schaul et al.'s (2015) scheme of setting the priority of new elements
+    # to the maximum priority so far.
+    parent_add_args = []
+    # Picks out 'priority' from arguments and passes the other arguments to the
+    # parent method.
+    for i, element in enumerate(self.get_add_args_signature()):
+      if element.name == 'priority':
+        priority = args[i]
+      else:
+        parent_add_args.append(args[i])
+
+    self.sum_tree.set(self.cursor(), priority)
+
+    super(OutOfGraphPrioritizedReplayBuffer, self)._add(*parent_add_args)
+
+  def sample_index_batch(self, batch_size):
+    """Returns a batch of valid indices sampled as in Schaul et al. (2015).
+
+    Args:
+      batch_size: int, number of indices returned.
+
+    Returns:
+      list of ints, a batch of valid indices sampled uniformly.
+
+    Raises:
+      Exception: If the batch was not constructed after maximum number of tries.
+    """
+    # Sample stratified indices. Some of them might be invalid.
+    indices = self.sum_tree.stratified_sample(batch_size)
+    allowed_attempts = self._max_sample_attempts
+    for i in range(len(indices)):
+      if not self.is_valid_transition(indices[i]):
+        if allowed_attempts == 0:
+          raise RuntimeError(
+              'Max sample attempts: Tried {} times but only sampled {}'
+              ' valid indices. Batch size is {}'.
+              format(self._max_sample_attempts, i, batch_size))
+        index = indices[i]
+        while not self.is_valid_transition(index) and allowed_attempts > 0:
+          # If index i is not valid keep sampling others. Note that this
+          # is not stratified.
+          index = self.sum_tree.sample()
+          allowed_attempts -= 1
+        indices[i] = index
+    return indices
+
+  def sample_transition_batch(self, batch_size=None, indices=None):
+    """Returns a batch of transitions with extra storage and the priorities.
+
+    The extra storage are defined through the extra_storage_types constructor
+    argument.
+
+    When the transition is terminal next_state_batch has undefined contents.
+
+    Args:
+      batch_size: int, number of transitions returned. If None, the default
+        batch_size will be used.
+      indices: None or list of ints, the indices of every transition in the
+        batch. If None, sample the indices uniformly.
+
+    Returns:
+      transition_batch: tuple of np.arrays with the shape and type as in
+        get_transition_elements().
+    """
+    transition = (super(OutOfGraphPrioritizedReplayBuffer, self).
+                  sample_transition_batch(batch_size, indices))
+    transition_elements = self.get_transition_elements(batch_size)
+    transition_names = [e.name for e in transition_elements]
+    probabilities_index = transition_names.index('sampling_probabilities')
+    indices_index = transition_names.index('indices')
+    indices = transition[indices_index]
+    # The parent returned an empty array for the probabilities. Fill it with the
+    # contents of the sum tree.
+    transition[probabilities_index][:] = self.get_priority(indices)
+    return transition
+
+  def set_priority(self, indices, priorities):
+    """Sets the priority of the given elements according to Schaul et al.
+
+    Args:
+      indices: np.array with dtype int32, of indices in range
+        [0, replay_capacity).
+      priorities: float, the corresponding priorities.
+    """
+    assert indices.dtype == np.int32, ('Indices must be integers, '
+                                       'given: {}'.format(indices.dtype))
+    for index, priority in zip(indices, priorities):
+      self.sum_tree.set(index, priority)
+
+  def get_priority(self, indices):
+    """Fetches the priorities correspond to a batch of memory indices.
+
+    For any memory location not yet used, the corresponding priority is 0.
+
+    Args:
+      indices: np.array with dtype int32, of indices in range
+        [0, replay_capacity).
+
+    Returns:
+      priorities: float, the corresponding priorities.
+    """
+    assert indices.shape, 'Indices must be an array.'
+    assert indices.dtype == np.int32, ('Indices must be int32s, '
+                                       'given: {}'.format(indices.dtype))
+    batch_size = len(indices)
+    priority_batch = np.empty((batch_size), dtype=np.float32)
+    for i, memory_index in enumerate(indices):
+      priority_batch[i] = self.sum_tree.get(memory_index)
+    return priority_batch
+
+  def get_transition_elements(self, batch_size=None):
+    """Returns a 'type signature' for sample_transition_batch.
+
+    Args:
+      batch_size: int, number of transitions returned. If None, the default
+        batch_size will be used.
+    Returns:
+      signature: A namedtuple describing the method's return type signature.
+    """
+    parent_transition_type = (
+        super(OutOfGraphPrioritizedReplayBuffer,
+              self).get_transition_elements(batch_size))
+    probablilities_type = [
+        ReplayElement('sampling_probabilities', (batch_size,), np.float32)
+    ]
+    return parent_transition_type + probablilities_type
+
+
+@gin.configurable(blacklist=['observation_shape', 'stack_size',
+                             'update_horizon', 'gamma'])
+class WrappedPrioritizedReplayBuffer(
+    circular_replay_buffer.WrappedReplayBuffer):
+  """Wrapper of OutOfGraphPrioritizedReplayBuffer with in-graph sampling.
+
+  Usage:
+
+    * To add a transition:  Call the add function.
+
+    * To sample a batch:  Query any of the tensors in the transition dictionary.
+                          Every sess.run that requires any of these tensors will
+                          sample a new transition.
+  """
+
+  def __init__(self,
+               observation_shape,
+               stack_size,
+               use_staging=True,
+               replay_capacity=1000000,
+               batch_size=32,
+               update_horizon=1,
+               gamma=0.99,
+               max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
+               extra_storage_types=None,
+               observation_dtype=np.uint8,
+               action_shape=(),
+               action_dtype=np.int32,
+               reward_shape=(),
+               reward_dtype=np.float32):
+    """Initializes WrappedPrioritizedReplayBuffer.
+
+    Args:
+      observation_shape: tuple of ints.
+      stack_size: int, number of frames to use in state stack.
+      use_staging: bool, when True it would use a staging area to prefetch
+        the next sampling batch.
+      replay_capacity: int, number of transitions to keep in memory.
+      batch_size: int.
+      update_horizon: int, length of update ('n' in n-step update).
+      gamma: int, the discount factor.
+      max_sample_attempts: int, the maximum number of attempts allowed to
+        get a sample.
+      extra_storage_types: list of ReplayElements defining the type of the extra
+        contents that will be stored and returned by sample_transition_batch.
+      observation_dtype: np.dtype, type of the observations. Defaults to
+        np.uint8 for Atari 2600.
+      action_shape: tuple of ints, the shape for the action vector. Empty tuple
+        means the action is a scalar.
+      action_dtype: np.dtype, type of elements in the action.
+      reward_shape: tuple of ints, the shape of the reward vector. Empty tuple
+        means the reward is a scalar.
+      reward_dtype: np.dtype, type of elements in the reward.
+
+    Raises:
+      ValueError: If update_horizon is not positive.
+      ValueError: If discount factor is not in [0, 1].
+    """
+    memory = OutOfGraphPrioritizedReplayBuffer(
+        observation_shape, stack_size, replay_capacity, batch_size,
+        update_horizon, gamma, max_sample_attempts,
+        extra_storage_types=extra_storage_types,
+        observation_dtype=observation_dtype)
+    super(WrappedPrioritizedReplayBuffer, self).__init__(
+        observation_shape,
+        stack_size,
+        use_staging,
+        replay_capacity,
+        batch_size,
+        update_horizon,
+        gamma,
+        wrapped_memory=memory,
+        extra_storage_types=extra_storage_types,
+        observation_dtype=observation_dtype,
+        action_shape=action_shape,
+        action_dtype=action_dtype,
+        reward_shape=reward_shape,
+        reward_dtype=reward_dtype)
+
+  def tf_set_priority(self, indices, priorities):
+    """Sets the priorities for the given indices.
+
+    Args:
+      indices: tf.Tensor with dtype int32 and shape [n].
+      priorities: tf.Tensor with dtype float and shape [n].
+
+    Returns:
+       A tf op setting the priorities for prioritized sampling.
+    """
+    return tf.py_func(
+        self.memory.set_priority, [indices, priorities], [],
+        name='prioritized_replay_set_priority_py_func')
+
+  def tf_get_priority(self, indices):
+    """Gets the priorities for the given indices.
+
+    Args:
+      indices: tf.Tensor with dtype int32 and shape [n].
+
+    Returns:
+      priorities: tf.Tensor with dtype float and shape [n], the priorities at
+        the indices.
+    """
+    return tf.py_func(
+        self.memory.get_priority, [indices],
+        tf.float32,
+        name='prioritized_replay_get_priority_py_func')
--- a/dopamine/replay_memory/sum_tree.py
+++ b/dopamine/replay_memory/sum_tree.py
@ -1,205 +1,205 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""A sum tree data structure.
-
-Used for prioritized experience replay. See prioritized_replay_buffer.py
-and Schaul et al. (2015).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import random
-
-import numpy as np
-
-
-class SumTree(object):
-  """A sum tree data structure for storing replay priorities.
-
-  A sum tree is a complete binary tree whose leaves contain values called
-  priorities. Internal nodes maintain the sum of the priorities of all leaf
-  nodes in their subtree.
-
-  For capacity = 4, the tree may look like this:
-
-               +---+
-               |2.5|
-               +-+-+
-                 |
-         +-------+--------+
-         |                |
-       +-+-+            +-+-+
-       |1.5|            |1.0|
-       +-+-+            +-+-+
-         |                |
-    +----+----+      +----+----+
-    |         |      |         |
-  +-+-+     +-+-+  +-+-+     +-+-+
-  |0.5|     |1.0|  |0.5|     |0.5|
-  +---+     +---+  +---+     +---+
-
-  This is stored in a list of numpy arrays:
-  self.nodes = [ [2.5], [1.5, 1], [0.5, 1, 0.5, 0.5] ]
-
-  For conciseness, we allocate arrays as powers of two, and pad the excess
-  elements with zero values.
-
-  This is similar to the usual array-based representation of a complete binary
-  tree, but is a little more user-friendly.
-  """
-
-  def __init__(self, capacity):
-    """Creates the sum tree data structure for the given replay capacity.
-
-    Args:
-      capacity: int, the maximum number of elements that can be stored in this
-        data structure.
-
-    Raises:
-      ValueError: If requested capacity is not positive.
-    """
-    assert isinstance(capacity, int)
-    if capacity <= 0:
-      raise ValueError('Sum tree capacity should be positive. Got: {}'.
-                       format(capacity))
-
-    self.nodes = []
-    tree_depth = int(math.ceil(np.log2(capacity)))
-    level_size = 1
-    for _ in range(tree_depth + 1):
-      nodes_at_this_depth = np.zeros(level_size)
-      self.nodes.append(nodes_at_this_depth)
-
-      level_size *= 2
-
-    self.max_recorded_priority = 1.0
-
-  def _total_priority(self):
-    """Returns the sum of all priorities stored in this sum tree.
-
-    Returns:
-      float, sum of priorities stored in this sum tree.
-    """
-    return self.nodes[0][0]
-
-  def sample(self, query_value=None):
-    """Samples an element from the sum tree.
-
-    Each element has probability p_i / sum_j p_j of being picked, where p_i is
-    the (positive) value associated with node i (possibly unnormalized).
-
-    Args:
-      query_value: float in [0, 1], used as the random value to select a
-      sample. If None, will select one randomly in [0, 1).
-
-    Returns:
-      int, a random element from the sum tree.
-
-    Raises:
-      Exception: If the sum tree is empty (i.e. its node values sum to 0), or if
-        the supplied query_value is larger than the total sum.
-    """
-    if self._total_priority() == 0.0:
-      raise Exception('Cannot sample from an empty sum tree.')
-
-    if query_value and (query_value < 0. or query_value > 1.):
-      raise ValueError('query_value must be in [0, 1].')
-
-    # Sample a value in range [0, R), where R is the value stored at the root.
-    query_value = random.random() if query_value is None else query_value
-    query_value *= self._total_priority()
-
-    # Now traverse the sum tree.
-    node_index = 0
-    for nodes_at_this_depth in self.nodes[1:]:
-      # Compute children of previous depth's node.
-      left_child = node_index * 2
-
-      left_sum = nodes_at_this_depth[left_child]
-      # Each subtree describes a range [0, a), where a is its value.
-      if query_value < left_sum:  # Recurse into left subtree.
-        node_index = left_child
-      else:  # Recurse into right subtree.
-        node_index = left_child + 1
-        # Adjust query to be relative to right subtree.
-        query_value -= left_sum
-
-    return node_index
-
-  def stratified_sample(self, batch_size):
-    """Performs stratified sampling using the sum tree.
-
-    Let R be the value at the root (total value of sum tree). This method will
-    divide [0, R) into batch_size segments, pick a random number from each of
-    those segments, and use that random number to sample from the sum_tree. This
-    is as specified in Schaul et al. (2015).
-
-    Args:
-      batch_size: int, the number of strata to use.
-    Returns:
-      list of batch_size elements sampled from the sum tree.
-
-    Raises:
-      Exception: If the sum tree is empty (i.e. its node values sum to 0).
-    """
-    if self._total_priority() == 0.0:
-      raise Exception('Cannot sample from an empty sum tree.')
-
-    bounds = np.linspace(0., 1., batch_size + 1)
-    assert len(bounds) == batch_size + 1
-    segments = [(bounds[i], bounds[i+1]) for i in range(batch_size)]
-    query_values = [random.uniform(x[0], x[1]) for x in segments]
-    return [self.sample(query_value=x) for x in query_values]
-
-  def get(self, node_index):
-    """Returns the value of the leaf node corresponding to the index.
-
-    Args:
-      node_index: The index of the leaf node.
-    Returns:
-      The value of the leaf node.
-    """
-    return self.nodes[-1][node_index]
-
-  def set(self, node_index, value):
-    """Sets the value of a leaf node and updates internal nodes accordingly.
-
-    This operation takes O(log(capacity)).
-    Args:
-      node_index: int, the index of the leaf node to be updated.
-      value: float, the value which we assign to the node. This value must be
-        nonnegative. Setting value = 0 will cause the element to never be
-        sampled.
-
-    Raises:
-      ValueError: If the given value is negative.
-    """
-    if value < 0.0:
-      raise ValueError('Sum tree values should be nonnegative. Got {}'.
-                       format(value))
-    self.max_recorded_priority = max(value, self.max_recorded_priority)
-
-    delta_value = value - self.nodes[-1][node_index]
-
-    # Now traverse back the tree, adjusting all sums along the way.
-    for nodes_at_this_depth in reversed(self.nodes):
-      # Note: Adding a delta leads to some tolerable numerical inaccuracies.
-      nodes_at_this_depth[node_index] += delta_value
-      node_index //= 2
-
-    assert node_index == 0, ('Sum tree traversal failed, final node index '
-                             'is not 0.')
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A sum tree data structure.
+
+Used for prioritized experience replay. See prioritized_replay_buffer.py
+and Schaul et al. (2015).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import random
+
+import numpy as np
+
+
+class SumTree(object):
+  """A sum tree data structure for storing replay priorities.
+
+  A sum tree is a complete binary tree whose leaves contain values called
+  priorities. Internal nodes maintain the sum of the priorities of all leaf
+  nodes in their subtree.
+
+  For capacity = 4, the tree may look like this:
+
+               +---+
+               |2.5|
+               +-+-+
+                 |
+         +-------+--------+
+         |                |
+       +-+-+            +-+-+
+       |1.5|            |1.0|
+       +-+-+            +-+-+
+         |                |
+    +----+----+      +----+----+
+    |         |      |         |
+  +-+-+     +-+-+  +-+-+     +-+-+
+  |0.5|     |1.0|  |0.5|     |0.5|
+  +---+     +---+  +---+     +---+
+
+  This is stored in a list of numpy arrays:
+  self.nodes = [ [2.5], [1.5, 1], [0.5, 1, 0.5, 0.5] ]
+
+  For conciseness, we allocate arrays as powers of two, and pad the excess
+  elements with zero values.
+
+  This is similar to the usual array-based representation of a complete binary
+  tree, but is a little more user-friendly.
+  """
+
+  def __init__(self, capacity):
+    """Creates the sum tree data structure for the given replay capacity.
+
+    Args:
+      capacity: int, the maximum number of elements that can be stored in this
+        data structure.
+
+    Raises:
+      ValueError: If requested capacity is not positive.
+    """
+    assert isinstance(capacity, int)
+    if capacity <= 0:
+      raise ValueError('Sum tree capacity should be positive. Got: {}'.
+                       format(capacity))
+
+    self.nodes = []
+    tree_depth = int(math.ceil(np.log2(capacity)))
+    level_size = 1
+    for _ in range(tree_depth + 1):
+      nodes_at_this_depth = np.zeros(level_size)
+      self.nodes.append(nodes_at_this_depth)
+
+      level_size *= 2
+
+    self.max_recorded_priority = 1.0
+
+  def _total_priority(self):
+    """Returns the sum of all priorities stored in this sum tree.
+
+    Returns:
+      float, sum of priorities stored in this sum tree.
+    """
+    return self.nodes[0][0]
+
+  def sample(self, query_value=None):
+    """Samples an element from the sum tree.
+
+    Each element has probability p_i / sum_j p_j of being picked, where p_i is
+    the (positive) value associated with node i (possibly unnormalized).
+
+    Args:
+      query_value: float in [0, 1], used as the random value to select a
+      sample. If None, will select one randomly in [0, 1).
+
+    Returns:
+      int, a random element from the sum tree.
+
+    Raises:
+      Exception: If the sum tree is empty (i.e. its node values sum to 0), or if
+        the supplied query_value is larger than the total sum.
+    """
+    if self._total_priority() == 0.0:
+      raise Exception('Cannot sample from an empty sum tree.')
+
+    if query_value and (query_value < 0. or query_value > 1.):
+      raise ValueError('query_value must be in [0, 1].')
+
+    # Sample a value in range [0, R), where R is the value stored at the root.
+    query_value = random.random() if query_value is None else query_value
+    query_value *= self._total_priority()
+
+    # Now traverse the sum tree.
+    node_index = 0
+    for nodes_at_this_depth in self.nodes[1:]:
+      # Compute children of previous depth's node.
+      left_child = node_index * 2
+
+      left_sum = nodes_at_this_depth[left_child]
+      # Each subtree describes a range [0, a), where a is its value.
+      if query_value < left_sum:  # Recurse into left subtree.
+        node_index = left_child
+      else:  # Recurse into right subtree.
+        node_index = left_child + 1
+        # Adjust query to be relative to right subtree.
+        query_value -= left_sum
+
+    return node_index
+
+  def stratified_sample(self, batch_size):
+    """Performs stratified sampling using the sum tree.
+
+    Let R be the value at the root (total value of sum tree). This method will
+    divide [0, R) into batch_size segments, pick a random number from each of
+    those segments, and use that random number to sample from the sum_tree. This
+    is as specified in Schaul et al. (2015).
+
+    Args:
+      batch_size: int, the number of strata to use.
+    Returns:
+      list of batch_size elements sampled from the sum tree.
+
+    Raises:
+      Exception: If the sum tree is empty (i.e. its node values sum to 0).
+    """
+    if self._total_priority() == 0.0:
+      raise Exception('Cannot sample from an empty sum tree.')
+
+    bounds = np.linspace(0., 1., batch_size + 1)
+    assert len(bounds) == batch_size + 1
+    segments = [(bounds[i], bounds[i+1]) for i in range(batch_size)]
+    query_values = [random.uniform(x[0], x[1]) for x in segments]
+    return [self.sample(query_value=x) for x in query_values]
+
+  def get(self, node_index):
+    """Returns the value of the leaf node corresponding to the index.
+
+    Args:
+      node_index: The index of the leaf node.
+    Returns:
+      The value of the leaf node.
+    """
+    return self.nodes[-1][node_index]
+
+  def set(self, node_index, value):
+    """Sets the value of a leaf node and updates internal nodes accordingly.
+
+    This operation takes O(log(capacity)).
+    Args:
+      node_index: int, the index of the leaf node to be updated.
+      value: float, the value which we assign to the node. This value must be
+        nonnegative. Setting value = 0 will cause the element to never be
+        sampled.
+
+    Raises:
+      ValueError: If the given value is negative.
+    """
+    if value < 0.0:
+      raise ValueError('Sum tree values should be nonnegative. Got {}'.
+                       format(value))
+    self.max_recorded_priority = max(value, self.max_recorded_priority)
+
+    delta_value = value - self.nodes[-1][node_index]
+
+    # Now traverse back the tree, adjusting all sums along the way.
+    for nodes_at_this_depth in reversed(self.nodes):
+      # Note: Adding a delta leads to some tolerable numerical inaccuracies.
+      nodes_at_this_depth[node_index] += delta_value
+      node_index //= 2
+
+    assert node_index == 0, ('Sum tree traversal failed, final node index '
+                             'is not 0.')
--- a/dopamine/utils/init.py
+++ b/dopamine/utils/init.py
@ -1,15 +1,15 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/dopamine/utils/test_utils.py
+++ b/dopamine/utils/test_utils.py
@ -1,34 +1,34 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Common testing utilities shared across agents."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-
-import mock
-import tensorflow as tf
-
-
-class MockReplayBuffer(object):
-  """Mock ReplayBuffer to verify the way the agent interacts with it."""
-
-  def __init__(self):
-    with tf.variable_scope('MockReplayBuffer', reuse=tf.AUTO_REUSE):
-      self.add = mock.Mock()
-      self.memory = mock.Mock()
-      self.memory.add_count = 0
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common testing utilities shared across agents."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+
+import mock
+import tensorflow as tf
+
+
+class MockReplayBuffer(object):
+  """Mock ReplayBuffer to verify the way the agent interacts with it."""
+
+  def __init__(self):
+    with tf.variable_scope('MockReplayBuffer', reuse=tf.AUTO_REUSE):
+      self.add = mock.Mock()
+      self.memory = mock.Mock()
+      self.memory.add_count = 0
--- a/setup.py
+++ b/setup.py
@ -1,92 +1,92 @@
-# coding=utf-8
-# Copyright 2018 The Dopamine Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Setup script for Dopamine.
-
-This script will install Dopamine as a Python module.
-
-See: https://github.com/google/dopamine
-
-"""
-
-import codecs
-from os import path
-from setuptools import find_packages
-from setuptools import setup
-
-here = path.abspath(path.dirname(__file__))
-
-# Get the long description from the README file.
-with codecs.open(path.join(here, 'README.md'), encoding='utf-8') as f:
-  long_description = f.read()
-
-install_requires = ['gin-config == 0.1.4', 'absl-py >= 0.2.2',
-                    'opencv-python >= 3.4.1.15',
-                    'gym >= 0.10.5']
-tests_require = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2',
-                 'opencv-python >= 3.4.1.15',
-                 'gym >= 0.10.5', 'mock >= 1.0.0']
-
-dopamine_description = (
-    'Dopamine: A framework for flexible Reinforcement Learning research')
-
-setup(
-    name='dopamine_rl',
-    version='2.0.1',
-    include_package_data=True,
-    packages=find_packages(exclude=['docs']),  # Required
-    package_data={'testdata': ['testdata/*.gin']},
-    install_requires=install_requires,
-    tests_require=tests_require,
-    description=dopamine_description,
-    long_description=long_description,
-    url='https://github.com/google/dopamine',  # Optional
-    author='The Dopamine Team',  # Optional
-    author_email='opensource@google.com',
-    classifiers=[  # Optional
-        'Development Status :: 4 - Beta',
-
-        # Indicate who your project is intended for
-        'Intended Audience :: Developers',
-        'Intended Audience :: Education',
-        'Intended Audience :: Science/Research',
-
-        # Pick your license as you wish
-        'License :: OSI Approved :: Apache Software License',
-
-        # Specify the Python versions you support here. In particular, ensure
-        # that you indicate whether you support Python 2, Python 3 or both.
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.4',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
-
-        'Topic :: Scientific/Engineering',
-        'Topic :: Scientific/Engineering :: Mathematics',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: Software Development',
-        'Topic :: Software Development :: Libraries',
-        'Topic :: Software Development :: Libraries :: Python Modules',
-
-    ],
-    project_urls={  # Optional
-        'Documentation': 'https://github.com/google/dopamine',
-        'Bug Reports': 'https://github.com/google/dopamine/issues',
-        'Source': 'https://github.com/google/dopamine',
-    },
-    license='Apache 2.0',
-    keywords='dopamine reinforcement-learning python machine learning'
-)
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Setup script for Dopamine.
+
+This script will install Dopamine as a Python module.
+
+See: https://github.com/google/dopamine
+
+"""
+
+import codecs
+from os import path
+from setuptools import find_packages
+from setuptools import setup
+
+here = path.abspath(path.dirname(__file__))
+
+# Get the long description from the README file.
+with codecs.open(path.join(here, 'README.md'), encoding='utf-8') as f:
+  long_description = f.read()
+
+install_requires = ['gin-config == 0.1.4', 'absl-py >= 0.2.2',
+                    'opencv-python >= 3.4.1.15',
+                    'gym >= 0.10.5']
+tests_require = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2',
+                 'opencv-python >= 3.4.1.15',
+                 'gym >= 0.10.5', 'mock >= 1.0.0']
+
+dopamine_description = (
+    'Dopamine: A framework for flexible Reinforcement Learning research')
+
+setup(
+    name='dopamine_rl',
+    version='2.0.1',
+    include_package_data=True,
+    packages=find_packages(exclude=['docs']),  # Required
+    package_data={'testdata': ['testdata/*.gin']},
+    install_requires=install_requires,
+    tests_require=tests_require,
+    description=dopamine_description,
+    long_description=long_description,
+    url='https://github.com/google/dopamine',  # Optional
+    author='The Dopamine Team',  # Optional
+    author_email='opensource@google.com',
+    classifiers=[  # Optional
+        'Development Status :: 4 - Beta',
+
+        # Indicate who your project is intended for
+        'Intended Audience :: Developers',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+
+        # Pick your license as you wish
+        'License :: OSI Approved :: Apache Software License',
+
+        # Specify the Python versions you support here. In particular, ensure
+        # that you indicate whether you support Python 2, Python 3 or both.
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Mathematics',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+
+    ],
+    project_urls={  # Optional
+        'Documentation': 'https://github.com/google/dopamine',
+        'Bug Reports': 'https://github.com/google/dopamine/issues',
+        'Source': 'https://github.com/google/dopamine',
+    },
+    license='Apache 2.0',
+    keywords='dopamine reinforcement-learning python machine learning'
+)