Tests with `reuse_dist_env = True` often causes memory leaks. This PR
ignores `reuse_dist_env` and forcibly sets it to `False`. This change
might slow down the tests, but I think it is better to manually restart
runners and relaunch tests.

Memory usages (See #6578):
- `reuse_dist_env == True`:
https://github.com/microsoft/DeepSpeed/actions/runs/11302940871/job/31439471512
- `reuse_dist_env == False`:
https://github.com/microsoft/DeepSpeed/actions/runs/11303250613/job/31440137894
This commit is contained in:
Masahiro Tanaka 2024-10-14 09:08:44 -07:00 коммит произвёл GitHub
Родитель 5c4b97f109
Коммит 7a5bc4fdf9
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
1 изменённых файлов: 9 добавлений и 0 удалений

Просмотреть файл

@ -25,6 +25,8 @@ from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
# Worker timeout for tests that hang # Worker timeout for tests that hang
DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600')) DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
warn_reuse_dist_env = False
def is_rocm_pytorch(): def is_rocm_pytorch():
return hasattr(torch.version, 'hip') and torch.version.hip is not None return hasattr(torch.version, 'hip') and torch.version.hip is not None
@ -179,6 +181,13 @@ class DistributedExec(ABC):
print("Ignoring reuse_dist_env for hpu") print("Ignoring reuse_dist_env for hpu")
self.reuse_dist_env = False self.reuse_dist_env = False
global warn_reuse_dist_env
if self.reuse_dist_env and not warn_reuse_dist_env:
# Currently we see memory leak for tests that reuse distributed environment
print("Ignoring reuse_dist_env and forcibly setting it to False")
warn_reuse_dist_env = True
self.reuse_dist_env = False
if self.reuse_dist_env: if self.reuse_dist_env:
if num_procs not in self._pool_cache: if num_procs not in self._pool_cache:
self._pool_cache[num_procs] = mp.Pool(processes=num_procs) self._pool_cache[num_procs] = mp.Pool(processes=num_procs)