зеркало из https://github.com/microsoft/DeepSpeed.git
Ignore reuse_dist_env (#6623)
Tests with `reuse_dist_env = True` often causes memory leaks. This PR ignores `reuse_dist_env` and forcibly sets it to `False`. This change might slow down the tests, but I think it is better to manually restart runners and relaunch tests. Memory usages (See #6578): - `reuse_dist_env == True`: https://github.com/microsoft/DeepSpeed/actions/runs/11302940871/job/31439471512 - `reuse_dist_env == False`: https://github.com/microsoft/DeepSpeed/actions/runs/11303250613/job/31440137894
This commit is contained in:
Родитель
5c4b97f109
Коммит
7a5bc4fdf9
|
@ -25,6 +25,8 @@ from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
|
||||||
# Worker timeout for tests that hang
|
# Worker timeout for tests that hang
|
||||||
DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
|
DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
|
||||||
|
|
||||||
|
warn_reuse_dist_env = False
|
||||||
|
|
||||||
|
|
||||||
def is_rocm_pytorch():
|
def is_rocm_pytorch():
|
||||||
return hasattr(torch.version, 'hip') and torch.version.hip is not None
|
return hasattr(torch.version, 'hip') and torch.version.hip is not None
|
||||||
|
@ -179,6 +181,13 @@ class DistributedExec(ABC):
|
||||||
print("Ignoring reuse_dist_env for hpu")
|
print("Ignoring reuse_dist_env for hpu")
|
||||||
self.reuse_dist_env = False
|
self.reuse_dist_env = False
|
||||||
|
|
||||||
|
global warn_reuse_dist_env
|
||||||
|
if self.reuse_dist_env and not warn_reuse_dist_env:
|
||||||
|
# Currently we see memory leak for tests that reuse distributed environment
|
||||||
|
print("Ignoring reuse_dist_env and forcibly setting it to False")
|
||||||
|
warn_reuse_dist_env = True
|
||||||
|
self.reuse_dist_env = False
|
||||||
|
|
||||||
if self.reuse_dist_env:
|
if self.reuse_dist_env:
|
||||||
if num_procs not in self._pool_cache:
|
if num_procs not in self._pool_cache:
|
||||||
self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
|
self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче