set KMP_AFFINITY and OMP_NUM_THREADS to deal with slow perf issue when having many threads

This commit is contained in:
miguelgfierro 2020-03-22 02:44:42 +00:00
Родитель 6021962790
Коммит 0d02b8a06b
2 изменённых файлов: 8 добавлений и 3 удалений

Просмотреть файл

@ -14,7 +14,6 @@ if nlp_path not in sys.path:
sys.path.insert(0, nlp_path)
sys.path.insert(0, "./")
print(sys.path)
from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset
from utils_nlp.models.transformers.extractive_summarization import (
ExtractiveSummarizer,
@ -25,7 +24,8 @@ from utils_nlp.models.transformers.extractive_summarization import (
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
os.environ["NCCL_IB_DISABLE"] = "0"
os.environ['OMP_NUM_THREADS'] = str(torch.cuda.device_count())
os.environ["KMP_AFFINITY"] = "verbose"
parser = argparse.ArgumentParser()
parser.add_argument(
@ -207,6 +207,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
else:
save_every = SAVE_EVERY
# """
print("starting training")
summarizer.fit(
ext_sum_train,
num_gpus=world_size,

Просмотреть файл

@ -3,11 +3,14 @@
import os
import pytest
import torch
@pytest.mark.gpu
@pytest.mark.integration
def test_ddp_extractive_summarization_cnndm_transformers(scripts, tmp):
ddp_env = os.environ.copy()
ddp_env["OMP_NUM_THREADS"] = str(torch.cuda.device_count())
ddp_env["KMP_AFFINITY"] = "verbose"
script = scripts["ddp_bertsumext"]
summary_filename = "bertsumext_prediction.txt"
import subprocess
@ -27,6 +30,7 @@ def test_ddp_extractive_summarization_cnndm_transformers(scripts, tmp):
"--summary_filename",
summary_filename,
],
env=ddp_env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)