Merge pull request #716 from Azure/app_pinning_tool_slurm

App pinning tool ( slurm/srun integration)
2023-04-05 19:20:25 -05:00 · 2023-04-05 19:20:25 -05:00 · 5da70fef67
--- a/experimental/check_app_pinning_tool/check_app_pinning.py
+++ b/experimental/check_app_pinning_tool/check_app_pinning.py
@ -226,7 +226,7 @@ def one_numa(row_l):


 def parse_lstopo():
-   cmd = ["lstopo-no-graphics", "--no-caches", "--taskset"]
+   cmd = ["lstopo-no-graphics", "--no-caches", "--taskset", "--whole-io"]
   try:
      cmdpipe = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   except FileNotFoundError:
@ -256,10 +256,10 @@ def parse_lstopo():
          row_l = row_s.split()
          core_id = re.findall(r'\d+',row_l[-2])[0]
          topo_d["numanode_ids"][numanode]["core_ids"].append(int(core_id))
-       if re.search(r'GPU.*card', row_s):
+       if re.search(r' {10,}GPU.*card', row_s):
          row_l = row_s.split()
          gpu_id = re.findall(r'\d+',row_l[-1])[0]
-          topo_d["numanode_ids"][numanode]["gpu_ids"].append(int(gpu_id))
+          topo_d["numanode_ids"][numanode]["gpu_ids"].append(int(gpu_id)-1)
   cmdpipe.stdout.close()
   cmdpipe.stderr.close()
   return topo_d
@ -497,6 +497,80 @@ def calc_process_pinning(number_processes_per_vm, num_numa_domains, l3cache_topo
    return (pinning_l, number_processes_per_numa, number_cores_in_l3cache)


+def calc_slurm_pinning(number_processes_per_numa, topo_2_d):
+    slurm_pinning_l = []
+    for numa_id in topo_2_d["numanode_ids"]:
+        numa_pinning_l = []
+        indx = 0
+        while len(numa_pinning_l) < number_processes_per_numa:
+            for l3cache_id in topo_2_d["numanode_ids"][numa_id]["l3cache_ids"]:
+                if indx > len(topo_2_d["numanode_ids"][numa_id]["l3cache_ids"][l3cache_id])-1:
+                   continue
+                if len(numa_pinning_l) < number_processes_per_numa:
+                   numa_pinning_l.append(topo_2_d["numanode_ids"][numa_id]["l3cache_ids"][l3cache_id][indx])
+                else:
+                   break
+            indx += 1
+        slurm_pinning_l += numa_pinning_l
+    return (slurm_pinning_l)
+
+
+def calc_slurm_pin_range(slurm_pinning_l, num_threads):
+    core_id_range_l = []
+    for core_id in slurm_pinning_l:
+        range_end = core_id + num_threads - 1
+        core_id_range = str(core_id) + "-" + str(range_end)
+        core_id_range_l.append(core_id_range)
+    return core_id_range_l
+
+
+def execute_cmd(cmd_l):
+    proc = subprocess.Popen(cmd_l, stdout=subprocess.PIPE, universal_newlines=True)
+    cmd_out, errs = proc.communicate()
+    return cmd_out
+
+
+def convert_range_to_mask(core_id_range_l):
+    slurm_mask_str = ""
+    for core_id_range in core_id_range_l:
+        hwloc_calc_arg = 'core:' + core_id_range
+        cmd_l = ['hwloc-calc', "--taskset", hwloc_calc_arg]
+        hwloc_calc_out = execute_cmd(cmd_l)
+        slurm_mask_str += "," + hwloc_calc_out.rstrip()
+    return slurm_mask_str[1:]
+
+
+def create_gpu_numa_mask_str(topo_d, total_num_gpus):
+   gpu_numa_mask_str = ""
+   for gpu_id in range(0,total_num_gpus):
+       for numa_id in topo_d["numanode_ids"]:
+           gpu_ids_l = topo_d["numanode_ids"][numa_id]["gpu_ids"]
+           if gpu_id in gpu_ids_l:
+              gpu_numa_mask_str += "," + topo_d["numanode_ids"][numa_id]["mask"]
+              break
+   return gpu_numa_mask_str[1:]
+
+
+def l3cache_id_in_numa(l3cache_l, numa_core_l):
+    for core_id in l3cache_l:
+        if core_id in numa_core_l:
+           return True
+        else:
+           return False
+
+
+def create_topo_2_d(topo_d, l3cache_topo_d):
+    topo_2_d = {}
+    topo_2_d = topo_d
+    for numa_id in topo_2_d["numanode_ids"]:
+        topo_2_d["numanode_ids"][numa_id]["l3cache_ids"] = {}
+        for l3cache_id in l3cache_topo_d["l3cache_ids"]:
+            if l3cache_id_in_numa(l3cache_topo_d["l3cache_ids"][l3cache_id], topo_d["numanode_ids"][numa_id]["core_ids"]):
+               topo_2_d["numanode_ids"][numa_id]["l3cache_ids"][l3cache_id] = l3cache_topo_d["l3cache_ids"][l3cache_id]
+
+    return topo_2_d
+
+
 def check_process_numa_distribution(total_num_processes, total_num_numa_domains, process_d):
    num_numa_domains = min(total_num_processes, total_num_numa_domains)
    numas_l = []
@ -715,7 +789,7 @@ def check_number_threads_per_l3cache(number_processes_per_vm, number_threads_per
    return have_warning


-def report(app_pattern, print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_syntax_l, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, num_numas):
+def report(app_pattern, print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_syntax_l, slurm_pinning_l, slurm_mask_str, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, num_numas, total_num_gpus):
    hostname = socket.gethostname()
    print("")
    print("Virtual Machine ({}, {}) Numa topology".format(sku_name, hostname))
@ -773,6 +847,15 @@ def report(app_pattern, print_pinning_syntax, topo_d, process_d, sku_name, l3cac
             else:
                az_mpi_args = "--bind-to l3cache --map-by ppr:{}:numa -report-bindings".format(number_processes_per_numa)
                print("mpirun -np {} {}".format(total_number_processes, az_mpi_args))
+          elif mpi_type == "srun":
+             if total_num_gpus == 0 or total_num_gpus != number_processes_per_vm:
+                az_mpi_args = "--mpi=pmix --cpu-bind=mask_cpu:{} --ntasks-per-node={}".format(slurm_mask_str, number_processes_per_vm)
+                print("core id pinning: {}\n".format(slurm_pinning_l))
+                print("srun {}".format(az_mpi_args))
+             else:
+                gpu_numa_mask_str = create_gpu_numa_mask_str(topo_d, total_num_gpus)
+                az_mpi_args = "--mpi=pmix --cpu-bind=mask_cpu:{} --ntasks-per-node={} --gpus-per-node={}".format(gpu_numa_mask_str, number_processes_per_vm, total_num_gpus)
+                print("srun {}".format(az_mpi_args))
          elif mpi_type == "intel":
             num_l3cache = len(l3cache_topo_d["l3cache_ids"])
             if number_threads_per_process == 1:
@ -799,6 +882,8 @@ def main():
   number_processes_per_vm = 0
   number_threads_per_process = 0
   pinning_l = []
+   slurm_pinning_l = []
+   slurm_mask_str = ""
   process_d = {}
   number_processes_per_numa = 0
   number_cores_in_l3cache = 0
@ -813,7 +898,7 @@ def main():
   parser.add_argument("-nv", "--total_number_vms", dest="total_number_vms", type=int, default=1, help="Total number of VM's (used with -pps)")
   parser.add_argument("-nppv", "--number_processes_per_vm", dest="number_processes_per_vm", type=int, help="Total number of MPI processes per VM (used with -pps)")
   parser.add_argument("-ntpp", "--number_threads_per_process", dest="number_threads_per_process", type=int, help="Number of threads per process (used with -pps)")
-   parser.add_argument("-mt", "--mpi_type", dest="mpi_type", type=str, choices=["openmpi","intel","mvapich2"], default="openmpi", help="Select which type of MPI to generate pinning syntax (used with -pps)")
+   parser.add_argument("-mt", "--mpi_type", dest="mpi_type", type=str, choices=["openmpi","intel","mvapich2","srun"], default="openmpi", help="Select which type of MPI to generate pinning syntax (used with -pps)(select srun when you are using a SLURM scheduler)")
   args = parser.parse_args()
   force = args.force
   if len(sys.argv) > 1 and not args.application_pattern and not args.print_pinning_syntax:
@ -857,7 +942,14 @@ def main():
      have_warning = check_pinning_syntax(number_processes_per_vm, number_threads_per_process, topo_d, l3cache_topo_d)
      (pinning_l, number_processes_per_numa, number_cores_in_l3cache) = calc_process_pinning(number_processes_per_vm, total_num_numa_domains, l3cache_topo_d)

-   report(args.application_pattern, args.print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_l, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, total_num_numa_domains)
+   if mpi_type == "srun":
+      if total_num_gpus == 0 or total_num_gpus != number_processes_per_vm:
+         topo_2_d = create_topo_2_d(topo_d, l3cache_topo_d)
+         slurm_pinning_l = calc_slurm_pinning(number_processes_per_numa, topo_2_d)
+         slurm_pinning_l = calc_slurm_pin_range(slurm_pinning_l, number_threads_per_process)
+         slurm_mask_str = convert_range_to_mask(slurm_pinning_l)
+
+   report(args.application_pattern, args.print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_l, slurm_pinning_l, slurm_mask_str, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, total_num_numa_domains, total_num_gpus)
   check_app(args.application_pattern,  total_num_numa_domains, total_num_gpus, topo_d, process_d, l3cache_topo_d)


--- a/experimental/check_app_pinning_tool/readme.md
+++ b/experimental/check_app_pinning_tool/readme.md
@ -1,7 +1,7 @@
 # HPC Application process/thread mapping/pinning checking tool

 Correct mapping/pinning of HPC Application processes/threads is critical for optimal performance.
-The HPC Application process/thread mapping/pinning checking tool has three main features, it allows you to quickly verify that the processes/threads associated with your HPC Application are mapped/pinned correctly/optimally, it can generate the MPI process/thread pinning syntax for OpenMPI/HPCX, Intel MPI and Mvapich2 (Currently for HPC VM's based on AMD processors (HB (v1,v2 & v3) and NDv4) and you can use this tool directly in an mpi run script to pass and use the optimal mpi pinning arguments. This tool shows you the virtual machine NUMA topology (i.e location of core id's, GPU's and NUMA domains), where the processes/threads associated with your HPC Application are mapped/pinned and warnings if they are not mapped/pinned optimally.
+The HPC Application process/thread mapping/pinning checking tool has three main features, it allows you to quickly verify that the processes/threads associated with your HPC Application are mapped/pinned correctly/optimally, it can generate the MPI process/thread pinning syntax for OpenMPI/HPCX, Intel MPI and Mvapich2 (Currently for HPC VM's based on AMD processors (HB (v1,v2 & v3) and NDv4) and you can use this tool directly in an mpi run script (or slurm/srun) to pass and use the optimal mpi pinning arguments. This tool shows you the virtual machine NUMA topology (i.e location of core id's, GPU's and NUMA domains), where the processes/threads associated with your HPC Application are mapped/pinned and warnings if they are not mapped/pinned optimally.

 ## Prerequisites

@ -34,9 +34,10 @@ optional arguments:
  -ntpp NUMBER_THREADS_PER_PROCESS, --number_threads_per_process NUMBER_THREADS_PER_PROCESS
                        Number of threads per process (used with -pps)
                        (default: None)
-  -mt {openmpi,intel,mvapich2}, --mpi_type {openmpi,intel,mvapich2}
+  -mt {openmpi,intel,mvapich2,srun}, --mpi_type {openmpi,intel,mvapich2,srun}
                        Select which type of MPI to generate pinning syntax
-                        (used with -pps) (default: openmpi)
+                        (used with -pps)(select srun when you are using a
+                        SLURM scheduler) (default: openmpi)
 ```
 ## Examples
 You are on a Standard_HB120-64rs_v3 virtual machine, you would like to know the correct HPCX pinning syntax to pin 16 MPI
@ -259,3 +260,48 @@ To run 16 processes and 6 threads using Intel MPI on HB120-96rs_v3, just add -mt
 check_app_pinning.py -pps -nv 1 -nppv 16 -ntpp $OMP_NUM_THREADS -mt intel
 ```
 >Note: AZ_MPI_NP=16 and AZ_MPI_ARGS="-genv I_MPI_PIN_DOMAIN 6:compact -genv FI_PROVIDER mlx -genv I_MPI_COLL_EXTERNAL 1 -genv I_MPI_DEBUG 6"
+
+Example of Slurm/srun integration, run 16 processes and 6 threads on HB120-96rs_v3 using srun with a Slurm scheduler.
+
+```
+#!/bin/bash
+#SBATCH --mem=0
+#SBATCH --ntasks-per-node=16
+#SBATCH --exclusive
+
+module load gcc-9.2.0
+module load mpi/hpcx
+
+export SLURM_CPU_BIND=verbose
+export OMP_NUM_THREADS=6
+
+check_app_pinning.py -pps -nv $SLURM_NNODES -nppv $SLURM_NTASKS_PER_NODE -ntpp $OMP_NUM_THREADS -mt srun
+AZ_MPI_NP=$(cat AZ_MPI_NP)
+AZ_MPI_ARGS=$(cat AZ_MPI_ARGS)
+
+srun $AZ_MPI_ARGS mpi_executable
+```
+>Note: AZ_MPI_ARGS="--mpi=pmix --cpu-bind=mask_cpu:0x3f,0xfc0,0x3f000,0xfc0000,0x3f000000,0xfc0000000,0x3f000000000,0xfc0000000000,0x3f000000000000,0xfc0000000000000,0x3f000000000000000,0xfc0000000000000000,0x3f000000000000000000,0xfc0000000000000000000,0x3f000000000000000000000,0xfc0000000000000000000000 --ntasks-per-node=16"
+
+
+Example of Slurm/srun integration, run 8 processes on NDm_A100_v4 using srun (Slurm Scheduler).
+
+```
+#!/bin/bash
+#SBATCH --mem=0
+#SBATCH --ntasks-per-node=8
+#SBATCH --exclusive
+
+module load gcc-9.2.0
+module load mpi/hpcx
+
+export SLURM_CPU_BIND=verbose
+export OMP_NUM_THREADS=1
+
+check_app_pinning.py -pps -nv $SLURM_NNODES -nppv $SLURM_NTASKS_PER_NODE -ntpp $OMP_NUM_THREADS -mt srun
+AZ_MPI_NP=$(cat AZ_MPI_NP)
+AZ_MPI_ARGS=$(cat AZ_MPI_ARGS)
+
+srun $AZ_MPI_ARGS mpi_executable
+```
+>Note: AZ_MPI_ARGS="--mpi=pmix --cpu-bind=mask_cpu:0xffffff000000,0xffffff000000,0xffffff,0xffffff,0xffffff000000000000000000,0xffffff000000000000000000,0xffffff000000000000,0xffffff000000000000 --ntasks-per-node=8 --gpus-per-node=8"