Stabilize MPI tests for Azure Linux

This commit is contained in:
Sandeep Karambelkar 2024-11-20 14:14:56 +05:30
Родитель 5c8d0258a6
Коммит 81559cd0b8
3 изменённых файлов: 85 добавлений и 12 удалений

Просмотреть файл

@ -9,10 +9,11 @@ from assertpy import assert_that
from retry import retry
from lisa.base_tools import Cat, Sed, Uname, Wget
from lisa.tools.git import Git
from lisa.feature import Feature
from lisa.features import Disk
from lisa.operating_system import CBLMariner, Oracle, Redhat, Ubuntu
from lisa.tools import Firewall, Ls, Lspci, Make, Service
from lisa.tools import Chmod, Find, Firewall, Ls, Lspci, Make, Service
from lisa.tools.tar import Tar
from lisa.util import (
LisaException,
@ -466,7 +467,6 @@ class Infiniband(Feature):
def install_open_mpi(self) -> None:
node = self._node
# Install Open MPI
wget = node.tools[Wget]
tar_file = (
"https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz"
@ -497,6 +497,55 @@ class Infiniband(Feature):
make.make("", cwd=openmpi_folder, sudo=True)
make.make_install(cwd=openmpi_folder, sudo=True)
def install_intel_mpi_benchmarking_tool(self, tool_names: List[str] = ["IMB-MPI1"]) -> None:
# Assumption is we have required mpi package built and installed
node = self._node
if not isinstance(node.os, CBLMariner):
# These tools are included in other distro packages
return
# Clone and build Intel MPI Benchmarks https://github.com/intel/mpi-benchmarks.git
git = node.tools[Git]
git.clone(url="https://github.com/intel/mpi-benchmarks.git", cwd=node.working_path)
imb_src_folder = node.get_pure_path(f"{node.working_path}/mpi-benchmarks")
find = node.tools[Find]
# find mpicc path
find_results = find.find_files(
node.get_pure_path("/"), "mpicc", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicc from MPI package"
).is_greater_than(0)
mpicc_path = find_results[0]
assert_that(mpicc_path).described_as(
"Could not find location of mpicc from MPI package"
).is_not_empty()
# find mpicxx path
find_results = find.find_files(
node.get_pure_path("/"), "mpicxx", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicxx from MPI package"
).is_greater_than(0)
mpicxx_path = find_results[0]
assert_that(mpicxx_path).described_as(
"Could not find location of mpicxx from MPI package"
).is_not_empty()
node.tools[Chmod].chmod(mpicc_path, "755", sudo=True)
node.tools[Chmod].chmod(mpicxx_path, "755", sudo=True)
# tool_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for tool in tool_names:
make = node.tools[Make]
make.make(f"{tool} CC={mpicc_path} CXX={mpicxx_path}",
cwd=imb_src_folder, sudo=True,
shell=False, sendYesCmd=False)
node.tools[Chmod].chmod(f"{imb_src_folder}/{tool}", "755", sudo=True)
def install_ibm_mpi(self, platform_mpi_url: str) -> None:
node = self._node
if isinstance(node.os, Redhat):

Просмотреть файл

@ -74,6 +74,8 @@ class Make(Tool):
thread_count: int = 0,
update_envs: Optional[Dict[str, str]] = None,
ignore_error: bool = False,
shell: bool = True,
sendYesCmd: bool = True
) -> ExecutableResult:
expected_exit_code: Optional[int] = 0
if thread_count == 0:
@ -95,13 +97,17 @@ class Make(Tool):
if ignore_error:
expected_exit_code = None
# yes '' answers all questions with default value.
command = ""
if sendYesCmd:
# yes '' answers all questions with default value.
command = "yes '' | "
result = self.node.execute(
f"yes '' | make -j{thread_count} {arguments}",
f"{command} make -j{thread_count} {arguments}",
cwd=cwd,
timeout=timeout,
sudo=sudo,
shell=True,
shell=shell,
update_envs=update_envs,
expected_exit_code=expected_exit_code,
expected_exit_code_failure_message="Failed to make",

Просмотреть файл

@ -15,7 +15,7 @@ from lisa import (
simple_requirement,
)
from lisa.features import AvailabilitySetEnabled, Infiniband, Sriov
from lisa.operating_system import BSD, Windows
from lisa.operating_system import BSD, CBLMariner, Windows
from lisa.sut_orchestrator.azure.tools import Waagent
from lisa.tools import Find, KernelConfig, Ls, Modprobe, Ssh
from lisa.util import (
@ -286,6 +286,9 @@ class InfinibandSuite(TestSuite):
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
sudo=True
# Note: Using bash because script is not supported by Dash
# sh points to dash on Ubuntu
@ -295,6 +298,7 @@ class InfinibandSuite(TestSuite):
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed intra-node pingpong test "
"with intel mpi",
@ -306,6 +310,7 @@ class InfinibandSuite(TestSuite):
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed inter-node pingpong test "
"with intel mpi",
@ -319,6 +324,7 @@ class InfinibandSuite(TestSuite):
"-n 44 -env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
f"/opt/intel/oneapi/mpi/2021.1.1/bin/{test}",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test with intel mpi",
timeout=3000,
@ -360,10 +366,13 @@ class InfinibandSuite(TestSuite):
raise SkippedException(err)
run_in_parallel([server_ib.install_open_mpi, client_ib.install_open_mpi])
server_node.execute("ldconfig", sudo=True)
client_node.execute("ldconfig", sudo=True)
# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool()
# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
server_node.close()
@ -386,7 +395,7 @@ class InfinibandSuite(TestSuite):
# Ping Pong test
find = server_node.tools[Find]
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of IMB-MPI1 for Open MPI"
@ -407,7 +416,7 @@ class InfinibandSuite(TestSuite):
# IMB-MPI Tests
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of Open MPI test: IMB-MPI1"
@ -417,7 +426,7 @@ class InfinibandSuite(TestSuite):
"Could not find location of Open MPI test: IMB-MPI1"
).is_not_empty()
server_node.execute(
f"/usr/local/bin/mpirun --host {server_ip},{client_ip} "
f"/usr/local/bin/mpirun -hosts {server_ip},{client_ip} "
"-n 2 --mca btl self,vader,openib --mca btl_openib_cq_size 4096 "
"--mca btl_openib_allow_ib 1 --mca "
f"btl_openib_warn_no_device_params_found 0 {test_path}",
@ -571,6 +580,12 @@ class InfinibandSuite(TestSuite):
raise SkippedException(err)
run_in_parallel([server_ib.install_mvapich_mpi, client_ib.install_mvapich_mpi])
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool(tool_names=test_names)
server_node.execute("ldconfig", sudo=True)
# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
@ -590,13 +605,15 @@ class InfinibandSuite(TestSuite):
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
sudo=True
# Run MPI tests
find = server_node.tools[Find]
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for test in test_names:
find_results = find.find_files(
server_node.get_pure_path("/usr"), test, sudo=True
server_node.get_pure_path("/"), test, sudo=True
)
assert_that(len(find_results)).described_as(
f"Could not find location of MVAPICH MPI test: {test}"
@ -611,6 +628,7 @@ class InfinibandSuite(TestSuite):
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test "
"with MVAPICH MPI",
sudo=sudo
)
def _check_nd_enabled(self, node: Node) -> None: