2019-04-26 18:27:27 +03:00
|
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
# Licensed under the Apache 2.0 License.
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
from enum import Enum
|
|
|
|
import paramiko
|
|
|
|
import subprocess
|
|
|
|
from contextlib import contextmanager
|
|
|
|
import infra.path
|
2019-05-29 12:42:17 +03:00
|
|
|
import uuid
|
2019-06-14 12:56:35 +03:00
|
|
|
import ctypes
|
|
|
|
import signal
|
2019-08-08 18:11:46 +03:00
|
|
|
import re
|
2019-08-19 17:40:15 +03:00
|
|
|
from collections import deque
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
from loguru import logger as LOG
|
|
|
|
|
|
|
|
DBG = os.getenv("DBG", "cgdb")
|
|
|
|
|
2019-06-14 12:56:35 +03:00
|
|
|
_libc = ctypes.CDLL("libc.so.6")
|
|
|
|
|
|
|
|
|
|
|
|
def _term_on_pdeathsig():
|
|
|
|
# usr/include/linux/prctl.h: #define PR_SET_PDEATHSIG 1
|
|
|
|
_libc.prctl(1, signal.SIGTERM)
|
|
|
|
|
|
|
|
|
|
|
|
def popen(*args, **kwargs):
|
|
|
|
kwargs["preexec_fn"] = _term_on_pdeathsig
|
|
|
|
return subprocess.Popen(*args, **kwargs)
|
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2019-06-25 17:37:05 +03:00
|
|
|
def coverage_enabled(bin):
|
|
|
|
return (
|
|
|
|
subprocess.run(
|
|
|
|
f"nm -C {bin} | grep __llvm_coverage_mapping", shell=True
|
|
|
|
).returncode
|
|
|
|
== 0
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
@contextmanager
|
|
|
|
def sftp_session(hostname):
|
|
|
|
client = paramiko.SSHClient()
|
|
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
|
|
client.connect(hostname)
|
|
|
|
try:
|
|
|
|
session = client.open_sftp()
|
|
|
|
try:
|
|
|
|
yield session
|
|
|
|
finally:
|
|
|
|
session.close()
|
|
|
|
finally:
|
|
|
|
client.close()
|
|
|
|
|
|
|
|
|
|
|
|
def log_errors(out_path, err_path):
|
2019-08-28 12:57:45 +03:00
|
|
|
error_filter = ["[fail ]", "[fatal]"]
|
2019-12-20 13:44:50 +03:00
|
|
|
error_lines = []
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
2019-09-13 16:37:06 +03:00
|
|
|
tail_lines = deque(maxlen=10)
|
2019-10-22 16:52:14 +03:00
|
|
|
with open(out_path, "r", errors="replace") as lines:
|
2019-04-26 18:27:27 +03:00
|
|
|
for line in lines:
|
2019-08-19 17:40:15 +03:00
|
|
|
stripped_line = line.rstrip()
|
|
|
|
tail_lines.append(stripped_line)
|
|
|
|
if any(x in stripped_line for x in error_filter):
|
|
|
|
LOG.error("{}: {}".format(out_path, stripped_line))
|
2019-12-20 13:44:50 +03:00
|
|
|
error_lines.append(stripped_line)
|
2020-04-02 01:22:55 +03:00
|
|
|
if error_lines:
|
|
|
|
LOG.info(
|
|
|
|
"{} errors found, printing end of output for context:", len(error_lines)
|
|
|
|
)
|
2019-08-19 17:40:15 +03:00
|
|
|
for line in tail_lines:
|
|
|
|
LOG.info(line)
|
2019-04-26 18:27:27 +03:00
|
|
|
except IOError:
|
|
|
|
LOG.exception("Could not check output {} for errors".format(out_path))
|
2020-04-02 01:22:55 +03:00
|
|
|
|
|
|
|
fatal_error_lines = []
|
|
|
|
try:
|
|
|
|
with open(err_path, "r", errors="replace") as lines:
|
|
|
|
fatal_error_lines = lines.readlines()
|
|
|
|
if fatal_error_lines:
|
|
|
|
LOG.error(f"Contents of {err_path}:\n{''.join(fatal_error_lines)}")
|
|
|
|
except IOError:
|
|
|
|
LOG.exception("Could not read err output {}".format(err_path))
|
|
|
|
|
|
|
|
return error_lines, fatal_error_lines
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
|
|
|
|
class CmdMixin(object):
|
|
|
|
def set_perf(self):
|
|
|
|
self.cmd = [
|
|
|
|
"perf",
|
|
|
|
"record",
|
|
|
|
"--freq=1000",
|
|
|
|
"--call-graph=dwarf",
|
|
|
|
"-s",
|
|
|
|
] + self.cmd
|
|
|
|
|
2020-04-08 15:42:11 +03:00
|
|
|
def _get_perf(self, lines):
|
|
|
|
pattern = "=> (.*)tx/s"
|
2019-08-08 18:11:46 +03:00
|
|
|
for line in lines:
|
|
|
|
LOG.debug(line.decode())
|
2020-04-08 15:42:11 +03:00
|
|
|
res = re.search(pattern, line.decode())
|
|
|
|
if res:
|
|
|
|
return float(res.group(1))
|
|
|
|
raise ValueError(f"No performance result found (pattern is {pattern})")
|
2019-08-08 18:11:46 +03:00
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
class SSHRemote(CmdMixin):
|
2019-06-06 00:43:40 +03:00
|
|
|
def __init__(
|
2019-08-22 13:08:15 +03:00
|
|
|
self,
|
|
|
|
name,
|
|
|
|
hostname,
|
|
|
|
exe_files,
|
|
|
|
data_files,
|
|
|
|
cmd,
|
|
|
|
workspace,
|
|
|
|
label,
|
2020-02-27 15:32:27 +03:00
|
|
|
common_dir,
|
2019-08-22 13:08:15 +03:00
|
|
|
env=None,
|
2019-09-04 14:03:33 +03:00
|
|
|
json_log_path=None,
|
2019-06-06 00:43:40 +03:00
|
|
|
):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
|
|
|
Runs a command on a remote host, through an SSH connection. A temporary
|
|
|
|
directory is created, and some files can be shipped over. The command is
|
|
|
|
run out of that directory.
|
|
|
|
|
|
|
|
Note that the name matters, since the temporary directory that will be first
|
2019-06-06 00:43:40 +03:00
|
|
|
deleted, then created and populated is workspace/label_name. There is deliberately no
|
2019-04-26 18:27:27 +03:00
|
|
|
cleanup on shutdown, to make debugging/inspection possible.
|
|
|
|
|
|
|
|
setup() connects, creates the directory and ships over the files
|
|
|
|
start() runs the specified command
|
|
|
|
stop() disconnects, which shuts down the command via SIGHUP
|
|
|
|
"""
|
|
|
|
self.hostname = hostname
|
2020-02-27 15:32:27 +03:00
|
|
|
self.exe_files = exe_files
|
|
|
|
self.data_files = data_files
|
2019-04-26 18:27:27 +03:00
|
|
|
self.cmd = cmd
|
|
|
|
self.client = paramiko.SSHClient()
|
2019-11-06 13:11:32 +03:00
|
|
|
# this client (proc_client) is used to execute commands on the remote host since the main client uses pty
|
|
|
|
self.proc_client = paramiko.SSHClient()
|
2019-04-26 18:27:27 +03:00
|
|
|
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
2019-11-06 13:11:32 +03:00
|
|
|
self.proc_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
2020-02-27 15:32:27 +03:00
|
|
|
self.common_dir = common_dir
|
2019-06-06 00:43:40 +03:00
|
|
|
self.root = os.path.join(workspace, label + "_" + name)
|
2019-05-21 11:27:22 +03:00
|
|
|
self.name = name
|
2019-05-24 15:46:17 +03:00
|
|
|
self.env = env or {}
|
2019-08-22 13:08:15 +03:00
|
|
|
self.out = os.path.join(self.root, "out")
|
|
|
|
self.err = os.path.join(self.root, "err")
|
2019-11-06 13:11:32 +03:00
|
|
|
self.suspension_proc = None
|
2020-04-08 15:42:11 +03:00
|
|
|
self.pid_file = f"{os.path.basename(self.cmd[0])}.pid"
|
2020-03-06 15:56:13 +03:00
|
|
|
self._pid = None
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def _rc(self, cmd):
|
|
|
|
LOG.info("[{}] {}".format(self.hostname, cmd))
|
|
|
|
_, stdout, _ = self.client.exec_command(cmd)
|
|
|
|
return stdout.channel.recv_exit_status()
|
|
|
|
|
|
|
|
def _connect(self):
|
|
|
|
LOG.debug("[{}] connect".format(self.hostname))
|
|
|
|
self.client.connect(self.hostname)
|
2019-11-06 13:11:32 +03:00
|
|
|
self.proc_client.connect(self.hostname)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def _setup_files(self):
|
|
|
|
assert self._rc("rm -rf {}".format(self.root)) == 0
|
2019-06-12 19:59:18 +03:00
|
|
|
assert self._rc("mkdir -p {}".format(self.root)) == 0
|
2020-02-27 15:32:27 +03:00
|
|
|
# For SSHRemote, both executable files (host and enclave) and data
|
|
|
|
# files (ledger, secrets) are copied to the remote
|
2019-04-26 18:27:27 +03:00
|
|
|
session = self.client.open_sftp()
|
2020-02-27 15:32:27 +03:00
|
|
|
for path in self.exe_files:
|
|
|
|
src_path = path
|
2019-10-23 10:49:22 +03:00
|
|
|
tgt_path = os.path.join(self.root, os.path.basename(path))
|
|
|
|
LOG.info("[{}] copy {} from {}".format(self.hostname, tgt_path, path))
|
|
|
|
session.put(path, tgt_path)
|
2020-04-01 11:06:55 +03:00
|
|
|
stat = os.stat(path)
|
|
|
|
session.chmod(tgt_path, stat.st_mode)
|
2020-02-27 15:32:27 +03:00
|
|
|
for path in self.data_files:
|
|
|
|
src_path = os.path.join(self.common_dir, path)
|
|
|
|
tgt_path = os.path.join(self.root, os.path.basename(src_path))
|
|
|
|
LOG.info("[{}] copy {} from {}".format(self.hostname, tgt_path, src_path))
|
|
|
|
session.put(src_path, tgt_path)
|
2019-04-26 18:27:27 +03:00
|
|
|
session.close()
|
|
|
|
|
2020-02-27 15:32:27 +03:00
|
|
|
def get(self, file_name, dst_path, timeout=60, target_name=None):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
2020-02-27 15:32:27 +03:00
|
|
|
Get file called `file_name` under the root of the remote. If the
|
2019-04-26 18:27:27 +03:00
|
|
|
file is missing, wait for timeout, and raise an exception.
|
|
|
|
|
2019-06-03 17:16:32 +03:00
|
|
|
If the file is present, it is copied to the CWD on the caller's
|
2020-02-27 15:32:27 +03:00
|
|
|
host, as `target_name` if it is set.
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
This call spins up a separate client because we don't want to interrupt
|
|
|
|
the main cmd that may be running.
|
|
|
|
"""
|
|
|
|
with sftp_session(self.hostname) as session:
|
2020-03-27 16:12:40 +03:00
|
|
|
end_time = time.time() + timeout
|
|
|
|
start_time = time.time()
|
|
|
|
while time.time() < end_time:
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
2020-02-27 15:32:27 +03:00
|
|
|
target_name = target_name or file_name
|
|
|
|
session.get(
|
|
|
|
os.path.join(self.root, file_name),
|
|
|
|
os.path.join(dst_path, target_name),
|
|
|
|
)
|
2019-04-26 18:27:27 +03:00
|
|
|
LOG.debug(
|
|
|
|
"[{}] found {} after {}s".format(
|
2020-03-27 16:12:40 +03:00
|
|
|
self.hostname, file_name, int(time.time() - start_time)
|
2019-04-26 18:27:27 +03:00
|
|
|
)
|
|
|
|
)
|
|
|
|
break
|
2020-02-27 15:32:27 +03:00
|
|
|
except FileNotFoundError:
|
2020-03-26 18:30:06 +03:00
|
|
|
time.sleep(0.1)
|
2019-04-26 18:27:27 +03:00
|
|
|
else:
|
2020-02-27 15:32:27 +03:00
|
|
|
raise ValueError(file_name)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def list_files(self, timeout=60):
|
|
|
|
files = []
|
|
|
|
with sftp_session(self.hostname) as session:
|
2020-03-27 16:12:40 +03:00
|
|
|
end_time = time.time() + timeout
|
|
|
|
while time.time() < end_time:
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
|
|
|
files = session.listdir(self.root)
|
|
|
|
|
|
|
|
break
|
|
|
|
except Exception:
|
2020-03-26 18:30:06 +03:00
|
|
|
time.sleep(0.1)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
else:
|
|
|
|
raise ValueError(self.root)
|
|
|
|
return files
|
|
|
|
|
|
|
|
def get_logs(self):
|
|
|
|
with sftp_session(self.hostname) as session:
|
2019-08-22 13:08:15 +03:00
|
|
|
for filepath in (self.err, self.out):
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
2020-02-27 15:32:27 +03:00
|
|
|
local_file_name = "{}_{}_{}".format(
|
|
|
|
self.hostname, self.name, os.path.basename(filepath),
|
2019-05-21 11:27:22 +03:00
|
|
|
)
|
2020-02-27 15:32:27 +03:00
|
|
|
dst_path = os.path.join(self.common_dir, local_file_name)
|
|
|
|
session.get(filepath, dst_path)
|
|
|
|
LOG.info("Downloaded {}".format(dst_path))
|
|
|
|
except FileNotFoundError:
|
2019-04-26 18:27:27 +03:00
|
|
|
LOG.warning(
|
2020-02-27 15:32:27 +03:00
|
|
|
"Failed to download {} to {} (host: {})".format(
|
|
|
|
filepath, dst_path, self.hostname
|
|
|
|
)
|
2019-04-26 18:27:27 +03:00
|
|
|
)
|
|
|
|
|
2019-08-28 17:32:40 +03:00
|
|
|
def start(self):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
|
|
|
Start cmd on the remote host. stdout and err are captured to file locally.
|
|
|
|
|
2019-06-06 00:43:40 +03:00
|
|
|
We create a pty on the remote host under which to run the command, so as to
|
2019-04-26 18:27:27 +03:00
|
|
|
get a SIGHUP on disconnection.
|
|
|
|
"""
|
|
|
|
cmd = self._cmd()
|
|
|
|
LOG.info("[{}] {}".format(self.hostname, cmd))
|
2020-03-06 15:56:13 +03:00
|
|
|
self.client.exec_command(cmd, get_pty=True)
|
|
|
|
self.pid()
|
|
|
|
|
|
|
|
def pid(self):
|
|
|
|
if self._pid is None:
|
2020-04-08 15:42:11 +03:00
|
|
|
pid_path = os.path.join(self.root, self.pid_file)
|
2020-03-06 15:56:13 +03:00
|
|
|
time_left = 3
|
|
|
|
while time_left > 0:
|
|
|
|
_, stdout, _ = self.proc_client.exec_command(f'cat "{pid_path}"')
|
2020-04-08 15:42:11 +03:00
|
|
|
res = stdout.read().strip()
|
|
|
|
if res:
|
|
|
|
self._pid = int(res)
|
2020-03-06 15:56:13 +03:00
|
|
|
break
|
|
|
|
time_left = max(time_left - 0.1, 0)
|
|
|
|
if not time_left:
|
|
|
|
raise TimeoutError("Failed to read PID from file")
|
|
|
|
time.sleep(0.1)
|
|
|
|
return self._pid
|
2019-11-06 13:11:32 +03:00
|
|
|
|
|
|
|
def suspend(self):
|
2020-03-06 15:56:13 +03:00
|
|
|
_, stdout, _ = self.proc_client.exec_command(f"kill -STOP {self.pid()}")
|
2019-11-06 13:11:32 +03:00
|
|
|
if stdout.channel.recv_exit_status() == 0:
|
|
|
|
LOG.info(f"Node {self.name} suspended...")
|
|
|
|
else:
|
|
|
|
raise RuntimeError(f"Node {self.name} could not be suspended")
|
|
|
|
|
|
|
|
def resume(self):
|
2020-03-06 15:56:13 +03:00
|
|
|
_, stdout, _ = self.proc_client.exec_command(f"kill -CONT {self.pid()}")
|
2019-11-06 13:11:32 +03:00
|
|
|
if stdout.channel.recv_exit_status() != 0:
|
|
|
|
raise RuntimeError(f"Could not resume node {self.name} from suspension!")
|
|
|
|
LOG.info(f"Node {self.name} resuming from suspension...")
|
2019-06-06 00:43:40 +03:00
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
def stop(self):
|
|
|
|
"""
|
|
|
|
Disconnect the client, and therefore shut down the command as well.
|
|
|
|
"""
|
|
|
|
LOG.info("[{}] closing".format(self.hostname))
|
|
|
|
self.get_logs()
|
2020-04-02 01:22:55 +03:00
|
|
|
errors, fatal_errors = log_errors(
|
2020-02-27 15:32:27 +03:00
|
|
|
os.path.join(self.common_dir, "{}_{}_out".format(self.hostname, self.name)),
|
|
|
|
os.path.join(self.common_dir, "{}_{}_err".format(self.hostname, self.name)),
|
2019-05-21 11:27:22 +03:00
|
|
|
)
|
2019-04-26 18:27:27 +03:00
|
|
|
self.client.close()
|
2019-11-06 13:11:32 +03:00
|
|
|
self.proc_client.close()
|
2020-04-02 01:22:55 +03:00
|
|
|
return errors, fatal_errors
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def setup(self):
|
|
|
|
"""
|
|
|
|
Connect to the remote host, empty the temporary directory if it exsits,
|
|
|
|
and populate it with the initial set of files.
|
|
|
|
"""
|
|
|
|
self._connect()
|
|
|
|
self._setup_files()
|
|
|
|
|
|
|
|
def _cmd(self):
|
2019-05-24 15:46:17 +03:00
|
|
|
env = " ".join(f"{key}={value}" for key, value in self.env.items())
|
|
|
|
cmd = " ".join(self.cmd)
|
2020-04-07 12:32:40 +03:00
|
|
|
return f"cd {self.root} && {env} {cmd} 1> {self.out} 2> {self.err} 0< /dev/null"
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def _dbg(self):
|
2019-08-22 13:08:15 +03:00
|
|
|
cmd = " ".join(self.cmd)
|
2019-11-25 15:11:07 +03:00
|
|
|
return f"cd {self.root} && {DBG} --args {cmd}"
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2019-06-07 18:35:22 +03:00
|
|
|
def _connect_new(self):
|
2019-04-26 18:27:27 +03:00
|
|
|
client = paramiko.SSHClient()
|
|
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
|
|
client.connect(self.hostname)
|
2019-06-07 18:35:22 +03:00
|
|
|
return client
|
|
|
|
|
2020-04-07 12:32:40 +03:00
|
|
|
def check_done(self):
|
2019-06-07 18:35:22 +03:00
|
|
|
client = self._connect_new()
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
2020-04-07 12:32:40 +03:00
|
|
|
_, stdout, _ = client.exec_command(f"ps -p {self.pid()}")
|
2020-04-08 15:42:11 +03:00
|
|
|
return stdout.channel.recv_exit_status() == 1
|
2019-04-26 18:27:27 +03:00
|
|
|
finally:
|
|
|
|
client.close()
|
|
|
|
|
2020-04-08 15:42:11 +03:00
|
|
|
def get_result(self, line_count):
|
2019-06-07 18:35:22 +03:00
|
|
|
client = self._connect_new()
|
|
|
|
try:
|
2020-04-07 12:32:40 +03:00
|
|
|
_, stdout, _ = client.exec_command(f"tail -{line_count} {self.out}")
|
2019-06-07 18:35:22 +03:00
|
|
|
if stdout.channel.recv_exit_status() == 0:
|
2020-04-08 15:42:11 +03:00
|
|
|
lines = stdout.read().splitlines()
|
|
|
|
result = lines[-line_count:]
|
|
|
|
return self._get_perf(result)
|
2019-06-07 18:35:22 +03:00
|
|
|
finally:
|
|
|
|
client.close()
|
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
@contextmanager
|
2020-04-17 12:03:29 +03:00
|
|
|
def ssh_remote(*args, **kwargs):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
|
|
|
Context Manager wrapper for SSHRemote
|
|
|
|
"""
|
2020-04-17 12:03:29 +03:00
|
|
|
remote = SSHRemote(*args, **kwargs)
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
|
|
|
remote.setup()
|
|
|
|
remote.start()
|
|
|
|
yield remote
|
|
|
|
finally:
|
|
|
|
remote.stop()
|
|
|
|
|
|
|
|
|
|
|
|
class LocalRemote(CmdMixin):
|
2019-06-06 00:43:40 +03:00
|
|
|
def __init__(
|
2019-08-22 13:08:15 +03:00
|
|
|
self,
|
|
|
|
name,
|
|
|
|
hostname,
|
|
|
|
exe_files,
|
|
|
|
data_files,
|
|
|
|
cmd,
|
|
|
|
workspace,
|
|
|
|
label,
|
2020-02-27 15:32:27 +03:00
|
|
|
common_dir,
|
2019-08-22 13:08:15 +03:00
|
|
|
env=None,
|
2019-09-04 14:03:33 +03:00
|
|
|
json_log_path=None,
|
2019-06-06 00:43:40 +03:00
|
|
|
):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
|
|
|
Local Equivalent to the SSHRemote
|
|
|
|
"""
|
|
|
|
self.hostname = hostname
|
2019-06-06 00:43:40 +03:00
|
|
|
self.exe_files = exe_files
|
|
|
|
self.data_files = data_files
|
2019-04-26 18:27:27 +03:00
|
|
|
self.cmd = cmd
|
2019-06-06 00:43:40 +03:00
|
|
|
self.root = os.path.join(workspace, label + "_" + name)
|
2020-02-27 15:32:27 +03:00
|
|
|
self.common_dir = common_dir
|
2019-04-26 18:27:27 +03:00
|
|
|
self.proc = None
|
|
|
|
self.stdout = None
|
|
|
|
self.stderr = None
|
2019-05-24 15:46:17 +03:00
|
|
|
self.env = env
|
2019-06-07 18:35:22 +03:00
|
|
|
self.name = name
|
2019-08-22 13:08:15 +03:00
|
|
|
self.out = os.path.join(self.root, "out")
|
|
|
|
self.err = os.path.join(self.root, "err")
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def _rc(self, cmd):
|
|
|
|
LOG.info("[{}] {}".format(self.hostname, cmd))
|
|
|
|
return subprocess.call(cmd, shell=True)
|
|
|
|
|
|
|
|
def _setup_files(self):
|
|
|
|
assert self._rc("rm -rf {}".format(self.root)) == 0
|
2019-06-06 00:43:40 +03:00
|
|
|
assert self._rc("mkdir -p {}".format(self.root)) == 0
|
|
|
|
for path in self.exe_files:
|
2020-03-03 20:43:12 +03:00
|
|
|
dst_path = os.path.normpath(os.path.join(self.root, os.path.basename(path)))
|
2020-02-06 18:27:18 +03:00
|
|
|
src_path = os.path.normpath(os.path.join(os.getcwd(), path))
|
2019-06-06 00:43:40 +03:00
|
|
|
assert self._rc("ln -s {} {}".format(src_path, dst_path)) == 0
|
|
|
|
for path in self.data_files:
|
2019-08-28 12:57:45 +03:00
|
|
|
dst_path = self.root
|
2020-02-27 15:32:27 +03:00
|
|
|
src_path = os.path.join(self.common_dir, path)
|
2019-06-06 00:43:40 +03:00
|
|
|
assert self._rc("cp {} {}".format(src_path, dst_path)) == 0
|
|
|
|
|
2020-02-27 15:32:27 +03:00
|
|
|
def get(self, file_name, dst_path, timeout=60, target_name=None):
|
|
|
|
path = os.path.join(self.root, file_name)
|
2020-03-27 16:12:40 +03:00
|
|
|
end_time = time.time() + timeout
|
|
|
|
while time.time() < end_time:
|
2019-04-26 18:27:27 +03:00
|
|
|
if os.path.exists(path):
|
|
|
|
break
|
2020-03-26 18:30:06 +03:00
|
|
|
time.sleep(0.1)
|
2019-04-26 18:27:27 +03:00
|
|
|
else:
|
|
|
|
raise ValueError(path)
|
2020-02-27 15:32:27 +03:00
|
|
|
target_name = target_name or file_name
|
|
|
|
assert (
|
|
|
|
self._rc("cp {} {}".format(path, os.path.join(dst_path, target_name))) == 0
|
|
|
|
)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def list_files(self):
|
|
|
|
return os.listdir(self.root)
|
|
|
|
|
2019-08-28 17:32:40 +03:00
|
|
|
def start(self, timeout=10):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
|
|
|
Start cmd. stdout and err are captured to file locally.
|
|
|
|
"""
|
|
|
|
cmd = self._cmd()
|
2019-05-24 15:46:17 +03:00
|
|
|
LOG.info(f"[{self.hostname}] {cmd} (env: {self.env})")
|
2019-08-22 13:08:15 +03:00
|
|
|
self.stdout = open(self.out, "wb")
|
|
|
|
self.stderr = open(self.err, "wb")
|
2019-06-14 12:56:35 +03:00
|
|
|
self.proc = popen(
|
2019-05-24 15:46:17 +03:00
|
|
|
self.cmd,
|
|
|
|
cwd=self.root,
|
|
|
|
stdout=self.stdout,
|
|
|
|
stderr=self.stderr,
|
|
|
|
env=self.env,
|
2019-04-26 18:27:27 +03:00
|
|
|
)
|
|
|
|
|
2019-11-06 13:11:32 +03:00
|
|
|
def suspend(self):
|
|
|
|
self.proc.send_signal(signal.SIGSTOP)
|
|
|
|
LOG.info(f"Node {self.name} suspended...")
|
|
|
|
|
|
|
|
def resume(self):
|
|
|
|
self.proc.send_signal(signal.SIGCONT)
|
|
|
|
LOG.info(f"Node {self.name} resuming from suspension...")
|
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
def stop(self):
|
|
|
|
"""
|
|
|
|
Disconnect the client, and therefore shut down the command as well.
|
|
|
|
"""
|
|
|
|
LOG.info("[{}] closing".format(self.hostname))
|
|
|
|
if self.proc:
|
|
|
|
self.proc.terminate()
|
2020-01-30 16:28:38 +03:00
|
|
|
self.proc.wait(10)
|
2019-04-26 18:27:27 +03:00
|
|
|
if self.stdout:
|
|
|
|
self.stdout.close()
|
|
|
|
if self.stderr:
|
|
|
|
self.stderr.close()
|
2019-12-20 13:44:50 +03:00
|
|
|
return log_errors(self.out, self.err)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def setup(self):
|
|
|
|
"""
|
|
|
|
Empty the temporary directory if it exists,
|
|
|
|
and populate it with the initial set of files.
|
|
|
|
"""
|
|
|
|
self._setup_files()
|
|
|
|
|
|
|
|
def _cmd(self):
|
2019-08-22 13:08:15 +03:00
|
|
|
cmd = " ".join(self.cmd)
|
2020-04-07 12:32:40 +03:00
|
|
|
return f"cd {self.root} && {cmd} 1> {self.out} 2> {self.err}"
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def _dbg(self):
|
2019-08-22 13:08:15 +03:00
|
|
|
cmd = " ".join(self.cmd)
|
|
|
|
return f"cd {self.root} && {DBG} --args {cmd}"
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2020-04-07 12:32:40 +03:00
|
|
|
def check_done(self):
|
|
|
|
return self.proc.poll() is not None
|
2019-10-11 17:14:31 +03:00
|
|
|
|
2020-04-08 15:42:11 +03:00
|
|
|
def get_result(self, line_count):
|
2019-08-22 13:08:15 +03:00
|
|
|
with open(self.out, "rb") as out:
|
2019-06-07 18:35:22 +03:00
|
|
|
lines = out.read().splitlines()
|
2020-04-07 12:32:40 +03:00
|
|
|
result = lines[-line_count:]
|
2020-04-08 15:42:11 +03:00
|
|
|
return self._get_perf(result)
|
2019-06-07 18:35:22 +03:00
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2019-05-24 15:46:17 +03:00
|
|
|
CCF_TO_OE_LOG_LEVEL = {
|
|
|
|
"trace": "VERBOSE",
|
|
|
|
"debug": "INFO",
|
|
|
|
"info": "WARNING",
|
|
|
|
"fail": "ERROR",
|
|
|
|
"fatal": "FATAL",
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
class CCFRemote(object):
|
|
|
|
BIN = "cchost"
|
|
|
|
DEPS = []
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
2019-08-28 12:57:45 +03:00
|
|
|
start_type,
|
2019-04-26 18:27:27 +03:00
|
|
|
lib_path,
|
2019-07-12 17:32:10 +03:00
|
|
|
local_node_id,
|
2019-04-26 18:27:27 +03:00
|
|
|
host,
|
|
|
|
pubhost,
|
2019-08-06 19:38:19 +03:00
|
|
|
node_port,
|
|
|
|
rpc_port,
|
2019-04-26 18:27:27 +03:00
|
|
|
remote_class,
|
|
|
|
enclave_type,
|
2019-06-06 00:43:40 +03:00
|
|
|
workspace,
|
|
|
|
label,
|
2020-02-27 15:32:27 +03:00
|
|
|
common_dir,
|
2019-08-28 12:57:45 +03:00
|
|
|
target_rpc_address=None,
|
2020-02-12 20:09:14 +03:00
|
|
|
members_info=None,
|
2020-01-08 16:44:19 +03:00
|
|
|
join_timer=None,
|
2019-08-28 12:57:45 +03:00
|
|
|
host_log_level="info",
|
2019-06-06 00:43:40 +03:00
|
|
|
sig_max_tx=1000,
|
|
|
|
sig_max_ms=1000,
|
2020-03-09 15:52:43 +03:00
|
|
|
raft_election_timeout=1000,
|
|
|
|
pbft_view_change_timeout=5000,
|
2019-11-19 14:05:44 +03:00
|
|
|
consensus="raft",
|
2020-01-24 13:26:06 +03:00
|
|
|
worker_threads=0,
|
2019-06-06 00:43:40 +03:00
|
|
|
memory_reserve_startup=0,
|
|
|
|
notify_server=None,
|
2019-08-28 12:57:45 +03:00
|
|
|
gov_script=None,
|
2019-04-26 18:27:27 +03:00
|
|
|
ledger_file=None,
|
|
|
|
sealed_secrets=None,
|
2019-09-04 14:03:33 +03:00
|
|
|
json_log_path=None,
|
2020-02-06 18:27:18 +03:00
|
|
|
binary_dir=".",
|
2019-04-26 18:27:27 +03:00
|
|
|
):
|
|
|
|
"""
|
|
|
|
Run a ccf binary on a remote host.
|
|
|
|
"""
|
2019-08-28 12:57:45 +03:00
|
|
|
self.start_type = start_type
|
2019-07-12 17:32:10 +03:00
|
|
|
self.local_node_id = local_node_id
|
2019-04-26 18:27:27 +03:00
|
|
|
self.host = host
|
|
|
|
self.pubhost = pubhost
|
2019-08-06 19:38:19 +03:00
|
|
|
self.node_port = node_port
|
|
|
|
self.rpc_port = rpc_port
|
2019-07-12 17:32:10 +03:00
|
|
|
self.pem = "{}.pem".format(local_node_id)
|
2020-02-06 18:27:18 +03:00
|
|
|
self.BIN = infra.path.build_bin_path(
|
|
|
|
self.BIN, enclave_type, binary_dir=binary_dir
|
|
|
|
)
|
2019-04-26 18:27:27 +03:00
|
|
|
self.ledger_file = ledger_file
|
|
|
|
self.ledger_file_name = (
|
2019-08-06 19:38:19 +03:00
|
|
|
os.path.basename(ledger_file) if ledger_file else f"{local_node_id}.ledger"
|
2019-04-26 18:27:27 +03:00
|
|
|
)
|
2020-02-27 15:32:27 +03:00
|
|
|
self.common_dir = common_dir
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2019-08-28 12:57:45 +03:00
|
|
|
exe_files = [self.BIN, lib_path] + self.DEPS
|
2020-01-20 16:47:03 +03:00
|
|
|
data_files = [self.ledger_file] if self.ledger_file else []
|
2019-08-28 12:57:45 +03:00
|
|
|
|
2020-04-01 11:06:55 +03:00
|
|
|
# exe_files may be relative or absolute. The remote implementation should
|
2020-02-07 17:29:02 +03:00
|
|
|
# copy (or symlink) to the target workspace, and then node will be able
|
|
|
|
# to reference the destination file locally in the target workspace.
|
2020-04-01 11:06:55 +03:00
|
|
|
bin_path = os.path.join(".", os.path.basename(self.BIN))
|
2020-02-07 17:29:02 +03:00
|
|
|
enclave_path = os.path.join(".", os.path.basename(lib_path))
|
|
|
|
|
2020-03-09 15:52:43 +03:00
|
|
|
election_timeout_arg = (
|
|
|
|
f"--pbft_view-change-timeout-ms={pbft_view_change_timeout}"
|
|
|
|
if consensus == "pbft"
|
|
|
|
else f"--raft-election-timeout-ms={raft_election_timeout}"
|
|
|
|
)
|
|
|
|
|
2019-08-28 17:32:40 +03:00
|
|
|
cmd = [
|
2020-04-01 11:06:55 +03:00
|
|
|
bin_path,
|
2020-02-07 17:29:02 +03:00
|
|
|
f"--enclave-file={enclave_path}",
|
2019-08-28 17:32:40 +03:00
|
|
|
f"--enclave-type={enclave_type}",
|
|
|
|
f"--node-address={host}:{node_port}",
|
|
|
|
f"--rpc-address={host}:{rpc_port}",
|
2020-01-08 12:43:22 +03:00
|
|
|
f"--public-rpc-address={pubhost}:{rpc_port}",
|
2019-08-28 17:32:40 +03:00
|
|
|
f"--ledger-file={self.ledger_file_name}",
|
|
|
|
f"--node-cert-file={self.pem}",
|
|
|
|
f"--host-log-level={host_log_level}",
|
2020-03-09 15:52:43 +03:00
|
|
|
election_timeout_arg,
|
2019-11-19 14:05:44 +03:00
|
|
|
f"--consensus={consensus}",
|
2020-03-27 16:12:40 +03:00
|
|
|
f"--worker-threads={worker_threads}",
|
2019-08-28 17:32:40 +03:00
|
|
|
]
|
|
|
|
|
2019-09-04 14:03:33 +03:00
|
|
|
if json_log_path:
|
|
|
|
log_file = f"{label}_{local_node_id}"
|
|
|
|
cmd += [f"--json-log-path={os.path.join(json_log_path, log_file)}"]
|
|
|
|
|
2019-08-28 17:32:40 +03:00
|
|
|
if sig_max_tx:
|
|
|
|
cmd += [f"--sig-max-tx={sig_max_tx}"]
|
|
|
|
|
|
|
|
if sig_max_ms:
|
|
|
|
cmd += [f"--sig-max-ms={sig_max_ms}"]
|
|
|
|
|
|
|
|
if memory_reserve_startup:
|
|
|
|
cmd += [f"--memory-reserve-startup={memory_reserve_startup}"]
|
|
|
|
|
|
|
|
if notify_server:
|
|
|
|
notify_server_host, *notify_server_port = notify_server.split(":")
|
|
|
|
|
|
|
|
if not notify_server_host or not (
|
|
|
|
notify_server_port and notify_server_port[0]
|
|
|
|
):
|
2019-06-06 00:43:40 +03:00
|
|
|
raise ValueError(
|
2019-08-28 17:32:40 +03:00
|
|
|
"Notification server host:port configuration is invalid"
|
2019-06-06 00:43:40 +03:00
|
|
|
)
|
|
|
|
|
2019-08-28 17:32:40 +03:00
|
|
|
cmd += [
|
|
|
|
f"--notify-server-address={notify_server_host}:{notify_server_port[0]}"
|
2019-06-06 00:43:40 +03:00
|
|
|
]
|
|
|
|
|
2019-09-10 13:34:21 +03:00
|
|
|
if start_type == StartType.new:
|
2019-08-28 17:32:40 +03:00
|
|
|
cmd += [
|
|
|
|
"start",
|
2019-08-29 12:33:29 +03:00
|
|
|
"--network-cert-file=networkcert.pem",
|
2019-08-28 17:32:40 +03:00
|
|
|
f"--gov-script={os.path.basename(gov_script)}",
|
|
|
|
]
|
2020-02-12 20:09:14 +03:00
|
|
|
if members_info is None:
|
2019-10-23 10:49:22 +03:00
|
|
|
raise ValueError(
|
2020-02-12 20:09:14 +03:00
|
|
|
"Starting node should be given at least one pair member certificate, member public encryption key"
|
2019-10-23 10:49:22 +03:00
|
|
|
)
|
2020-02-12 20:09:14 +03:00
|
|
|
for mc, mk in members_info:
|
|
|
|
cmd += [f"--member-info={mc},{mk}"]
|
|
|
|
data_files.append(mc)
|
|
|
|
data_files.append(mk)
|
2020-03-02 14:32:02 +03:00
|
|
|
data_files += [os.path.basename(gov_script)]
|
2019-08-28 17:32:40 +03:00
|
|
|
elif start_type == StartType.join:
|
|
|
|
cmd += [
|
|
|
|
"join",
|
|
|
|
"--network-cert-file=networkcert.pem",
|
|
|
|
f"--target-rpc-address={target_rpc_address}",
|
2020-01-08 16:44:19 +03:00
|
|
|
f"--join-timer={join_timer}",
|
2019-08-28 17:32:40 +03:00
|
|
|
]
|
|
|
|
data_files += ["networkcert.pem"]
|
|
|
|
elif start_type == StartType.recover:
|
2019-08-29 12:33:29 +03:00
|
|
|
cmd += ["recover", "--network-cert-file=networkcert.pem"]
|
2019-08-28 17:32:40 +03:00
|
|
|
else:
|
|
|
|
raise ValueError(
|
|
|
|
f"Unexpected CCFRemote start type {start_type}. Should be start, join or recover"
|
|
|
|
)
|
2019-05-13 16:02:44 +03:00
|
|
|
|
2019-08-16 21:52:55 +03:00
|
|
|
# Necessary for the az-dcap-client >=1.1 (https://github.com/microsoft/Azure-DCAP-Client/issues/84)
|
|
|
|
env = {"HOME": os.environ["HOME"]}
|
2019-05-28 18:33:57 +03:00
|
|
|
self.profraw = None
|
2019-08-09 15:53:46 +03:00
|
|
|
if enclave_type == "virtual":
|
|
|
|
env["UBSAN_OPTIONS"] = "print_stacktrace=1"
|
|
|
|
if coverage_enabled(lib_path):
|
|
|
|
self.profraw = f"{uuid.uuid4()}-{local_node_id}_{os.path.basename(lib_path)}.profraw"
|
|
|
|
env["LLVM_PROFILE_FILE"] = self.profraw
|
2019-05-28 18:33:57 +03:00
|
|
|
|
2019-08-28 12:57:45 +03:00
|
|
|
oe_log_level = CCF_TO_OE_LOG_LEVEL.get(host_log_level)
|
2019-05-24 15:46:17 +03:00
|
|
|
if oe_log_level:
|
|
|
|
env["OE_LOG_LEVEL"] = oe_log_level
|
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
self.remote = remote_class(
|
2019-08-29 18:01:57 +03:00
|
|
|
local_node_id,
|
|
|
|
host,
|
|
|
|
exe_files,
|
|
|
|
data_files,
|
|
|
|
cmd,
|
|
|
|
workspace,
|
|
|
|
label,
|
2020-02-27 15:32:27 +03:00
|
|
|
common_dir,
|
2019-08-29 18:01:57 +03:00
|
|
|
env,
|
2019-09-04 14:03:33 +03:00
|
|
|
json_log_path,
|
2019-04-26 18:27:27 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
def setup(self):
|
|
|
|
self.remote.setup()
|
|
|
|
|
|
|
|
def start(self):
|
2019-08-28 17:32:40 +03:00
|
|
|
self.remote.start()
|
2019-09-11 16:28:30 +03:00
|
|
|
|
2019-11-06 13:11:32 +03:00
|
|
|
def suspend(self):
|
|
|
|
return self.remote.suspend()
|
|
|
|
|
|
|
|
def resume(self):
|
|
|
|
self.remote.resume()
|
|
|
|
|
2020-02-27 15:32:27 +03:00
|
|
|
def get_startup_files(self, dst_path):
|
|
|
|
self.remote.get(self.pem, dst_path)
|
2019-09-10 13:34:21 +03:00
|
|
|
if self.start_type in {StartType.new, StartType.recover}:
|
2020-02-27 15:32:27 +03:00
|
|
|
self.remote.get("networkcert.pem", dst_path)
|
2020-03-11 11:39:20 +03:00
|
|
|
self.remote.get("network_enc_pubk.pem", dst_path)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def debug_node_cmd(self):
|
|
|
|
return self.remote._dbg()
|
|
|
|
|
|
|
|
def stop(self):
|
2020-04-02 01:22:55 +03:00
|
|
|
errors, fatal_errors = [], []
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
2020-04-02 01:22:55 +03:00
|
|
|
errors, fatal_errors = self.remote.stop()
|
2019-04-26 18:27:27 +03:00
|
|
|
except Exception:
|
2019-07-12 17:32:10 +03:00
|
|
|
LOG.exception("Failed to shut down {} cleanly".format(self.local_node_id))
|
2019-05-28 18:33:57 +03:00
|
|
|
if self.profraw:
|
|
|
|
try:
|
2020-02-27 15:32:27 +03:00
|
|
|
self.remote.get(self.profraw, self.common_dir)
|
2019-05-28 18:33:57 +03:00
|
|
|
except Exception:
|
|
|
|
LOG.info(f"Could not retrieve {self.profraw}")
|
2020-04-02 01:22:55 +03:00
|
|
|
return errors, fatal_errors
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2020-04-07 12:32:40 +03:00
|
|
|
def check_done(self):
|
|
|
|
return self.remote.check_done()
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def set_perf(self):
|
|
|
|
self.remote.set_perf()
|
|
|
|
|
|
|
|
def get_sealed_secrets(self):
|
|
|
|
files = self.remote.list_files()
|
|
|
|
sealed_secrets_files = []
|
|
|
|
for f in files:
|
|
|
|
if f.startswith("sealed_secrets."):
|
|
|
|
sealed_secrets_files.append(f)
|
|
|
|
|
|
|
|
latest_sealed_secrets = sorted(sealed_secrets_files, reverse=True)[0]
|
2020-02-27 15:32:27 +03:00
|
|
|
self.remote.get(latest_sealed_secrets, self.common_dir)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2020-02-27 15:32:27 +03:00
|
|
|
return os.path.join(self.common_dir, latest_sealed_secrets)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
def get_ledger(self):
|
2020-02-27 15:32:27 +03:00
|
|
|
self.remote.get(self.ledger_file_name, self.common_dir)
|
2019-04-26 18:27:27 +03:00
|
|
|
return self.ledger_file_name
|
|
|
|
|
2019-06-06 00:43:40 +03:00
|
|
|
def ledger_path(self):
|
|
|
|
return os.path.join(self.remote.root, self.ledger_file_name)
|
|
|
|
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
@contextmanager
|
2020-04-17 12:03:29 +03:00
|
|
|
def ccf_remote(*args, **kwargs):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
|
|
|
Context Manager wrapper for CCFRemote
|
|
|
|
"""
|
2020-04-17 12:03:29 +03:00
|
|
|
remote = CCFRemote(*args, **kwargs)
|
2019-04-26 18:27:27 +03:00
|
|
|
try:
|
|
|
|
remote.setup()
|
|
|
|
remote.start()
|
|
|
|
yield remote
|
|
|
|
finally:
|
|
|
|
remote.stop()
|
|
|
|
|
|
|
|
|
2019-08-28 12:57:45 +03:00
|
|
|
class StartType(Enum):
|
2019-09-10 13:34:21 +03:00
|
|
|
new = 0
|
2019-08-28 12:57:45 +03:00
|
|
|
join = 1
|
|
|
|
recover = 2
|