mutual trust
This commit is contained in:
Родитель
70962cf9fb
Коммит
32a40ca298
|
@ -145,6 +145,7 @@ void* Process::ForkThread(void* arg)
|
|||
{
|
||||
Process* const p = static_cast<Process* const>(arg);
|
||||
std::string path;
|
||||
std::string nodesNum;
|
||||
|
||||
Start:
|
||||
int ret = p->CreateTaskFolder();
|
||||
|
@ -182,9 +183,16 @@ Start:
|
|||
Logger::Error(p->jobId, p->taskId, p->requeueCount, "Failed to create environment file for docker task.");
|
||||
goto Final;
|
||||
}
|
||||
|
||||
auto it = p->environments.find(std::string("CCP_NODES"));
|
||||
if (it != p->environments.end())
|
||||
{
|
||||
std::string ccp_nodes = it->second;
|
||||
nodesNum = ccp_nodes.substr(0, ccp_nodes.find(' '));
|
||||
}
|
||||
}
|
||||
|
||||
if (0 != p->ExecuteCommand("/bin/bash", "PrepareTask.sh", p->taskExecutionId, p->GetAffinity(), p->taskFolder, p->userName, p->dockerImage))
|
||||
if (0 != p->ExecuteCommand("/bin/bash", "PrepareTask.sh", p->taskExecutionId, p->GetAffinity(), p->taskFolder, p->userName, p->dockerImage, nodesNum))
|
||||
{
|
||||
goto Final;
|
||||
}
|
||||
|
|
|
@ -132,27 +132,41 @@ pplx::task<json::value> RemoteExecutor::StartTask(StartTaskArgs&& args, std::str
|
|||
taskInfo->Affinity = args.StartInfo.Affinity;
|
||||
taskInfo->SetTaskRequeueCount(args.StartInfo.TaskRequeueCount);
|
||||
|
||||
std::string userName = "root";
|
||||
auto jobUser = this->jobUsers.find(args.JobId);
|
||||
if (jobUser == this->jobUsers.end())
|
||||
{
|
||||
this->jobTaskTable.RemoveJob(args.JobId);
|
||||
throw std::runtime_error(String::Join(" ", "Job", args.JobId, "was not started on this node."));
|
||||
}
|
||||
else
|
||||
{
|
||||
userName = std::get<0>(jobUser->second);
|
||||
}
|
||||
|
||||
if (args.StartInfo.CommandLine.empty())
|
||||
{
|
||||
Logger::Info(args.JobId, args.TaskId, args.StartInfo.TaskRequeueCount, "MPI non-master task found, skip creating the process.");
|
||||
std::string dockerImage = args.StartInfo.EnvironmentVariables["CCP_DOCKERIMAGE"];
|
||||
if (!dockerImage.empty())
|
||||
{
|
||||
taskInfo->IsPrimaryTask = false;
|
||||
std::string output;
|
||||
if (0 != System::ExecuteCommandOut(output, "/bin/bash", "StartMpiContainer.sh", taskInfo->TaskId, userName, dockerImage))
|
||||
{
|
||||
Logger::Info(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetTaskRequeueCount(), "Start MPI container successfully.");
|
||||
}
|
||||
else
|
||||
{
|
||||
Logger::Error(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetTaskRequeueCount(), "Start MPI container failed. {0}", output);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (this->processes.find(taskInfo->ProcessKey) == this->processes.end() &&
|
||||
isNewEntry)
|
||||
{
|
||||
std::string userName = "root";
|
||||
auto jobUser = this->jobUsers.find(args.JobId);
|
||||
if (jobUser == this->jobUsers.end())
|
||||
{
|
||||
this->jobTaskTable.RemoveJob(args.JobId);
|
||||
throw std::runtime_error(String::Join(" ", "Job", args.JobId, "was not started on this node."));
|
||||
}
|
||||
else
|
||||
{
|
||||
userName = std::get<0>(jobUser->second);
|
||||
}
|
||||
|
||||
auto process = std::shared_ptr<Process>(new Process(
|
||||
taskInfo->JobId,
|
||||
taskInfo->TaskId,
|
||||
|
@ -264,7 +278,7 @@ pplx::task<json::value> RemoteExecutor::EndJob(hpc::arguments::EndJobArgs&& args
|
|||
{
|
||||
const auto* stat = this->TerminateTask(
|
||||
args.JobId, taskPair.first, taskInfo->GetTaskRequeueCount(),
|
||||
taskInfo->ProcessKey, (int)ErrorCodes::EndJobExitCode, true);
|
||||
taskInfo->ProcessKey, (int)ErrorCodes::EndJobExitCode, true, taskInfo->IsPrimaryTask == false);
|
||||
Logger::Debug(args.JobId, taskPair.first, taskInfo->GetTaskRequeueCount(), "EndJob: Terminating task");
|
||||
if (stat != nullptr)
|
||||
{
|
||||
|
@ -397,7 +411,8 @@ pplx::task<json::value> RemoteExecutor::EndTask(hpc::arguments::EndTaskArgs&& ar
|
|||
args.JobId, args.TaskId, taskInfo->GetTaskRequeueCount(),
|
||||
taskInfo->ProcessKey,
|
||||
(int)ErrorCodes::EndTaskExitCode,
|
||||
args.TaskCancelGracePeriodSeconds == 0);
|
||||
args.TaskCancelGracePeriodSeconds == 0
|
||||
taskInfo->IsPrimaryTask == false);
|
||||
|
||||
taskInfo->ExitCode = (int)ErrorCodes::EndTaskExitCode;
|
||||
|
||||
|
@ -466,7 +481,8 @@ void* RemoteExecutor::GracePeriodElapsed(void* data)
|
|||
jobId, taskId, requeueCount,
|
||||
processKey,
|
||||
(int)ErrorCodes::EndTaskExitCode,
|
||||
true);
|
||||
true,
|
||||
false);
|
||||
|
||||
if (stat != nullptr)
|
||||
{
|
||||
|
@ -670,8 +686,23 @@ pplx::task<json::value> RemoteExecutor::MetricConfig(
|
|||
|
||||
const ProcessStatistics* RemoteExecutor::TerminateTask(
|
||||
int jobId, int taskId, int requeueCount,
|
||||
uint64_t processKey, int exitCode, bool forced)
|
||||
uint64_t processKey, int exitCode, bool forced, bool mpiDockerTask)
|
||||
{
|
||||
if (mpiDockerTask)
|
||||
{
|
||||
std::string output;
|
||||
if (0 != System::ExecuteCommandOut(output, "/bin/bash", "StopMpiContainer.sh", taskId))
|
||||
{
|
||||
Logger::Info(jobId, taskId, requeueCount, "Stop MPI container successfully.");
|
||||
}
|
||||
else
|
||||
{
|
||||
Logger::Error(jobId, taskId, requeueCount, "Stop MPI container failed. {0}", output);
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto p = this->processes.find(processKey);
|
||||
// Logger::Debug(
|
||||
// jobId, taskId, requeueCount,
|
||||
|
|
|
@ -42,7 +42,7 @@ namespace hpc
|
|||
|
||||
const hpc::data::ProcessStatistics* TerminateTask(
|
||||
int jobId, int taskId, int requeueCount,
|
||||
uint64_t processKey, int exitCode, bool forced);
|
||||
uint64_t processKey, int exitCode, bool forced, bool mpiDockerTask);
|
||||
|
||||
void ReportTaskCompletion(int jobId, int taskId, int taskRequeueCount, json::value jsonBody, const std::string& callbackUri);
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
docker version > /dev/nul
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Cleaning up docker containers..."
|
||||
docker rm -f $(docker ps -a -q -f name=^/$(GetContainerName))
|
||||
docker rm -f $(docker ps -a -q -f name=^/$(GetContainerName)) 2>/dev/nul
|
||||
fi
|
||||
|
||||
if $CGInstalled; then
|
||||
|
|
|
@ -13,6 +13,7 @@ isDockerTask=$(CheckNotEmpty $3)
|
|||
|
||||
if [ "$isDockerTask" == "1" ]; then
|
||||
docker rm -f $(GetContainerName $taskId)
|
||||
/etc/init.d/ssh start
|
||||
exit
|
||||
fi
|
||||
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
#!/bin/bash
|
||||
|
||||
. common.sh
|
||||
|
||||
containerName=$1
|
||||
userName=$2
|
||||
|
||||
userSshDir=$(GetUserSshDir $userName)
|
||||
|
||||
docker exec $containerName /bin/cp -r $tmpSshDir $userSshDir 2>&1
|
||||
ec1=$?
|
||||
docker exec $containerName /bin/chown -R $userName $userSshDir 2>&1
|
||||
ec2=$?
|
||||
if [ $ec1 -ne 0 ] || [ $ec2 -ne 0 ]
|
||||
then
|
||||
echo "Failed to set container ssh key"
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
/etc/init.d/ssh stop
|
||||
ec=$?
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
echo "Failed to stop host ssh server"
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
docker exec $containerName /etc/init.d/ssh start
|
||||
ec=$?
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
echo "Failed to start container ssh server"
|
||||
exit $ec
|
||||
fi
|
|
@ -10,17 +10,31 @@ affinity=$2
|
|||
|
||||
# for docker command
|
||||
taskFolder=$3
|
||||
dockerImage=$4
|
||||
userName=$4
|
||||
dockerImage=$5
|
||||
nodeNum=$6
|
||||
|
||||
isDockerTask=$(CheckNotEmpty $dockerImage)
|
||||
isMpiPrimaryTask=[]
|
||||
|
||||
if [ "$isDockerTask" == "1" ]; then
|
||||
placeholderCommand="/bin/bash"
|
||||
docker run -id --name $(GetContainerName $taskId) --cpuset-cpus $affinity -v $taskFolder:$taskFolder:z $dockerImage $placeholderCommand 2>&1
|
||||
containerName=$(GetContainerName $taskId)
|
||||
if [ "$nodeNum" -gt 1 ]; then
|
||||
mpiContainerStartOption=$(GetMpiContainerStartOption $userName)
|
||||
fi
|
||||
|
||||
docker run -id \
|
||||
--name $containerName \
|
||||
--cpuset-cpus $affinity \
|
||||
--env-file $taskFolder/environments \
|
||||
-v $taskFolder:$taskFolder:z \
|
||||
$mpiContainerStartOption \
|
||||
$dockerImage $containerPlaceholderCommand 2>&1
|
||||
|
||||
ec=$?
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
echo "Failed to start docker container"
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
|
@ -34,6 +48,14 @@ if [ "$isDockerTask" == "1" ]; then
|
|||
echo "Failed to set docker container placeholder $tasks"
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
docker exec $containerName useradd -m $userName
|
||||
docker exec $containerName chown $userName $taskFolder
|
||||
if [ "$nodeNum" -gt 1 ]; then
|
||||
/bin/bash MpiContainerPreparation.sh $containerName $userName
|
||||
fi
|
||||
|
||||
exit
|
||||
fi
|
||||
|
||||
if $CGInstalled; then
|
||||
|
|
|
@ -1,28 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
. common.sh
|
||||
|
||||
[ -z "$1" ] && echo "task execution id not specified" && exit 202
|
||||
[ -z "$2" ] && echo "run.sh not specified" && exit 202
|
||||
[ -z "$3" ] && echo "user name not specified" && exit 202
|
||||
|
||||
rootLogFolder=/opt/hpcnodemanager/logs
|
||||
trustLogFile="${rootLogFolder}/${1}_trust.txt"
|
||||
failedTrustLogFile="${rootLogFolder}/failed_${1}_trust.txt"
|
||||
trustKeysDir="${rootLogFolder}/${1}_${3}/"
|
||||
sshFolder="/home/${3}/.ssh/"
|
||||
if [ "$3" = "root" ]; then
|
||||
sshFolder=/root/.ssh/
|
||||
fi
|
||||
|
||||
/bin/bash WaitForTrust.sh "$3" "$1" > "$trustLogFile" 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
mv "$trustLogFile" "$failedTrustLogFile"
|
||||
mkdir -p "$trustKeysDir" > /dev/null
|
||||
cp -rf "${sshFolder}*" "$trustKeysDir"
|
||||
|
||||
exit 203
|
||||
fi
|
||||
|
||||
/bin/bash "$2"
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash
|
||||
|
||||
. common.sh
|
||||
|
||||
[ -z "$1" ] && echo "task id not specified" && exit 202
|
||||
[ -z "$2" ] && echo "user name not specified" && exit 202
|
||||
[ -z "$3" ] && echo "docker image not specified" && exit 202
|
||||
|
||||
taskId=$1
|
||||
userName=$2
|
||||
dockerImage=$3
|
||||
|
||||
containerName=$(GetContainerName "MPI_$taskId")
|
||||
mpiContainerStartOption=$(GetMpiContainerStartOption $userName)
|
||||
|
||||
docker run -id \
|
||||
--name $containerName \
|
||||
$mpiContainerStartOption \
|
||||
$dockerImage $containerPlaceholderCommand 2>&1
|
||||
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
docker exec $containerName useradd -m $userName
|
||||
/bin/bash MpiContainerPreparation.sh $containerName $userName
|
|
@ -12,10 +12,12 @@ userName=$3
|
|||
|
||||
dockerImage=$4
|
||||
isDockerTask=$(CheckNotEmpty $dockerImage)
|
||||
runDir=$(GetParentDir $runPath)
|
||||
cp {TestMutualTrust.sh,WaitForTrust.sh} $runDir
|
||||
|
||||
if [ "$isDockerTask" == "1" ]; then
|
||||
containerName=$(GetContainerName $taskId)
|
||||
docker exec $containerName useradd $userName 2> /dev/null
|
||||
docker exec $containerName /bin/bash -c "$runDir/TestMutualTrust.sh $taskId $runDir $userName" &&\
|
||||
docker exec -u $userName $containerName /bin/bash $runPath
|
||||
exit
|
||||
fi
|
||||
|
@ -23,8 +25,9 @@ fi
|
|||
if $CGInstalled; then
|
||||
groupName=$(GetCGroupName "$taskId")
|
||||
group=$CGroupSubSys:$groupName
|
||||
cgexec -g "$group" /bin/bash RunTask.sh "$@"
|
||||
cgexec -g "$group" /bin/bash $runDir/TestMutualTrust.sh "$taskId" "$runDir" "$userName" &&\
|
||||
cgexec -g "$group" /bin/bash $runPath
|
||||
else
|
||||
/bin/bash RunTask.sh "$@"
|
||||
/bin/bash $runDir/TestMutualTrust.sh "$taskId" "$runDir" "$userName" &&\
|
||||
/bin/bash $runPath
|
||||
fi
|
||||
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
. common.sh
|
||||
|
||||
[ -z "$1" ] && echo "task id not specified" && exit 202
|
||||
|
||||
taskId=$1
|
||||
|
||||
containerName=$(GetContainerName "MPI_$taskId")
|
||||
|
||||
docker rm -f $containerName
|
||||
/etc/init.d/ssh start
|
|
@ -0,0 +1,25 @@
|
|||
#!/bin/bash
|
||||
|
||||
[ -z "$1" ] && echo "task execution id not specified" && exit 202
|
||||
[ -z "$2" ] && echo "run directory not specified" && exit 202
|
||||
[ -z "$3" ] && echo "user name not specified" && exit 202
|
||||
|
||||
# use parent directory of run.sh as run directory
|
||||
runDir="$2"
|
||||
trustLogFile="${runDir}/${1}_trust.txt"
|
||||
failedTrustLogFile="${runDir}/failed_${1}_trust.txt"
|
||||
trustKeysDir="${runDir}/${1}_${3}/"
|
||||
sshFolder="/home/${3}/.ssh/"
|
||||
if [ "$3" = "root" ]; then
|
||||
sshFolder=/root/.ssh/
|
||||
fi
|
||||
|
||||
mkdir -p "$runDir" > /dev/null
|
||||
/bin/bash $runDir/WaitForTrust.sh "$3" "$1" "$runDir" > "$trustLogFile" 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Mutual trust failure." >&2
|
||||
mkdir -p "$trustKeysDir" > /dev/null
|
||||
cp -rf "${sshFolder}*" "$trustKeysDir"
|
||||
|
||||
exit 203
|
||||
fi
|
|
@ -15,6 +15,7 @@ nodes=()
|
|||
testpids=()
|
||||
userName=$1
|
||||
taskExecutionId=$2
|
||||
rootLogFolder=$3
|
||||
totalWaitTime=90
|
||||
singleTryWaitTime=45
|
||||
|
||||
|
@ -36,7 +37,7 @@ fi
|
|||
|
||||
for node in "${nodes[@]}"
|
||||
do
|
||||
nodeLogFile=logs/ssh_${taskExecutionId}_${node}.log
|
||||
nodeLogFile=$rootLogFolder/ssh_${taskExecutionId}_${node}.log
|
||||
touch "$nodeLogFile"
|
||||
chown "$userName" "$nodeLogFile"
|
||||
echo >> "$nodeLogFile"
|
||||
|
@ -66,7 +67,7 @@ do
|
|||
|
||||
if [ $exitcode != 0 ]; then
|
||||
finished=false
|
||||
nodeLogFile=logs/ssh_${taskExecutionId}_${nodes[$i]}.log
|
||||
nodeLogFile=$rootLogFolder/ssh_${taskExecutionId}_${nodes[$i]}.log
|
||||
echo >> "$nodeLogFile"
|
||||
echo ">> SECONDS=$SECONDS" >> "$nodeLogFile"
|
||||
sync
|
||||
|
@ -95,7 +96,6 @@ else
|
|||
echo "not all trusted task=$taskExecutionId. If you pre-configured any ssh keys, make sure they are working for establishing trust relationship between nodes." > /dev/stderr
|
||||
echo
|
||||
echo "Saving logs"
|
||||
rootLogFolder=/opt/hpcnodemanager/logs
|
||||
trustKeysDir=${rootLogFolder}/${taskExecutionId}_${userName}/
|
||||
sshFolder=/home/${userName}/.ssh/
|
||||
if [ "$userName" = "root" ]; then
|
||||
|
|
|
@ -88,4 +88,42 @@ function CheckNotEmpty
|
|||
else
|
||||
echo 0
|
||||
fi
|
||||
}
|
||||
}
|
||||
|
||||
function CheckUserSshKeyExistence
|
||||
{
|
||||
local userName=$1
|
||||
local userSshDir=$(GetUserSshDir $userName)
|
||||
if [ -f $userSshDir/id_rsa ]; then
|
||||
echo 1
|
||||
else
|
||||
echo 0
|
||||
fi
|
||||
}
|
||||
|
||||
function GetParentDir
|
||||
{
|
||||
echo "$(echo $1 | sed 's/\/[^\/]*$//g')"
|
||||
}
|
||||
|
||||
function GetUserSshDir
|
||||
{
|
||||
local userName=$1
|
||||
if [ "$userName" == "root" ]; then
|
||||
echo "/root/.ssh"
|
||||
else
|
||||
echo "/home/$userName/.ssh"
|
||||
fi
|
||||
}
|
||||
|
||||
tmpSshDir="/tmp/hpcSshKey/.ssh"
|
||||
containerPlaceholderCommand="/bin/bash"
|
||||
|
||||
function GetMpiContainerStartOption
|
||||
{
|
||||
local userName=$1
|
||||
local userSshDir=$(GetUserSshDir $userName)
|
||||
local sshDirMountOption="-v $userSshDir:$tmpSshDir:ro"
|
||||
local networkHostOption="--network host"
|
||||
echo "$sshDirMountOption $networkHostOption"
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче