backup
This commit is contained in:
Родитель
6a7ef893c6
Коммит
7c94523eb2
|
@ -144,14 +144,12 @@ void Process::OnCompletedInternal()
|
|||
void* Process::ForkThread(void* arg)
|
||||
{
|
||||
Process* const p = static_cast<Process* const>(arg);
|
||||
int ret;
|
||||
std::string path;
|
||||
auto dockerImageIt = p->environments.find("CCP_DOCKER_IMAGE");
|
||||
bool isDockerTask = dockerImageIt != p->environments.end() && !dockerImageIt->second.empty();
|
||||
auto disableCgroupIt = p->environments.find("CCP_DISABLE_CGROUP");
|
||||
bool disableCgroup = disableCgroupIt != p->environments.end() && disableCgroupIt->second == "1";
|
||||
std::string envFile;
|
||||
|
||||
Start:
|
||||
int ret = p->CreateTaskFolder();
|
||||
ret = p->CreateTaskFolder();
|
||||
if (ret != 0)
|
||||
{
|
||||
p->message << "Task " << p->taskId << ": error when create task folder, ret " << ret << std::endl;
|
||||
|
@ -171,33 +169,18 @@ Start:
|
|||
goto Final;
|
||||
}
|
||||
|
||||
if (isDockerTask)
|
||||
p -> environmentsBuffer.clear();
|
||||
std::transform(
|
||||
p->environments.cbegin(),
|
||||
p->environments.cend(),
|
||||
std::back_inserter(p->environmentsBuffer),
|
||||
[](const auto& v) { return String::Join("=", v.first, v.second); });
|
||||
envFile = p->taskFolder + "/environments";
|
||||
ret = System::WriteStringToFile(envFile, String::Join<'\n'>(p->environmentsBuffer));
|
||||
if (ret != 0)
|
||||
{
|
||||
p -> environmentsBuffer.clear();
|
||||
std::transform(
|
||||
p->environments.cbegin(),
|
||||
p->environments.cend(),
|
||||
std::back_inserter(p->environmentsBuffer),
|
||||
[](const auto& v) { return String::Join("=", v.first, v.second); });
|
||||
|
||||
std::string envFile = p->taskFolder + "/environments";
|
||||
int ret = System::WriteStringToFile(envFile, String::Join<'\n'>(p->environmentsBuffer));
|
||||
if (ret != 0)
|
||||
{
|
||||
Logger::Error(p->jobId, p->taskId, p->requeueCount, "Failed to create environment file for docker task. Exitcode: {0}", ret);
|
||||
goto Final;
|
||||
}
|
||||
}
|
||||
|
||||
if (disableCgroup)
|
||||
{
|
||||
std::string flagFile = p->taskFolder + "/disable_cgroup";
|
||||
int ret = System::WriteStringToFile(flagFile, "1");
|
||||
if (ret != 0)
|
||||
{
|
||||
Logger::Error(p->jobId, p->taskId, p->requeueCount, "Failed to create flag file to disable cgroup. Exitcode: {0}", ret);
|
||||
goto Final;
|
||||
}
|
||||
Logger::Error(p->jobId, p->taskId, p->requeueCount, "Failed to create task environments file {0}. Exitcode: {1}", envFile, ret);
|
||||
goto Final;
|
||||
}
|
||||
|
||||
if (0 != p->ExecuteCommand("/bin/bash", "PrepareTask.sh", p->taskExecutionId, p->GetAffinity(), p->taskFolder, p->userName))
|
||||
|
|
|
@ -10,7 +10,7 @@ taskId=$1
|
|||
processId=$2
|
||||
taskFolder=$3
|
||||
|
||||
isDockerTask=$(CheckDockerEnvFileExist $taskFolder)
|
||||
isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder)
|
||||
if $isDockerTask; then
|
||||
isDebugMode=$(CheckDockerDebugMode $taskFolder)
|
||||
if ! $isDebugMode; then
|
||||
|
@ -32,9 +32,9 @@ if $isDockerTask; then
|
|||
exit
|
||||
fi
|
||||
|
||||
/bin/bash ./EndTask.sh "$taskId" "$processId" "1"
|
||||
/bin/bash ./EndTask.sh "$taskId" "$processId" "1" "$taskFolder"
|
||||
|
||||
cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder)
|
||||
cgDisabled=$(CheckDisableCgroupSet $taskFolder)
|
||||
if $CGInstalled && ! $cgDisabled; then
|
||||
groupName=$(GetCGroupName "$taskId")
|
||||
group=$CGroupSubSys:$groupName
|
||||
|
|
|
@ -12,8 +12,8 @@ processId=$2
|
|||
forced=$3
|
||||
taskFolder=$4
|
||||
|
||||
isDockerTask=$(CheckDockerEnvFileExist $taskFolder)
|
||||
cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder)
|
||||
isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder)
|
||||
cgDisabled=$(CheckDisableCgroupSet $taskFolder)
|
||||
if $CGInstalled && ! $cgDisabled; then
|
||||
if $isDockerTask; then
|
||||
containerId=$(GetContainerId $taskFolder)
|
||||
|
|
|
@ -12,7 +12,7 @@ affinity=$2
|
|||
taskFolder=$3
|
||||
userName=$4
|
||||
|
||||
isDockerTask=$(CheckDockerEnvFileExist $taskFolder)
|
||||
isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder)
|
||||
if $isDockerTask; then
|
||||
isMpiTask=$(CheckMpiTask $taskFolder)
|
||||
if $isMpiTask; then
|
||||
|
@ -28,7 +28,7 @@ if $isDockerTask; then
|
|||
dockerImage=$(GetDockerImageName $taskFolder)
|
||||
volumeOption=$(GetDockerVolumeOption $taskFolder)
|
||||
additionalOption=$(GetDockerAdditionalOption $taskFolder)
|
||||
envFile=$(GetDockerTaskEnvFile $taskFolder)
|
||||
envFile=$(GetTaskEnvFile $taskFolder)
|
||||
containerIdFile=$(GetContainerIdFile $taskFolder)
|
||||
dockerEngine=$(GetDockerEngine $taskFolder)
|
||||
$dockerEngine run -id \
|
||||
|
@ -70,7 +70,7 @@ if $isDockerTask; then
|
|||
exit
|
||||
fi
|
||||
|
||||
cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder)
|
||||
cgDisabled=$(CheckDisableCgroupSet $taskFolder)
|
||||
if $CGInstalled && ! $cgDisabled; then
|
||||
groupName=$(GetCGroupName "$taskId")
|
||||
group=$CGroupSubSys:$groupName
|
||||
|
@ -116,11 +116,11 @@ if $CGInstalled && ! $cgDisabled; then
|
|||
exit $ec
|
||||
fi
|
||||
|
||||
numaMaxIndex=$((`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'` - 1))
|
||||
maxLoop=3
|
||||
while [ $maxLoop -gt 0 ]
|
||||
do
|
||||
memsFile=$(GetMemsFile "$groupName")
|
||||
numaMaxIndex=$((`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'` - 1))
|
||||
echo 0-$numaMaxIndex > "$memsFile"
|
||||
ec=$?
|
||||
if [ $ec -eq 0 ]
|
||||
|
@ -138,6 +138,49 @@ if $CGInstalled && ! $cgDisabled; then
|
|||
exit $ec
|
||||
fi
|
||||
|
||||
memoryLimit=$(GetMemoryLimitBytes $taskFolder)
|
||||
maxLoop=3
|
||||
while [ $maxLoop -gt 0 ]
|
||||
do
|
||||
memoryLimitFile=$(GetMemoryLimitFile "$groupName")
|
||||
echo $memoryLimit > "$memoryLimitFile"
|
||||
ec=$?
|
||||
if [ $ec -eq 0 ]
|
||||
then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Failed to set memory limit for $group, error code $ec, retry after .5 seconds"
|
||||
((maxLoop--))
|
||||
sleep .5
|
||||
done
|
||||
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
maxLoop=3
|
||||
while [ $maxLoop -gt 0 ]
|
||||
do
|
||||
memorySwappinessFile=$(GetMemorySwappinessFile "$groupName")
|
||||
echo 0 > "$memorySwappinessFile"
|
||||
ec=$?
|
||||
if [ $ec -eq 0 ]
|
||||
then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Failed to disable memory swap for $group, error code $ec, retry after .5 seconds"
|
||||
((maxLoop--))
|
||||
sleep .5
|
||||
done
|
||||
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
tasks=$(GetCpusetTasksFile "$groupName")
|
||||
freezerState=$(GetFreezerStateFile "$groupName")
|
||||
|
||||
|
@ -145,5 +188,4 @@ if $CGInstalled && ! $cgDisabled; then
|
|||
[ ! -f "$freezerState" ] && echo "$freezerState doesn't exist" && exit 201
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
fi
|
|
@ -23,7 +23,7 @@ case "$CCP_MPI_HOSTFILE_FORMAT" in
|
|||
*) echo $CCP_NODES_CORES | tr ' ' '\n' | sed -n 'n;p' > $CCP_MPI_HOSTFILE;;
|
||||
esac
|
||||
|
||||
isDockerTask=$(CheckDockerEnvFileExist $taskFolder)
|
||||
isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder)
|
||||
if $isDockerTask; then
|
||||
containerId=$(GetContainerId $taskFolder)
|
||||
docker exec $containerId /bin/bash -c "$taskFolder/TestMutualTrust.sh $taskId $taskFolder $userName" &&\
|
||||
|
@ -31,7 +31,7 @@ if $isDockerTask; then
|
|||
exit
|
||||
fi
|
||||
|
||||
cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder)
|
||||
cgDisabled=$(CheckDisableCgroupSet $taskFolder)
|
||||
if $CGInstalled && ! $cgDisabled; then
|
||||
groupName=$(GetCGroupName "$taskId")
|
||||
group=$CGroupSubSys:$groupName
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
taskId=$1
|
||||
taskFolder=$2
|
||||
|
||||
isDockerTask=$(CheckDockerEnvFileExist $taskFolder)
|
||||
isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder)
|
||||
|
||||
userTime10Ms=0
|
||||
kernelTime10Ms=0
|
||||
|
@ -34,7 +34,7 @@ function GetMemoryMaxusageFile
|
|||
GetGroupFile "$groupName" memory memory.max_usage_in_bytes
|
||||
}
|
||||
|
||||
cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder)
|
||||
cgDisabled=$(CheckDisableCgroupSet $taskFolder)
|
||||
if $CGInstalled && ! $cgDisabled; then
|
||||
if $isDockerTask; then
|
||||
containerId=$(GetContainerId $taskFolder)
|
||||
|
|
|
@ -55,6 +55,18 @@ function GetFreezerStateFile
|
|||
GetGroupFile "$groupName" freezer freezer.state
|
||||
}
|
||||
|
||||
function GetMemoryLimitFile
|
||||
{
|
||||
local groupName=$1
|
||||
GetGroupFile "$groupName" memory memory.limit_in_bytes
|
||||
}
|
||||
|
||||
function GetMemorySwappinessFile
|
||||
{
|
||||
local groupName=$1
|
||||
GetGroupFile "$groupName" memory memory.swappiness
|
||||
}
|
||||
|
||||
MpiContainerSuffix="MPI"
|
||||
DebugContainerSuffix="DEBUG"
|
||||
TmpSshDir="/tmp/hpcSshKey/.ssh"
|
||||
|
@ -85,7 +97,7 @@ function GetContainerPlaceholder
|
|||
echo "$taskFolder/placeholder"
|
||||
}
|
||||
|
||||
function GetDockerTaskEnvFile
|
||||
function GetTaskEnvFile
|
||||
{
|
||||
local taskFolder=$1
|
||||
echo "$taskFolder/environments"
|
||||
|
@ -103,10 +115,10 @@ function GetContainerId
|
|||
cat $(GetContainerIdFile $taskFolder)
|
||||
}
|
||||
|
||||
function CheckDockerEnvFileExist
|
||||
function CheckDockerImageNameNotEmpty
|
||||
{
|
||||
local taskFolder=$1
|
||||
[ -f $(GetDockerTaskEnvFile $taskFolder) ] && echo true || echo false
|
||||
[ -z $taskFolder ] || [ -z $(GetDockerImageName $taskFolder) ] && echo false || echo true
|
||||
}
|
||||
|
||||
function GetUserSshDir
|
||||
|
@ -127,14 +139,14 @@ function GetMpiContainerStartOption
|
|||
function CheckMpiTask
|
||||
{
|
||||
local taskFolder=$1
|
||||
local nodeNum=$(cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_NODES=" | sed -r 's/^CCP_NODES=([0-9]+) .*/\1/g')
|
||||
local nodeNum=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_NODES=" | sed -r 's/^CCP_NODES=([0-9]+) .*/\1/g')
|
||||
[ "$nodeNum" -gt 1 ] && echo true || echo false
|
||||
}
|
||||
|
||||
function CheckDockerDebugMode
|
||||
{
|
||||
local taskFolder=$1
|
||||
debugOption=$(cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_DEBUG=" | cut -d '=' -f 2)
|
||||
debugOption=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_DEBUG=" | cut -d '=' -f 2)
|
||||
[ -z $debugOption ] || [ "$debugOption" == "0" ] && echo false || echo true
|
||||
}
|
||||
|
||||
|
@ -145,7 +157,7 @@ function GetDockerEngine
|
|||
echo "docker"
|
||||
else
|
||||
local taskFolder=$1
|
||||
local nvidiaOption=$(cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_NVIDIA=" | cut -d '=' -f 2)
|
||||
local nvidiaOption=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_NVIDIA=" | cut -d '=' -f 2)
|
||||
if [ -z $nvidiaOption ] || [ "$nvidiaOption" == "0" ]; then
|
||||
echo "docker"
|
||||
else
|
||||
|
@ -157,19 +169,19 @@ function GetDockerEngine
|
|||
function GetDockerImageName
|
||||
{
|
||||
local taskFolder=$1
|
||||
cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_IMAGE=" | cut -d '=' -f 2
|
||||
cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_IMAGE=" | cut -d '=' -f 2
|
||||
}
|
||||
|
||||
function GetDockerVolumeOption
|
||||
{
|
||||
local taskFolder=$1
|
||||
cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_VOLUMES=" | sed -e 's/^CCP_DOCKER_VOLUMES=/-v /g' -e 's/,/ -v /g'
|
||||
cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_VOLUMES=" | sed -e 's/^CCP_DOCKER_VOLUMES=/-v /g' -e 's/,/ -v /g'
|
||||
}
|
||||
|
||||
function GetDockerAdditionalOption
|
||||
{
|
||||
local taskFolder=$1
|
||||
cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_START_OPTION=" | sed 's/^CCP_DOCKER_START_OPTION=//'
|
||||
cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_START_OPTION=" | sed 's/^CCP_DOCKER_START_OPTION=//'
|
||||
}
|
||||
|
||||
function GetSshStartCommand
|
||||
|
@ -184,9 +196,17 @@ function GetSshStopCommand
|
|||
echo $version | grep -iq ubuntu && echo "service ssh stop" || echo "service sshd stop"
|
||||
}
|
||||
|
||||
function CheckCgroupDisabledInFlagFile
|
||||
function CheckDisableCgroupSet
|
||||
{
|
||||
local taskFolder=$1
|
||||
local flagFile="$taskFolder/disable_cgroup"
|
||||
[ -f $flagFile ] && [ "$(head $flagFile)" == "1" ] && echo true || echo false
|
||||
[ -z $taskFolder ] && echo false && exit
|
||||
local disableCgroupValue=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DISABLE_CGROUP=" | cut -d '=' -f 2)
|
||||
[ "$disableCgroupValue" == "1" ] && echo true || echo false
|
||||
}
|
||||
|
||||
function GetMemoryLimitBytes
|
||||
{
|
||||
local taskFolder=$1
|
||||
local memoryLimitMegaBytes=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_MAXIMUMMEMORY=" | cut -d '=' -f 2)
|
||||
[ -z $memoryLimitMegaBytes ] && echo -1 || echo ${memoryLimitMegaBytes}M
|
||||
}
|
Загрузка…
Ссылка в новой задаче