From 7c94523eb255c0ed861363e4efceabc52862eb96 Mon Sep 17 00:00:00 2001 From: zclok010 Date: Thu, 23 May 2019 11:46:49 +0800 Subject: [PATCH] backup --- nodemanager/core/Process.cpp | 45 ++++++++----------------- nodemanager/scripts/CleanupTask.sh | 6 ++-- nodemanager/scripts/EndTask.sh | 4 +-- nodemanager/scripts/PrepareTask.sh | 54 ++++++++++++++++++++++++++---- nodemanager/scripts/StartTask.sh | 4 +-- nodemanager/scripts/Statistics.sh | 4 +-- nodemanager/scripts/common.sh | 44 +++++++++++++++++------- 7 files changed, 103 insertions(+), 58 deletions(-) diff --git a/nodemanager/core/Process.cpp b/nodemanager/core/Process.cpp index 81fcb5c..074238a 100644 --- a/nodemanager/core/Process.cpp +++ b/nodemanager/core/Process.cpp @@ -144,14 +144,12 @@ void Process::OnCompletedInternal() void* Process::ForkThread(void* arg) { Process* const p = static_cast(arg); + int ret; std::string path; - auto dockerImageIt = p->environments.find("CCP_DOCKER_IMAGE"); - bool isDockerTask = dockerImageIt != p->environments.end() && !dockerImageIt->second.empty(); - auto disableCgroupIt = p->environments.find("CCP_DISABLE_CGROUP"); - bool disableCgroup = disableCgroupIt != p->environments.end() && disableCgroupIt->second == "1"; + std::string envFile; Start: - int ret = p->CreateTaskFolder(); + ret = p->CreateTaskFolder(); if (ret != 0) { p->message << "Task " << p->taskId << ": error when create task folder, ret " << ret << std::endl; @@ -171,33 +169,18 @@ Start: goto Final; } - if (isDockerTask) + p -> environmentsBuffer.clear(); + std::transform( + p->environments.cbegin(), + p->environments.cend(), + std::back_inserter(p->environmentsBuffer), + [](const auto& v) { return String::Join("=", v.first, v.second); }); + envFile = p->taskFolder + "/environments"; + ret = System::WriteStringToFile(envFile, String::Join<'\n'>(p->environmentsBuffer)); + if (ret != 0) { - p -> environmentsBuffer.clear(); - std::transform( - p->environments.cbegin(), - p->environments.cend(), - std::back_inserter(p->environmentsBuffer), - [](const auto& v) { return String::Join("=", v.first, v.second); }); - - std::string envFile = p->taskFolder + "/environments"; - int ret = System::WriteStringToFile(envFile, String::Join<'\n'>(p->environmentsBuffer)); - if (ret != 0) - { - Logger::Error(p->jobId, p->taskId, p->requeueCount, "Failed to create environment file for docker task. Exitcode: {0}", ret); - goto Final; - } - } - - if (disableCgroup) - { - std::string flagFile = p->taskFolder + "/disable_cgroup"; - int ret = System::WriteStringToFile(flagFile, "1"); - if (ret != 0) - { - Logger::Error(p->jobId, p->taskId, p->requeueCount, "Failed to create flag file to disable cgroup. Exitcode: {0}", ret); - goto Final; - } + Logger::Error(p->jobId, p->taskId, p->requeueCount, "Failed to create task environments file {0}. Exitcode: {1}", envFile, ret); + goto Final; } if (0 != p->ExecuteCommand("/bin/bash", "PrepareTask.sh", p->taskExecutionId, p->GetAffinity(), p->taskFolder, p->userName)) diff --git a/nodemanager/scripts/CleanupTask.sh b/nodemanager/scripts/CleanupTask.sh index 45478ce..1d3325c 100644 --- a/nodemanager/scripts/CleanupTask.sh +++ b/nodemanager/scripts/CleanupTask.sh @@ -10,7 +10,7 @@ taskId=$1 processId=$2 taskFolder=$3 -isDockerTask=$(CheckDockerEnvFileExist $taskFolder) +isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder) if $isDockerTask; then isDebugMode=$(CheckDockerDebugMode $taskFolder) if ! $isDebugMode; then @@ -32,9 +32,9 @@ if $isDockerTask; then exit fi -/bin/bash ./EndTask.sh "$taskId" "$processId" "1" +/bin/bash ./EndTask.sh "$taskId" "$processId" "1" "$taskFolder" -cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder) +cgDisabled=$(CheckDisableCgroupSet $taskFolder) if $CGInstalled && ! $cgDisabled; then groupName=$(GetCGroupName "$taskId") group=$CGroupSubSys:$groupName diff --git a/nodemanager/scripts/EndTask.sh b/nodemanager/scripts/EndTask.sh index 402219a..370eccf 100644 --- a/nodemanager/scripts/EndTask.sh +++ b/nodemanager/scripts/EndTask.sh @@ -12,8 +12,8 @@ processId=$2 forced=$3 taskFolder=$4 -isDockerTask=$(CheckDockerEnvFileExist $taskFolder) -cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder) +isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder) +cgDisabled=$(CheckDisableCgroupSet $taskFolder) if $CGInstalled && ! $cgDisabled; then if $isDockerTask; then containerId=$(GetContainerId $taskFolder) diff --git a/nodemanager/scripts/PrepareTask.sh b/nodemanager/scripts/PrepareTask.sh index d27ffab..1d96b79 100644 --- a/nodemanager/scripts/PrepareTask.sh +++ b/nodemanager/scripts/PrepareTask.sh @@ -12,7 +12,7 @@ affinity=$2 taskFolder=$3 userName=$4 -isDockerTask=$(CheckDockerEnvFileExist $taskFolder) +isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder) if $isDockerTask; then isMpiTask=$(CheckMpiTask $taskFolder) if $isMpiTask; then @@ -28,7 +28,7 @@ if $isDockerTask; then dockerImage=$(GetDockerImageName $taskFolder) volumeOption=$(GetDockerVolumeOption $taskFolder) additionalOption=$(GetDockerAdditionalOption $taskFolder) - envFile=$(GetDockerTaskEnvFile $taskFolder) + envFile=$(GetTaskEnvFile $taskFolder) containerIdFile=$(GetContainerIdFile $taskFolder) dockerEngine=$(GetDockerEngine $taskFolder) $dockerEngine run -id \ @@ -70,7 +70,7 @@ if $isDockerTask; then exit fi -cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder) +cgDisabled=$(CheckDisableCgroupSet $taskFolder) if $CGInstalled && ! $cgDisabled; then groupName=$(GetCGroupName "$taskId") group=$CGroupSubSys:$groupName @@ -116,11 +116,11 @@ if $CGInstalled && ! $cgDisabled; then exit $ec fi + numaMaxIndex=$((`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'` - 1)) maxLoop=3 while [ $maxLoop -gt 0 ] do memsFile=$(GetMemsFile "$groupName") - numaMaxIndex=$((`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'` - 1)) echo 0-$numaMaxIndex > "$memsFile" ec=$? if [ $ec -eq 0 ] @@ -138,6 +138,49 @@ if $CGInstalled && ! $cgDisabled; then exit $ec fi + memoryLimit=$(GetMemoryLimitBytes $taskFolder) + maxLoop=3 + while [ $maxLoop -gt 0 ] + do + memoryLimitFile=$(GetMemoryLimitFile "$groupName") + echo $memoryLimit > "$memoryLimitFile" + ec=$? + if [ $ec -eq 0 ] + then + break + fi + + echo "Failed to set memory limit for $group, error code $ec, retry after .5 seconds" + ((maxLoop--)) + sleep .5 + done + + if [ $ec -ne 0 ] + then + exit $ec + fi + + maxLoop=3 + while [ $maxLoop -gt 0 ] + do + memorySwappinessFile=$(GetMemorySwappinessFile "$groupName") + echo 0 > "$memorySwappinessFile" + ec=$? + if [ $ec -eq 0 ] + then + break + fi + + echo "Failed to disable memory swap for $group, error code $ec, retry after .5 seconds" + ((maxLoop--)) + sleep .5 + done + + if [ $ec -ne 0 ] + then + exit $ec + fi + tasks=$(GetCpusetTasksFile "$groupName") freezerState=$(GetFreezerStateFile "$groupName") @@ -145,5 +188,4 @@ if $CGInstalled && ! $cgDisabled; then [ ! -f "$freezerState" ] && echo "$freezerState doesn't exist" && exit 201 exit 0 -fi - +fi \ No newline at end of file diff --git a/nodemanager/scripts/StartTask.sh b/nodemanager/scripts/StartTask.sh index 19aacac..d4fe2fb 100644 --- a/nodemanager/scripts/StartTask.sh +++ b/nodemanager/scripts/StartTask.sh @@ -23,7 +23,7 @@ case "$CCP_MPI_HOSTFILE_FORMAT" in *) echo $CCP_NODES_CORES | tr ' ' '\n' | sed -n 'n;p' > $CCP_MPI_HOSTFILE;; esac -isDockerTask=$(CheckDockerEnvFileExist $taskFolder) +isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder) if $isDockerTask; then containerId=$(GetContainerId $taskFolder) docker exec $containerId /bin/bash -c "$taskFolder/TestMutualTrust.sh $taskId $taskFolder $userName" &&\ @@ -31,7 +31,7 @@ if $isDockerTask; then exit fi -cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder) +cgDisabled=$(CheckDisableCgroupSet $taskFolder) if $CGInstalled && ! $cgDisabled; then groupName=$(GetCGroupName "$taskId") group=$CGroupSubSys:$groupName diff --git a/nodemanager/scripts/Statistics.sh b/nodemanager/scripts/Statistics.sh index 3db7a38..60ed8b1 100644 --- a/nodemanager/scripts/Statistics.sh +++ b/nodemanager/scripts/Statistics.sh @@ -9,7 +9,7 @@ taskId=$1 taskFolder=$2 -isDockerTask=$(CheckDockerEnvFileExist $taskFolder) +isDockerTask=$(CheckDockerImageNameNotEmpty $taskFolder) userTime10Ms=0 kernelTime10Ms=0 @@ -34,7 +34,7 @@ function GetMemoryMaxusageFile GetGroupFile "$groupName" memory memory.max_usage_in_bytes } -cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder) +cgDisabled=$(CheckDisableCgroupSet $taskFolder) if $CGInstalled && ! $cgDisabled; then if $isDockerTask; then containerId=$(GetContainerId $taskFolder) diff --git a/nodemanager/scripts/common.sh b/nodemanager/scripts/common.sh index c57098a..db2fe12 100644 --- a/nodemanager/scripts/common.sh +++ b/nodemanager/scripts/common.sh @@ -55,6 +55,18 @@ function GetFreezerStateFile GetGroupFile "$groupName" freezer freezer.state } +function GetMemoryLimitFile +{ + local groupName=$1 + GetGroupFile "$groupName" memory memory.limit_in_bytes +} + +function GetMemorySwappinessFile +{ + local groupName=$1 + GetGroupFile "$groupName" memory memory.swappiness +} + MpiContainerSuffix="MPI" DebugContainerSuffix="DEBUG" TmpSshDir="/tmp/hpcSshKey/.ssh" @@ -85,7 +97,7 @@ function GetContainerPlaceholder echo "$taskFolder/placeholder" } -function GetDockerTaskEnvFile +function GetTaskEnvFile { local taskFolder=$1 echo "$taskFolder/environments" @@ -103,10 +115,10 @@ function GetContainerId cat $(GetContainerIdFile $taskFolder) } -function CheckDockerEnvFileExist +function CheckDockerImageNameNotEmpty { local taskFolder=$1 - [ -f $(GetDockerTaskEnvFile $taskFolder) ] && echo true || echo false + [ -z $taskFolder ] || [ -z $(GetDockerImageName $taskFolder) ] && echo false || echo true } function GetUserSshDir @@ -127,14 +139,14 @@ function GetMpiContainerStartOption function CheckMpiTask { local taskFolder=$1 - local nodeNum=$(cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_NODES=" | sed -r 's/^CCP_NODES=([0-9]+) .*/\1/g') + local nodeNum=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_NODES=" | sed -r 's/^CCP_NODES=([0-9]+) .*/\1/g') [ "$nodeNum" -gt 1 ] && echo true || echo false } function CheckDockerDebugMode { local taskFolder=$1 - debugOption=$(cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_DEBUG=" | cut -d '=' -f 2) + debugOption=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_DEBUG=" | cut -d '=' -f 2) [ -z $debugOption ] || [ "$debugOption" == "0" ] && echo false || echo true } @@ -145,7 +157,7 @@ function GetDockerEngine echo "docker" else local taskFolder=$1 - local nvidiaOption=$(cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_NVIDIA=" | cut -d '=' -f 2) + local nvidiaOption=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_NVIDIA=" | cut -d '=' -f 2) if [ -z $nvidiaOption ] || [ "$nvidiaOption" == "0" ]; then echo "docker" else @@ -157,19 +169,19 @@ function GetDockerEngine function GetDockerImageName { local taskFolder=$1 - cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_IMAGE=" | cut -d '=' -f 2 + cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_IMAGE=" | cut -d '=' -f 2 } function GetDockerVolumeOption { local taskFolder=$1 - cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_VOLUMES=" | sed -e 's/^CCP_DOCKER_VOLUMES=/-v /g' -e 's/,/ -v /g' + cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_VOLUMES=" | sed -e 's/^CCP_DOCKER_VOLUMES=/-v /g' -e 's/,/ -v /g' } function GetDockerAdditionalOption { local taskFolder=$1 - cat $(GetDockerTaskEnvFile $taskFolder) | grep "CCP_DOCKER_START_OPTION=" | sed 's/^CCP_DOCKER_START_OPTION=//' + cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DOCKER_START_OPTION=" | sed 's/^CCP_DOCKER_START_OPTION=//' } function GetSshStartCommand @@ -184,9 +196,17 @@ function GetSshStopCommand echo $version | grep -iq ubuntu && echo "service ssh stop" || echo "service sshd stop" } -function CheckCgroupDisabledInFlagFile +function CheckDisableCgroupSet { local taskFolder=$1 - local flagFile="$taskFolder/disable_cgroup" - [ -f $flagFile ] && [ "$(head $flagFile)" == "1" ] && echo true || echo false + [ -z $taskFolder ] && echo false && exit + local disableCgroupValue=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_DISABLE_CGROUP=" | cut -d '=' -f 2) + [ "$disableCgroupValue" == "1" ] && echo true || echo false +} + +function GetMemoryLimitBytes +{ + local taskFolder=$1 + local memoryLimitMegaBytes=$(cat $(GetTaskEnvFile $taskFolder) | grep "CCP_MAXIMUMMEMORY=" | cut -d '=' -f 2) + [ -z $memoryLimitMegaBytes ] && echo -1 || echo ${memoryLimitMegaBytes}M } \ No newline at end of file