diff --git a/nodemanager/scripts/EndTask.sh b/nodemanager/scripts/EndTask.sh index 77e8ef2..3b3279c 100644 --- a/nodemanager/scripts/EndTask.sh +++ b/nodemanager/scripts/EndTask.sh @@ -101,14 +101,14 @@ elif $CGInstalled && ! $cgDisabled; then ((maxLoop--)) done else # processes would run away if pstree is not installed - pid=$(pstree -l -p "$processId" | grep "([[:digit:]]*)" -o | tr -d '()') - if [ -n "$pid" ]; then - if [ "$forced" == "1" ]; then - kill -s 9 $pid - else - kill -s SIGINT $pid - fi - fi + pid=$(pstree -l -p "$processId" | grep "([[:digit:]]*)" -o | tr -d '()') + if [ -n "$pid" ]; then + if [ "$forced" == "1" ]; then + kill -s 9 $pid + else + kill -s SIGINT $pid + fi + fi fi exit 0 \ No newline at end of file diff --git a/nodemanager/scripts/PrepareTask.sh b/nodemanager/scripts/PrepareTask.sh index 52399e3..b739ed0 100644 --- a/nodemanager/scripts/PrepareTask.sh +++ b/nodemanager/scripts/PrepareTask.sh @@ -56,7 +56,7 @@ if $isDockerTask; then containerId=$(GetContainerId $taskFolder) docker exec $containerId useradd -m $userName - docker exec $containerId chown $userName $taskFolder + docker exec $containerId chown $userName $taskFolder if $isMpiTask && ! $skipSshSetup; then /bin/bash MpiContainerPreparation.sh $containerId $userName fi diff --git a/nodemanager/scripts/StartTask.sh b/nodemanager/scripts/StartTask.sh index 76b7411..dd33e89 100644 --- a/nodemanager/scripts/StartTask.sh +++ b/nodemanager/scripts/StartTask.sh @@ -17,45 +17,45 @@ cp {TestMutualTrust.sh,WaitForTrust.sh} $taskFolder # Generate hostfile or machinefile for Intel MPI, Open MPI, MPICH or other MPI applications export CCP_MPI_HOSTFILE=$taskFolder/mpi_hostfile case "$CCP_MPI_HOSTFILE_FORMAT" in - "1") echo $CCP_NODES_CORES | awk '{ORS=NR%2==0?":":"\n"}1' RS=" " | tail -n +2 > $CCP_MPI_HOSTFILE;; - "2") echo $CCP_NODES_CORES | awk '{ORS=NR%2==0?" slots=":"\n"}1' RS=" " | tail -n +2 > $CCP_MPI_HOSTFILE;; - "3") echo $CCP_NODES_CORES | awk '{ORS=NR%2==0?" ":"\n"}1' RS=" " | tail -n +2 > $CCP_MPI_HOSTFILE;; - *) echo $CCP_NODES_CORES | tr ' ' '\n' | sed -n 'n;p' > $CCP_MPI_HOSTFILE;; + "1") echo $CCP_NODES_CORES | awk '{ORS=NR%2==0?":":"\n"}1' RS=" " | tail -n +2 > $CCP_MPI_HOSTFILE;; + "2") echo $CCP_NODES_CORES | awk '{ORS=NR%2==0?" slots=":"\n"}1' RS=" " | tail -n +2 > $CCP_MPI_HOSTFILE;; + "3") echo $CCP_NODES_CORES | awk '{ORS=NR%2==0?" ":"\n"}1' RS=" " | tail -n +2 > $CCP_MPI_HOSTFILE;; + *) echo $CCP_NODES_CORES | tr ' ' '\n' | sed -n 'n;p' > $CCP_MPI_HOSTFILE;; esac isDockerTask=$(CheckDockerEnvFileExist $taskFolder) if $isDockerTask; then containerId=$(GetContainerId $taskFolder) - docker exec $containerId /bin/bash -c "$taskFolder/TestMutualTrust.sh $taskId $taskFolder $userName" &&\ - docker exec -u $userName -e CCP_MPI_HOSTFILE=$taskFolder/mpi_hostfile $containerId /bin/bash $runPath - exit + docker exec $containerId /bin/bash -c "$taskFolder/TestMutualTrust.sh $taskId $taskFolder $userName" &&\ + docker exec -u $userName -e CCP_MPI_HOSTFILE=$taskFolder/mpi_hostfile $containerId /bin/bash $runPath + exit fi cgDisabled=$(CheckCgroupDisabledInFlagFile $taskFolder) if ! $CGroupV1 && ! $cgDisabled; then - groupName=$(GetCGroupName "$taskId") - procsFile=$(GetCpusetTasksFileV2 "$groupName") - echo $$ > "$procsFile" - /bin/bash $taskFolder/TestMutualTrust.sh "$taskId" "$taskFolder" "$userName" || exit - if [ "$CCP_SWITCH_USER" == "1" ]; then - su $userName -m -c "/bin/bash $runPath" - else - sudo -H -E -u $userName env "PATH=$PATH" /bin/bash $runPath - fi + groupName=$(GetCGroupName "$taskId") + procsFile=$(GetCpusetTasksFileV2 "$groupName") + echo $$ > "$procsFile" + /bin/bash $taskFolder/TestMutualTrust.sh "$taskId" "$taskFolder" "$userName" || exit + if [ "$CCP_SWITCH_USER" == "1" ]; then + su $userName -m -c "/bin/bash $runPath" + else + sudo -H -E -u $userName env "PATH=$PATH" /bin/bash $runPath + fi elif $CGInstalled && ! $cgDisabled; then - groupName=$(GetCGroupName "$taskId") - group=$CGroupSubSys:$groupName - cgexec -g "$group" /bin/bash $taskFolder/TestMutualTrust.sh "$taskId" "$taskFolder" "$userName" || exit - if [ "$CCP_SWITCH_USER" == "1" ]; then - cgexec -g "$group" su $userName -m -c "/bin/bash $runPath" - else - cgexec -g "$group" sudo -H -E -u $userName env "PATH=$PATH" /bin/bash $runPath - fi + groupName=$(GetCGroupName "$taskId") + group=$CGroupSubSys:$groupName + cgexec -g "$group" /bin/bash $taskFolder/TestMutualTrust.sh "$taskId" "$taskFolder" "$userName" || exit + if [ "$CCP_SWITCH_USER" == "1" ]; then + cgexec -g "$group" su $userName -m -c "/bin/bash $runPath" + else + cgexec -g "$group" sudo -H -E -u $userName env "PATH=$PATH" /bin/bash $runPath + fi else - /bin/bash $taskFolder/TestMutualTrust.sh "$taskId" "$taskFolder" "$userName" || exit - if [ "$CCP_SWITCH_USER" == "1" ]; then - su $userName -m -c "/bin/bash $runPath" - else - sudo -H -E -u $userName env "PATH=$PATH" /bin/bash $runPath - fi + /bin/bash $taskFolder/TestMutualTrust.sh "$taskId" "$taskFolder" "$userName" || exit + if [ "$CCP_SWITCH_USER" == "1" ]; then + su $userName -m -c "/bin/bash $runPath" + else + sudo -H -E -u $userName env "PATH=$PATH" /bin/bash $runPath + fi fi \ No newline at end of file diff --git a/nodemanager/scripts/common.sh b/nodemanager/scripts/common.sh index dfc53fa..4c1fdf7 100644 --- a/nodemanager/scripts/common.sh +++ b/nodemanager/scripts/common.sh @@ -7,7 +7,7 @@ command -v cgexec > /dev/null 2>&1 && CGInstalled=true CGroupV1=true if [ "$(stat -fc %T /sys/fs/cgroup/)" == "cgroup2fs" ]; then - CGroupV1=false + CGroupV1=false fi CGroupV2Root="/sys/fs/cgroup" diff --git a/nodemanager/scripts/hpcagent b/nodemanager/scripts/hpcagent index dac454d..b4abf49 100644 --- a/nodemanager/scripts/hpcagent +++ b/nodemanager/scripts/hpcagent @@ -36,215 +36,215 @@ LogFile = '/opt/hpcnodemanager/logs/hpclinuxagent.log' RestartIntervalInSeconds = 60 def main(): - global config, logger - logger = agentUtil.LoggerInit(LogFile, '/dev/stdout') - logger.Log("The command line is: " + " ".join(sys.argv)) - config = configUtil.ConfigUtility(logger.Log, logger.Error) + global config, logger + logger = agentUtil.LoggerInit(LogFile, '/dev/stdout') + logger.Log("The command line is: " + " ".join(sys.argv)) + config = configUtil.ConfigUtility(logger.Log, logger.Error) - ret = 1 - if re.match("^([-/]*)(disable)", sys.argv[1]): - ret = disable() - elif re.match("^([-/]*)(enable)", sys.argv[1]): - ret = enable() - elif re.match("^([-/]*)(daemon)", sys.argv[1]): - ret = daemon() - elif re.match("^([-/]*)(restart)", sys.argv[1]): - disable() - ret = enable() - else: - logger.Log("Invalid parameter %s" % sys.argv[1]) - sys.exit(ret) + ret = 1 + if re.match("^([-/]*)(disable)", sys.argv[1]): + ret = disable() + elif re.match("^([-/]*)(enable)", sys.argv[1]): + ret = enable() + elif re.match("^([-/]*)(daemon)", sys.argv[1]): + ret = daemon() + elif re.match("^([-/]*)(restart)", sys.argv[1]): + disable() + ret = enable() + else: + logger.Log("Invalid parameter %s" % sys.argv[1]) + sys.exit(ret) def _is_nodemanager_daemon(pid): - retcode, output = agentUtil.RunGetOutput("ps -p {0} -o cmd=".format(pid)) - if retcode == 0: - logger.Log("The cmd for process {0} is {1}".format(pid, output)) - pattern = r'(.*[/\s])?{0}\s+[-/]*daemon$'.format(__file__) - if re.match(pattern, output): - return True - logger.Log("The process {0} is not HPC Linux node manager daemon".format(pid)) - return False + retcode, output = agentUtil.RunGetOutput("ps -p {0} -o cmd=".format(pid)) + if retcode == 0: + logger.Log("The cmd for process {0} is {1}".format(pid, output)) + pattern = r'(.*[/\s])?{0}\s+[-/]*daemon$'.format(__file__) + if re.match(pattern, output): + return True + logger.Log("The process {0} is not HPC Linux node manager daemon".format(pid)) + return False def _add_dns_search(domain_fqdn): - need_update = False - new_content = '' - for line in (open('/etc/resolv.conf', 'r')).readlines(): - if re.match('^search.* {0}'.format(domain_fqdn), line): - logger.Log('{0} was already added in /etc/resolv.conf'.format(domain_fqdn)) - return - if re.match('^search', line): - need_update = True - new_content += line.replace('search', 'search {0}'.format(domain_fqdn)) - else: - new_content += line - if need_update: - logger.Log('Adding {0} to /etc/resolv.conf'.format(domain_fqdn)) - agentUtil.SetFileContents('/etc/resolv.conf', new_content) + need_update = False + new_content = '' + for line in (open('/etc/resolv.conf', 'r')).readlines(): + if re.match('^search.* {0}'.format(domain_fqdn), line): + logger.Log('{0} was already added in /etc/resolv.conf'.format(domain_fqdn)) + return + if re.match('^search', line): + need_update = True + new_content += line.replace('search', 'search {0}'.format(domain_fqdn)) + else: + new_content += line + if need_update: + logger.Log('Adding {0} to /etc/resolv.conf'.format(domain_fqdn)) + agentUtil.SetFileContents('/etc/resolv.conf', new_content) def _mount_cgroup(): - if not os.path.isdir('/cgroup'): - os.mkdir('/cgroup') - if not os.listdir('/cgroup'): - retcode, mount_msg = agentUtil.RunGetOutput('mount -t cgroup cgroup /cgroup') - logger.Log("mount /cgroup directory {0}:{1}".format(retcode, mount_msg)) - if retcode == 0: - logger.Log("/cgroup directory is successfully mounted.") - else: - raise Exception("failed to mount /cgroup directory") - else: - logger.Log("/cgroup directory was already mounted.") + if not os.path.isdir('/cgroup'): + os.mkdir('/cgroup') + if not os.listdir('/cgroup'): + retcode, mount_msg = agentUtil.RunGetOutput('mount -t cgroup cgroup /cgroup') + logger.Log("mount /cgroup directory {0}:{1}".format(retcode, mount_msg)) + if retcode == 0: + logger.Log("/cgroup directory is successfully mounted.") + else: + raise Exception("failed to mount /cgroup directory") + else: + logger.Log("/cgroup directory was already mounted.") def configure_iptables(): - if agentUtil.Run("command -v iptables", chk_err=False) == 0: - agentUtil.Run("iptables -D INPUT -p tcp --dport 40000 -j ACCEPT", chk_err=False) - agentUtil.Run("iptables -D INPUT -p tcp --dport 40002 -j ACCEPT", chk_err=False) - agentUtil.Run("iptables -I INPUT -p tcp --dport 40000 -j ACCEPT") - agentUtil.Run("iptables -I INPUT -p tcp --dport 40002 -j ACCEPT") + if agentUtil.Run("command -v iptables", chk_err=False) == 0: + agentUtil.Run("iptables -D INPUT -p tcp --dport 40000 -j ACCEPT", chk_err=False) + agentUtil.Run("iptables -D INPUT -p tcp --dport 40002 -j ACCEPT", chk_err=False) + agentUtil.Run("iptables -I INPUT -p tcp --dport 40000 -j ACCEPT") + agentUtil.Run("iptables -I INPUT -p tcp --dport 40002 -j ACCEPT") def enable(): - # Check whether monitor process is running. - # If it does, return. Otherwise clear pid file - if os.path.isfile(PidFilePath): - pid = agentUtil.GetFileContents(PidFilePath) - if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid): - logger.Log('HPC Linux node manager daemon is already running') - return 0 - os.remove(PidFilePath) + # Check whether monitor process is running. + # If it does, return. Otherwise clear pid file + if os.path.isfile(PidFilePath): + pid = agentUtil.GetFileContents(PidFilePath) + if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid): + logger.Log('HPC Linux node manager daemon is already running') + return 0 + os.remove(PidFilePath) - args = [get_python_executor(), os.path.join(os.getcwd(), __file__), "daemon"] - devnull = open(os.devnull, 'w') - child = subprocess.Popen(args, stdout=devnull, stderr=devnull, preexec_fn=os.setsid) - if child.pid is None or child.pid < 1: - logger.Error('Failed to launch HPC Linux node manager daemon') - return 1 - else: - # Sleep 3 seconds to check if the process is still running - time.sleep(3) - if child.poll() is None: - agentUtil.SetFileContents(PidFilePath, str(child.pid)) - logger.Log("Daemon pid: {0}".format(child.pid)) - logger.Log('HPC Linux node manager daemon is enabled') - return 0 - else: - logger.Log('Failed to launch HPC Linux node manager daemon') - return 1 + args = [get_python_executor(), os.path.join(os.getcwd(), __file__), "daemon"] + devnull = open(os.devnull, 'w') + child = subprocess.Popen(args, stdout=devnull, stderr=devnull, preexec_fn=os.setsid) + if child.pid is None or child.pid < 1: + logger.Error('Failed to launch HPC Linux node manager daemon') + return 1 + else: + # Sleep 3 seconds to check if the process is still running + time.sleep(3) + if child.poll() is None: + agentUtil.SetFileContents(PidFilePath, str(child.pid)) + logger.Log("Daemon pid: {0}".format(child.pid)) + logger.Log('HPC Linux node manager daemon is enabled') + return 0 + else: + logger.Log('Failed to launch HPC Linux node manager daemon') + return 1 def get_dist_info(): - distroName = '' - distroVersion = '' - if 'linux_distribution' in dir(platform): - distinfo = platform.linux_distribution(full_distribution_name=0) - distroName = distinfo[0].strip() - distroVersion = distinfo[1] - # if the distroName is empty we get from /etc/*-release - if not distroName: - errCode, info = agentUtil.RunGetOutput("cat /etc/*-release") - if errCode != 0: - raise Exception('Failed to get Linux Distro info by running command "cat /etc/*release", error code: {}'.format(errCode)) - for line in info.splitlines(): - if line.startswith('PRETTY_NAME='): - line = line.lower() - if 'ubuntu' in line: - distroName = 'ubuntu' - elif 'centos' in line: - distroName = 'centos' - elif 'red hat' in line: - distroName = 'redhat' - elif 'suse' in line: - distroName = 'suse' - elif 'almalinux' in line: - distroName = 'almalinux' - elif 'rocky' in line: - distroName = 'rocky' - elif 'fedora' in line: - distroName = 'fedora' - elif 'freebsd' in line: - distroName = 'freebsd' - else: - raise Exception('Unknown linux distribution with {}'.format(line)) - if line.startswith('VERSION_ID='): - line = line.strip(' ') - quoteIndex = line.index('"') - if quoteIndex >= 0: - distroVersion = line[quoteIndex+1:-1] - return distroName.lower(), distroVersion + distroName = '' + distroVersion = '' + if 'linux_distribution' in dir(platform): + distinfo = platform.linux_distribution(full_distribution_name=0) + distroName = distinfo[0].strip() + distroVersion = distinfo[1] + # if the distroName is empty we get from /etc/*-release + if not distroName: + errCode, info = agentUtil.RunGetOutput("cat /etc/*-release") + if errCode != 0: + raise Exception('Failed to get Linux Distro info by running command "cat /etc/*release", error code: {}'.format(errCode)) + for line in info.splitlines(): + if line.startswith('PRETTY_NAME='): + line = line.lower() + if 'ubuntu' in line: + distroName = 'ubuntu' + elif 'centos' in line: + distroName = 'centos' + elif 'red hat' in line: + distroName = 'redhat' + elif 'suse' in line: + distroName = 'suse' + elif 'almalinux' in line: + distroName = 'almalinux' + elif 'rocky' in line: + distroName = 'rocky' + elif 'fedora' in line: + distroName = 'fedora' + elif 'freebsd' in line: + distroName = 'freebsd' + else: + raise Exception('Unknown linux distribution with {}'.format(line)) + if line.startswith('VERSION_ID='): + line = line.strip(' ') + quoteIndex = line.index('"') + if quoteIndex >= 0: + distroVersion = line[quoteIndex+1:-1] + return distroName.lower(), distroVersion def get_python_executor(): - cmd = '' - if sys.version_info.major == 2: - cmd = 'python2' - elif sys.version_info.major == 3: - cmd = 'python3' - if agentUtil.Run("command -v {0}".format(cmd), chk_err=False) != 0: - # If a user-installed python isn't available, check for a platform-python. This is typically only used in RHEL 8.0. - if agentUtil.Run("command -v /usr/libexec/platform-python", chk_err=False) == 0: - cmd = '/usr/libexec/platform-python' - return cmd + cmd = '' + if sys.version_info.major == 2: + cmd = 'python2' + elif sys.version_info.major == 3: + cmd = 'python3' + if agentUtil.Run("command -v {0}".format(cmd), chk_err=False) != 0: + # If a user-installed python isn't available, check for a platform-python. This is typically only used in RHEL 8.0. + if agentUtil.Run("command -v /usr/libexec/platform-python", chk_err=False) == 0: + cmd = '/usr/libexec/platform-python' + return cmd def daemon(): - conn_string = config.get_cluster_connectionstring() - logger.Log("The connection string is " + conn_string) - dns_name = conn_string.split(',')[0].strip() - if dns_name.find('.') > 0: - # The cluster name is FQDN, extract the domain FQDN - domain_fqdn = dns_name.split(".", 1)[1] - logger.Log("The domain FQDN is " + domain_fqdn) - _add_dns_search(domain_fqdn) + conn_string = config.get_cluster_connectionstring() + logger.Log("The connection string is " + conn_string) + dns_name = conn_string.split(',')[0].strip() + if dns_name.find('.') > 0: + # The cluster name is FQDN, extract the domain FQDN + domain_fqdn = dns_name.split(".", 1)[1] + logger.Log("The domain FQDN is " + domain_fqdn) + _add_dns_search(domain_fqdn) - if os.path.exists("/sys/fs/cgroup/cgroup.controllers"): - agentUtil.Run("echo \"+cpu +cpuset +memory\" > /sys/fs/cgroup/cgroup.subtree_control") - logger.Log("Cgroup v2 subsystem enabled: cpu, cpuset, memory") + if os.path.exists("/sys/fs/cgroup/cgroup.controllers"): + agentUtil.Run("echo \"+cpu +cpuset +memory\" > /sys/fs/cgroup/cgroup.subtree_control") + logger.Log("Cgroup v2 subsystem enabled: cpu, cpuset, memory") - try: - # Mount the directory /cgroup for centos 6.* - distroName, distroVersion = get_dist_info() - major_version = int(distroVersion.split('.')[0]) - if (distroName == 'centos' or distroName == 'redhat') and major_version < 7: - _mount_cgroup() - logger.Log("Configure iptables to allow incoming tcp connection to 40000 and 40002.") - configure_iptables() - while True: - exe_path = os.path.join(InstallRoot, "nodemanager") - devnull = open(os.devnull, 'w') - child_process = subprocess.Popen(exe_path, stdout=devnull, stderr=devnull, cwd=InstallRoot) - if child_process.pid is None or child_process.pid < 1: - logger.Log('Failed to start HPC node manager process') - return 1 - else: - # Sleep 1 second to check if the process is still running - time.sleep(1) - if child_process.poll() is None: - logger.Log('HPC node manager process started') - exit_code = child_process.wait() - exit_msg = "HPC node manager process exits: {0}".format(exit_code) - logger.Warn(exit_msg) - else: - exit_msg = "HPC node manager process crashes: {0}".format(child_process.returncode) - logger.Error(exit_msg) - logger.Log("Restart HPC node manager process after {0} seconds".format(RestartIntervalInSeconds)) - time.sleep(RestartIntervalInSeconds) + try: + # Mount the directory /cgroup for centos 6.* + distroName, distroVersion = get_dist_info() + major_version = int(distroVersion.split('.')[0]) + if (distroName == 'centos' or distroName == 'redhat') and major_version < 7: + _mount_cgroup() + logger.Log("Configure iptables to allow incoming tcp connection to 40000 and 40002.") + configure_iptables() + while True: + exe_path = os.path.join(InstallRoot, "nodemanager") + devnull = open(os.devnull, 'w') + child_process = subprocess.Popen(exe_path, stdout=devnull, stderr=devnull, cwd=InstallRoot) + if child_process.pid is None or child_process.pid < 1: + logger.Log('Failed to start HPC node manager process') + return 1 + else: + # Sleep 1 second to check if the process is still running + time.sleep(1) + if child_process.poll() is None: + logger.Log('HPC node manager process started') + exit_code = child_process.wait() + exit_msg = "HPC node manager process exits: {0}".format(exit_code) + logger.Warn(exit_msg) + else: + exit_msg = "HPC node manager process crashes: {0}".format(child_process.returncode) + logger.Error(exit_msg) + logger.Log("Restart HPC node manager process after {0} seconds".format(RestartIntervalInSeconds)) + time.sleep(RestartIntervalInSeconds) - except Exception as e: - logger.Error("Failed to start the daemon with error:{0}, stack trace: {1}".format(e, traceback.format_exc())) - return 1 + except Exception as e: + logger.Error("Failed to start the daemon with error:{0}, stack trace: {1}".format(e, traceback.format_exc())) + return 1 def disable(): - # Check whether daemon process is running. - # If it does, kill it. Otherwise clear pid file - if os.path.isfile(PidFilePath): - pid = agentUtil.GetFileContents(PidFilePath) - if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid): - logger.Log("Stop HPC node manager daemon: {0}".format(pid)) - os.killpg(int(pid), 9) - logger.Log('HPC node manager daemon is disabled') - os.remove(PidFilePath) - else: - logger.Log('HPC node manager daemon is not running') - return 0 + # Check whether daemon process is running. + # If it does, kill it. Otherwise clear pid file + if os.path.isfile(PidFilePath): + pid = agentUtil.GetFileContents(PidFilePath) + if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid): + logger.Log("Stop HPC node manager daemon: {0}".format(pid)) + os.killpg(int(pid), 9) + logger.Log('HPC node manager daemon is disabled') + os.remove(PidFilePath) + else: + logger.Log('HPC node manager daemon is not running') + return 0 if __name__ == '__main__': - main() \ No newline at end of file + main() \ No newline at end of file diff --git a/nodemanager/scripts/hpcagent.sh b/nodemanager/scripts/hpcagent.sh index be1b0d9..b6bc214 100644 --- a/nodemanager/scripts/hpcagent.sh +++ b/nodemanager/scripts/hpcagent.sh @@ -26,8 +26,8 @@ PIDFILE=/var/run/hpcnmdaemon.pid # Exit if not run as root if [[ $EUID != 0 ]]; then - echo "This script must be run as root" - exit 1 + echo "This script must be run as root" + exit 1 fi # Exit if the package is not installed @@ -37,11 +37,11 @@ fi [ -r /etc/default/$NAME ] && . /etc/default/$NAME if command -v python3 >/dev/null 2>&1 ; then - PYTHONEXECUTOR="python3" + PYTHONEXECUTOR="python3" elif command -v python2 >/dev/null 2>&1 ; then - PYTHONEXECUTOR="python2" + PYTHONEXECUTOR="python2" elif command -v /usr/libexec/platform-python >/dev/null 2>&1 ; then - PYTHONEXECUTOR="/usr/libexec/platform-python" + PYTHONEXECUTOR="/usr/libexec/platform-python" fi # @@ -49,16 +49,16 @@ fi # do_start() { - local RC=0 - echo "Starting $FriendlyName" - $PYTHONEXECUTOR $AgentPath enable - RC=$? - if [ $RC = 0 ]; then - echo "$FriendlyName was started" - else - echo "Failed to start $FriendlyName : $RC" - fi - return $RC + local RC=0 + echo "Starting $FriendlyName" + $PYTHONEXECUTOR $AgentPath enable + RC=$? + if [ $RC = 0 ]; then + echo "$FriendlyName was started" + else + echo "Failed to start $FriendlyName : $RC" + fi + return $RC } # @@ -66,16 +66,16 @@ do_start() # do_stop() { - local RC=0 - echo "Starting $FriendlyName" - $PYTHONEXECUTOR $AgentPath disable - RC=$? - if [ $RC = 0 ]; then - echo "$FriendlyName was stopped" - else - echo "Failed to stop $FriendlyName : $RC" - fi - return $RC + local RC=0 + echo "Starting $FriendlyName" + $PYTHONEXECUTOR $AgentPath disable + RC=$? + if [ $RC = 0 ]; then + echo "$FriendlyName was stopped" + else + echo "Failed to stop $FriendlyName : $RC" + fi + return $RC } # @@ -83,54 +83,54 @@ do_stop() # do_restart() { - local RC=0 - echo "restarting $FriendlyName" - $PYTHONEXECUTOR $AgentPath restart - RC=$? - if [ $RC = 0 ]; then - echo "$FriendlyName was restarted" - else - echo "Failed to restart $FriendlyName : $RC" - fi - return $RC + local RC=0 + echo "restarting $FriendlyName" + $PYTHONEXECUTOR $AgentPath restart + RC=$? + if [ $RC = 0 ]; then + echo "$FriendlyName was restarted" + else + echo "Failed to restart $FriendlyName : $RC" + fi + return $RC } do_status() { - local daemon_pid=0 - [ -r $PIDFILE ] && read daemon_pid < $PIDFILE - if [ $daemon_pid != 0 ]; then - local cmdline=`ps -p $daemon_pid -o cmd=` - [[ "$cmdline" = *"$AgentPath"* ]] || daemon_pid=0 - fi - if [ $daemon_pid = 0 ]; then - echo "$FriendlyName is not running" - else - echo "$FriendlyName is running" - fi - return 0 + local daemon_pid=0 + [ -r $PIDFILE ] && read daemon_pid < $PIDFILE + if [ $daemon_pid != 0 ]; then + local cmdline=`ps -p $daemon_pid -o cmd=` + [[ "$cmdline" = *"$AgentPath"* ]] || daemon_pid=0 + fi + if [ $daemon_pid = 0 ]; then + echo "$FriendlyName is not running" + else + echo "$FriendlyName is running" + fi + return 0 } case "$1" in start) - do_start - RETVAL=$? - ;; + do_start + RETVAL=$? + ;; stop) - do_stop - RETVAL=$? - ;; + do_stop + RETVAL=$? + ;; status) - do_status - RETVAL=$? - ;; + do_status + RETVAL=$? + ;; restart|reload|force-reload) - do_restart - RETVAL=$? - ;; + do_restart + RETVAL=$? + ;; *) - echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2 - RETVAL=3 - ;; + echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2 + RETVAL=3 + ;; esac exit $RETVAL \ No newline at end of file diff --git a/nodemanager/scripts/setup.py b/nodemanager/scripts/setup.py index 30c5429..023b5cf 100644 --- a/nodemanager/scripts/setup.py +++ b/nodemanager/scripts/setup.py @@ -42,692 +42,692 @@ SetupLogFile = None SupportSystemd = False if not hasattr(subprocess,'check_output'): - def check_output(*popenargs, **kwargs): - r"""Backport from subprocess module from python 2.7""" - if 'stdout' in kwargs: - raise ValueError('stdout argument not allowed, it will be overridden.') - process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) - output, unused_err = process.communicate() - retcode = process.poll() - if retcode: - cmd = kwargs.get("args") - if cmd is None: - cmd = popenargs[0] - raise subprocess.CalledProcessError(retcode, cmd, output=output) - return output + def check_output(*popenargs, **kwargs): + r"""Backport from subprocess module from python 2.7""" + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise subprocess.CalledProcessError(retcode, cmd, output=output) + return output - # Exception classes used by this module. - class CalledProcessError(Exception): - def __init__(self, returncode, cmd, output=None): - self.returncode = returncode - self.cmd = cmd - self.output = output - def __str__(self): - return "Command '%s' returned non-zero exit status %d" % (self.cmd, self.returncode) + # Exception classes used by this module. + class CalledProcessError(Exception): + def __init__(self, returncode, cmd, output=None): + self.returncode = returncode + self.cmd = cmd + self.output = output + def __str__(self): + return "Command '%s' returned non-zero exit status %d" % (self.cmd, self.returncode) - subprocess.check_output=check_output - subprocess.CalledProcessError=CalledProcessError + subprocess.check_output=check_output + subprocess.CalledProcessError=CalledProcessError def get_argvalue(argstr): - start_index = argstr.index(':') + 1 - return argstr[start_index:] + start_index = argstr.index(':') + 1 + return argstr[start_index:] def install_package(package_name): - if DistroName in ["centos", "redhat", "alma", "almalinux", "rocky", "rockylinux"]: - cmd = "yum -y install " + package_name - elif DistroName == "ubuntu": - cmd = "apt-get -y install " + package_name - elif DistroName == "suse": - cmd = "zypper -n install " + package_name - else: - raise Exception("Unsupported Linux Distro.") - Log("The command to install {0}: {1}".format(package_name, cmd)) - attempt = 1 - while(True): - Log("Installing package {0} (Attempt {1})".format(package_name, attempt)) - retcode, retoutput = RunGetOutput(cmd) - if retcode == 0: - Log("package {0} installation succeeded".format(package_name)) - break - else: - Log("package {0} installation failed {0}:\n {1}".format(package_name, retcode, retoutput)) - if attempt < 3: - attempt += 1 - time.sleep(5) - continue - else: - raise Exception("failed to install package {0}:{1}".format(package_name, retcode)) + if DistroName in ["centos", "redhat", "alma", "almalinux", "rocky", "rockylinux"]: + cmd = "yum -y install " + package_name + elif DistroName == "ubuntu": + cmd = "apt-get -y install " + package_name + elif DistroName == "suse": + cmd = "zypper -n install " + package_name + else: + raise Exception("Unsupported Linux Distro.") + Log("The command to install {0}: {1}".format(package_name, cmd)) + attempt = 1 + while(True): + Log("Installing package {0} (Attempt {1})".format(package_name, attempt)) + retcode, retoutput = RunGetOutput(cmd) + if retcode == 0: + Log("package {0} installation succeeded".format(package_name)) + break + else: + Log("package {0} installation failed {0}:\n {1}".format(package_name, retcode, retoutput)) + if attempt < 3: + attempt += 1 + time.sleep(5) + continue + else: + raise Exception("failed to install package {0}:{1}".format(package_name, retcode)) def extract_hpcagent_files(src): - srctar = tarfile.open(src, 'r:gz') - try: - Run("rm -rf {0}/nodemanager {0}/hpcagent {0}/*.sh {0}/*.py {0}/lib {0}/Utils".format(InstallRoot)) - if ( - DistroName in ["centos", "redhat", "alma", "almalinux", "rocky", "rockylinux"] - and re.match("^[8|9]", DistroVersion) - and 'filter' in inspect.signature(srctar.extractall).parameters.keys() - ): - srctar.extractall(path=InstallRoot, filter="fully_trusted") - else: - srctar.extractall(InstallRoot) - libdir = os.path.join(InstallRoot, 'lib') - os.chmod(libdir, 0o644) - os.chmod(os.path.join(InstallRoot, 'Utils'), 0o644) - # Copy setup.py itself to the InstallRoot - shutil.copy2(__file__, InstallRoot) - for tmpname in os.listdir(libdir): - tmppath = os.path.join(libdir, tmpname) - if tmpname.endswith(".tar.gz") and os.path.isfile(tmppath): - Run("tar xzvf {0} -C {1}".format(tmppath, libdir)) - os.remove(tmppath) - Run("chmod 755 {0}/nodemanager {0}/hpcagent {0}/*.sh {0}/*.py {0}/Utils/*".format(InstallRoot)) - Run("chmod -R 755 {0}/lib".format(InstallRoot)) - finally: - srctar.close() + srctar = tarfile.open(src, 'r:gz') + try: + Run("rm -rf {0}/nodemanager {0}/hpcagent {0}/*.sh {0}/*.py {0}/lib {0}/Utils".format(InstallRoot)) + if ( + DistroName in ["centos", "redhat", "alma", "almalinux", "rocky", "rockylinux"] + and re.match("^[8|9]", DistroVersion) + and 'filter' in inspect.signature(srctar.extractall).parameters.keys() + ): + srctar.extractall(path=InstallRoot, filter="fully_trusted") + else: + srctar.extractall(InstallRoot) + libdir = os.path.join(InstallRoot, 'lib') + os.chmod(libdir, 0o644) + os.chmod(os.path.join(InstallRoot, 'Utils'), 0o644) + # Copy setup.py itself to the InstallRoot + shutil.copy2(__file__, InstallRoot) + for tmpname in os.listdir(libdir): + tmppath = os.path.join(libdir, tmpname) + if tmpname.endswith(".tar.gz") and os.path.isfile(tmppath): + Run("tar xzvf {0} -C {1}".format(tmppath, libdir)) + os.remove(tmppath) + Run("chmod 755 {0}/nodemanager {0}/hpcagent {0}/*.sh {0}/*.py {0}/Utils/*".format(InstallRoot)) + Run("chmod -R 755 {0}/lib".format(InstallRoot)) + finally: + srctar.close() def remove_hpcagent_files(keep_log=True, keep_cert=True): - if os.path.isdir(InstallRoot): - for tmpname in os.listdir(InstallRoot): - if tmpname == 'logs' and keep_log: - continue - if tmpname == 'certs'and keep_cert: - continue - tmppath = os.path.join(InstallRoot, tmpname) - if os.path.isdir(tmppath): - shutil.rmtree(tmppath) - elif os.path.isfile(tmppath): - os.remove(tmppath) + if os.path.isdir(InstallRoot): + for tmpname in os.listdir(InstallRoot): + if tmpname == 'logs' and keep_log: + continue + if tmpname == 'certs'and keep_cert: + continue + tmppath = os.path.join(InstallRoot, tmpname) + if os.path.isdir(tmppath): + shutil.rmtree(tmppath) + elif os.path.isfile(tmppath): + os.remove(tmppath) def install_cgroup_tools(): - if os.path.exists("/sys/fs/cgroup/cgroup.controllers"): - Log("cgroup v2 enabled, skip cgroup tools installation") - elif Run("command -v cgexec", chk_err=False) == 0: - Log("cgroup tools was already installed") - else: - Log("Start to install cgroup tools") - if DistroName == "ubuntu": - if re.match("^1", DistroVersion): - cg_pkgname = 'cgroup-bin' - else: - cg_pkgname = 'cgroup-tools' - elif (DistroName == "centos" or DistroName == "redhat") and re.match("^6", DistroVersion): - cg_pkgname = 'libcgroup' - else: - cg_pkgname = 'libcgroup-tools' - install_package(cg_pkgname) - Log("cgroup tool was successfully installed") + if os.path.exists("/sys/fs/cgroup/cgroup.controllers"): + Log("cgroup v2 enabled, skip cgroup tools installation") + elif Run("command -v cgexec", chk_err=False) == 0: + Log("cgroup tools was already installed") + else: + Log("Start to install cgroup tools") + if DistroName == "ubuntu": + if re.match("^1", DistroVersion): + cg_pkgname = 'cgroup-bin' + else: + cg_pkgname = 'cgroup-tools' + elif (DistroName == "centos" or DistroName == "redhat") and re.match("^6", DistroVersion): + cg_pkgname = 'libcgroup' + else: + cg_pkgname = 'libcgroup-tools' + install_package(cg_pkgname) + Log("cgroup tool was successfully installed") def install_sysstat(): - if Run("command -v iostat", chk_err=False) == 0: - Log("sysstat was already installed") - else: - Log("Start to install sysstat") - install_package('sysstat') - Log("sysstat was successfully installed") + if Run("command -v iostat", chk_err=False) == 0: + Log("sysstat was already installed") + else: + Log("Start to install sysstat") + install_package('sysstat') + Log("sysstat was successfully installed") def install_pstree(): - if Run("command -v pstree", chk_err=False) == 0: - Log("pstree was already installed") - else: - Log("Start to install pstree") - install_package('psmisc') - Log("pstree was successfully installed") + if Run("command -v pstree", chk_err=False) == 0: + Log("pstree was already installed") + else: + Log("Start to install pstree") + install_package('psmisc') + Log("pstree was successfully installed") def install_chkconfig(): - if os.path.isfile("/usr/lib/systemd/systemd-sysv-install"): - Log("chkconfig was already installed") - else: - Log("Start to install chkconfig") - if os.path.isdir("/etc/init.d"): - shutil.rmtree("/etc/init.d") - install_package('chkconfig') - Log("chkconfig was successfully installed") + if os.path.isfile("/usr/lib/systemd/systemd-sysv-install"): + Log("chkconfig was already installed") + else: + Log("Start to install chkconfig") + if os.path.isdir("/etc/init.d"): + shutil.rmtree("/etc/init.d") + install_package('chkconfig') + Log("chkconfig was successfully installed") def copy_direcotry(src, dest): - if not os.path.exists(dest): - shutil.copytree(src, dest) - if not os.path.samefile(src, dest): - for filename in os.listdir(src): - srcname = os.path.join(src, filename) - destname = os.path.join(dest, filename) - if os.path.isfile(srcname): - shutil.copy2(srcname, destname) - elif os.path.isdir(srcname): - copy_direcotry(srcname, destname) + if not os.path.exists(dest): + shutil.copytree(src, dest) + if not os.path.samefile(src, dest): + for filename in os.listdir(src): + srcname = os.path.join(src, filename) + destname = os.path.join(dest, filename) + if os.path.isfile(srcname): + shutil.copy2(srcname, destname) + elif os.path.isdir(srcname): + copy_direcotry(srcname, destname) def Usage(): - usage = 'Usage: \n' \ - '*Fresh new install the HPC node agent:\n' \ - ' setup.py -install -connectionstring: -certfile: -certpasswd: -authenticationkey: [-managehosts]\n\n' \ - '*Install the HPC node agent with currently existing certificate:\n' \ - ' setup.py -install -connectionstring: -keepcert [-managehosts]\n\n' \ - '*Uninstall the HPC node agent:\n' \ - ' setup.py -uninstall [-keepcert]\n\n' \ - '*Update the binaries of HPC node agent:\n' \ - ' setup.py -update\n\n' \ - '*Update the certificate used to communicate with head node:\n' \ - ' setup.py -updatecert -certfile: -certpasswd: -authenticationkey:\n\n' \ - 'Description of the parameters:\n' \ - ' connectionstring: The connection string of the HPC cluster, typically a list of head node hostnames or full qualified domain names.\n' \ - ' certfile: The PFX certificate file used to communicate with head node\n' \ - ' certpasswd: The protection password of the PFX certificate\n' \ - ' authenticationkey: same as ClusterAuthenticationKey registry setting on the head node, used to secure communication between head node and compute nodes.\n\n' \ - ' keepcert: Keep the currently existing certificates\n' \ - ' managehosts: Specify that you want the /etc/hosts file managed by HPC\n\n' \ - 'Note: This command must be run as root user\n\n' \ - 'Examples: \n' \ - 'setup.py -install -connectionstring:\'hn1,hn2,hn3\' -certfile:\'/root/mycert.pfx\' -certpasswd:\'certpass\' -managehosts -authenticationkey:\'authenticationkey\'\n\n' \ - 'setup.py -install -connectionstring:\'hn1.hpc.local,hn2.hpc.local,hn3.hpc.local\' -keepcert\n\n' \ - 'setup.py -uninstall -keepcert\n\n' \ - 'setup.py -update\n\n' \ - 'setup.py -updatecert -certfile:\'/root/newcert.pfx\' -certpasswd:\'certpass\' -authenticationkey:\'authenticationkey\'\n' - print(usage) + usage = 'Usage: \n' \ + '*Fresh new install the HPC node agent:\n' \ + ' setup.py -install -connectionstring: -certfile: -certpasswd: -authenticationkey: [-managehosts]\n\n' \ + '*Install the HPC node agent with currently existing certificate:\n' \ + ' setup.py -install -connectionstring: -keepcert [-managehosts]\n\n' \ + '*Uninstall the HPC node agent:\n' \ + ' setup.py -uninstall [-keepcert]\n\n' \ + '*Update the binaries of HPC node agent:\n' \ + ' setup.py -update\n\n' \ + '*Update the certificate used to communicate with head node:\n' \ + ' setup.py -updatecert -certfile: -certpasswd: -authenticationkey:\n\n' \ + 'Description of the parameters:\n' \ + ' connectionstring: The connection string of the HPC cluster, typically a list of head node hostnames or full qualified domain names.\n' \ + ' certfile: The PFX certificate file used to communicate with head node\n' \ + ' certpasswd: The protection password of the PFX certificate\n' \ + ' authenticationkey: same as ClusterAuthenticationKey registry setting on the head node, used to secure communication between head node and compute nodes.\n\n' \ + ' keepcert: Keep the currently existing certificates\n' \ + ' managehosts: Specify that you want the /etc/hosts file managed by HPC\n\n' \ + 'Note: This command must be run as root user\n\n' \ + 'Examples: \n' \ + 'setup.py -install -connectionstring:\'hn1,hn2,hn3\' -certfile:\'/root/mycert.pfx\' -certpasswd:\'certpass\' -managehosts -authenticationkey:\'authenticationkey\'\n\n' \ + 'setup.py -install -connectionstring:\'hn1.hpc.local,hn2.hpc.local,hn3.hpc.local\' -keepcert\n\n' \ + 'setup.py -uninstall -keepcert\n\n' \ + 'setup.py -update\n\n' \ + 'setup.py -updatecert -certfile:\'/root/newcert.pfx\' -certpasswd:\'certpass\' -authenticationkey:\'authenticationkey\'\n' + print(usage) def is_hpcagent_installed(): - if os.path.isfile('/etc/init.d/hpcagent'): - return True - else: - return False + if os.path.isfile('/etc/init.d/hpcagent'): + return True + else: + return False def cleanup_host_entries(): - hostsfile = '/etc/hosts' - if not os.path.isfile(hostsfile): - return - try: - updated = False - newcontent='' - with open(hostsfile, 'r') as F: - for line in F.readlines(): - if re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPC\s*$", line): - updated = True - else: - newcontent += line - if updated: - Log("Clean HPC related host entries from hosts file") - ReplaceFileContentsAtomic(hostsfile,newcontent) - os.chmod(hostsfile, 0o644) - except : - raise + hostsfile = '/etc/hosts' + if not os.path.isfile(hostsfile): + return + try: + updated = False + newcontent='' + with open(hostsfile, 'r') as F: + for line in F.readlines(): + if re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPC\s*$", line): + updated = True + else: + newcontent += line + if updated: + Log("Clean HPC related host entries from hosts file") + ReplaceFileContentsAtomic(hostsfile,newcontent) + os.chmod(hostsfile, 0o644) + except : + raise def cleanup_hpc_agent(keepcert): - if os.path.isfile('/etc/init.d/hpcagent'): - Log("Stop the hpc node agent") - if SupportSystemd: - Run("systemctl stop hpcagent", chk_err=False) - Run("systemctl disable hpcagent", chk_err=False) - os.remove('/etc/init.d/hpcagent') - Run("systemctl reset-failed", chk_err=False) - else: - Run("service hpcagent stop", chk_err=False) - if DistroName == "ubuntu": - Run("update-rc.d -f hpcagent remove") - elif DistroName in ["centos", "redhat", "suse", "alma", "almalinux", "rocky", "rockylinux"]: - Run("chkconfig --del hpcagent") - else: - raise Exception("unsupported Linux Distro") - os.remove('/etc/init.d/hpcagent') - if Run("firewall-cmd --state", chk_err=False) == 0: - Log("Remove firewall policies for hpc agent") - Run("firewall-cmd --permanent --zone=public --query-port=40000/tcp && firewall-cmd --permanent --zone=public --remove-port=40000/tcp", chk_err=False) - Run("firewall-cmd --permanent --zone=public --query-port=40002/tcp && firewall-cmd --permanent --zone=public --remove-port=40002/tcp", chk_err=False) - Run("firewall-cmd --reload") - remove_hpcagent_files(keep_cert=keepcert) - cleanup_host_entries() + if os.path.isfile('/etc/init.d/hpcagent'): + Log("Stop the hpc node agent") + if SupportSystemd: + Run("systemctl stop hpcagent", chk_err=False) + Run("systemctl disable hpcagent", chk_err=False) + os.remove('/etc/init.d/hpcagent') + Run("systemctl reset-failed", chk_err=False) + else: + Run("service hpcagent stop", chk_err=False) + if DistroName == "ubuntu": + Run("update-rc.d -f hpcagent remove") + elif DistroName in ["centos", "redhat", "suse", "alma", "almalinux", "rocky", "rockylinux"]: + Run("chkconfig --del hpcagent") + else: + raise Exception("unsupported Linux Distro") + os.remove('/etc/init.d/hpcagent') + if Run("firewall-cmd --state", chk_err=False) == 0: + Log("Remove firewall policies for hpc agent") + Run("firewall-cmd --permanent --zone=public --query-port=40000/tcp && firewall-cmd --permanent --zone=public --remove-port=40000/tcp", chk_err=False) + Run("firewall-cmd --permanent --zone=public --query-port=40002/tcp && firewall-cmd --permanent --zone=public --remove-port=40002/tcp", chk_err=False) + Run("firewall-cmd --reload") + remove_hpcagent_files(keep_cert=keepcert) + cleanup_host_entries() def uninstall(): - keepcert = False - if len(sys.argv) > 3: - Usage() - sys.exit(1) - elif (len(sys.argv) == 3): - if re.match("^[-/]keepcert", sys.argv[2]): - keepcert = True - else: - Usage() - sys.exit(1) - cleanup_hpc_agent(keepcert) - Log("hpc agent removed") + keepcert = False + if len(sys.argv) > 3: + Usage() + sys.exit(1) + elif (len(sys.argv) == 3): + if re.match("^[-/]keepcert", sys.argv[2]): + keepcert = True + else: + Usage() + sys.exit(1) + cleanup_hpc_agent(keepcert) + Log("hpc agent removed") def update(): - if len(sys.argv) > 2: - Usage() - sys.exit(1) + if len(sys.argv) > 2: + Usage() + sys.exit(1) - setup_dir = os.path.dirname(os.path.abspath(__file__)) - if os.path.samefile(setup_dir, InstallRoot): - Log("Nothing to update") - sys.exit(1) + setup_dir = os.path.dirname(os.path.abspath(__file__)) + if os.path.samefile(setup_dir, InstallRoot): + Log("Nothing to update") + sys.exit(1) - srcpkg = os.path.join(setup_dir, 'hpcnodeagent.tar.gz') - if not os.path.isfile(srcpkg): - Log("Nothing to update: hpcnodeagent.tar.gz not found") - sys.exit(1) + srcpkg = os.path.join(setup_dir, 'hpcnodeagent.tar.gz') + if not os.path.isfile(srcpkg): + Log("Nothing to update: hpcnodeagent.tar.gz not found") + sys.exit(1) - if not is_hpcagent_installed(): - Log("No hpc agent installed") - sys.exit(1) + if not is_hpcagent_installed(): + Log("No hpc agent installed") + sys.exit(1) - pemfile = os.path.join(InstallRoot, "certs/nodemanager.pem") - rsakeyfile = os.path.join(InstallRoot, "certs/nodemanager_rsa.key") - if not os.path.isfile(pemfile) or not os.path.isfile(rsakeyfile): - Log("No certificates configured") - sys.exit(1) + pemfile = os.path.join(InstallRoot, "certs/nodemanager.pem") + rsakeyfile = os.path.join(InstallRoot, "certs/nodemanager_rsa.key") + if not os.path.isfile(pemfile) or not os.path.isfile(rsakeyfile): + Log("No certificates configured") + sys.exit(1) - configfile = os.path.join(InstallRoot, 'nodemanager.json') - if not os.path.isfile(configfile): - Log("nodemanager.json not found") - sys.exit(1) + configfile = os.path.join(InstallRoot, 'nodemanager.json') + if not os.path.isfile(configfile): + Log("nodemanager.json not found") + sys.exit(1) - Log("Stop the hpc node agent") - if SupportSystemd: - Run("systemctl stop hpcagent", chk_err=False) - else: - Run("service hpcagent stop", chk_err=False) + Log("Stop the hpc node agent") + if SupportSystemd: + Run("systemctl stop hpcagent", chk_err=False) + else: + Run("service hpcagent stop", chk_err=False) - with open(configfile, 'r') as F: - configjson = json.load(F) + with open(configfile, 'r') as F: + configjson = json.load(F) - Log("Update the binaries") - extract_hpcagent_files(srcpkg) - ReplaceFileContentsAtomic(configfile, json.dumps(configjson)) - os.chmod(configfile, 0o644) - shutil.move(os.path.join(InstallRoot, "hpcagent.sh"), "/etc/init.d/hpcagent") + Log("Update the binaries") + extract_hpcagent_files(srcpkg) + ReplaceFileContentsAtomic(configfile, json.dumps(configjson)) + os.chmod(configfile, 0o644) + shutil.move(os.path.join(InstallRoot, "hpcagent.sh"), "/etc/init.d/hpcagent") - Log("restart hpcagent") - if SupportSystemd: - Run("systemctl restart hpcagent") - else: - Run("service hpcagent restart") - Log("hpc agent updated") + Log("restart hpcagent") + if SupportSystemd: + Run("systemctl restart hpcagent") + else: + Run("service hpcagent restart") + Log("hpc agent updated") def generatekeypair(certfile, certpasswd): - certsdir = os.path.join(InstallRoot, "certs") - if not os.path.isdir(certsdir): - os.makedirs(certsdir, 0o750) - else: - os.chmod(certsdir, 0o750) - result = Run("openssl pkcs12 -in {0} -out {1}/nodemanager_rsa.key -nocerts -nodes -password pass:'{2}'".format(certfile, certsdir, certpasswd)) - if result != 0: - raise Exception("Failed to generate nodemanager_rsa.key, please check whether the certificate protection password is correct.") - result = Run("openssl pkcs12 -in {0} -out {1}/nodemanager.pem -password pass:'{2}' -nokeys".format(certfile, certsdir, certpasswd)) - if result != 0: - raise Exception("Failed to generate nodemanager.pem, please check whether the certificate protection password is correct.") - result = Run("openssl rsa -in {0}/nodemanager_rsa.key -out {0}/nodemanager.key".format(certsdir)) - if result != 0: - raise Exception("Failed to generate nodemanager.key.") - shutil.copy2(os.path.join(certsdir,'nodemanager.pem'), os.path.join(certsdir, 'nodemanager.crt')) + certsdir = os.path.join(InstallRoot, "certs") + if not os.path.isdir(certsdir): + os.makedirs(certsdir, 0o750) + else: + os.chmod(certsdir, 0o750) + result = Run("openssl pkcs12 -in {0} -out {1}/nodemanager_rsa.key -nocerts -nodes -password pass:'{2}'".format(certfile, certsdir, certpasswd)) + if result != 0: + raise Exception("Failed to generate nodemanager_rsa.key, please check whether the certificate protection password is correct.") + result = Run("openssl pkcs12 -in {0} -out {1}/nodemanager.pem -password pass:'{2}' -nokeys".format(certfile, certsdir, certpasswd)) + if result != 0: + raise Exception("Failed to generate nodemanager.pem, please check whether the certificate protection password is correct.") + result = Run("openssl rsa -in {0}/nodemanager_rsa.key -out {0}/nodemanager.key".format(certsdir)) + if result != 0: + raise Exception("Failed to generate nodemanager.key.") + shutil.copy2(os.path.join(certsdir,'nodemanager.pem'), os.path.join(certsdir, 'nodemanager.crt')) def updatecert(): - if not is_hpcagent_installed(): - Log("No hpc agent installed") - sys.exit(1) + if not is_hpcagent_installed(): + Log("No hpc agent installed") + sys.exit(1) - certfile = None - certpasswd = None - authenticationkey = None - for a in sys.argv[2:]: - if re.match("^[-/]certfile:.+", a): - certfile = get_argvalue(a) - elif re.match("^[-/]certpasswd:.+", a): - certpasswd = get_argvalue(a) - elif re.match("^[-/]authenticationkey:.+", a): - authenticationkey = get_argvalue(a) - else: - print("Invalid argument: %s" % a) - Usage() - sys.exit(1) + certfile = None + certpasswd = None + authenticationkey = None + for a in sys.argv[2:]: + if re.match("^[-/]certfile:.+", a): + certfile = get_argvalue(a) + elif re.match("^[-/]certpasswd:.+", a): + certpasswd = get_argvalue(a) + elif re.match("^[-/]authenticationkey:.+", a): + authenticationkey = get_argvalue(a) + else: + print("Invalid argument: %s" % a) + Usage() + sys.exit(1) - if not os.path.isfile(certfile): - print("certfile not found: %s" % certfile) - sys.exit(1) - if not certpasswd: - certpasswd = getpass.getpass(prompt='Please input the certificate protection password:') + if not os.path.isfile(certfile): + print("certfile not found: %s" % certfile) + sys.exit(1) + if not certpasswd: + certpasswd = getpass.getpass(prompt='Please input the certificate protection password:') - try: - generatekeypair(certfile, certpasswd) - if authenticationkey is not None: - configfile = os.path.join(InstallRoot, 'nodemanager.json') - if not os.path.isfile(configfile): - Log("nodemanager.json not found") - sys.exit(1) - with open(configfile, 'r') as F: - configjson = json.load(F) - configjson['ClusterAuthenticationKey'] = authenticationkey - SetFileContents(configfile, json.dumps(configjson)) - os.chmod(configfile, 0o640) - print("The credentials were successfully updated") - if SupportSystemd: - Run("systemctl restart hpcagent") - else: - Run("service hpcagent restart") - sys.exit(0) - except Exception as e: - print("Failed to update certificate: {0}".format(e)) - sys.exit(1) + try: + generatekeypair(certfile, certpasswd) + if authenticationkey is not None: + configfile = os.path.join(InstallRoot, 'nodemanager.json') + if not os.path.isfile(configfile): + Log("nodemanager.json not found") + sys.exit(1) + with open(configfile, 'r') as F: + configjson = json.load(F) + configjson['ClusterAuthenticationKey'] = authenticationkey + SetFileContents(configfile, json.dumps(configjson)) + os.chmod(configfile, 0o640) + print("The credentials were successfully updated") + if SupportSystemd: + Run("systemctl restart hpcagent") + else: + Run("service hpcagent restart") + sys.exit(0) + except Exception as e: + print("Failed to update certificate: {0}".format(e)) + sys.exit(1) def install(): - keepcert = False - managehosts = False - connectionstring = None - certfile = None - certpasswd = None - authenticationkey = None - for a in sys.argv[2:]: - if re.match(r"^[-/](help|usage|\?)", a): - Usage() - sys.exit(0) - if re.match("^[-/](connectionstring|clusname):.+", a): - connectionstring = get_argvalue(a) - elif re.match("^[-/]certfile:.+", a): - certfile = get_argvalue(a) - elif re.match("^[-/]certpasswd:.+", a): - certpasswd = get_argvalue(a) - elif re.match("^[-/]authenticationkey:.+", a): - authenticationkey = get_argvalue(a) - elif re.match("^[-/]keepcert", a): - keepcert = True - elif re.match("^[-/]managehosts", a): - managehosts = True - else: - print("Invalid argument: %s" % a) - Usage() - sys.exit(1) + keepcert = False + managehosts = False + connectionstring = None + certfile = None + certpasswd = None + authenticationkey = None + for a in sys.argv[2:]: + if re.match(r"^[-/](help|usage|\?)", a): + Usage() + sys.exit(0) + if re.match("^[-/](connectionstring|clusname):.+", a): + connectionstring = get_argvalue(a) + elif re.match("^[-/]certfile:.+", a): + certfile = get_argvalue(a) + elif re.match("^[-/]certpasswd:.+", a): + certpasswd = get_argvalue(a) + elif re.match("^[-/]authenticationkey:.+", a): + authenticationkey = get_argvalue(a) + elif re.match("^[-/]keepcert", a): + keepcert = True + elif re.match("^[-/]managehosts", a): + managehosts = True + else: + print("Invalid argument: %s" % a) + Usage() + sys.exit(1) - if not connectionstring or (not keepcert and not certfile): - print("One or more parameters are not specified.") - Usage() - sys.exit(1) + if not connectionstring or (not keepcert and not certfile): + print("One or more parameters are not specified.") + Usage() + sys.exit(1) - if keepcert and (certfile or certpasswd or authenticationkey): - print("The parameter keepcert cannot be specified with the parameter certfile, certpass or authenticationkey") - Usage() - sys.exit(1) + if keepcert and (certfile or certpasswd or authenticationkey): + print("The parameter keepcert cannot be specified with the parameter certfile, certpass or authenticationkey") + Usage() + sys.exit(1) - if keepcert: - pemfile = os.path.join(InstallRoot, "certs/nodemanager.pem") - rsakeyfile = os.path.join(InstallRoot, "certs/nodemanager_rsa.key") - if not os.path.isfile(pemfile) or not os.path.isfile(rsakeyfile): - Log("nodemanager.pem or nodemanager_rsa.key not found") - sys.exit(1) - configfiletemp = os.path.join(InstallRoot, 'nodemanager.json') - if not os.path.isfile(configfiletemp): - Log("nodemanager.json not found") - sys.exit(1) - with open(configfiletemp, 'r') as F: - configjsontemp = json.load(F) - authenticationkey = configjsontemp.get('ClusterAuthenticationKey') - else: - if not os.path.isfile(certfile): - print("certfile not found: %s" % certfile) - sys.exit(1) - if not certpasswd: - certpasswd = getpass.getpass(prompt='Please input the certificate protection password:') + if keepcert: + pemfile = os.path.join(InstallRoot, "certs/nodemanager.pem") + rsakeyfile = os.path.join(InstallRoot, "certs/nodemanager_rsa.key") + if not os.path.isfile(pemfile) or not os.path.isfile(rsakeyfile): + Log("nodemanager.pem or nodemanager_rsa.key not found") + sys.exit(1) + configfiletemp = os.path.join(InstallRoot, 'nodemanager.json') + if not os.path.isfile(configfiletemp): + Log("nodemanager.json not found") + sys.exit(1) + with open(configfiletemp, 'r') as F: + configjsontemp = json.load(F) + authenticationkey = configjsontemp.get('ClusterAuthenticationKey') + else: + if not os.path.isfile(certfile): + print("certfile not found: %s" % certfile) + sys.exit(1) + if not certpasswd: + certpasswd = getpass.getpass(prompt='Please input the certificate protection password:') - srcpkgdir = os.path.dirname(__file__) - srcpkg = os.path.join(srcpkgdir, 'hpcnodeagent.tar.gz') - if not os.path.isfile(srcpkg): - Log("hpcnodeagent.tar.gz not found") - sys.exit(1) + srcpkgdir = os.path.dirname(__file__) + srcpkg = os.path.join(srcpkgdir, 'hpcnodeagent.tar.gz') + if not os.path.isfile(srcpkg): + Log("hpcnodeagent.tar.gz not found") + sys.exit(1) - if is_hpcagent_installed(): - Log("hpc agent was already installed") - sys.exit(0) + if is_hpcagent_installed(): + Log("hpc agent was already installed") + sys.exit(0) - Log("Start to install HPC Linux node agent") - try: - extract_hpcagent_files(srcpkg) - logdir = os.path.join(InstallRoot, "logs") - certsdir = os.path.join(InstallRoot, "certs") - if not os.path.isdir(logdir): - os.makedirs(logdir) + Log("Start to install HPC Linux node agent") + try: + extract_hpcagent_files(srcpkg) + logdir = os.path.join(InstallRoot, "logs") + certsdir = os.path.join(InstallRoot, "certs") + if not os.path.isdir(logdir): + os.makedirs(logdir) - host_name = socket.gethostname().split('.')[0] - api_prefix = "https://{0}:443/HpcLinux/api/" - node_uri = api_prefix + host_name + "/computenodereported" - reg_uri = api_prefix + host_name + "/registerrequested" - metric_inst_uri = api_prefix + host_name + "/getinstanceids" - configjson = { - "NamingServiceUri": ['https://{0}:443/HpcNaming/api/fabric/resolve/singleton/'.format(h.strip()) for h in connectionstring.split(',')], - "HeartbeatUri": node_uri, - "RegisterUri": reg_uri, - "MetricInstanceIdsUri": metric_inst_uri, - "MetricUri": "", - "TrustedCAFile": os.path.join(certsdir, "nodemanager.pem"), - "CertificateChainFile": os.path.join(certsdir, "nodemanager.crt"), - "PrivateKeyFile": os.path.join(certsdir, "nodemanager.key"), - "ListeningUri": "https://0.0.0.0:40002", - "DefaultServiceName": "SchedulerStatefulService", - "UdpMetricServiceName": "MonitoringStatefulService", - "ClusterAuthenticationKey": authenticationkey if authenticationkey else "", - } - if managehosts: - configjson['HostsFileUri'] = api_prefix + "hostsfile" - configfile = os.path.join(InstallRoot, 'nodemanager.json') - SetFileContents(configfile, json.dumps(configjson)) - os.chmod(configfile, 0o640) - if not keepcert: - Log("Generating the key pair from {0}".format(certfile)) - generatekeypair(certfile, certpasswd) + host_name = socket.gethostname().split('.')[0] + api_prefix = "https://{0}:443/HpcLinux/api/" + node_uri = api_prefix + host_name + "/computenodereported" + reg_uri = api_prefix + host_name + "/registerrequested" + metric_inst_uri = api_prefix + host_name + "/getinstanceids" + configjson = { + "NamingServiceUri": ['https://{0}:443/HpcNaming/api/fabric/resolve/singleton/'.format(h.strip()) for h in connectionstring.split(',')], + "HeartbeatUri": node_uri, + "RegisterUri": reg_uri, + "MetricInstanceIdsUri": metric_inst_uri, + "MetricUri": "", + "TrustedCAFile": os.path.join(certsdir, "nodemanager.pem"), + "CertificateChainFile": os.path.join(certsdir, "nodemanager.crt"), + "PrivateKeyFile": os.path.join(certsdir, "nodemanager.key"), + "ListeningUri": "https://0.0.0.0:40002", + "DefaultServiceName": "SchedulerStatefulService", + "UdpMetricServiceName": "MonitoringStatefulService", + "ClusterAuthenticationKey": authenticationkey if authenticationkey else "", + } + if managehosts: + configjson['HostsFileUri'] = api_prefix + "hostsfile" + configfile = os.path.join(InstallRoot, 'nodemanager.json') + SetFileContents(configfile, json.dumps(configjson)) + os.chmod(configfile, 0o640) + if not keepcert: + Log("Generating the key pair from {0}".format(certfile)) + generatekeypair(certfile, certpasswd) - Log("Install depending tools ...") - install_cgroup_tools() - install_sysstat() - install_pstree() - if DistroName in ["centos", "redhat", "alma", "almalinux", "rocky", "rockylinux"]: - install_chkconfig() + Log("Install depending tools ...") + install_cgroup_tools() + install_sysstat() + install_pstree() + if DistroName in ["centos", "redhat", "alma", "almalinux", "rocky", "rockylinux"]: + install_chkconfig() - if Run("command -v setsebool", chk_err=False) == 0: - Log("Set SELinux boolean value httpd_can_network_connect and allow_httpd_anon_write to true") - Run("setsebool -P httpd_can_network_connect 1") - Run("setsebool -P allow_httpd_anon_write 1") + if Run("command -v setsebool", chk_err=False) == 0: + Log("Set SELinux boolean value httpd_can_network_connect and allow_httpd_anon_write to true") + Run("setsebool -P httpd_can_network_connect 1") + Run("setsebool -P allow_httpd_anon_write 1") - if Run("firewall-cmd --state", chk_err=False) == 0: - Log("Configuring firewalld settings") - Run("firewall-cmd --permanent --zone=public --add-port=40000/tcp") - Run("firewall-cmd --permanent --zone=public --add-port=40002/tcp") - Run("firewall-cmd --reload") - Log("firewalld settings configured") + if Run("firewall-cmd --state", chk_err=False) == 0: + Log("Configuring firewalld settings") + Run("firewall-cmd --permanent --zone=public --add-port=40000/tcp") + Run("firewall-cmd --permanent --zone=public --add-port=40002/tcp") + Run("firewall-cmd --reload") + Log("firewalld settings configured") - Log("Starting the hpc node agent daemon") - shutil.move(os.path.join(InstallRoot, "hpcagent.sh"), "/etc/init.d/hpcagent") - if SupportSystemd: - Run("systemctl enable hpcagent") - errCode, msg = RunGetOutput("systemctl start hpcagent") - else: - if DistroName == "ubuntu": - Run("update-rc.d hpcagent defaults") - elif DistroName in ["centos", "redhat", "suse", "alma", "almalinux", "rocky", "rockylinux"]: - Run("chkconfig --add hpcagent") - else: - raise Exception("unsupported Linux Distro") - errCode, msg = RunGetOutput("service hpcagent start") - if errCode == 0: - Log("The hpc node agent was installed") - else: - Log("The hpc node agent failed to start: " + msg) - sys.exit(1) - except Exception as e: - cleanup_hpc_agent(keepcert) - Log("Failed to install hpc node agent: {0}, stack trace: {1}".format(e, traceback.format_exc())) - sys.exit(1) + Log("Starting the hpc node agent daemon") + shutil.move(os.path.join(InstallRoot, "hpcagent.sh"), "/etc/init.d/hpcagent") + if SupportSystemd: + Run("systemctl enable hpcagent") + errCode, msg = RunGetOutput("systemctl start hpcagent") + else: + if DistroName == "ubuntu": + Run("update-rc.d hpcagent defaults") + elif DistroName in ["centos", "redhat", "suse", "alma", "almalinux", "rocky", "rockylinux"]: + Run("chkconfig --add hpcagent") + else: + raise Exception("unsupported Linux Distro") + errCode, msg = RunGetOutput("service hpcagent start") + if errCode == 0: + Log("The hpc node agent was installed") + else: + Log("The hpc node agent failed to start: " + msg) + sys.exit(1) + except Exception as e: + cleanup_hpc_agent(keepcert) + Log("Failed to install hpc node agent: {0}, stack trace: {1}".format(e, traceback.format_exc())) + sys.exit(1) def get_dist_info(): - distroName = '' - distroVersion = '' - if 'linux_distribution' in dir(platform): - distinfo = platform.linux_distribution(full_distribution_name=0) - distroName = distinfo[0].strip() - distroVersion = distinfo[1] - # if the distroName is empty we get from /etc/*-release - if not distroName: - errCode, info = RunGetOutput("cat /etc/*-release") - if errCode != 0: - raise Exception('Failed to get Linux Distro info by running command "cat /etc/*release", error code: {}'.format(errCode)) - for line in info.splitlines(): - if line.startswith('PRETTY_NAME='): - line = line.lower() - if 'ubuntu' in line: - distroName = 'ubuntu' - elif 'centos' in line: - distroName = 'centos' - elif 'red hat' in line: - distroName = 'redhat' - elif 'suse' in line: - distroName = 'suse' - elif 'almalinux' in line: - distroName = 'almalinux' - elif 'rocky' in line: - distroName = 'rocky' - elif 'fedora' in line: - distroName = 'fedora' - elif 'freebsd' in line: - distroName = 'freebsd' - else: - raise Exception('Unknown linux distribution with {}'.format(line)) - if line.startswith('VERSION_ID='): - line = line.strip(' ') - quoteIndex = line.index('"') - if quoteIndex >= 0: - distroVersion = line[quoteIndex+1:-1] - return distroName.lower(), distroVersion + distroName = '' + distroVersion = '' + if 'linux_distribution' in dir(platform): + distinfo = platform.linux_distribution(full_distribution_name=0) + distroName = distinfo[0].strip() + distroVersion = distinfo[1] + # if the distroName is empty we get from /etc/*-release + if not distroName: + errCode, info = RunGetOutput("cat /etc/*-release") + if errCode != 0: + raise Exception('Failed to get Linux Distro info by running command "cat /etc/*release", error code: {}'.format(errCode)) + for line in info.splitlines(): + if line.startswith('PRETTY_NAME='): + line = line.lower() + if 'ubuntu' in line: + distroName = 'ubuntu' + elif 'centos' in line: + distroName = 'centos' + elif 'red hat' in line: + distroName = 'redhat' + elif 'suse' in line: + distroName = 'suse' + elif 'almalinux' in line: + distroName = 'almalinux' + elif 'rocky' in line: + distroName = 'rocky' + elif 'fedora' in line: + distroName = 'fedora' + elif 'freebsd' in line: + distroName = 'freebsd' + else: + raise Exception('Unknown linux distribution with {}'.format(line)) + if line.startswith('VERSION_ID='): + line = line.strip(' ') + quoteIndex = line.index('"') + if quoteIndex >= 0: + distroVersion = line[quoteIndex+1:-1] + return distroName.lower(), distroVersion def main(): - t = time.localtime() - global DistroName, DistroVersion, SetupLogFile, SupportSystemd - DistroName, DistroVersion = get_dist_info() - SupportSystemd = Run("command -v systemctl", chk_err=False) == 0 + t = time.localtime() + global DistroName, DistroVersion, SetupLogFile, SupportSystemd + DistroName, DistroVersion = get_dist_info() + SupportSystemd = Run("command -v systemctl", chk_err=False) == 0 - if len(sys.argv) < 2: - Usage() - sys.exit(1) + if len(sys.argv) < 2: + Usage() + sys.exit(1) - if re.match(r"^[-/]*(help|usage|\?)", sys.argv[1]): - Usage() - sys.exit(0) + if re.match(r"^[-/]*(help|usage|\?)", sys.argv[1]): + Usage() + sys.exit(0) - if os.geteuid() != 0: - print("You must run this command as root user") - sys.exit(1) + if os.geteuid() != 0: + print("You must run this command as root user") + sys.exit(1) - if re.match("^[-/]*uninstall$", sys.argv[1]): - SetupLogFile = "/root/hpcagent_uninstall_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) - uninstall() - elif re.match("^[-/]*update$", sys.argv[1]): - SetupLogFile = "/root/hpcagent_update_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) - update() - elif re.match("^[-/]*install$", sys.argv[1]): - SetupLogFile = "/root/hpcagent_install_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) - install() - elif re.match("^[-/]*updatecert$", sys.argv[1]): - SetupLogFile = "/root/hpcagent_updatecert_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) - updatecert() - else: - print("Invalid arguments") - Usage() - sys.exit(1) - sys.exit(0) + if re.match("^[-/]*uninstall$", sys.argv[1]): + SetupLogFile = "/root/hpcagent_uninstall_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) + uninstall() + elif re.match("^[-/]*update$", sys.argv[1]): + SetupLogFile = "/root/hpcagent_update_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) + update() + elif re.match("^[-/]*install$", sys.argv[1]): + SetupLogFile = "/root/hpcagent_install_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) + install() + elif re.match("^[-/]*updatecert$", sys.argv[1]): + SetupLogFile = "/root/hpcagent_updatecert_%04u%02u%02u-%02u%02u%02u.log" % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) + updatecert() + else: + print("Invalid arguments") + Usage() + sys.exit(1) + sys.exit(0) def Run(cmd,chk_err=True): - retcode,out=RunGetOutput(cmd,chk_err) - return retcode + retcode,out=RunGetOutput(cmd,chk_err) + return retcode def RunGetOutput(cmd,chk_err=True): - try: - output=subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) - except subprocess.CalledProcessError as e: - if chk_err : - Error('CalledProcessError. Error Code is ' + str(e.returncode) ) - Error('CalledProcessError. Command result was ' + (e.output[:-1]).decode('latin-1')) - return e.returncode,e.output.decode('latin-1') - return 0,output.decode('latin-1') + try: + output=subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True) + except subprocess.CalledProcessError as e: + if chk_err : + Error('CalledProcessError. Error Code is ' + str(e.returncode) ) + Error('CalledProcessError. Command result was ' + (e.output[:-1]).decode('latin-1')) + return e.returncode,e.output.decode('latin-1') + return 0,output.decode('latin-1') def ReplaceStringInFile(fname,src,repl): - """ - Replace 'src' with 'repl' in file. - """ - try: - updated='' - with open(fname, 'r') as F: - for line in F.readlines(): - n = line.replace(src, repl) - updated += n - ReplaceFileContentsAtomic(fname,updated) - except : - raise - return + """ + Replace 'src' with 'repl' in file. + """ + try: + updated='' + with open(fname, 'r') as F: + for line in F.readlines(): + n = line.replace(src, repl) + updated += n + ReplaceFileContentsAtomic(fname,updated) + except : + raise + return def ReplaceFileContentsAtomic(filepath, contents): - """ - Write 'contents' to 'filepath' by creating a temp file, and replacing original. - """ - handle, temp = tempfile.mkstemp(dir = os.path.dirname(filepath)) - if type(contents) == str : - contents=contents.encode('latin-1') - try: - os.write(handle, contents) - except IOError as e: - Error('ReplaceFileContentsAtomic Writing to file ' + filepath + ' Exception is ' + str(e)) - return None - finally: - os.close(handle) - try: - os.rename(temp, filepath) - return None - except IOError as e: - Error('ReplaceFileContentsAtomic Renaming ' + temp + ' to ' + filepath + ' Exception is ' + str(e)) - try: - os.remove(filepath) - except IOError as e: - Error('ReplaceFileContentsAtomic Removing '+ filepath + ' Exception is ' + str(e)) - try: - os.rename(temp,filepath) - except IOError as e: - Error('ReplaceFileContentsAtomic Removing '+ filepath + ' Exception is ' + str(e)) - return 1 - return 0 + """ + Write 'contents' to 'filepath' by creating a temp file, and replacing original. + """ + handle, temp = tempfile.mkstemp(dir = os.path.dirname(filepath)) + if type(contents) == str : + contents=contents.encode('latin-1') + try: + os.write(handle, contents) + except IOError as e: + Error('ReplaceFileContentsAtomic Writing to file ' + filepath + ' Exception is ' + str(e)) + return None + finally: + os.close(handle) + try: + os.rename(temp, filepath) + return None + except IOError as e: + Error('ReplaceFileContentsAtomic Renaming ' + temp + ' to ' + filepath + ' Exception is ' + str(e)) + try: + os.remove(filepath) + except IOError as e: + Error('ReplaceFileContentsAtomic Removing '+ filepath + ' Exception is ' + str(e)) + try: + os.rename(temp,filepath) + except IOError as e: + Error('ReplaceFileContentsAtomic Removing '+ filepath + ' Exception is ' + str(e)) + return 1 + return 0 def SetFileContents(filepath, contents): - """ - Write 'contents' to 'filepath'. - """ - if type(contents) == str : - contents=contents.encode('latin-1', 'ignore') - try: - with open(filepath, "wb+") as F : - F.write(contents) - except IOError as e: - Error('Failed to Write to file ' + filepath + ' Exception is ' + str(e)) - return None - return 0 + """ + Write 'contents' to 'filepath'. + """ + if type(contents) == str : + contents=contents.encode('latin-1', 'ignore') + try: + with open(filepath, "wb+") as F : + F.write(contents) + except IOError as e: + Error('Failed to Write to file ' + filepath + ' Exception is ' + str(e)) + return None + return 0 def LogWithPrefix(prefix, message): - t = time.localtime() - t = "%04u/%02u/%02u %02u:%02u:%02u " % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) - t += prefix - for line in message.split('\n'): - line = t + line - line = ''.join(filter(lambda x : x in string.printable, line)) - print(line) - try: - with open(SetupLogFile, "a") as F : - F.write(line + "\n") - except IOError as e: - print(e) - pass + t = time.localtime() + t = "%04u/%02u/%02u %02u:%02u:%02u " % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) + t += prefix + for line in message.split('\n'): + line = t + line + line = ''.join(filter(lambda x : x in string.printable, line)) + print(line) + try: + with open(SetupLogFile, "a") as F : + F.write(line + "\n") + except IOError as e: + print(e) + pass def Log(message): - LogWithPrefix("", message) + LogWithPrefix("", message) def Error(message): - LogWithPrefix("ERROR:", message) - + LogWithPrefix("ERROR:", message) + def Warn(message): - LogWithPrefix("WARNING:", message) + LogWithPrefix("WARNING:", message) if __name__ == '__main__': - main() \ No newline at end of file + main() \ No newline at end of file