pulled from another branch
This commit is contained in:
Коммит
210722dd71
|
@ -418,13 +418,14 @@
|
||||||
"../common.h"
|
"../common.h"
|
||||||
"../details/log_msg.h"
|
"../details/log_msg.h"
|
||||||
|
|
||||||
1427353166 source:/home/evanc/whpc-linux-communicator/nodemanager/main.cpp
|
1428909167 source:/home/evanc/whpc-linux-communicator/nodemanager/main.cpp
|
||||||
<iostream>
|
<iostream>
|
||||||
<cpprest/http_listener.h>
|
<cpprest/http_listener.h>
|
||||||
<cpprest/json.h>
|
<cpprest/json.h>
|
||||||
"utils/Logger.h"
|
"utils/Logger.h"
|
||||||
"core/RemoteCommunicator.h"
|
"core/RemoteCommunicator.h"
|
||||||
"core/RemoteExecutor.h"
|
"core/RemoteExecutor.h"
|
||||||
|
"Version.h"
|
||||||
|
|
||||||
1428488455 /home/evanc/whpc-linux-communicator/nodemanager/utils/Logger.h
|
1428488455 /home/evanc/whpc-linux-communicator/nodemanager/utils/Logger.h
|
||||||
<syslog.h>
|
<syslog.h>
|
||||||
|
@ -671,7 +672,7 @@
|
||||||
"../utils/Logger.h"
|
"../utils/Logger.h"
|
||||||
"../utils/System.h"
|
"../utils/System.h"
|
||||||
|
|
||||||
1427353166 /home/evanc/whpc-linux-communicator/nodemanager/core/Reporter.h
|
1428578115 /home/evanc/whpc-linux-communicator/nodemanager/core/Reporter.h
|
||||||
<cpprest/json.h>
|
<cpprest/json.h>
|
||||||
<functional>
|
<functional>
|
||||||
|
|
||||||
|
@ -682,3 +683,12 @@
|
||||||
|
|
||||||
1428574569 /home/evanc/whpc-linux-communicator/nodemanager/common/ErrorCodes.h
|
1428574569 /home/evanc/whpc-linux-communicator/nodemanager/common/ErrorCodes.h
|
||||||
|
|
||||||
|
1428909113 source:/home/evanc/whpc-linux-communicator/nodemanager/Version.cpp
|
||||||
|
"Version.h"
|
||||||
|
|
||||||
|
1429007613 /home/evanc/whpc-linux-communicator/nodemanager/Version.h
|
||||||
|
<string>
|
||||||
|
<map>
|
||||||
|
<vector>
|
||||||
|
<iostream>
|
||||||
|
|
||||||
|
|
|
@ -2,18 +2,85 @@
|
||||||
#define VERSION_H_INCLUDED
|
#define VERSION_H_INCLUDED
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
namespace hpc
|
namespace hpc
|
||||||
{
|
{
|
||||||
class Version
|
class Version
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
static const std::map<std::string, std::vector<std::string>>& GetVersionHistory()
|
||||||
|
{
|
||||||
|
static std::map<std::string, std::vector<std::string>> versionHistory =
|
||||||
|
{
|
||||||
|
{ "1.1.1.1",
|
||||||
|
{
|
||||||
|
"Node manager main functionality support",
|
||||||
|
"Added version support",
|
||||||
|
"Fixed network card reversed order issue",
|
||||||
|
"Added trace",
|
||||||
|
"Added error codes definition",
|
||||||
|
"Fixed a potential node error issue",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ "1.1.1.2",
|
||||||
|
{
|
||||||
|
"Fixed a long running issue because of callback failure",
|
||||||
|
"Added version history support",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ "1.1.1.3",
|
||||||
|
{
|
||||||
|
"Fixed a long running issue because of callback contract mismatch",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ "1.1.1.4",
|
||||||
|
{
|
||||||
|
"Retry when create cgroup failed",
|
||||||
|
"Return the exit code and error message when PrepareTask",
|
||||||
|
"Record the output to message in Process",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ "1.1.1.5",
|
||||||
|
{
|
||||||
|
"Print out version history",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
return versionHistory;
|
||||||
|
}
|
||||||
|
|
||||||
static const std::string& GetVersion()
|
static const std::string& GetVersion()
|
||||||
{
|
{
|
||||||
static std::string version = "1.1.1.0";
|
auto& h = GetVersionHistory();
|
||||||
|
auto it = --h.end();
|
||||||
return version;
|
return it->first;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void PrintVersionHistory()
|
||||||
|
{
|
||||||
|
auto& h = GetVersionHistory();
|
||||||
|
for (auto& v : h)
|
||||||
|
{
|
||||||
|
std::cout << v.first << std::endl;
|
||||||
|
std::cout << "================================================================" << std::endl;
|
||||||
|
|
||||||
|
int number = 0;
|
||||||
|
for (auto& m : v.second)
|
||||||
|
{
|
||||||
|
number++;
|
||||||
|
std::cout << number << ". " << m << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -73,7 +73,11 @@ namespace hpc
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
{
|
{
|
||||||
std::string cmdLine = String::Join(" ", cmd, args...);
|
std::string cmdLine = String::Join(" ", cmd, args...);
|
||||||
this->message << "Task " << this->taskId << ": '" << cmdLine << "' failed. exitCode " << ret << "\r\n";
|
this->message
|
||||||
|
<< "Task " << this->taskId << ": '" << cmdLine
|
||||||
|
<< "' failed. exitCode " << ret << ". output "
|
||||||
|
<< output << std::endl;
|
||||||
|
|
||||||
Logger::Error(this->jobId, this->taskId, this->requeueCount, "'{0}' failed. exitCode {1}, output {2}.", cmdLine, ret, output);
|
Logger::Error(this->jobId, this->taskId, this->requeueCount, "'{0}' failed. exitCode {1}, output {2}.", cmdLine, ret, output);
|
||||||
|
|
||||||
this->SetExitCode(ret);
|
this->SetExitCode(ret);
|
||||||
|
|
|
@ -109,7 +109,8 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
|
||||||
taskInfo->KernelProcessorTime = kernelTime.tv_sec * 1000000 + kernelTime.tv_usec;
|
taskInfo->KernelProcessorTime = kernelTime.tv_sec * 1000000 + kernelTime.tv_usec;
|
||||||
taskInfo->UserProcessorTime = userTime.tv_sec * 1000000 + userTime.tv_usec;
|
taskInfo->UserProcessorTime = userTime.tv_sec * 1000000 + userTime.tv_usec;
|
||||||
|
|
||||||
auto jsonBody = taskInfo->ToJson();
|
auto jsonBody = taskInfo->ToCompletionEventArgJson();
|
||||||
|
|
||||||
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
|
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
|
||||||
"Callback to {0} with {1}", callbackUri, jsonBody);
|
"Callback to {0} with {1}", callbackUri, jsonBody);
|
||||||
client::http_client_config config;
|
client::http_client_config config;
|
||||||
|
@ -121,6 +122,9 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
|
||||||
"Callback to {0} response code {1}", callbackUri, response.status_code());
|
"Callback to {0} response code {1}", callbackUri, response.status_code());
|
||||||
}).wait();
|
}).wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this won't remove the task entry added later as attempt id doesn't match
|
||||||
|
this->jobTaskTable.RemoveTask(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetAttemptId());
|
||||||
}
|
}
|
||||||
catch (const std::exception& ex)
|
catch (const std::exception& ex)
|
||||||
{
|
{
|
||||||
|
@ -128,8 +132,6 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
|
||||||
"Exception when sending back task result. {0}", ex.what());
|
"Exception when sending back task result. {0}", ex.what());
|
||||||
}
|
}
|
||||||
|
|
||||||
// this won't remove the task entry added later as attempt id doesn't match
|
|
||||||
this->jobTaskTable.RemoveTask(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetAttemptId());
|
|
||||||
|
|
||||||
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
|
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
|
||||||
"attemptId {0}, erasing process", taskInfo->GetAttemptId());
|
"attemptId {0}, erasing process", taskInfo->GetAttemptId());
|
||||||
|
|
|
@ -20,7 +20,14 @@ json::value TaskInfo::ToJson() const
|
||||||
j["NumberOfProcesses"] = this->NumberOfProcesses;
|
j["NumberOfProcesses"] = this->NumberOfProcesses;
|
||||||
j["PrimaryTask"] = this->IsPrimaryTask;
|
j["PrimaryTask"] = this->IsPrimaryTask;
|
||||||
j["Message"] = JsonHelper<std::string>::ToJson(this->Message);
|
j["Message"] = JsonHelper<std::string>::ToJson(this->Message);
|
||||||
j["ProcessIds"] = JsonHelper<std::string>::ToJson(String::Join<','>(this->ProcessIds));
|
j["ProcessIds"] = JsonHelper<std::string>::ToJson(String::Join<','>(this->ProcessIds));
|
||||||
|
|
||||||
|
return j;
|
||||||
|
}
|
||||||
|
|
||||||
|
json::value TaskInfo::ToCompletionEventArgJson() const
|
||||||
|
{
|
||||||
|
json::value j = this->ToJson();
|
||||||
|
|
||||||
json::value jobIdArg;
|
json::value jobIdArg;
|
||||||
jobIdArg["JobId"] = this->JobId;
|
jobIdArg["JobId"] = this->JobId;
|
||||||
|
|
|
@ -23,6 +23,7 @@ namespace hpc
|
||||||
TaskInfo(TaskInfo&& t) = default;
|
TaskInfo(TaskInfo&& t) = default;
|
||||||
|
|
||||||
web::json::value ToJson() const;
|
web::json::value ToJson() const;
|
||||||
|
web::json::value ToCompletionEventArgJson() const;
|
||||||
|
|
||||||
const std::string& NodeName;
|
const std::string& NodeName;
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
#include <cpprest/http_listener.h>
|
#include <cpprest/http_listener.h>
|
||||||
#include <cpprest/json.h>
|
#include <cpprest/json.h>
|
||||||
|
|
||||||
|
@ -22,6 +23,13 @@ using namespace hpc::common;
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
|
if (argc > 1)
|
||||||
|
{
|
||||||
|
if (string("-v") == argv[1])
|
||||||
|
Version::PrintVersionHistory();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
std::cout << "Node manager started." << std::endl;
|
std::cout << "Node manager started." << std::endl;
|
||||||
Logger::Info("Log system works.");
|
Logger::Info("Log system works.");
|
||||||
Logger::Info("Version: {0}", Version::GetVersion());
|
Logger::Info("Version: {0}", Version::GetVersion());
|
||||||
|
|
|
@ -44,7 +44,7 @@ if $CGInstalled; then
|
||||||
((maxLoop--))
|
((maxLoop--))
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
kill -s TERM $(pstree -l -p $taskId | grep "([[:digit:]]*)" -o | tr -d '()')
|
kill -s TERM $(pstree -l -p $processId | grep "([[:digit:]]*)" -o | tr -d '()')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
@ -8,10 +8,67 @@
|
||||||
taskId=$1
|
taskId=$1
|
||||||
|
|
||||||
if $CGInstalled; then
|
if $CGInstalled; then
|
||||||
groupName=$(GetCGroupName $taskId)
|
groupName=$(GetCGroupName $taskId)
|
||||||
group=$CGroupSubSys:$groupName
|
group=$CGroupSubSys:$groupName
|
||||||
cgcreate -g $group
|
|
||||||
echo "$2" > $CGroupRoot/cpuset/$groupName/cpuset.cpus
|
maxLoop=3
|
||||||
echo 0 > $CGroupRoot/cpuset/$groupName/cpuset.mems
|
while [ $maxLoop -gt 0 ]
|
||||||
|
do
|
||||||
|
cgcreate -g $group
|
||||||
|
ec=$?
|
||||||
|
if [ $ec -eq 0 ]
|
||||||
|
then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Failed to create cgroup $group, error code $ec, retry after .5 seconds"
|
||||||
|
((maxLoop--))
|
||||||
|
sleep .5
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $ec -ne 0 ]
|
||||||
|
then
|
||||||
|
exit $ec
|
||||||
|
fi
|
||||||
|
|
||||||
|
maxLoop=3
|
||||||
|
while [ $maxLoop -gt 0 ]
|
||||||
|
do
|
||||||
|
echo "$2" > $CGroupRoot/cpuset/$groupName/cpuset.cpus
|
||||||
|
ec=$?
|
||||||
|
if [ $ec -eq 0 ]
|
||||||
|
then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Failed to set cpus for $group, error code $ec, retry after .5 seconds"
|
||||||
|
((maxLoop--))
|
||||||
|
sleep .5
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $ec -ne 0 ]
|
||||||
|
then
|
||||||
|
exit $ec
|
||||||
|
fi
|
||||||
|
|
||||||
|
maxLoop=3
|
||||||
|
while [ $maxLoop -gt 0 ]
|
||||||
|
do
|
||||||
|
echo 0 > $CGroupRoot/cpuset/$groupName/cpuset.mems
|
||||||
|
ec=$?
|
||||||
|
if [ $ec -eq 0 ]
|
||||||
|
then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Failed to set mems for $group, error code $ec, retry after .5 seconds"
|
||||||
|
((maxLoop--))
|
||||||
|
sleep .5
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $ec -ne 0 ]
|
||||||
|
then
|
||||||
|
exit $ec
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче