pulled from another branch
This commit is contained in:
Коммит
210722dd71
|
@ -418,13 +418,14 @@
|
|||
"../common.h"
|
||||
"../details/log_msg.h"
|
||||
|
||||
1427353166 source:/home/evanc/whpc-linux-communicator/nodemanager/main.cpp
|
||||
1428909167 source:/home/evanc/whpc-linux-communicator/nodemanager/main.cpp
|
||||
<iostream>
|
||||
<cpprest/http_listener.h>
|
||||
<cpprest/json.h>
|
||||
"utils/Logger.h"
|
||||
"core/RemoteCommunicator.h"
|
||||
"core/RemoteExecutor.h"
|
||||
"Version.h"
|
||||
|
||||
1428488455 /home/evanc/whpc-linux-communicator/nodemanager/utils/Logger.h
|
||||
<syslog.h>
|
||||
|
@ -671,7 +672,7 @@
|
|||
"../utils/Logger.h"
|
||||
"../utils/System.h"
|
||||
|
||||
1427353166 /home/evanc/whpc-linux-communicator/nodemanager/core/Reporter.h
|
||||
1428578115 /home/evanc/whpc-linux-communicator/nodemanager/core/Reporter.h
|
||||
<cpprest/json.h>
|
||||
<functional>
|
||||
|
||||
|
@ -682,3 +683,12 @@
|
|||
|
||||
1428574569 /home/evanc/whpc-linux-communicator/nodemanager/common/ErrorCodes.h
|
||||
|
||||
1428909113 source:/home/evanc/whpc-linux-communicator/nodemanager/Version.cpp
|
||||
"Version.h"
|
||||
|
||||
1429007613 /home/evanc/whpc-linux-communicator/nodemanager/Version.h
|
||||
<string>
|
||||
<map>
|
||||
<vector>
|
||||
<iostream>
|
||||
|
||||
|
|
|
@ -2,18 +2,85 @@
|
|||
#define VERSION_H_INCLUDED
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
namespace hpc
|
||||
{
|
||||
class Version
|
||||
{
|
||||
public:
|
||||
static const std::map<std::string, std::vector<std::string>>& GetVersionHistory()
|
||||
{
|
||||
static std::map<std::string, std::vector<std::string>> versionHistory =
|
||||
{
|
||||
{ "1.1.1.1",
|
||||
{
|
||||
"Node manager main functionality support",
|
||||
"Added version support",
|
||||
"Fixed network card reversed order issue",
|
||||
"Added trace",
|
||||
"Added error codes definition",
|
||||
"Fixed a potential node error issue",
|
||||
}
|
||||
},
|
||||
{ "1.1.1.2",
|
||||
{
|
||||
"Fixed a long running issue because of callback failure",
|
||||
"Added version history support",
|
||||
}
|
||||
},
|
||||
{ "1.1.1.3",
|
||||
{
|
||||
"Fixed a long running issue because of callback contract mismatch",
|
||||
}
|
||||
},
|
||||
{ "1.1.1.4",
|
||||
{
|
||||
"Retry when create cgroup failed",
|
||||
"Return the exit code and error message when PrepareTask",
|
||||
"Record the output to message in Process",
|
||||
}
|
||||
},
|
||||
{ "1.1.1.5",
|
||||
{
|
||||
"Print out version history",
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
return versionHistory;
|
||||
}
|
||||
|
||||
static const std::string& GetVersion()
|
||||
{
|
||||
static std::string version = "1.1.1.0";
|
||||
|
||||
return version;
|
||||
auto& h = GetVersionHistory();
|
||||
auto it = --h.end();
|
||||
return it->first;
|
||||
}
|
||||
|
||||
static void PrintVersionHistory()
|
||||
{
|
||||
auto& h = GetVersionHistory();
|
||||
for (auto& v : h)
|
||||
{
|
||||
std::cout << v.first << std::endl;
|
||||
std::cout << "================================================================" << std::endl;
|
||||
|
||||
int number = 0;
|
||||
for (auto& m : v.second)
|
||||
{
|
||||
number++;
|
||||
std::cout << number << ". " << m << std::endl;
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -73,7 +73,11 @@ namespace hpc
|
|||
if (ret != 0)
|
||||
{
|
||||
std::string cmdLine = String::Join(" ", cmd, args...);
|
||||
this->message << "Task " << this->taskId << ": '" << cmdLine << "' failed. exitCode " << ret << "\r\n";
|
||||
this->message
|
||||
<< "Task " << this->taskId << ": '" << cmdLine
|
||||
<< "' failed. exitCode " << ret << ". output "
|
||||
<< output << std::endl;
|
||||
|
||||
Logger::Error(this->jobId, this->taskId, this->requeueCount, "'{0}' failed. exitCode {1}, output {2}.", cmdLine, ret, output);
|
||||
|
||||
this->SetExitCode(ret);
|
||||
|
|
|
@ -109,7 +109,8 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
|
|||
taskInfo->KernelProcessorTime = kernelTime.tv_sec * 1000000 + kernelTime.tv_usec;
|
||||
taskInfo->UserProcessorTime = userTime.tv_sec * 1000000 + userTime.tv_usec;
|
||||
|
||||
auto jsonBody = taskInfo->ToJson();
|
||||
auto jsonBody = taskInfo->ToCompletionEventArgJson();
|
||||
|
||||
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
|
||||
"Callback to {0} with {1}", callbackUri, jsonBody);
|
||||
client::http_client_config config;
|
||||
|
@ -121,6 +122,9 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
|
|||
"Callback to {0} response code {1}", callbackUri, response.status_code());
|
||||
}).wait();
|
||||
}
|
||||
|
||||
// this won't remove the task entry added later as attempt id doesn't match
|
||||
this->jobTaskTable.RemoveTask(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetAttemptId());
|
||||
}
|
||||
catch (const std::exception& ex)
|
||||
{
|
||||
|
@ -128,8 +132,6 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
|
|||
"Exception when sending back task result. {0}", ex.what());
|
||||
}
|
||||
|
||||
// this won't remove the task entry added later as attempt id doesn't match
|
||||
this->jobTaskTable.RemoveTask(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetAttemptId());
|
||||
|
||||
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
|
||||
"attemptId {0}, erasing process", taskInfo->GetAttemptId());
|
||||
|
|
|
@ -20,7 +20,14 @@ json::value TaskInfo::ToJson() const
|
|||
j["NumberOfProcesses"] = this->NumberOfProcesses;
|
||||
j["PrimaryTask"] = this->IsPrimaryTask;
|
||||
j["Message"] = JsonHelper<std::string>::ToJson(this->Message);
|
||||
j["ProcessIds"] = JsonHelper<std::string>::ToJson(String::Join<','>(this->ProcessIds));
|
||||
j["ProcessIds"] = JsonHelper<std::string>::ToJson(String::Join<','>(this->ProcessIds));
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
json::value TaskInfo::ToCompletionEventArgJson() const
|
||||
{
|
||||
json::value j = this->ToJson();
|
||||
|
||||
json::value jobIdArg;
|
||||
jobIdArg["JobId"] = this->JobId;
|
||||
|
|
|
@ -23,6 +23,7 @@ namespace hpc
|
|||
TaskInfo(TaskInfo&& t) = default;
|
||||
|
||||
web::json::value ToJson() const;
|
||||
web::json::value ToCompletionEventArgJson() const;
|
||||
|
||||
const std::string& NodeName;
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include <iostream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <cpprest/http_listener.h>
|
||||
#include <cpprest/json.h>
|
||||
|
||||
|
@ -22,6 +23,13 @@ using namespace hpc::common;
|
|||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if (argc > 1)
|
||||
{
|
||||
if (string("-v") == argv[1])
|
||||
Version::PrintVersionHistory();
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::cout << "Node manager started." << std::endl;
|
||||
Logger::Info("Log system works.");
|
||||
Logger::Info("Version: {0}", Version::GetVersion());
|
||||
|
|
|
@ -44,7 +44,7 @@ if $CGInstalled; then
|
|||
((maxLoop--))
|
||||
done
|
||||
else
|
||||
kill -s TERM $(pstree -l -p $taskId | grep "([[:digit:]]*)" -o | tr -d '()')
|
||||
kill -s TERM $(pstree -l -p $processId | grep "([[:digit:]]*)" -o | tr -d '()')
|
||||
fi
|
||||
|
||||
exit 0
|
||||
|
|
|
@ -8,10 +8,67 @@
|
|||
taskId=$1
|
||||
|
||||
if $CGInstalled; then
|
||||
groupName=$(GetCGroupName $taskId)
|
||||
group=$CGroupSubSys:$groupName
|
||||
cgcreate -g $group
|
||||
echo "$2" > $CGroupRoot/cpuset/$groupName/cpuset.cpus
|
||||
echo 0 > $CGroupRoot/cpuset/$groupName/cpuset.mems
|
||||
groupName=$(GetCGroupName $taskId)
|
||||
group=$CGroupSubSys:$groupName
|
||||
|
||||
maxLoop=3
|
||||
while [ $maxLoop -gt 0 ]
|
||||
do
|
||||
cgcreate -g $group
|
||||
ec=$?
|
||||
if [ $ec -eq 0 ]
|
||||
then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Failed to create cgroup $group, error code $ec, retry after .5 seconds"
|
||||
((maxLoop--))
|
||||
sleep .5
|
||||
done
|
||||
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
maxLoop=3
|
||||
while [ $maxLoop -gt 0 ]
|
||||
do
|
||||
echo "$2" > $CGroupRoot/cpuset/$groupName/cpuset.cpus
|
||||
ec=$?
|
||||
if [ $ec -eq 0 ]
|
||||
then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Failed to set cpus for $group, error code $ec, retry after .5 seconds"
|
||||
((maxLoop--))
|
||||
sleep .5
|
||||
done
|
||||
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
exit $ec
|
||||
fi
|
||||
|
||||
maxLoop=3
|
||||
while [ $maxLoop -gt 0 ]
|
||||
do
|
||||
echo 0 > $CGroupRoot/cpuset/$groupName/cpuset.mems
|
||||
ec=$?
|
||||
if [ $ec -eq 0 ]
|
||||
then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Failed to set mems for $group, error code $ec, retry after .5 seconds"
|
||||
((maxLoop--))
|
||||
sleep .5
|
||||
done
|
||||
|
||||
if [ $ec -ne 0 ]
|
||||
then
|
||||
exit $ec
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче