This commit is contained in:
evanc 2015-04-14 04:19:54 -07:00
Родитель 2a94c8771a 2d4008c855
Коммит 210722dd71
9 изменённых файлов: 173 добавлений и 17 удалений

Просмотреть файл

@ -418,13 +418,14 @@
"../common.h"
"../details/log_msg.h"
1427353166 source:/home/evanc/whpc-linux-communicator/nodemanager/main.cpp
1428909167 source:/home/evanc/whpc-linux-communicator/nodemanager/main.cpp
<iostream>
<cpprest/http_listener.h>
<cpprest/json.h>
"utils/Logger.h"
"core/RemoteCommunicator.h"
"core/RemoteExecutor.h"
"Version.h"
1428488455 /home/evanc/whpc-linux-communicator/nodemanager/utils/Logger.h
<syslog.h>
@ -671,7 +672,7 @@
"../utils/Logger.h"
"../utils/System.h"
1427353166 /home/evanc/whpc-linux-communicator/nodemanager/core/Reporter.h
1428578115 /home/evanc/whpc-linux-communicator/nodemanager/core/Reporter.h
<cpprest/json.h>
<functional>
@ -682,3 +683,12 @@
1428574569 /home/evanc/whpc-linux-communicator/nodemanager/common/ErrorCodes.h
1428909113 source:/home/evanc/whpc-linux-communicator/nodemanager/Version.cpp
"Version.h"
1429007613 /home/evanc/whpc-linux-communicator/nodemanager/Version.h
<string>
<map>
<vector>
<iostream>

Просмотреть файл

@ -2,18 +2,85 @@
#define VERSION_H_INCLUDED
#include <string>
#include <map>
#include <vector>
#include <iostream>
namespace hpc
{
class Version
{
public:
static const std::map<std::string, std::vector<std::string>>& GetVersionHistory()
{
static std::map<std::string, std::vector<std::string>> versionHistory =
{
{ "1.1.1.1",
{
"Node manager main functionality support",
"Added version support",
"Fixed network card reversed order issue",
"Added trace",
"Added error codes definition",
"Fixed a potential node error issue",
}
},
{ "1.1.1.2",
{
"Fixed a long running issue because of callback failure",
"Added version history support",
}
},
{ "1.1.1.3",
{
"Fixed a long running issue because of callback contract mismatch",
}
},
{ "1.1.1.4",
{
"Retry when create cgroup failed",
"Return the exit code and error message when PrepareTask",
"Record the output to message in Process",
}
},
{ "1.1.1.5",
{
"Print out version history",
}
},
};
return versionHistory;
}
static const std::string& GetVersion()
{
static std::string version = "1.1.1.0";
return version;
auto& h = GetVersionHistory();
auto it = --h.end();
return it->first;
}
static void PrintVersionHistory()
{
auto& h = GetVersionHistory();
for (auto& v : h)
{
std::cout << v.first << std::endl;
std::cout << "================================================================" << std::endl;
int number = 0;
for (auto& m : v.second)
{
number++;
std::cout << number << ". " << m << std::endl;
}
std::cout << std::endl;
}
}
private:
};
}

Просмотреть файл

@ -73,7 +73,11 @@ namespace hpc
if (ret != 0)
{
std::string cmdLine = String::Join(" ", cmd, args...);
this->message << "Task " << this->taskId << ": '" << cmdLine << "' failed. exitCode " << ret << "\r\n";
this->message
<< "Task " << this->taskId << ": '" << cmdLine
<< "' failed. exitCode " << ret << ". output "
<< output << std::endl;
Logger::Error(this->jobId, this->taskId, this->requeueCount, "'{0}' failed. exitCode {1}, output {2}.", cmdLine, ret, output);
this->SetExitCode(ret);

Просмотреть файл

@ -109,7 +109,8 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
taskInfo->KernelProcessorTime = kernelTime.tv_sec * 1000000 + kernelTime.tv_usec;
taskInfo->UserProcessorTime = userTime.tv_sec * 1000000 + userTime.tv_usec;
auto jsonBody = taskInfo->ToJson();
auto jsonBody = taskInfo->ToCompletionEventArgJson();
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
"Callback to {0} with {1}", callbackUri, jsonBody);
client::http_client_config config;
@ -121,6 +122,9 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
"Callback to {0} response code {1}", callbackUri, response.status_code());
}).wait();
}
// this won't remove the task entry added later as attempt id doesn't match
this->jobTaskTable.RemoveTask(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetAttemptId());
}
catch (const std::exception& ex)
{
@ -128,8 +132,6 @@ json::value RemoteExecutor::StartTask(StartTaskArgs&& args, const std::string& c
"Exception when sending back task result. {0}", ex.what());
}
// this won't remove the task entry added later as attempt id doesn't match
this->jobTaskTable.RemoveTask(taskInfo->JobId, taskInfo->TaskId, taskInfo->GetAttemptId());
Logger::Debug(taskInfo->JobId, taskInfo->TaskId, taskInfo->TaskRequeueCount,
"attemptId {0}, erasing process", taskInfo->GetAttemptId());

Просмотреть файл

@ -22,6 +22,13 @@ json::value TaskInfo::ToJson() const
j["Message"] = JsonHelper<std::string>::ToJson(this->Message);
j["ProcessIds"] = JsonHelper<std::string>::ToJson(String::Join<','>(this->ProcessIds));
return j;
}
json::value TaskInfo::ToCompletionEventArgJson() const
{
json::value j = this->ToJson();
json::value jobIdArg;
jobIdArg["JobId"] = this->JobId;
jobIdArg["TaskInfo"] = j;

Просмотреть файл

@ -23,6 +23,7 @@ namespace hpc
TaskInfo(TaskInfo&& t) = default;
web::json::value ToJson() const;
web::json::value ToCompletionEventArgJson() const;
const std::string& NodeName;

Просмотреть файл

@ -1,4 +1,5 @@
#include <iostream>
#include <string>
#include <cpprest/http_listener.h>
#include <cpprest/json.h>
@ -22,6 +23,13 @@ using namespace hpc::common;
int main(int argc, char* argv[])
{
if (argc > 1)
{
if (string("-v") == argv[1])
Version::PrintVersionHistory();
return 0;
}
std::cout << "Node manager started." << std::endl;
Logger::Info("Log system works.");
Logger::Info("Version: {0}", Version::GetVersion());

Просмотреть файл

@ -44,7 +44,7 @@ if $CGInstalled; then
((maxLoop--))
done
else
kill -s TERM $(pstree -l -p $taskId | grep "([[:digit:]]*)" -o | tr -d '()')
kill -s TERM $(pstree -l -p $processId | grep "([[:digit:]]*)" -o | tr -d '()')
fi
exit 0

Просмотреть файл

@ -8,10 +8,67 @@
taskId=$1
if $CGInstalled; then
groupName=$(GetCGroupName $taskId)
group=$CGroupSubSys:$groupName
cgcreate -g $group
echo "$2" > $CGroupRoot/cpuset/$groupName/cpuset.cpus
echo 0 > $CGroupRoot/cpuset/$groupName/cpuset.mems
groupName=$(GetCGroupName $taskId)
group=$CGroupSubSys:$groupName
maxLoop=3
while [ $maxLoop -gt 0 ]
do
cgcreate -g $group
ec=$?
if [ $ec -eq 0 ]
then
break
fi
echo "Failed to create cgroup $group, error code $ec, retry after .5 seconds"
((maxLoop--))
sleep .5
done
if [ $ec -ne 0 ]
then
exit $ec
fi
maxLoop=3
while [ $maxLoop -gt 0 ]
do
echo "$2" > $CGroupRoot/cpuset/$groupName/cpuset.cpus
ec=$?
if [ $ec -eq 0 ]
then
break
fi
echo "Failed to set cpus for $group, error code $ec, retry after .5 seconds"
((maxLoop--))
sleep .5
done
if [ $ec -ne 0 ]
then
exit $ec
fi
maxLoop=3
while [ $maxLoop -gt 0 ]
do
echo 0 > $CGroupRoot/cpuset/$groupName/cpuset.mems
ec=$?
if [ $ec -eq 0 ]
then
break
fi
echo "Failed to set mems for $group, error code $ec, retry after .5 seconds"
((maxLoop--))
sleep .5
done
if [ $ec -ne 0 ]
then
exit $ec
fi
fi