Merge pull request #30 from AaronYll/fix-tasktimeout-after-network_partition

fix race condition when get stale request to StartTask
This commit is contained in:
Zihao Chen 2023-03-29 14:45:13 +08:00 коммит произвёл GitHub
Родитель 62449c5b3c 94e8108461
Коммит 059e887288
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
1 изменённых файлов: 17 добавлений и 7 удалений

Просмотреть файл

@ -56,17 +56,27 @@ namespace hpc
void SetTaskRequeueCount(int c)
{
int oldC = this->taskRequeueCount;
this->taskRequeueCount = c;
if (!this->processKeySet)
if (c < oldC)
{
this->ProcessKey = this->GetAttemptId();
this->processKeySet = true;
hpc::utils::Logger::Warn(this->JobId, this->TaskId, this->taskRequeueCount,
"The requeue count must be monotonically increasing, cannot change requeue count from {0} to {1}",
oldC, c);
}
else
{
this->taskRequeueCount = c;
hpc::utils::Logger::Info(this->JobId, this->TaskId, this->taskRequeueCount,
"Change requeue count from {0} to {1}, processKey {2}",
oldC, c, this->ProcessKey);
if (!this->processKeySet)
{
this->ProcessKey = this->GetAttemptId();
this->processKeySet = true;
}
hpc::utils::Logger::Info(this->JobId, this->TaskId, this->taskRequeueCount,
"Change requeue count from {0} to {1}, processKey {2}",
oldC, c, this->ProcessKey);
}
}
int GetProcessCount() const { return this->ProcessIds.size(); }