зеркало из https://github.com/microsoft/pai.git
[Launcher]: Add RetryPolicy Doc (#812)
This commit is contained in:
Родитель
6ef03d094e
Коммит
7038888f18
|
@ -30,14 +30,20 @@
|
|||
- [EnvironmentVariables](#EnvironmentVariables)
|
||||
- [HDFS Published Informations](#HDFS_Published_Informations)
|
||||
- [ExitStatus Convention](#ExitStatus_Convention)
|
||||
- [RetryPolicy](#RetryPolicy)
|
||||
- [ApplicationCompletionPolicy](#ApplicationCompletionPolicy)
|
||||
- [Framework ACL](#Framework_ACL)
|
||||
- [Best Practices](#Best_Practices)
|
||||
|
||||
## <a name="Concepts">Concepts</a>
|
||||
### <a name="Concepts_Basic">Basic</a>
|
||||
* Different **TaskRoles** compose a **Framework**
|
||||
* Same **Tasks** compose a **TaskRole**
|
||||
* A **User Service** executed by all **Tasks** in a **TaskRole**
|
||||
* A **User Service** is executed by all **Tasks** in its corresponding **TaskRole**
|
||||
|
||||
### <a name="Concepts_YARN">YARN Related</a>
|
||||
* A YARN **Application** is an execution attempt of a **Framework**
|
||||
* A YARN **Container** is an execution attempt of a **Task**
|
||||
|
||||
## <a name="Quick_Start">Quick Start</a>
|
||||
1. **Prepare Framework**
|
||||
|
@ -633,24 +639,103 @@ Notes:
|
|||
|
||||
|
||||
## <a name="ExitStatus_Convention">ExitStatus Convention</a>
|
||||
You can check all the defined ExitStatus by: [ExitType](../src/main/java/com/microsoft/frameworklauncher/common/model/ExitType.java), [RetryPolicyDescriptor](../src/main/java/com/microsoft/frameworklauncher/common/model/RetryPolicyDescriptor.java), [RetryPolicyState](../src/main/java/com/microsoft/frameworklauncher/common/model/RetryPolicyState.java), [ExitDiagnostics](../src/main/java/com/microsoft/frameworklauncher/common/exit/ExitDiagnostics.java).
|
||||
You can check all the defined ExitStatus by: [ExitType](../src/main/java/com/microsoft/frameworklauncher/common/model/ExitType.java), [ExitDiagnostics](../src/main/java/com/microsoft/frameworklauncher/common/exit/ExitDiagnostics.java).
|
||||
|
||||
Recipes:
|
||||
1. Your LauncherClient can depend on the ExitStatus Convention
|
||||
2. If your Service failed, the Service can optionally return the ExitCode of USER_APP_TRANSIENT_ERROR and USER_APP_NON_TRANSIENT_ERROR to help FancyRetryPolicy to identify your Service's TRANSIENT_NORMAL and NON_TRANSIENT ExitType. If neither ExitCode is returned, the Service is considered to exit due to UNKNOWN ExitType.
|
||||
|
||||
|
||||
## <a name="RetryPolicy">RetryPolicy</a>
|
||||
### <a name="RetryPolicy_Overview">Overview</a>
|
||||
RetryPolicy can be configured for the whole Framework and each TaskRole to control:
|
||||
1. **Framework RetryPolicy**:<br>
|
||||
The conditions to retry the whole Framework after the Framework's current associated [Application](#Concepts_YARN) completed.<br>
|
||||
*It can also be considered as **Framework CompletionPolicy**, i.e. the conditions to complete the whole Framework.*
|
||||
|
||||
2. **Task RetryPolicy**:<br>
|
||||
The conditions to retry a single Task in the TaskRole after the Task's current associated [Container](#Concepts_YARN) completed.<br>
|
||||
*It can also be considered as **Task CompletionPolicy**, i.e. the conditions to complete a single Task in the TaskRole.*
|
||||
|
||||
### <a name="RetryPolicy_Usage">Usage</a>
|
||||
For details, please check: [RetryPolicyDescriptor](../src/main/java/com/microsoft/frameworklauncher/common/model/RetryPolicyDescriptor.java), [RetryPolicyState](../src/main/java/com/microsoft/frameworklauncher/common/model/RetryPolicyState.java).
|
||||
|
||||
### <a name="RetryPolicy_Examples">Examples</a>
|
||||
Notes:
|
||||
1. *Italic Conditions* can be inherited from the **DEFAULT** RetryPolicy, so no need to specify them explicitly.
|
||||
2. For the definition of each ExitType, such as transient failure, see [ExitStatus Convention](#ExitStatus_Convention).
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>FrameworkType</th>
|
||||
<th>Framework RetryPolicy</th>
|
||||
<th>TaskRole</th>
|
||||
<th>Task RetryPolicy</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="2"><b>DEFAULT</td>
|
||||
<td rowspan="2"><i>FancyRetryPolicy = false<br>MaxRetryCount = 0</i></td>
|
||||
<td>TaskRole1</td>
|
||||
<td><i>FancyRetryPolicy = false<br>MaxRetryCount = 0</i></td>
|
||||
<td rowspan="2">The default RetryPolicy:<br>Never Retry for any failure or success.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>TaskRole2</td>
|
||||
<td><i>FancyRetryPolicy = false<br>MaxRetryCount = 0</i></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="1"><b>Service</td>
|
||||
<td rowspan="1"><i>FancyRetryPolicy = false</i><br>MaxRetryCount = -2</td>
|
||||
<td>TaskRole1</td>
|
||||
<td><i>FancyRetryPolicy = false</i><br>MaxRetryCount = -2</td>
|
||||
<td rowspan="1">Always Retry for any failure or success.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="1"><b>Blind Batch Job</td>
|
||||
<td rowspan="1"><i>FancyRetryPolicy = false</i><br>MaxRetryCount = -1</td>
|
||||
<td>TaskRole1</td>
|
||||
<td><i>FancyRetryPolicy = false</i><br>MaxRetryCount = -1</td>
|
||||
<td rowspan="1">Always Retry for any failure.<br>Never Retry for success.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="1"><b>Batch Job with Task Fault Tolerance</td>
|
||||
<td rowspan="1">FancyRetryPolicy = true<br>MaxRetryCount = 3</td>
|
||||
<td>TaskRole1</td>
|
||||
<td>FancyRetryPolicy = true<br>MaxRetryCount = 3</td>
|
||||
<td rowspan="1">Always Retry for transient failure.<br>Never Retry for non-transient failure or success.<br>Retry up to 3 times for unknown failure.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="1"><b>Batch Job without Task Fault Tolerance</td>
|
||||
<td rowspan="1">FancyRetryPolicy = true<br>MaxRetryCount = 3</td>
|
||||
<td>TaskRole1</td>
|
||||
<td><i>FancyRetryPolicy = false<br>MaxRetryCount = 0</i></td>
|
||||
<td rowspan="1">For Framework RetryPolicy, same as "Batch Job with Task Fault Tolerance".<br>For Task RetryPolicy, because the Task cannot tolerate any failed Container, such as it cannot recover from previous failed Container, so Never Retry Task for any failure or success.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="1"><b>Debug Mode</td>
|
||||
<td rowspan="1">FancyRetryPolicy = true<br><i>MaxRetryCount = 0</i></td>
|
||||
<td>TaskRole1</td>
|
||||
<td>FancyRetryPolicy = true<br><i>MaxRetryCount = 0</i></td>
|
||||
<td rowspan="1">Always Retry for transient failure.<br>Never Retry for non-transient failure or unknown failure or success.<br>This can help to capture the unexpected exit of User Service itself.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
## <a name="ApplicationCompletionPolicy">ApplicationCompletionPolicy</a>
|
||||
### <a name="ApplicationCompletionPolicy_Overview">Overview</a>
|
||||
ApplicationCompletionPolicy can be configured for each TaskRole to control:
|
||||
1. The conditions to complete the Application.
|
||||
2. The ExitStatus of the completed Application.
|
||||
1. The conditions to complete the [Application](#Concepts_YARN).
|
||||
2. The ExitStatus of the completed [Application](#Concepts_YARN).
|
||||
|
||||
### <a name="ApplicationCompletionPolicy_Usage">Usage</a>
|
||||
For details, please check: [TaskRoleApplicationCompletionPolicyDescriptor](../src/main/java/com/microsoft/frameworklauncher/common/model/TaskRoleApplicationCompletionPolicyDescriptor.java).
|
||||
|
||||
### <a name="ApplicationCompletionPolicy_Examples">Examples</a>
|
||||
Notes, *Italic Conditions* can be inherited from the **DEFAULT** ApplicationCompletionPolicy, so no need to specify them explicitly.
|
||||
Notes:
|
||||
1. *Italic Conditions* can be inherited from the **DEFAULT** ApplicationCompletionPolicy, so no need to specify them explicitly.
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
|
@ -674,7 +759,7 @@ Notes, *Italic Conditions* can be inherited from the **DEFAULT** ApplicationComp
|
|||
<td rowspan="1"><b>Service</td>
|
||||
<td>TaskRole1</td>
|
||||
<td><i>MinFailedTaskCount = 1<br>MinSucceededTaskCount = null</i></td>
|
||||
<td rowspan="1">Actually, any ApplicationCompletionPolicy is fine, since Service's Task will never complete, i.e. its Task's MaxRetryCount is -2.</td>
|
||||
<td rowspan="1">Actually, any ApplicationCompletionPolicy is fine, since Service's Task will never complete, i.e. its Task's MaxRetryCount is -2, see <a href="#RetryPolicy_Examples">RetryPolicy Examples</a>.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="2"><b>MapReduce</td>
|
||||
|
|
|
@ -840,7 +840,7 @@ public class ApplicationMaster extends AbstractService {
|
|||
if (fancyRetryPolicy) {
|
||||
// FancyRetryPolicy only handle exit due to transient and non-transient failure specially,
|
||||
// Leave exit due to others to NormalRetryPolicy
|
||||
LOGGER.logWarning(logPrefix +
|
||||
LOGGER.logInfo(logPrefix +
|
||||
"Transfer the RetryDecision to NormalRetryPolicy. Reason: " +
|
||||
fancyRetryPolicyLogSuffix);
|
||||
}
|
||||
|
@ -859,7 +859,7 @@ public class ApplicationMaster extends AbstractService {
|
|||
return;
|
||||
} else {
|
||||
if (exitType == ExitType.SUCCEEDED) {
|
||||
LOGGER.logWarning(completeTaskLogPrefix +
|
||||
LOGGER.logInfo(completeTaskLogPrefix +
|
||||
"Task exited due to %s.", exitType);
|
||||
completeTask(taskStatus);
|
||||
return;
|
||||
|
|
|
@ -110,7 +110,7 @@ public class ExitDiagnostics {
|
|||
DEF.put(ExitStatusKey.AM_KILLED_BY_USER, new ExitStatusValue(
|
||||
ExitStatusKey.AM_KILLED_BY_USER.toInt(),
|
||||
"AM Killed by User", ExitType.TRANSIENT_NORMAL));
|
||||
// AM Internal TransientNormalError: hdfs error, env error...
|
||||
// AM Internal TransientNormalError: machine error, network error, configuration error, environment error...
|
||||
DEF.put(ExitStatusKey.AM_INTERNAL_TRANSIENT_NORMAL_ERROR, new ExitStatusValue(
|
||||
ExitStatusKey.AM_INTERNAL_TRANSIENT_NORMAL_ERROR.toInt(),
|
||||
"AM internal transient normal error", ExitType.TRANSIENT_NORMAL));
|
||||
|
@ -206,7 +206,8 @@ public class ExitDiagnostics {
|
|||
// Note that UserApplication should do some retries on the local machine first, return USER_APP_TRANSIENT_ERROR only if local retries does not work.
|
||||
DEF.put(ExitStatusKey.USER_APP_TRANSIENT_ERROR, new ExitStatusValue(
|
||||
ExitStatusKey.USER_APP_TRANSIENT_ERROR.toInt(),
|
||||
"UserApplication transient error: maybe hdfs error, env error, machine error, connection error...", ExitType.TRANSIENT_NORMAL));
|
||||
"UserApplication transient error: maybe dependent components shutdown, " +
|
||||
"machine error, network error, configuration error, environment error...", ExitType.TRANSIENT_NORMAL));
|
||||
// UserApplication failed, and it can ensure that it will fail in every retry times:
|
||||
DEF.put(ExitStatusKey.USER_APP_NON_TRANSIENT_ERROR, new ExitStatusValue(
|
||||
ExitStatusKey.USER_APP_NON_TRANSIENT_ERROR.toInt(),
|
||||
|
|
|
@ -24,7 +24,8 @@ public enum ExitType implements Serializable {
|
|||
SUCCEEDED,
|
||||
|
||||
// Failed, and it can ensure that it will success within a finite retry times:
|
||||
// such as hdfs error, env error, machine error, connection error...
|
||||
// such as dependent components shutdown, machine error, network error,
|
||||
// configuration error, environment error...
|
||||
TRANSIENT_NORMAL,
|
||||
|
||||
// A special TRANSIENT_NORMAL which indicate the exit due to resource conflict
|
||||
|
|
|
@ -34,24 +34,18 @@ import java.io.Serializable;
|
|||
* will retry and retriedCount++ if exit due to failure and maxRetryCount == -1,
|
||||
* will retry and retriedCount++ if exit due to failure and retriedCount < maxRetryCount,
|
||||
* will not retry if all above conditions are not satisfied.
|
||||
*
|
||||
* For all cases, the final ExitStatus is always the same as the ExitStatus of the last attempt.
|
||||
*/
|
||||
public class RetryPolicyDescriptor implements Serializable {
|
||||
@Valid
|
||||
@NotNull
|
||||
@Min(-2)
|
||||
private Integer maxRetryCount = 0;
|
||||
private Boolean fancyRetryPolicy = false;
|
||||
|
||||
@Valid
|
||||
@NotNull
|
||||
private Boolean fancyRetryPolicy = false;
|
||||
|
||||
public Integer getMaxRetryCount() {
|
||||
return maxRetryCount;
|
||||
}
|
||||
|
||||
public void setMaxRetryCount(Integer maxRetryCount) {
|
||||
this.maxRetryCount = maxRetryCount;
|
||||
}
|
||||
@Min(-2)
|
||||
private Integer maxRetryCount = 0;
|
||||
|
||||
public Boolean getFancyRetryPolicy() {
|
||||
return fancyRetryPolicy;
|
||||
|
@ -60,4 +54,12 @@ public class RetryPolicyDescriptor implements Serializable {
|
|||
public void setFancyRetryPolicy(Boolean fancyRetryPolicy) {
|
||||
this.fancyRetryPolicy = fancyRetryPolicy;
|
||||
}
|
||||
|
||||
public Integer getMaxRetryCount() {
|
||||
return maxRetryCount;
|
||||
}
|
||||
|
||||
public void setMaxRetryCount(Integer maxRetryCount) {
|
||||
this.maxRetryCount = maxRetryCount;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,11 +100,11 @@ public class RequestManager extends AbstractService { // THREAD SAFE
|
|||
private void pullRequest() throws Exception {
|
||||
Map<String, FrameworkRequest> newFrameworkRequests;
|
||||
try {
|
||||
LOGGER.logInfo("Pulling AllFrameworkRequests");
|
||||
LOGGER.logDebug("Pulling AllFrameworkRequests");
|
||||
|
||||
newFrameworkRequests = zkStore.getAllFrameworkRequests();
|
||||
|
||||
LOGGER.logInfo("Pulled AllFrameworkRequests");
|
||||
LOGGER.logDebug("Pulled AllFrameworkRequests");
|
||||
} catch (KeeperException.NoNodeException e) {
|
||||
LOGGER.logWarning(e,
|
||||
"Failed to getAllFrameworkRequests, LauncherRequest is deleted on ZK");
|
||||
|
|
|
@ -744,7 +744,7 @@ public class Service extends AbstractService {
|
|||
if (fancyRetryPolicy) {
|
||||
// FancyRetryPolicy only handle exit due to transient and non-transient failure specially,
|
||||
// Leave exit due to others to NormalRetryPolicy
|
||||
LOGGER.logWarning(logPrefix +
|
||||
LOGGER.logInfo(logPrefix +
|
||||
"Transfer the RetryDecision to NormalRetryPolicy. Reason: " +
|
||||
fancyRetryPolicyLogSuffix);
|
||||
}
|
||||
|
@ -763,7 +763,7 @@ public class Service extends AbstractService {
|
|||
return;
|
||||
} else {
|
||||
if (exitType == ExitType.SUCCEEDED) {
|
||||
LOGGER.logWarning(completeFrameworkLogPrefix +
|
||||
LOGGER.logInfo(completeFrameworkLogPrefix +
|
||||
"Framework exited due to %s.", exitType);
|
||||
completeFramework(frameworkStatus);
|
||||
return;
|
||||
|
|
|
@ -558,12 +558,12 @@ public class StatusManager extends AbstractService { // THREAD SAFE
|
|||
|
||||
// Initialize new Framework: Add or NonRolling Upgrade Framework
|
||||
if (!frameworkStatuses.containsKey(frameworkName)) {
|
||||
LOGGER.logDebug(logPrefix + "Add new Framework");
|
||||
LOGGER.logInfo(logPrefix + "Add new Framework");
|
||||
addFramework(frameworkRequest);
|
||||
} else {
|
||||
FrameworkStatus frameworkStatus = frameworkStatuses.get(frameworkName);
|
||||
if (!frameworkStatus.getFrameworkVersion().equals(frameworkVersion)) {
|
||||
LOGGER.logDebug(logPrefix + "NonRolling Upgrade Framework");
|
||||
LOGGER.logInfo(logPrefix + "NonRolling Upgrade Framework");
|
||||
upgradeFramework(frameworkRequest);
|
||||
}
|
||||
}
|
||||
|
@ -582,7 +582,7 @@ public class StatusManager extends AbstractService { // THREAD SAFE
|
|||
frameworkName, frameworkVersion);
|
||||
|
||||
if (!frameworkRequests.containsKey(frameworkName)) {
|
||||
LOGGER.logDebug(logPrefix + "Remove Framework permanently");
|
||||
LOGGER.logInfo(logPrefix + "Remove Framework permanently");
|
||||
removeFramework(frameworkName, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -106,7 +106,7 @@ public class StatusManager extends AbstractService { // THREAD SAFE
|
|||
* REGION InternalUtils
|
||||
*/
|
||||
private void pullStatus() throws Exception {
|
||||
LOGGER.logInfo("Pulling AggregatedLauncherStatus");
|
||||
LOGGER.logDebug("Pulling AggregatedLauncherStatus");
|
||||
|
||||
Map<String, AggregatedFrameworkStatus> reusableAggFrameworkStatuses =
|
||||
getReusableAggregatedFrameworkStatuses();
|
||||
|
@ -122,7 +122,7 @@ public class StatusManager extends AbstractService { // THREAD SAFE
|
|||
newAggFrameworkStatuses.putAll(nonreusableAggFrameworkStatuses);
|
||||
aggFrameworkStatuses = CommonExts.asReadOnly(newAggFrameworkStatuses);
|
||||
|
||||
LOGGER.logInfo("Pulled AggregatedLauncherStatus: " +
|
||||
LOGGER.logDebug("Pulled AggregatedLauncherStatus: " +
|
||||
"AggregatedFrameworkStatus Reused Percentage: [%s / %s]",
|
||||
reusableAggFrameworkStatuses.size(), aggFrameworkStatuses.size());
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче