Add ChaosContext and ChaosTargetFilter in ChaosParameters (#62)
This commit is contained in:
Родитель
489057c6dd
Коммит
32b4a257a9
|
@ -34,3 +34,7 @@
|
|||
# App and application tests
|
||||
src/sfctl/tests/app_test.py @iyyappam
|
||||
src/sfctl/custom_app.py @iyyappam
|
||||
|
||||
# Chaos and related tests
|
||||
src/sfctl/tests/chaos_test.py @motanv
|
||||
src/sfctl/custom_chaos.py @motanv
|
|
@ -27,6 +27,7 @@ Change Log
|
|||
- Application list related commands now support an optional argument to limit the number of results
|
||||
- Deployed application info can now optionally include health states
|
||||
- Numerous documentation improvements and corrections
|
||||
- ChaosContext (context) and ChaosTargetFilter (chaos-target-filter) arguments are added to Chaos start command (#62)
|
||||
|
||||
3.0.0
|
||||
-----
|
||||
|
|
|
@ -21,6 +21,7 @@ import sfctl.helps.health # pylint: disable=unused-import
|
|||
import sfctl.helps.cluster_upgrade # pylint: disable=unused-import
|
||||
import sfctl.helps.compose # pylint: disable=unused-import
|
||||
import sfctl.helps.app_type # pylint: disable=unused-import
|
||||
import sfctl.helps.chaos # pylint: disable=unused-import
|
||||
|
||||
class SFCommandHelp(CLIHelp):
|
||||
"""Service Fabric CLI help loader"""
|
||||
|
|
|
@ -4,46 +4,44 @@
|
|||
# license information.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
"""Custom commands for the Service Fabric chaos test service"""
|
||||
"""Custom commands for the Service Fabric chaos service"""
|
||||
|
||||
def start( #pylint: disable=too-many-arguments,too-many-locals
|
||||
client, time_to_run="4294967295", max_cluster_stabilization=60,
|
||||
max_concurrent_faults=1, disable_move_replica_faults=False,
|
||||
wait_time_between_faults=20,
|
||||
wait_time_between_iterations=30, warning_as_error=False,
|
||||
max_percent_unhealthy_nodes=0,
|
||||
max_percent_unhealthy_apps=0,
|
||||
app_type_health_policy_map=None, timeout=60):
|
||||
"""
|
||||
If Chaos is not already running in the cluster, starts running Chaos with
|
||||
the specified in Chaos parameters.
|
||||
:param str time_to_run: Total time (in seconds) for which Chaos will run
|
||||
before automatically stopping. The maximum allowed value is 4,294,967,295
|
||||
(System.UInt32.MaxValue).
|
||||
:param int max_cluster_stabilization: The maximum amount of time to wait
|
||||
for all cluster entities to become stable and healthy.
|
||||
:param int max_concurrent_faults: The maximum number of concurrent faults
|
||||
induced per iteration.
|
||||
:param bool disable_move_replica_faults: Disables the move primary and move
|
||||
secondary faults.
|
||||
:param int wait_time_between_faults: Wait time (in seconds) between
|
||||
consecutive faults within a single iteration.
|
||||
:param int wait_time_between_iterations: Time-separation (in seconds)
|
||||
between two consecutive iterations of Chaos.
|
||||
:param bool warning_as_error: When evaluating cluster health during
|
||||
Chaos, treat warnings with the same severity as errors.
|
||||
:param int max_percent_unhealthy_nodes: When evaluating cluster health
|
||||
during Chaos, the maximum allowed percentage of unhealthy nodes before
|
||||
reporting an error.
|
||||
:param int max_percent_unhealthy_apps: When evaluating cluster
|
||||
health during Chaos, the maximum allowed percentage of unhealthy
|
||||
applications before reporting an error.
|
||||
:param str app_type_health_policy_map: JSON encoded list with max
|
||||
percentage unhealthy applications for specific application types. Each
|
||||
entry specifies as a key the application type name and as a value an
|
||||
integer that represents the MaxPercentUnhealthyApplications percentage
|
||||
used to evaluate the applications of the specified application type.
|
||||
"""
|
||||
def parse_chaos_context(formatted_chaos_context):
|
||||
""""Parse a chaos context from a formatted context"""
|
||||
from azure.servicefabric.models.chaos_context import (
|
||||
ChaosContext
|
||||
)
|
||||
|
||||
if formatted_chaos_context is None:
|
||||
return None
|
||||
|
||||
return ChaosContext(formatted_chaos_context)
|
||||
|
||||
def parse_chaos_target_filter(formatted_chaos_target_filter):
|
||||
""""Parse a chaos target filter from a formatted filter"""
|
||||
from azure.servicefabric.models.chaos_target_filter import (
|
||||
ChaosTargetFilter
|
||||
)
|
||||
|
||||
if formatted_chaos_target_filter is None:
|
||||
return None
|
||||
|
||||
nodetype_inclusion_list = formatted_chaos_target_filter.get('NodeTypeInclusionList', None) # pylint: disable=line-too-long
|
||||
application_inclusion_list = formatted_chaos_target_filter.get('ApplicationInclusionList', None) # pylint: disable=line-too-long
|
||||
|
||||
return ChaosTargetFilter(nodetype_inclusion_list, application_inclusion_list) # pylint: disable=line-too-long
|
||||
|
||||
def start(client, time_to_run="4294967295", max_cluster_stabilization=60, #pylint: disable=too-many-arguments,too-many-locals,missing-docstring
|
||||
max_concurrent_faults=1, disable_move_replica_faults=False,
|
||||
wait_time_between_faults=20,
|
||||
wait_time_between_iterations=30,
|
||||
warning_as_error=False,
|
||||
max_percent_unhealthy_nodes=0,
|
||||
max_percent_unhealthy_apps=0,
|
||||
app_type_health_policy_map=None,
|
||||
context=None,
|
||||
chaos_target_filter=None,
|
||||
timeout=60):
|
||||
from azure.servicefabric.models.chaos_parameters import (
|
||||
ChaosParameters
|
||||
)
|
||||
|
@ -52,6 +50,8 @@ def start( #pylint: disable=too-many-arguments,too-many-locals
|
|||
)
|
||||
from sfctl.custom_health import parse_app_health_map
|
||||
|
||||
context = parse_chaos_context(context)
|
||||
|
||||
health_map = parse_app_health_map(app_type_health_policy_map)
|
||||
|
||||
health_policy = ClusterHealthPolicy(warning_as_error,
|
||||
|
@ -59,13 +59,17 @@ def start( #pylint: disable=too-many-arguments,too-many-locals
|
|||
max_percent_unhealthy_apps,
|
||||
health_map)
|
||||
|
||||
# Does not support Chaos Context currently
|
||||
target_filter = parse_chaos_target_filter(chaos_target_filter)
|
||||
|
||||
#pylint: disable=too-many-arguments
|
||||
chaos_params = ChaosParameters(time_to_run, max_cluster_stabilization,
|
||||
max_concurrent_faults,
|
||||
not disable_move_replica_faults,
|
||||
wait_time_between_faults,
|
||||
wait_time_between_iterations,
|
||||
health_policy,
|
||||
None)
|
||||
context,
|
||||
target_filter)
|
||||
#pylint: enable=too-many-arguments
|
||||
|
||||
client.start_chaos(chaos_params, timeout)
|
||||
|
|
|
@ -0,0 +1,183 @@
|
|||
# -----------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
"""Help documentation for Service Fabric Chaos commands."""
|
||||
|
||||
from knack.help_files import helps
|
||||
|
||||
helps['chaos start'] = """
|
||||
type: command
|
||||
short-summary: Starts Chaos in the cluster.
|
||||
long-summary: If Chaos is not already running in the cluster,
|
||||
it starts Chaos with the passed in Chaos parameters.
|
||||
If Chaos is already running when this call is made,
|
||||
the call fails with the error code FABRIC_E_CHAOS_ALREADY_RUNNING.
|
||||
parameters:
|
||||
- name: --time-to-run
|
||||
type: string
|
||||
short-summary: Total time (in seconds) for which Chaos will run
|
||||
before automatically stopping. The maximum allowed value is 4,294,967,295
|
||||
(System.UInt32.MaxValue).
|
||||
- name: --max-cluster-stabilization
|
||||
type: int
|
||||
short-summary: The maximum amount of time to wait
|
||||
for all cluster entities to become stable and healthy.
|
||||
long-summary: Chaos executes in iterations and at the start of
|
||||
each iteration it validates the health of cluster entities.
|
||||
During validation if a cluster entity is not stable and healthy
|
||||
within MaxClusterStabilizationTimeoutInSeconds,
|
||||
Chaos generates a validation failed event.
|
||||
- name: --max-concurrent-faults
|
||||
type: int
|
||||
short-summary: The maximum number of concurrent faults induced
|
||||
per iteration. Chaos executes in iterations and two consecutive
|
||||
iterations are separated by a validation phase. The higher
|
||||
the concurrency, the more aggressive the injection of
|
||||
faults -- inducing more complex series of states to uncover bugs.
|
||||
The recommendation is to start with a value of 2 or 3 and to
|
||||
exercise caution while moving up.
|
||||
- name: --disable-move-replica-faults
|
||||
type: bool
|
||||
short-summary: Disables the move primary and move secondary faults.
|
||||
- name: --wait-time-between-faults
|
||||
type: int
|
||||
short-summary: Wait time (in seconds) between consecutive faults
|
||||
within a single iteration.
|
||||
long-summary: The larger the value, the lower the overlapping
|
||||
between faults and the simpler the sequence of state transitions
|
||||
that the cluster goes through. The recommendation is to start
|
||||
with a value between 1 and 5 and exercise caution while moving up.
|
||||
- name: --wait-time-between-iterations
|
||||
type: int
|
||||
short-summary: Time-separation (in seconds) between two consecutive
|
||||
iterations of Chaos. The larger the value, the lower the fault
|
||||
injection rate.
|
||||
- name: --max-percent-unhealthy-nodes
|
||||
type: int
|
||||
short-summary: When evaluating cluster health during Chaos, the
|
||||
maximum allowed percentage of unhealthy nodes before
|
||||
reporting an error.
|
||||
long-summary: The maximum allowed percentage of unhealthy nodes
|
||||
before reporting an error. For example, to allow 10% of nodes
|
||||
to be unhealthy, this value would be 10. The percentage represents
|
||||
the maximum tolerated percentage of nodes that can be unhealthy
|
||||
before the cluster is considered in error. If the percentage is
|
||||
respected but there is at least one unhealthy node, the health
|
||||
is evaluated as Warning. The percentage is calculated by dividing
|
||||
the number of unhealthy nodes over the total number of nodes
|
||||
in the cluster. The computation rounds up to tolerate one failure
|
||||
on small numbers of nodes. Default percentage is zero.
|
||||
In large clusters, some nodes will always be down or out for
|
||||
repairs, so this percentage should be configured to tolerate that.
|
||||
- name: --max-percent-unhealthy-apps
|
||||
type: int
|
||||
short-summary: When evaluating cluster health during Chaos,
|
||||
the maximum allowed percentage of unhealthy applications
|
||||
before reporting an error.
|
||||
long-summary: The maximum allowed percentage of unhealthy
|
||||
applications before reporting an error. For example,
|
||||
to allow 10% of applications to be unhealthy, this value would be 10.
|
||||
The percentage represents the maximum tolerated percentage
|
||||
of applications that can be unhealthy before the cluster is
|
||||
considered in error. If the percentage is respected but
|
||||
there is at least one unhealthy application, the health
|
||||
is evaluated as Warning. This is calculated by dividing
|
||||
the number of unhealthy applications over the total number
|
||||
of application instances in the cluster, excluding applications
|
||||
of application types that are included in the
|
||||
ApplicationTypeHealthPolicyMap. The computation rounds up
|
||||
to tolerate one failure on small numbers of applications.
|
||||
Default percentage is zero.
|
||||
- name: --app-type-health-policy-map
|
||||
type: string
|
||||
short-summary: JSON encoded list with max
|
||||
percentage unhealthy applications for specific application
|
||||
types. Each entry specifies as a key the application type
|
||||
name and as a value an integer that represents the
|
||||
MaxPercentUnhealthyApplications percentage used to evaluate
|
||||
the applications of the specified application type.
|
||||
long-summary: Defines a map with max percentage unhealthy
|
||||
applications for specific application types. Each entry
|
||||
specifies as key the application type name and as value
|
||||
an integer that represents the MaxPercentUnhealthyApplications
|
||||
percentage used to evaluate the applications of the specified
|
||||
application type. The application type health policy map
|
||||
can be used during cluster health evaluation to describe
|
||||
special application types. The application types included
|
||||
in the map are evaluated against the percentage specified
|
||||
in the map, and not with the global MaxPercentUnhealthyApplications
|
||||
defined in the cluster health policy. The applications of
|
||||
application types specified in the map are not counted against
|
||||
the global pool of applications. For example, if some
|
||||
applications of a type are critical, the cluster administrator
|
||||
can add an entry to the map for that application type and assign
|
||||
it a value of 0% (that is, do not tolerate any failures).
|
||||
All other applications can be evaluated with
|
||||
MaxPercentUnhealthyApplications set to 20% to tolerate
|
||||
some failures out of the thousands of application instances.
|
||||
The application type health policy map is used only if the
|
||||
cluster manifest enables application type health evaluation
|
||||
using the configuration entry for
|
||||
HealthManager/EnableApplicationTypeHealthEvaluation.
|
||||
- name: --context
|
||||
type: string
|
||||
short-summary: JSON encoded map of (string, string) type key-value
|
||||
pairs. The map can be used to record information about the Chaos
|
||||
run. There cannot be more than 100 such pairs and each
|
||||
string (key or value) can be at most 4095 characters long.
|
||||
This map is set by the starter of the Chaos run to optionally
|
||||
store the context about the specific run.
|
||||
- name: --chaos-target-filter
|
||||
type: string
|
||||
short-summary: JSON encoded dictionary with two
|
||||
string type keys. The two keys are NodeTypeInclusionList and
|
||||
ApplicationInclusionList. Values for both of these keys are list of
|
||||
string. chaos_target_filter defines all filters for targeted
|
||||
Chaos faults, for example, faulting only certain node types or
|
||||
faulting only certain applications.
|
||||
long-summary: If chaos_target_filter is not used, Chaos faults all cluster entities.
|
||||
If chaos_target_filter is used, Chaos faults only the entities that
|
||||
meet the chaos_target_filter specification. NodeTypeInclusionList
|
||||
and ApplicationInclusionList allow a union semantics only. It is
|
||||
not possible to specify an intersection of NodeTypeInclusionList
|
||||
and ApplicationInclusionList. For example,
|
||||
it is not possible to specify "fault this application only when
|
||||
it is on that node type." Once an entity is included in either
|
||||
NodeTypeInclusionList or ApplicationInclusionList, that entity cannot
|
||||
be excluded using ChaosTargetFilter. Even if applicationX does not
|
||||
appear in ApplicationInclusionList, in some Chaos iteration
|
||||
applicationX can be faulted because it happens to be on a node of
|
||||
nodeTypeY that is included in NodeTypeInclusionList.
|
||||
If both NodeTypeInclusionList and ApplicationInclusionList
|
||||
are empty, an ArgumentException is thrown.
|
||||
All types of faults (restart node, restart codepackage, remove replica,
|
||||
restart replica, move primary, and move secondary) are enabled for
|
||||
the nodes of these node types.
|
||||
If a nodetype (say NodeTypeX) does not appear in the
|
||||
NodeTypeInclusionList, then node level faults (like NodeRestart)
|
||||
will never be enabled for the nodes of NodeTypeX, but code package
|
||||
and replica faults can still be enabled for NodeTypeX
|
||||
if an application in the ApplicationInclusionList happens to
|
||||
reside on a node of NodeTypeX.
|
||||
At most 100 node type names can be included in this list,
|
||||
to increase this number, a config upgrade is required for
|
||||
MaxNumberOfNodeTypesInChaosEntityFilter configuration.
|
||||
All replicas belonging to services of these applications are
|
||||
amenable to replica faults (restart replica, remove replica,
|
||||
move primary, and move secondary) by Chaos.
|
||||
Chaos may restart a code package only if the code package hosts
|
||||
replicas of these applications only.
|
||||
If an application does not appear in this list, it can still
|
||||
be faulted in some Chaos iteration if the application ends
|
||||
up on a node of a node type that is incuded in NodeTypeInclusionList.
|
||||
However if applicationX is tied to nodeTypeY through placement
|
||||
constraints and applicationX is absent from ApplicationInclusionList
|
||||
and nodeTypeY is absent from NodeTypeInclusionList, then
|
||||
applicationX will never be faulted. At most 1000 application
|
||||
names can be included in this list, to increase this number,
|
||||
a config upgrade is required for
|
||||
MaxNumberOfApplicationsInChaosEntityFilter configuration.
|
||||
"""
|
|
@ -64,6 +64,8 @@ def custom_arguments(self, _): #pylint: disable=too-many-statements
|
|||
arg_context.argument('wait_time_between_iterations', type=int)
|
||||
arg_context.argument('max_percent_unhealthy_nodes', type=int)
|
||||
arg_context.argument('max_percent_unhealthy_apps', type=int)
|
||||
arg_context.argument('context', type=json_encoded)
|
||||
arg_context.argument('chaos_target_filter', type=json_encoded)
|
||||
|
||||
with ArgumentsContext(self, 'cluster health') as arg_context:
|
||||
arg_context.argument('nodes_health_state_filter', type=int)
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
# -----------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
"""Custom Chaos command related tests"""
|
||||
|
||||
import unittest
|
||||
import sfctl.custom_chaos as sf_c
|
||||
|
||||
class ChaosTests(unittest.TestCase):
|
||||
"""Chaos tests"""
|
||||
|
||||
def parse_none_context_test(self):
|
||||
"""Parsing None context returns None"""
|
||||
res = sf_c.parse_chaos_context(None)
|
||||
self.assertIs(res, None)
|
||||
|
||||
def parse_populated_context_test(self):
|
||||
"""Parse context with contents"""
|
||||
from azure.servicefabric.models.chaos_context import (
|
||||
ChaosContext
|
||||
)
|
||||
|
||||
wrapper = sf_c.parse_chaos_context({
|
||||
'key1': 'value1',
|
||||
'key2': 'value2',
|
||||
'key3': 'value3'
|
||||
})
|
||||
|
||||
self.assertIsInstance(wrapper, ChaosContext)
|
||||
res = wrapper.map
|
||||
self.assertIsInstance(res, dict)
|
||||
self.assertEqual(len(res), 3)
|
||||
self.assertEqual(res['key1'], 'value1')
|
||||
self.assertEqual(res['key2'], 'value2')
|
||||
self.assertEqual(res['key3'], 'value3')
|
||||
|
||||
def parse_none_target_filter_test(self):
|
||||
"""Parse None chaos target filter returns None"""
|
||||
self.assertIs(sf_c.parse_chaos_target_filter(None), None)
|
||||
|
||||
def parse_nodetype_list_test(self):
|
||||
"""Parse nodetypeinclusionlist list"""
|
||||
|
||||
res = sf_c.parse_chaos_target_filter({
|
||||
'NodeTypeInclusionList': [
|
||||
'N0010Ref', 'N0020Ref', 'N0030Ref', 'N0070Ref']
|
||||
})
|
||||
|
||||
self.assertEqual(len(res.node_type_inclusion_list), 4)
|
||||
self.assertEqual(res.application_inclusion_list, None)
|
||||
self.assertEqual(res.node_type_inclusion_list[0], 'N0010Ref')
|
||||
self.assertEqual(res.node_type_inclusion_list[1], 'N0020Ref')
|
||||
self.assertEqual(res.node_type_inclusion_list[2], 'N0030Ref')
|
||||
self.assertEqual(res.node_type_inclusion_list[3], 'N0070Ref')
|
||||
|
||||
def parse_application_list_test(self):
|
||||
"""Parse application inclusion list"""
|
||||
|
||||
res = sf_c.parse_chaos_target_filter({
|
||||
'ApplicationInclusionList': ['fabric:/TestApp1', 'fabric:/TestApp2'] #pylint: disable=line-too-long
|
||||
})
|
||||
|
||||
self.assertEqual(len(res.application_inclusion_list), 2)
|
||||
self.assertEqual(res.node_type_inclusion_list, None)
|
||||
self.assertEqual(res.application_inclusion_list[0], 'fabric:/TestApp1')
|
||||
self.assertEqual(res.application_inclusion_list[1], 'fabric:/TestApp2')
|
Загрузка…
Ссылка в новой задаче