Add ChaosContext and ChaosTargetFilter in ChaosParameters (#62)

This commit is contained in:
Mohammad Tanviruzzaman 2018-02-08 12:02:02 -08:00 коммит произвёл samedder
Родитель 489057c6dd
Коммит 32b4a257a9
7 изменённых файлов: 305 добавлений и 41 удалений

Просмотреть файл

@ -34,3 +34,7 @@
# App and application tests
src/sfctl/tests/app_test.py @iyyappam
src/sfctl/custom_app.py @iyyappam
# Chaos and related tests
src/sfctl/tests/chaos_test.py @motanv
src/sfctl/custom_chaos.py @motanv

Просмотреть файл

@ -27,6 +27,7 @@ Change Log
- Application list related commands now support an optional argument to limit the number of results
- Deployed application info can now optionally include health states
- Numerous documentation improvements and corrections
- ChaosContext (context) and ChaosTargetFilter (chaos-target-filter) arguments are added to Chaos start command (#62)
3.0.0
-----

Просмотреть файл

@ -21,6 +21,7 @@ import sfctl.helps.health # pylint: disable=unused-import
import sfctl.helps.cluster_upgrade # pylint: disable=unused-import
import sfctl.helps.compose # pylint: disable=unused-import
import sfctl.helps.app_type # pylint: disable=unused-import
import sfctl.helps.chaos # pylint: disable=unused-import
class SFCommandHelp(CLIHelp):
"""Service Fabric CLI help loader"""

Просмотреть файл

@ -4,46 +4,44 @@
# license information.
# -----------------------------------------------------------------------------
"""Custom commands for the Service Fabric chaos test service"""
"""Custom commands for the Service Fabric chaos service"""
def start( #pylint: disable=too-many-arguments,too-many-locals
client, time_to_run="4294967295", max_cluster_stabilization=60,
max_concurrent_faults=1, disable_move_replica_faults=False,
wait_time_between_faults=20,
wait_time_between_iterations=30, warning_as_error=False,
max_percent_unhealthy_nodes=0,
max_percent_unhealthy_apps=0,
app_type_health_policy_map=None, timeout=60):
"""
If Chaos is not already running in the cluster, starts running Chaos with
the specified in Chaos parameters.
:param str time_to_run: Total time (in seconds) for which Chaos will run
before automatically stopping. The maximum allowed value is 4,294,967,295
(System.UInt32.MaxValue).
:param int max_cluster_stabilization: The maximum amount of time to wait
for all cluster entities to become stable and healthy.
:param int max_concurrent_faults: The maximum number of concurrent faults
induced per iteration.
:param bool disable_move_replica_faults: Disables the move primary and move
secondary faults.
:param int wait_time_between_faults: Wait time (in seconds) between
consecutive faults within a single iteration.
:param int wait_time_between_iterations: Time-separation (in seconds)
between two consecutive iterations of Chaos.
:param bool warning_as_error: When evaluating cluster health during
Chaos, treat warnings with the same severity as errors.
:param int max_percent_unhealthy_nodes: When evaluating cluster health
during Chaos, the maximum allowed percentage of unhealthy nodes before
reporting an error.
:param int max_percent_unhealthy_apps: When evaluating cluster
health during Chaos, the maximum allowed percentage of unhealthy
applications before reporting an error.
:param str app_type_health_policy_map: JSON encoded list with max
percentage unhealthy applications for specific application types. Each
entry specifies as a key the application type name and as a value an
integer that represents the MaxPercentUnhealthyApplications percentage
used to evaluate the applications of the specified application type.
"""
def parse_chaos_context(formatted_chaos_context):
""""Parse a chaos context from a formatted context"""
from azure.servicefabric.models.chaos_context import (
ChaosContext
)
if formatted_chaos_context is None:
return None
return ChaosContext(formatted_chaos_context)
def parse_chaos_target_filter(formatted_chaos_target_filter):
""""Parse a chaos target filter from a formatted filter"""
from azure.servicefabric.models.chaos_target_filter import (
ChaosTargetFilter
)
if formatted_chaos_target_filter is None:
return None
nodetype_inclusion_list = formatted_chaos_target_filter.get('NodeTypeInclusionList', None) # pylint: disable=line-too-long
application_inclusion_list = formatted_chaos_target_filter.get('ApplicationInclusionList', None) # pylint: disable=line-too-long
return ChaosTargetFilter(nodetype_inclusion_list, application_inclusion_list) # pylint: disable=line-too-long
def start(client, time_to_run="4294967295", max_cluster_stabilization=60, #pylint: disable=too-many-arguments,too-many-locals,missing-docstring
max_concurrent_faults=1, disable_move_replica_faults=False,
wait_time_between_faults=20,
wait_time_between_iterations=30,
warning_as_error=False,
max_percent_unhealthy_nodes=0,
max_percent_unhealthy_apps=0,
app_type_health_policy_map=None,
context=None,
chaos_target_filter=None,
timeout=60):
from azure.servicefabric.models.chaos_parameters import (
ChaosParameters
)
@ -52,6 +50,8 @@ def start( #pylint: disable=too-many-arguments,too-many-locals
)
from sfctl.custom_health import parse_app_health_map
context = parse_chaos_context(context)
health_map = parse_app_health_map(app_type_health_policy_map)
health_policy = ClusterHealthPolicy(warning_as_error,
@ -59,13 +59,17 @@ def start( #pylint: disable=too-many-arguments,too-many-locals
max_percent_unhealthy_apps,
health_map)
# Does not support Chaos Context currently
target_filter = parse_chaos_target_filter(chaos_target_filter)
#pylint: disable=too-many-arguments
chaos_params = ChaosParameters(time_to_run, max_cluster_stabilization,
max_concurrent_faults,
not disable_move_replica_faults,
wait_time_between_faults,
wait_time_between_iterations,
health_policy,
None)
context,
target_filter)
#pylint: enable=too-many-arguments
client.start_chaos(chaos_params, timeout)

183
src/sfctl/helps/chaos.py Normal file
Просмотреть файл

@ -0,0 +1,183 @@
# -----------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# -----------------------------------------------------------------------------
"""Help documentation for Service Fabric Chaos commands."""
from knack.help_files import helps
helps['chaos start'] = """
type: command
short-summary: Starts Chaos in the cluster.
long-summary: If Chaos is not already running in the cluster,
it starts Chaos with the passed in Chaos parameters.
If Chaos is already running when this call is made,
the call fails with the error code FABRIC_E_CHAOS_ALREADY_RUNNING.
parameters:
- name: --time-to-run
type: string
short-summary: Total time (in seconds) for which Chaos will run
before automatically stopping. The maximum allowed value is 4,294,967,295
(System.UInt32.MaxValue).
- name: --max-cluster-stabilization
type: int
short-summary: The maximum amount of time to wait
for all cluster entities to become stable and healthy.
long-summary: Chaos executes in iterations and at the start of
each iteration it validates the health of cluster entities.
During validation if a cluster entity is not stable and healthy
within MaxClusterStabilizationTimeoutInSeconds,
Chaos generates a validation failed event.
- name: --max-concurrent-faults
type: int
short-summary: The maximum number of concurrent faults induced
per iteration. Chaos executes in iterations and two consecutive
iterations are separated by a validation phase. The higher
the concurrency, the more aggressive the injection of
faults -- inducing more complex series of states to uncover bugs.
The recommendation is to start with a value of 2 or 3 and to
exercise caution while moving up.
- name: --disable-move-replica-faults
type: bool
short-summary: Disables the move primary and move secondary faults.
- name: --wait-time-between-faults
type: int
short-summary: Wait time (in seconds) between consecutive faults
within a single iteration.
long-summary: The larger the value, the lower the overlapping
between faults and the simpler the sequence of state transitions
that the cluster goes through. The recommendation is to start
with a value between 1 and 5 and exercise caution while moving up.
- name: --wait-time-between-iterations
type: int
short-summary: Time-separation (in seconds) between two consecutive
iterations of Chaos. The larger the value, the lower the fault
injection rate.
- name: --max-percent-unhealthy-nodes
type: int
short-summary: When evaluating cluster health during Chaos, the
maximum allowed percentage of unhealthy nodes before
reporting an error.
long-summary: The maximum allowed percentage of unhealthy nodes
before reporting an error. For example, to allow 10% of nodes
to be unhealthy, this value would be 10. The percentage represents
the maximum tolerated percentage of nodes that can be unhealthy
before the cluster is considered in error. If the percentage is
respected but there is at least one unhealthy node, the health
is evaluated as Warning. The percentage is calculated by dividing
the number of unhealthy nodes over the total number of nodes
in the cluster. The computation rounds up to tolerate one failure
on small numbers of nodes. Default percentage is zero.
In large clusters, some nodes will always be down or out for
repairs, so this percentage should be configured to tolerate that.
- name: --max-percent-unhealthy-apps
type: int
short-summary: When evaluating cluster health during Chaos,
the maximum allowed percentage of unhealthy applications
before reporting an error.
long-summary: The maximum allowed percentage of unhealthy
applications before reporting an error. For example,
to allow 10% of applications to be unhealthy, this value would be 10.
The percentage represents the maximum tolerated percentage
of applications that can be unhealthy before the cluster is
considered in error. If the percentage is respected but
there is at least one unhealthy application, the health
is evaluated as Warning. This is calculated by dividing
the number of unhealthy applications over the total number
of application instances in the cluster, excluding applications
of application types that are included in the
ApplicationTypeHealthPolicyMap. The computation rounds up
to tolerate one failure on small numbers of applications.
Default percentage is zero.
- name: --app-type-health-policy-map
type: string
short-summary: JSON encoded list with max
percentage unhealthy applications for specific application
types. Each entry specifies as a key the application type
name and as a value an integer that represents the
MaxPercentUnhealthyApplications percentage used to evaluate
the applications of the specified application type.
long-summary: Defines a map with max percentage unhealthy
applications for specific application types. Each entry
specifies as key the application type name and as value
an integer that represents the MaxPercentUnhealthyApplications
percentage used to evaluate the applications of the specified
application type. The application type health policy map
can be used during cluster health evaluation to describe
special application types. The application types included
in the map are evaluated against the percentage specified
in the map, and not with the global MaxPercentUnhealthyApplications
defined in the cluster health policy. The applications of
application types specified in the map are not counted against
the global pool of applications. For example, if some
applications of a type are critical, the cluster administrator
can add an entry to the map for that application type and assign
it a value of 0% (that is, do not tolerate any failures).
All other applications can be evaluated with
MaxPercentUnhealthyApplications set to 20% to tolerate
some failures out of the thousands of application instances.
The application type health policy map is used only if the
cluster manifest enables application type health evaluation
using the configuration entry for
HealthManager/EnableApplicationTypeHealthEvaluation.
- name: --context
type: string
short-summary: JSON encoded map of (string, string) type key-value
pairs. The map can be used to record information about the Chaos
run. There cannot be more than 100 such pairs and each
string (key or value) can be at most 4095 characters long.
This map is set by the starter of the Chaos run to optionally
store the context about the specific run.
- name: --chaos-target-filter
type: string
short-summary: JSON encoded dictionary with two
string type keys. The two keys are NodeTypeInclusionList and
ApplicationInclusionList. Values for both of these keys are list of
string. chaos_target_filter defines all filters for targeted
Chaos faults, for example, faulting only certain node types or
faulting only certain applications.
long-summary: If chaos_target_filter is not used, Chaos faults all cluster entities.
If chaos_target_filter is used, Chaos faults only the entities that
meet the chaos_target_filter specification. NodeTypeInclusionList
and ApplicationInclusionList allow a union semantics only. It is
not possible to specify an intersection of NodeTypeInclusionList
and ApplicationInclusionList. For example,
it is not possible to specify "fault this application only when
it is on that node type." Once an entity is included in either
NodeTypeInclusionList or ApplicationInclusionList, that entity cannot
be excluded using ChaosTargetFilter. Even if applicationX does not
appear in ApplicationInclusionList, in some Chaos iteration
applicationX can be faulted because it happens to be on a node of
nodeTypeY that is included in NodeTypeInclusionList.
If both NodeTypeInclusionList and ApplicationInclusionList
are empty, an ArgumentException is thrown.
All types of faults (restart node, restart codepackage, remove replica,
restart replica, move primary, and move secondary) are enabled for
the nodes of these node types.
If a nodetype (say NodeTypeX) does not appear in the
NodeTypeInclusionList, then node level faults (like NodeRestart)
will never be enabled for the nodes of NodeTypeX, but code package
and replica faults can still be enabled for NodeTypeX
if an application in the ApplicationInclusionList happens to
reside on a node of NodeTypeX.
At most 100 node type names can be included in this list,
to increase this number, a config upgrade is required for
MaxNumberOfNodeTypesInChaosEntityFilter configuration.
All replicas belonging to services of these applications are
amenable to replica faults (restart replica, remove replica,
move primary, and move secondary) by Chaos.
Chaos may restart a code package only if the code package hosts
replicas of these applications only.
If an application does not appear in this list, it can still
be faulted in some Chaos iteration if the application ends
up on a node of a node type that is incuded in NodeTypeInclusionList.
However if applicationX is tied to nodeTypeY through placement
constraints and applicationX is absent from ApplicationInclusionList
and nodeTypeY is absent from NodeTypeInclusionList, then
applicationX will never be faulted. At most 1000 application
names can be included in this list, to increase this number,
a config upgrade is required for
MaxNumberOfApplicationsInChaosEntityFilter configuration.
"""

Просмотреть файл

@ -64,6 +64,8 @@ def custom_arguments(self, _): #pylint: disable=too-many-statements
arg_context.argument('wait_time_between_iterations', type=int)
arg_context.argument('max_percent_unhealthy_nodes', type=int)
arg_context.argument('max_percent_unhealthy_apps', type=int)
arg_context.argument('context', type=json_encoded)
arg_context.argument('chaos_target_filter', type=json_encoded)
with ArgumentsContext(self, 'cluster health') as arg_context:
arg_context.argument('nodes_health_state_filter', type=int)

Просмотреть файл

@ -0,0 +1,69 @@
# -----------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# -----------------------------------------------------------------------------
"""Custom Chaos command related tests"""
import unittest
import sfctl.custom_chaos as sf_c
class ChaosTests(unittest.TestCase):
"""Chaos tests"""
def parse_none_context_test(self):
"""Parsing None context returns None"""
res = sf_c.parse_chaos_context(None)
self.assertIs(res, None)
def parse_populated_context_test(self):
"""Parse context with contents"""
from azure.servicefabric.models.chaos_context import (
ChaosContext
)
wrapper = sf_c.parse_chaos_context({
'key1': 'value1',
'key2': 'value2',
'key3': 'value3'
})
self.assertIsInstance(wrapper, ChaosContext)
res = wrapper.map
self.assertIsInstance(res, dict)
self.assertEqual(len(res), 3)
self.assertEqual(res['key1'], 'value1')
self.assertEqual(res['key2'], 'value2')
self.assertEqual(res['key3'], 'value3')
def parse_none_target_filter_test(self):
"""Parse None chaos target filter returns None"""
self.assertIs(sf_c.parse_chaos_target_filter(None), None)
def parse_nodetype_list_test(self):
"""Parse nodetypeinclusionlist list"""
res = sf_c.parse_chaos_target_filter({
'NodeTypeInclusionList': [
'N0010Ref', 'N0020Ref', 'N0030Ref', 'N0070Ref']
})
self.assertEqual(len(res.node_type_inclusion_list), 4)
self.assertEqual(res.application_inclusion_list, None)
self.assertEqual(res.node_type_inclusion_list[0], 'N0010Ref')
self.assertEqual(res.node_type_inclusion_list[1], 'N0020Ref')
self.assertEqual(res.node_type_inclusion_list[2], 'N0030Ref')
self.assertEqual(res.node_type_inclusion_list[3], 'N0070Ref')
def parse_application_list_test(self):
"""Parse application inclusion list"""
res = sf_c.parse_chaos_target_filter({
'ApplicationInclusionList': ['fabric:/TestApp1', 'fabric:/TestApp2'] #pylint: disable=line-too-long
})
self.assertEqual(len(res.application_inclusion_list), 2)
self.assertEqual(res.node_type_inclusion_list, None)
self.assertEqual(res.application_inclusion_list[0], 'fabric:/TestApp1')
self.assertEqual(res.application_inclusion_list[1], 'fabric:/TestApp2')