Add ChaosContext and ChaosTargetFilter in ChaosParameters (#62)

2018-02-08 12:02:02 -08:00 · 2018-02-08 12:02:02 -08:00 · 32b4a257a9
--- a/4
+++ b/4
@ -34,3 +34,7 @@
 # App and application tests
 src/sfctl/tests/app_test.py @iyyappam
 src/sfctl/custom_app.py @iyyappam
+
+# Chaos and related tests
+src/sfctl/tests/chaos_test.py @motanv
+src/sfctl/custom_chaos.py @motanv
--- a/src/README.rst
+++ b/src/README.rst
@ -27,6 +27,7 @@ Change Log
 - Application list related commands now support an optional argument to limit the number of results
 - Deployed application info can now optionally include health states
 - Numerous documentation improvements and corrections
+- ChaosContext (context) and ChaosTargetFilter (chaos-target-filter) arguments are added to Chaos start command (#62)

 3.0.0
 -----
--- a/src/sfctl/commands.py
+++ b/src/sfctl/commands.py
@ -21,6 +21,7 @@ import sfctl.helps.health # pylint: disable=unused-import
 import sfctl.helps.cluster_upgrade # pylint: disable=unused-import
 import sfctl.helps.compose # pylint: disable=unused-import
 import sfctl.helps.app_type # pylint: disable=unused-import
+import sfctl.helps.chaos # pylint: disable=unused-import

 class SFCommandHelp(CLIHelp):
    """Service Fabric CLI help loader"""
--- a/src/sfctl/custom_chaos.py
+++ b/src/sfctl/custom_chaos.py
@ -4,46 +4,44 @@
 # license information.
 # -----------------------------------------------------------------------------

-"""Custom commands for the Service Fabric chaos test service"""
+"""Custom commands for the Service Fabric chaos service"""

-def start( #pylint: disable=too-many-arguments,too-many-locals
-        client, time_to_run="4294967295", max_cluster_stabilization=60,
-        max_concurrent_faults=1, disable_move_replica_faults=False,
-        wait_time_between_faults=20,
-        wait_time_between_iterations=30, warning_as_error=False,
-        max_percent_unhealthy_nodes=0,
-        max_percent_unhealthy_apps=0,
-        app_type_health_policy_map=None, timeout=60):
-    """
-    If Chaos is not already running in the cluster, starts running Chaos with
-    the specified in Chaos parameters.
-    :param str time_to_run: Total time (in seconds) for which Chaos will run
-    before automatically stopping. The maximum allowed value is 4,294,967,295
-    (System.UInt32.MaxValue).
-    :param int max_cluster_stabilization: The maximum amount of time to wait
-    for all cluster entities to become stable and healthy.
-    :param int max_concurrent_faults: The maximum number of concurrent faults
-    induced per iteration.
-    :param bool disable_move_replica_faults: Disables the move primary and move
-    secondary faults.
-    :param int wait_time_between_faults: Wait time (in seconds) between
-    consecutive faults within a single iteration.
-    :param int wait_time_between_iterations: Time-separation (in seconds)
-    between two consecutive iterations of Chaos.
-    :param bool warning_as_error: When evaluating cluster health during
-    Chaos, treat warnings with the same severity as errors.
-    :param int max_percent_unhealthy_nodes: When evaluating cluster health
-    during Chaos, the maximum allowed percentage of unhealthy nodes before
-    reporting an error.
-    :param int max_percent_unhealthy_apps: When evaluating cluster
-    health during Chaos, the maximum allowed percentage of unhealthy
-    applications before reporting an error.
-    :param str app_type_health_policy_map: JSON encoded list with max
-    percentage unhealthy applications for specific application types. Each
-    entry specifies as a key the application type name and as  a value an
-    integer that represents the MaxPercentUnhealthyApplications percentage
-    used to evaluate the applications of the specified application type.
-    """
+def parse_chaos_context(formatted_chaos_context):
+    """"Parse a chaos context from a formatted context"""
+    from azure.servicefabric.models.chaos_context import (
+        ChaosContext
+    )
+
+    if formatted_chaos_context is None:
+        return None
+
+    return ChaosContext(formatted_chaos_context)
+
+def parse_chaos_target_filter(formatted_chaos_target_filter):
+    """"Parse a chaos target filter from a formatted filter"""
+    from azure.servicefabric.models.chaos_target_filter import (
+        ChaosTargetFilter
+    )
+
+    if formatted_chaos_target_filter is None:
+        return None
+
+    nodetype_inclusion_list = formatted_chaos_target_filter.get('NodeTypeInclusionList', None) # pylint: disable=line-too-long
+    application_inclusion_list = formatted_chaos_target_filter.get('ApplicationInclusionList', None) # pylint: disable=line-too-long
+
+    return ChaosTargetFilter(nodetype_inclusion_list, application_inclusion_list) # pylint: disable=line-too-long
+
+def start(client, time_to_run="4294967295", max_cluster_stabilization=60, #pylint: disable=too-many-arguments,too-many-locals,missing-docstring
+          max_concurrent_faults=1, disable_move_replica_faults=False,
+          wait_time_between_faults=20,
+          wait_time_between_iterations=30,
+          warning_as_error=False,
+          max_percent_unhealthy_nodes=0,
+          max_percent_unhealthy_apps=0,
+          app_type_health_policy_map=None,
+          context=None,
+          chaos_target_filter=None,
+          timeout=60):
    from azure.servicefabric.models.chaos_parameters import (
        ChaosParameters
    )
@ -52,6 +50,8 @@ def start( #pylint: disable=too-many-arguments,too-many-locals
    )
    from sfctl.custom_health import parse_app_health_map

+    context = parse_chaos_context(context)
+
    health_map = parse_app_health_map(app_type_health_policy_map)

    health_policy = ClusterHealthPolicy(warning_as_error,
@ -59,13 +59,17 @@ def start( #pylint: disable=too-many-arguments,too-many-locals
                                        max_percent_unhealthy_apps,
                                        health_map)

-    # Does not support Chaos Context currently
+    target_filter = parse_chaos_target_filter(chaos_target_filter)
+
+    #pylint: disable=too-many-arguments
    chaos_params = ChaosParameters(time_to_run, max_cluster_stabilization,
                                   max_concurrent_faults,
                                   not disable_move_replica_faults,
                                   wait_time_between_faults,
                                   wait_time_between_iterations,
                                   health_policy,
-                                   None)
+                                   context,
+                                   target_filter)
+    #pylint: enable=too-many-arguments

    client.start_chaos(chaos_params, timeout)
--- a/src/sfctl/helps/chaos.py
+++ b/src/sfctl/helps/chaos.py
@ -0,0 +1,183 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# -----------------------------------------------------------------------------
+
+"""Help documentation for Service Fabric Chaos commands."""
+
+from knack.help_files import helps
+
+helps['chaos start'] = """
+    type: command
+    short-summary: Starts Chaos in the cluster.
+    long-summary: If Chaos is not already running in the cluster,
+        it starts Chaos with the passed in Chaos parameters.
+        If Chaos is already running when this call is made,
+        the call fails with the error code FABRIC_E_CHAOS_ALREADY_RUNNING.
+    parameters:
+        - name: --time-to-run
+          type: string
+          short-summary: Total time (in seconds) for which Chaos will run
+            before automatically stopping. The maximum allowed value is 4,294,967,295
+            (System.UInt32.MaxValue).
+        - name: --max-cluster-stabilization
+          type: int
+          short-summary: The maximum amount of time to wait
+            for all cluster entities to become stable and healthy.
+          long-summary: Chaos executes in iterations and at the start of
+            each iteration it validates the health of cluster entities.
+            During validation if a cluster entity is not stable and healthy
+            within MaxClusterStabilizationTimeoutInSeconds,
+            Chaos generates a validation failed event.
+        - name: --max-concurrent-faults
+          type: int
+          short-summary: The maximum number of concurrent faults induced
+            per iteration. Chaos executes in iterations and two consecutive
+            iterations are separated by a validation phase. The higher
+            the concurrency, the more aggressive the injection of
+            faults -- inducing more complex series of states to uncover bugs.
+            The recommendation is to start with a value of 2 or 3 and to
+            exercise caution while moving up.
+        - name: --disable-move-replica-faults
+          type: bool
+          short-summary: Disables the move primary and move secondary faults.
+        - name: --wait-time-between-faults
+          type: int
+          short-summary: Wait time (in seconds) between consecutive faults
+            within a single iteration.
+          long-summary: The larger the value, the lower the overlapping
+            between faults and the simpler the sequence of state transitions
+            that the cluster goes through. The recommendation is to start
+            with a value between 1 and 5 and exercise caution while moving up.
+        - name: --wait-time-between-iterations
+          type: int
+          short-summary: Time-separation (in seconds) between two consecutive
+            iterations of Chaos. The larger the value, the lower the fault
+            injection rate.
+        - name: --max-percent-unhealthy-nodes
+          type: int
+          short-summary: When evaluating cluster health during Chaos, the
+            maximum allowed percentage of unhealthy nodes before
+            reporting an error.
+          long-summary: The maximum allowed percentage of unhealthy nodes
+            before reporting an error. For example, to allow 10% of nodes
+            to be unhealthy, this value would be 10. The percentage represents
+            the maximum tolerated percentage of nodes that can be unhealthy
+            before the cluster is considered in error. If the percentage is
+            respected but there is at least one unhealthy node, the health
+            is evaluated as Warning. The percentage is calculated by dividing
+            the number of unhealthy nodes over the total number of nodes
+            in the cluster. The computation rounds up to tolerate one failure
+            on small numbers of nodes. Default percentage is zero.
+            In large clusters, some nodes will always be down or out for
+            repairs, so this percentage should be configured to tolerate that.
+        - name: --max-percent-unhealthy-apps
+          type: int
+          short-summary: When evaluating cluster health during Chaos,
+            the maximum allowed percentage of unhealthy applications
+            before reporting an error.
+          long-summary: The maximum allowed percentage of unhealthy
+            applications before reporting an error. For example,
+            to allow 10% of applications to be unhealthy, this value would be 10.
+            The percentage represents the maximum tolerated percentage
+            of applications that can be unhealthy before the cluster is
+            considered in error. If the percentage is respected but
+            there is at least one unhealthy application, the health
+            is evaluated as Warning. This is calculated by dividing
+            the number of unhealthy applications over the total number
+            of application instances in the cluster, excluding applications
+            of application types that are included in the
+            ApplicationTypeHealthPolicyMap. The computation rounds up
+            to tolerate one failure on small numbers of applications.
+            Default percentage is zero.
+        - name: --app-type-health-policy-map
+          type: string
+          short-summary: JSON encoded list with max
+            percentage unhealthy applications for specific application
+            types. Each entry specifies as a key the application type
+            name and as  a value an integer that represents the
+            MaxPercentUnhealthyApplications percentage used to evaluate
+            the applications of the specified application type.
+          long-summary: Defines a map with max percentage unhealthy
+            applications for specific application types. Each entry
+            specifies as key the application type name and as value
+            an integer that represents the MaxPercentUnhealthyApplications
+            percentage used to evaluate the applications of the specified
+            application type. The application type health policy map
+            can be used during cluster health evaluation to describe
+            special application types. The application types included
+            in the map are evaluated against the percentage specified
+            in the map, and not with the global MaxPercentUnhealthyApplications
+            defined in the cluster health policy. The applications of
+            application types specified in the map are not counted against
+            the global pool of applications. For example, if some
+            applications of a type are critical, the cluster administrator
+            can add an entry to the map for that application type and assign
+            it a value of 0% (that is, do not tolerate any failures).
+            All other applications can be evaluated with
+            MaxPercentUnhealthyApplications set to 20% to tolerate
+            some failures out of the thousands of application instances.
+            The application type health policy map is used only if the
+            cluster manifest enables application type health evaluation
+            using the configuration entry for
+            HealthManager/EnableApplicationTypeHealthEvaluation.
+        - name: --context
+          type: string
+          short-summary: JSON encoded map of (string, string) type key-value
+            pairs. The map can be used to record information about the Chaos
+            run. There cannot be more than 100 such pairs and each
+            string (key or value) can be at most 4095 characters long.
+            This map is set by the starter of the Chaos run to optionally
+            store the context about the specific run.
+        - name: --chaos-target-filter
+          type: string
+          short-summary: JSON encoded dictionary with two
+            string type keys. The two keys are NodeTypeInclusionList and
+            ApplicationInclusionList. Values for both of these keys are list of
+            string. chaos_target_filter defines all filters for targeted
+            Chaos faults, for example, faulting only certain node types or
+            faulting only certain applications.
+          long-summary: If chaos_target_filter is not used, Chaos faults all cluster entities.
+            If chaos_target_filter is used, Chaos faults only the entities that
+            meet the chaos_target_filter specification. NodeTypeInclusionList
+            and ApplicationInclusionList allow a union semantics only. It is
+            not possible to specify an intersection of NodeTypeInclusionList
+            and ApplicationInclusionList. For example,
+            it is not possible to specify "fault this application only when
+            it is on that node type." Once an entity is included in either
+            NodeTypeInclusionList or ApplicationInclusionList, that entity cannot
+            be excluded using ChaosTargetFilter. Even if applicationX does not 
+            appear in ApplicationInclusionList, in some Chaos iteration
+            applicationX can be faulted because it happens to be on a node of
+            nodeTypeY that is included in NodeTypeInclusionList.
+            If both NodeTypeInclusionList and ApplicationInclusionList
+            are empty, an ArgumentException is thrown.
+            All types of faults (restart node, restart codepackage, remove replica,
+            restart replica, move primary, and move secondary) are enabled for
+            the nodes of these node types.
+            If a nodetype (say NodeTypeX) does not appear in the
+            NodeTypeInclusionList, then node level faults (like NodeRestart)
+            will never be enabled for the nodes of NodeTypeX, but code package
+            and replica faults can still be enabled for NodeTypeX 
+            if an application in the ApplicationInclusionList happens to
+            reside on a node of NodeTypeX. 
+            At most 100 node type names can be included in this list,
+            to increase this number, a config upgrade is required for
+            MaxNumberOfNodeTypesInChaosEntityFilter configuration.
+            All replicas belonging to services of these applications are
+            amenable to replica faults (restart replica, remove replica,
+            move primary, and move secondary) by Chaos.
+            Chaos may restart a code package only if the code package hosts
+            replicas of these applications only.
+            If an application does not appear in this list, it can still
+            be faulted in some Chaos iteration if the application ends
+            up on a node of a node type that is  incuded in NodeTypeInclusionList.
+            However if applicationX is tied to nodeTypeY through placement
+            constraints and applicationX is absent from ApplicationInclusionList
+            and nodeTypeY is absent from NodeTypeInclusionList, then
+            applicationX will never be faulted. At most 1000 application
+            names can be included in this list, to increase this number,
+            a config upgrade is required for 
+            MaxNumberOfApplicationsInChaosEntityFilter configuration.
+"""
--- a/src/sfctl/params.py
+++ b/src/sfctl/params.py
@ -64,6 +64,8 @@ def custom_arguments(self, _): #pylint: disable=too-many-statements
        arg_context.argument('wait_time_between_iterations', type=int)
        arg_context.argument('max_percent_unhealthy_nodes', type=int)
        arg_context.argument('max_percent_unhealthy_apps', type=int)
+        arg_context.argument('context', type=json_encoded)
+        arg_context.argument('chaos_target_filter', type=json_encoded)

    with ArgumentsContext(self, 'cluster health') as arg_context:
        arg_context.argument('nodes_health_state_filter', type=int)
--- a/src/sfctl/tests/chaos_test.py
+++ b/src/sfctl/tests/chaos_test.py
@ -0,0 +1,69 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# -----------------------------------------------------------------------------
+
+"""Custom Chaos command related tests"""
+
+import unittest
+import sfctl.custom_chaos as sf_c
+
+class ChaosTests(unittest.TestCase):
+    """Chaos tests"""
+
+    def parse_none_context_test(self):
+        """Parsing None context returns None"""
+        res = sf_c.parse_chaos_context(None)
+        self.assertIs(res, None)
+
+    def parse_populated_context_test(self):
+        """Parse context with contents"""
+        from azure.servicefabric.models.chaos_context import (
+            ChaosContext
+        )
+
+        wrapper = sf_c.parse_chaos_context({
+            'key1': 'value1',
+            'key2': 'value2',
+            'key3': 'value3'
+        })
+
+        self.assertIsInstance(wrapper, ChaosContext)
+        res = wrapper.map
+        self.assertIsInstance(res, dict)
+        self.assertEqual(len(res), 3)
+        self.assertEqual(res['key1'], 'value1')
+        self.assertEqual(res['key2'], 'value2')
+        self.assertEqual(res['key3'], 'value3')
+
+    def parse_none_target_filter_test(self):
+        """Parse None chaos target filter returns None"""
+        self.assertIs(sf_c.parse_chaos_target_filter(None), None)
+
+    def parse_nodetype_list_test(self):
+        """Parse nodetypeinclusionlist list"""
+
+        res = sf_c.parse_chaos_target_filter({
+            'NodeTypeInclusionList': [
+                'N0010Ref', 'N0020Ref', 'N0030Ref', 'N0070Ref']
+        })
+
+        self.assertEqual(len(res.node_type_inclusion_list), 4)
+        self.assertEqual(res.application_inclusion_list, None)
+        self.assertEqual(res.node_type_inclusion_list[0], 'N0010Ref')
+        self.assertEqual(res.node_type_inclusion_list[1], 'N0020Ref')
+        self.assertEqual(res.node_type_inclusion_list[2], 'N0030Ref')
+        self.assertEqual(res.node_type_inclusion_list[3], 'N0070Ref')
+
+    def parse_application_list_test(self):
+        """Parse application inclusion list"""
+
+        res = sf_c.parse_chaos_target_filter({
+            'ApplicationInclusionList': ['fabric:/TestApp1', 'fabric:/TestApp2'] #pylint: disable=line-too-long
+        })
+
+        self.assertEqual(len(res.application_inclusion_list), 2)
+        self.assertEqual(res.node_type_inclusion_list, None)
+        self.assertEqual(res.application_inclusion_list[0], 'fabric:/TestApp1')
+        self.assertEqual(res.application_inclusion_list[1], 'fabric:/TestApp2')