added retries for agent cgroups test (#3075)

* retries for agent cgroups test * pylint warn * addressed comment
2024-03-01 12:47:14 -08:00 · 2024-03-01 12:47:14 -08:00 · cc94d46faa
--- a/tests_e2e/tests/lib/cgroup_helpers.py
+++ b/tests_e2e/tests/lib/cgroup_helpers.py
@ -8,6 +8,7 @@ from azurelinuxagent.common.utils import shellutil
 from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION
 from tests_e2e.tests.lib.agent_log import AgentLog
 from tests_e2e.tests.lib.logging import log
 from tests_e2e.tests.lib.retry import retry_if_false
 BASE_CGROUP = '/sys/fs/cgroup'
 AGENT_CGROUP_NAME = 'WALinuxAgent'
@ -93,23 +94,27 @@ def verify_agent_cgroup_assigned_correctly():
    This method checks agent is running and assigned to the correct cgroup using service status output
    """
    log.info("===== Verifying the daemon and the agent are assigned to the same correct cgroup using systemd")
    service_status = shellutil.run_command(["systemctl", "status", systemd.get_agent_unit_name()])
    log.info("Agent service status output:\n%s", service_status)
    is_active = False
    is_cgroup_assigned = False
    cgroup_mount_path = get_agent_cgroup_mount_path()
-    is_active_pattern = re.compile(r".*Active:\s+active.*")
+    service_status = ""
-    for line in service_status.splitlines():
+    def check_agent_service_cgroup():
-        if re.match(is_active_pattern, line):
+        is_active = False
-            is_active = True
+        is_cgroup_assigned = False
-        elif cgroup_mount_path in line:
+        service_status = shellutil.run_command(["systemctl", "status", systemd.get_agent_unit_name()])
-            is_cgroup_assigned = True
+        log.info("Agent service status output:\n%s", service_status)
        is_active_pattern = re.compile(r".*Active:\s+active.*")
-    if not is_active:
+        for line in service_status.splitlines():
-        fail('walinuxagent service was not active/running. Service status:{0}'.format(service_status))
+            if re.match(is_active_pattern, line):
-    if not is_cgroup_assigned:
+                is_active = True
-        fail('walinuxagent service was not assigned to the expected cgroup:{0}'.format(cgroup_mount_path))
+            elif cgroup_mount_path in line:
                is_cgroup_assigned = True
        return is_active and is_cgroup_assigned
    # Test check can happen before correct cgroup assigned and relfected in service status. So, retrying the check for few times
    if not retry_if_false(check_agent_service_cgroup):
        fail('walinuxagent service was not assigned to the expected cgroup:{0}. Current agent status:{1}'.format(cgroup_mount_path, service_status))
    log.info("Successfully verified the agent cgroup assigned correctly by systemd\n")
--- a/tests_e2e/tests/scripts/agent_cgroups-check_cgroups_agent.py
+++ b/tests_e2e/tests/scripts/agent_cgroups-check_cgroups_agent.py
@ -61,22 +61,26 @@ def verify_agent_cgroup_created_on_file_system():
    """
    log.info("===== Verifying the agent cgroup paths exist on file system")
    agent_cgroup_mount_path = get_agent_cgroup_mount_path()
-    all_agent_cgroup_controllers_path_exist = True
+    log.info("expected agent cgroup mount path: %s", agent_cgroup_mount_path)
    missing_agent_cgroup_controllers_path = []
    verified_agent_cgroup_controllers_path = []
-    log.info("expected agent cgroup mount path: %s", agent_cgroup_mount_path)
+    def is_agent_cgroup_controllers_path_exist():
        all_controllers_path_exist = True
-    for controller in AGENT_CONTROLLERS:
+        for controller in AGENT_CONTROLLERS:
-        agent_controller_path = os.path.join(BASE_CGROUP, controller, agent_cgroup_mount_path[1:])
+            agent_controller_path = os.path.join(BASE_CGROUP, controller, agent_cgroup_mount_path[1:])
-        if not os.path.exists(agent_controller_path):
+            if not os.path.exists(agent_controller_path):
-            all_agent_cgroup_controllers_path_exist = False
+                all_controllers_path_exist = False
-            missing_agent_cgroup_controllers_path.append(agent_controller_path)
+                missing_agent_cgroup_controllers_path.append(agent_controller_path)
-        else:
+            else:
-            verified_agent_cgroup_controllers_path.append(agent_controller_path)
+                verified_agent_cgroup_controllers_path.append(agent_controller_path)
        return all_controllers_path_exist
-    if not all_agent_cgroup_controllers_path_exist:
+    # Test check can happen before agent setup cgroup configuration. So, retrying the check for few times
    if not retry_if_false(is_agent_cgroup_controllers_path_exist):
        fail("Agent's cgroup paths couldn't be found on file system. Missing agent cgroups path :{0}.\n Verified agent cgroups path:{1}".format(missing_agent_cgroup_controllers_path, verified_agent_cgroup_controllers_path))
    log.info('Verified all agent cgroup paths are present.\n {0}'.format(verified_agent_cgroup_controllers_path))