* Monitor RAM usage for VMAgent

* address comments

* fix Unit test

* address new comments

* refactor metricvalue

* fix tests

* fix monitor UTS

* fix pylint warning

* update periodic report

* addressed few more comments

* fix ut errors

* pylint warnings

* pylint error

* fix tests

* address private members

* log counter found error

* fix test errors
This commit is contained in:
Nageswara Nandigam 2022-06-02 09:26:46 -07:00 коммит произвёл GitHub
Родитель 48158a8b56
Коммит 8320fee03c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
16 изменённых файлов: 313 добавлений и 123 удалений

Просмотреть файл

@ -13,21 +13,60 @@
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+
from collections import namedtuple
import errno
import os
import re
from datetime import timedelta
from azurelinuxagent.common import logger
from azurelinuxagent.common import logger, conf
from azurelinuxagent.common.exception import CGroupsException
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.utils import fileutil
_REPORT_EVERY_HOUR = timedelta(hours=1)
_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period())
AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes
MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
class CounterNotFound(Exception):
pass
class MetricValue(object):
"""
Class for defining all the required metric fields to send telemetry.
"""
def __init__(self, category, counter, instance, value, report_period=_DEFAULT_REPORT_PERIOD):
self._category = category
self._counter = counter
self._instance = instance
self._value = value
self._report_period = report_period
@property
def category(self):
return self._category
@property
def counter(self):
return self._counter
@property
def instance(self):
return self._instance
@property
def value(self):
return self._value
@property
def report_period(self):
return self._report_period
class MetricsCategory(object):
@ -40,6 +79,7 @@ class MetricsCounter(object):
TOTAL_MEM_USAGE = "Total Memory Usage"
MAX_MEM_USAGE = "Max Memory Usage"
THROTTLED_TIME = "Throttled Time"
SWAP_MEM_USAGE = "Swap Memory Usage"
re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')
@ -166,7 +206,8 @@ class CpuCgroup(CGroup):
#
match = re_user_system_times.match(cpuacct_stat)
if not match:
raise CGroupsException("The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat))
raise CGroupsException(
"The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat))
cpu_ticks = int(match.groups()[0]) + int(match.groups()[1])
return cpu_ticks
@ -239,7 +280,8 @@ class CpuCgroup(CGroup):
return float(self.get_throttled_time() / 1E9)
if not self._cpu_usage_initialized():
raise CGroupsException("initialize_cpu_usage() must be invoked before the first call to get_throttled_time()")
raise CGroupsException(
"initialize_cpu_usage() must be invoked before the first call to get_throttled_time()")
self._previous_throttled_time = self._current_throttled_time
self._current_throttled_time = self.get_throttled_time()
@ -250,53 +292,99 @@ class CpuCgroup(CGroup):
tracked = []
cpu_usage = self.get_cpu_usage()
if cpu_usage >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))
tracked.append(
MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))
if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']:
throttled_time = self.get_cpu_throttled_time()
if cpu_usage >= float(0) and throttled_time >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))
tracked.append(
MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))
return tracked
class MemoryCgroup(CGroup):
def __init__(self, name, cgroup_path):
super(MemoryCgroup, self).__init__(name, cgroup_path)
self._counter_not_found_error_count = 0
def _get_memory_stat_counter(self, counter_name):
try:
with open(os.path.join(self.path, 'memory.stat')) as memory_stat:
# cat /sys/fs/cgroup/memory/azure.slice/memory.stat
# cache 67178496
# rss 42340352
# rss_huge 6291456
# swap 0
for line in memory_stat:
re_memory_counter = r'{0}\s+(\d+)'.format(counter_name)
match = re.match(re_memory_counter, line)
if match is not None:
return int(match.groups()[0])
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
raise
raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
except Exception as e:
raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
raise CounterNotFound("Cannot find counter: {0}".format(counter_name))
def get_memory_usage(self):
"""
Collect memory.usage_in_bytes from the cgroup.
Collect RSS+CACHE from memory.stat cgroup.
:return: Memory usage in bytes
:rtype: int
"""
usage = None
try:
usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True)
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)
return int(usage)
cache = self._get_memory_stat_counter("cache")
rss = self._get_memory_stat_counter("rss")
return cache + rss
def try_swap_memory_usage(self):
"""
Collect SWAP from memory.stat cgroup.
:return: Memory usage in bytes
:rtype: int
Note: stat file is the only place to get the SWAP since other swap related file memory.memsw.usage_in_bytes is for total Memory+SWAP.
"""
try:
return self._get_memory_stat_counter("swap")
except CounterNotFound as e:
if self._counter_not_found_error_count < 1:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not find swap counter from "memory.stat" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
self._counter_not_found_error_count += 1
return 0
def get_max_memory_usage(self):
"""
Collect memory.usage_in_bytes from the cgroup.
Collect memory.max_usage_in_bytes from the cgroup.
:return: Memory usage in bytes
:rtype: int
"""
usage = None
usage = 0
try:
usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)
usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True))
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)
raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e)
return int(usage)
return usage
def get_tracked_metrics(self, **_):
return [
MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name, self.get_memory_usage()),
MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name, self.get_max_memory_usage()),
MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name,
self.get_memory_usage()),
MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name,
self.get_max_memory_usage(), _REPORT_EVERY_HOUR),
MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name,
self.try_swap_memory_usage(), _REPORT_EVERY_HOUR)
]

Просмотреть файл

@ -23,7 +23,7 @@ import threading
from azurelinuxagent.common import conf
from azurelinuxagent.common import logger
from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter
from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup
from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX
from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException
@ -98,6 +98,13 @@ _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT = """
[Service]
CPUQuota={0}
"""
_DROP_IN_FILE_MEMORY_ACCOUNTING = "13-MemoryAccounting.conf"
_DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS = """
# This drop-in unit file was created by the Azure VM Agent.
# Do not edit.
[Service]
MemoryAccounting=yes
"""
class DisableCgroups(object):
@ -176,11 +183,18 @@ class CGroupConfigurator(object):
cpu_controller_root,
memory_controller_root)
if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None:
self.enable()
if self._agent_cpu_cgroup_path is not None:
_log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path)
self.enable()
self.__set_cpu_quota(conf.get_agent_cpu_quota())
CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
if self._agent_memory_cgroup_path is not None:
_log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path)
CGroupsTelemetry.track_cgroup(MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path))
_log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled)
except Exception as exception:
@ -322,6 +336,7 @@ class CGroupConfigurator(object):
agent_drop_in_path = systemd.get_agent_drop_in_path()
agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_ACCOUNTING)
agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_MEMORY_ACCOUNTING)
files_to_create = []
@ -349,6 +364,12 @@ class CGroupConfigurator(object):
if not os.path.exists(agent_drop_in_file_cpu_accounting):
files_to_create.append((agent_drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS))
if fileutil.findre_in_file(agent_unit_file, r"MemoryAccounting=") is not None:
CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_memory_accounting)
else:
if not os.path.exists(agent_drop_in_file_memory_accounting):
files_to_create.append((agent_drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS))
if len(files_to_create) > 0:
# create the unit files, but if 1 fails remove all and return
try:
@ -482,7 +503,6 @@ class CGroupConfigurator(object):
"Attempted to enable cgroups, but they are not supported on the current platform")
self._agent_cgroups_enabled = True
self._extensions_cgroups_enabled = True
self.__set_cpu_quota(conf.get_agent_cpu_quota())
def disable(self, reason, disable_cgroups):
if disable_cgroups == DisableCgroups.ALL: # disable all

Просмотреть файл

@ -47,16 +47,21 @@ class PollResourceUsage(PeriodicOperation):
Periodic operation to poll the tracked cgroups for resource usage data.
It also checks whether there are processes in the agent's cgroup that should not be there.
"""
def __init__(self):
super(PollResourceUsage, self).__init__(conf.get_cgroup_check_period())
self.__log_metrics = conf.get_cgroup_log_metrics()
self.__periodic_metrics = {}
def _operation(self):
tracked_metrics = CGroupsTelemetry.poll_all_tracked()
for metric in tracked_metrics:
report_metric(metric.category, metric.counter, metric.instance, metric.value, log_event=self.__log_metrics)
key = metric.category + metric.counter + metric.instance
if key not in self.__periodic_metrics or (self.__periodic_metrics[key] + metric.report_period) <= datetime.datetime.now():
report_metric(metric.category, metric.counter, metric.instance, metric.value, log_event=self.__log_metrics)
self.__periodic_metrics[key] = datetime.datetime.now()
CGroupConfigurator.get_instance().check_cgroups(tracked_metrics)

Просмотреть файл

@ -13,6 +13,7 @@ Restart=always
RestartSec=5
Slice=azure.slice
CPUAccounting=yes
MemoryAccounting=yes
[Install]
WantedBy=multi-user.target

Просмотреть файл

@ -13,6 +13,7 @@ Restart=always
RestartSec=5
Slice=azure.slice
CPUAccounting=yes
MemoryAccounting=yes
[Install]
WantedBy=multi-user.target

Просмотреть файл

@ -19,6 +19,7 @@ ExecStart=/usr/bin/python3 -u /usr/sbin/waagent -daemon
Restart=always
Slice=azure.slice
CPUAccounting=yes
MemoryAccounting=yes
[Install]
WantedBy=multi-user.target

Просмотреть файл

@ -97,6 +97,7 @@ class UnitFilePaths:
slice = "/lib/systemd/system/walinuxagent.service.d/10-Slice.conf"
cpu_accounting = "/lib/systemd/system/walinuxagent.service.d/11-CPUAccounting.conf"
cpu_quota = "/lib/systemd/system/walinuxagent.service.d/12-CPUQuota.conf"
memory_accounting = "/lib/systemd/system/walinuxagent.service.d/13-MemoryAccounting.conf"
extension_service_cpu_accounting = '/lib/systemd/system/extension.service.d/11-CPUAccounting.conf'
extension_service_cpu_quota = '/lib/systemd/system/extension.service.d/12-CPUQuota.conf'
extension_service_memory_accounting = '/lib/systemd/system/extension.service.d/13-MemoryAccounting.conf'

Просмотреть файл

@ -77,6 +77,8 @@ class CGroupConfiguratorSystemdTestCase(AgentTestCase):
self.assertTrue(configurator.enabled(), "Cgroups should be enabled")
self.assertTrue(any(cg for cg in tracked.values() if cg.name == AGENT_NAME_TELEMETRY and 'cpu' in cg.path),
"The Agent's CPU is not being tracked. Tracked: {0}".format(tracked))
self.assertTrue(any(cg for cg in tracked.values() if cg.name == AGENT_NAME_TELEMETRY and 'memory' in cg.path),
"The Agent's Memory is not being tracked. Tracked: {0}".format(tracked))
def test_initialize_should_start_tracking_other_controllers_when_one_is_not_present(self):
command_mocks = [MockCommand(r"^mount -t cgroup$",
@ -147,6 +149,7 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blki
extensions_slice_unit_file = configurator.mocks.get_mapped_path(UnitFilePaths.vmextensions)
agent_drop_in_file_slice = configurator.mocks.get_mapped_path(UnitFilePaths.slice)
agent_drop_in_file_cpu_accounting = configurator.mocks.get_mapped_path(UnitFilePaths.cpu_accounting)
agent_drop_in_file_memory_accounting = configurator.mocks.get_mapped_path(UnitFilePaths.memory_accounting)
# The mock creates the slice unit files; delete them
os.remove(azure_slice_unit_file)
@ -158,6 +161,7 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blki
self.assertFalse(os.path.exists(extensions_slice_unit_file), "{0} should not have been created".format(extensions_slice_unit_file))
self.assertFalse(os.path.exists(agent_drop_in_file_slice), "{0} should not have been created".format(agent_drop_in_file_slice))
self.assertFalse(os.path.exists(agent_drop_in_file_cpu_accounting), "{0} should not have been created".format(agent_drop_in_file_cpu_accounting))
self.assertFalse(os.path.exists(agent_drop_in_file_memory_accounting), "{0} should not have been created".format(agent_drop_in_file_memory_accounting))
def test_initialize_should_create_unit_files_when_the_agent_service_file_is_not_updated(self):
with self._get_cgroup_configurator(initialize=False) as configurator:
@ -166,6 +170,7 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blki
extensions_slice_unit_file = configurator.mocks.get_mapped_path(UnitFilePaths.vmextensions)
agent_drop_in_file_slice = configurator.mocks.get_mapped_path(UnitFilePaths.slice)
agent_drop_in_file_cpu_accounting = configurator.mocks.get_mapped_path(UnitFilePaths.cpu_accounting)
agent_drop_in_file_memory_accounting = configurator.mocks.get_mapped_path(UnitFilePaths.memory_accounting)
# The mock creates the service and slice unit files; replace the former and delete the latter
configurator.mocks.add_data_file(os.path.join(data_dir, 'init', "walinuxagent.service.previous"), UnitFilePaths.walinuxagent)
@ -180,6 +185,7 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blki
self.assertTrue(os.path.exists(extensions_slice_unit_file), "{0} was not created".format(extensions_slice_unit_file))
self.assertTrue(os.path.exists(agent_drop_in_file_slice), "{0} was not created".format(agent_drop_in_file_slice))
self.assertTrue(os.path.exists(agent_drop_in_file_cpu_accounting), "{0} was not created".format(agent_drop_in_file_cpu_accounting))
self.assertTrue(os.path.exists(agent_drop_in_file_memory_accounting), "{0} was not created".format(agent_drop_in_file_memory_accounting))
def test_setup_extension_slice_should_create_unit_files(self):
with self._get_cgroup_configurator() as configurator:
@ -229,12 +235,12 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blki
self.assertIn("Attempted to enable cgroups, but they are not supported on the current platform", str(context_manager.exception))
def test_enable_should_set_agent_cpu_quota_and_track_throttled_time(self):
with self._get_cgroup_configurator(enable=False) as configurator:
with self._get_cgroup_configurator(initialize=False) as configurator:
agent_drop_in_file_cpu_quota = configurator.mocks.get_mapped_path(UnitFilePaths.cpu_quota)
if os.path.exists(agent_drop_in_file_cpu_quota):
raise Exception("{0} should not have been created during test setup".format(agent_drop_in_file_cpu_quota))
configurator.enable()
configurator.initialize()
expected_quota = "CPUQuota={0}%".format(conf.get_agent_cpu_quota())
self.assertTrue(os.path.exists(agent_drop_in_file_cpu_quota), "{0} was not created".format(agent_drop_in_file_cpu_quota))
@ -244,13 +250,13 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blki
self.assertTrue(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should be tracked")
def test_enable_should_not_track_throttled_time_when_setting_the_cpu_quota_fails(self):
with self._get_cgroup_configurator(enable=False) as configurator:
with self._get_cgroup_configurator(initialize=False) as configurator:
if CGroupsTelemetry.get_track_throttled_time():
raise Exception("Test setup should not start tracking Throttle Time")
configurator.mocks.add_file(UnitFilePaths.cpu_quota, Exception("A TEST EXCEPTION"))
configurator.enable()
configurator.initialize()
self.assertFalse(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should not be tracked")
@ -268,7 +274,9 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blki
self.assertTrue(
fileutil.findre_in_file(agent_drop_in_file_cpu_quota, "^CPUQuota=$"),
"CPUQuota was not set correctly. Expected an empty value. Got:\n{0}".format(fileutil.read_file(agent_drop_in_file_cpu_quota)))
self.assertEqual(len(CGroupsTelemetry._tracked), 0, "No cgroups should be tracked after disable. Tracking: {0}".format(CGroupsTelemetry._tracked))
self.assertEqual(len(CGroupsTelemetry._tracked), 1, "Memory cgroups should be tracked after disable. Tracking: {0}".format(CGroupsTelemetry._tracked))
self.assertFalse(any(cg for cg in CGroupsTelemetry._tracked.values() if cg.name == 'walinuxagent.service' and 'cpu' in cg.path),
"The Agent's cpu should not be tracked. Tracked: {0}".format(CGroupsTelemetry._tracked))
def test_disable_should_reset_cpu_quota_for_all_cgroups(self):
service_list = [

Просмотреть файл

@ -22,7 +22,7 @@ import os
import random
import shutil
from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup, MetricsCounter
from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup, MetricsCounter, CounterNotFound
from azurelinuxagent.common.exception import CGroupsException
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.utils import fileutil
@ -206,11 +206,14 @@ class TestMemoryCgroup(AgentTestCase):
test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups", "memory_mount"))
memory_usage = test_mem_cg.get_memory_usage()
self.assertEqual(100000, memory_usage)
self.assertEqual(150000, memory_usage)
max_memory_usage = test_mem_cg.get_max_memory_usage()
self.assertEqual(1000000, max_memory_usage)
swap_memory_usage = test_mem_cg.try_swap_memory_usage()
self.assertEqual(20000, swap_memory_usage)
def test_get_metrics_when_files_not_present(self):
test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups"))
@ -223,3 +226,17 @@ class TestMemoryCgroup(AgentTestCase):
test_mem_cg.get_max_memory_usage()
self.assertEqual(e.exception.errno, errno.ENOENT)
with self.assertRaises(IOError) as e:
test_mem_cg.try_swap_memory_usage()
self.assertEqual(e.exception.errno, errno.ENOENT)
def test_get_memory_usage_counters_not_found(self):
test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups", "missing_memory_counters"))
with self.assertRaises(CounterNotFound):
test_mem_cg.get_memory_usage()
swap_memory_usage = test_mem_cg.try_swap_memory_usage()
self.assertEqual(0, swap_memory_usage)

Просмотреть файл

@ -116,18 +116,20 @@ class TestCGroupsTelemetry(AgentTestCase):
self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i)))
self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i)))
def _assert_polled_metrics_equal(self, metrics, cpu_metric_value, memory_metric_value, max_memory_metric_value):
def _assert_polled_metrics_equal(self, metrics, cpu_metric_value, memory_metric_value, max_memory_metric_value, swap_memory_value):
for metric in metrics:
self.assertIn(metric.category, ["CPU", "Memory"])
if metric.category == "CPU":
self.assertEqual(metric.counter, "% Processor Time")
self.assertEqual(metric.value, cpu_metric_value)
if metric.category == "Memory":
self.assertIn(metric.counter, ["Total Memory Usage", "Max Memory Usage", "Memory Used by Process"])
self.assertIn(metric.counter, ["Total Memory Usage", "Max Memory Usage", "Swap Memory Usage"])
if metric.counter == "Total Memory Usage":
self.assertEqual(metric.value, memory_metric_value)
elif metric.counter == "Max Memory Usage":
self.assertEqual(metric.value, max_memory_metric_value)
elif metric.counter == "Swap Memory Usage":
self.assertEqual(metric.value, swap_memory_value)
def test_telemetry_polling_with_active_cgroups(self, *args): # pylint: disable=unused-argument
num_extensions = 3
@ -136,27 +138,30 @@ class TestCGroupsTelemetry(AgentTestCase):
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage:
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage:
with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage:
with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active:
patch_is_active.return_value = True
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage:
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.try_swap_memory_usage") as patch_try_swap_memory_usage:
with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage:
with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active:
patch_is_active.return_value = True
current_cpu = 30
current_memory = 209715200
current_max_memory = 471859200
current_cpu = 30
current_memory = 209715200
current_max_memory = 471859200
current_swap_memory = 20971520
# 1 CPU metric + 1 Current Memory + 1 Max memory
num_of_metrics_per_extn_expected = 3
patch_get_cpu_usage.return_value = current_cpu
patch_get_memory_usage.return_value = current_memory # example 200 MB
patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB
num_polls = 10
# 1 CPU metric + 1 Current Memory + 1 Max memory + 1 swap memory
num_of_metrics_per_extn_expected = 4
patch_get_cpu_usage.return_value = current_cpu
patch_get_memory_usage.return_value = current_memory # example 200 MB
patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB
patch_try_swap_memory_usage.return_value = current_swap_memory # example 20MB
num_polls = 12
for data_count in range(1, num_polls + 1): # pylint: disable=unused-variable
metrics = CGroupsTelemetry.poll_all_tracked()
self.assertEqual(len(metrics), num_extensions * num_of_metrics_per_extn_expected)
self._assert_polled_metrics_equal(metrics, current_cpu, current_memory, current_max_memory)
for data_count in range(1, num_polls + 1): # pylint: disable=unused-variable
metrics = CGroupsTelemetry.poll_all_tracked()
self.assertEqual(len(metrics), num_extensions * num_of_metrics_per_extn_expected)
self._assert_polled_metrics_equal(metrics, current_cpu, current_memory, current_max_memory, current_swap_memory)
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage", side_effect=raise_ioerror)
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror)
@ -177,7 +182,6 @@ class TestCGroupsTelemetry(AgentTestCase):
self.assertEqual(len(metrics), 0)
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage")
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage")
@patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage")
@ -216,7 +220,6 @@ class TestCGroupsTelemetry(AgentTestCase):
self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i)))
self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i)))
# mocking get_proc_stat to make it run on Mac and other systems. This test does not need to read the values of the
# /proc/stat file on the filesystem.
@patch("azurelinuxagent.common.logger.periodic_warn")
@ -238,7 +241,7 @@ class TestCGroupsTelemetry(AgentTestCase):
@patch("azurelinuxagent.common.logger.periodic_warn")
def test_telemetry_polling_to_generate_transient_logs_ioerror_permission_denied(self, patch_periodic_warn):
num_extensions = 1
num_controllers = 2
num_controllers = 1
is_active_check_per_controller = 2
self._track_new_extension_cgroups(num_extensions)
@ -251,7 +254,7 @@ class TestCGroupsTelemetry(AgentTestCase):
with patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=io_error_3):
poll_count = 1
expected_count_per_call = num_controllers + is_active_check_per_controller
# each collect per controller would generate a log statement, and each cgroup would invoke a
# get_max_memory_usage memory controller would generate a log statement, and each cgroup would invoke a
# is active check raising an exception
for data_count in range(poll_count, 10): # pylint: disable=unused-variable
@ -266,16 +269,18 @@ class TestCGroupsTelemetry(AgentTestCase):
# Trying to invoke IndexError during the getParameter call
with patch("azurelinuxagent.common.utils.fileutil.read_file", return_value=''):
with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn:
expected_call_count = 2 # 1 periodic warning for the cpu cgroups, and 1 for memory
expected_call_count = 1 # 1 periodic warning for memory
for data_count in range(1, 10): # pylint: disable=unused-variable
CGroupsTelemetry.poll_all_tracked()
self.assertEqual(expected_call_count, patch_periodic_warn.call_count)
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.try_swap_memory_usage")
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage")
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage")
@patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage")
@patch("azurelinuxagent.common.cgroup.CGroup.is_active")
def test_telemetry_calculations(self, patch_is_active, patch_get_cpu_usage, patch_get_memory_usage, patch_get_memory_max_usage, *args): # pylint: disable=unused-argument
def test_telemetry_calculations(self, patch_is_active, patch_get_cpu_usage, patch_get_memory_usage, patch_get_memory_max_usage, patch_try_memory_swap_usage,
*args): # pylint: disable=unused-argument
num_polls = 10
num_extensions = 1
@ -284,6 +289,7 @@ class TestCGroupsTelemetry(AgentTestCase):
# only verifying calculations and not validity of the values.
memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)]
max_memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)]
swap_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)]
self._track_new_extension_cgroups(num_extensions)
self.assertEqual(2 * num_extensions, len(CGroupsTelemetry._tracked))
@ -291,14 +297,15 @@ class TestCGroupsTelemetry(AgentTestCase):
for i in range(num_polls):
patch_is_active.return_value = True
patch_get_cpu_usage.return_value = cpu_percent_values[i]
patch_get_memory_usage.return_value = memory_usage_values[i] # example 200 MB
patch_get_memory_max_usage.return_value = max_memory_usage_values[i] # example 450 MB
patch_get_memory_usage.return_value = memory_usage_values[i]
patch_get_memory_max_usage.return_value = max_memory_usage_values[i]
patch_try_memory_swap_usage.return_value = swap_usage_values[i]
metrics = CGroupsTelemetry.poll_all_tracked()
# 1 CPU metric + 1 Current Memory + 1 Max memory
self.assertEqual(len(metrics), 3 * num_extensions)
self._assert_polled_metrics_equal(metrics, cpu_percent_values[i], memory_usage_values[i], max_memory_usage_values[i])
# 1 CPU metric + 1 Current Memory + 1 Max memory + 1 swap memory
self.assertEqual(len(metrics), 4 * num_extensions)
self._assert_polled_metrics_equal(metrics, cpu_percent_values[i], memory_usage_values[i], max_memory_usage_values[i], swap_usage_values[i])
def test_cgroup_tracking(self, *args): # pylint: disable=unused-argument
num_extensions = 5
@ -332,8 +339,7 @@ class TestCGroupsTelemetry(AgentTestCase):
metrics = CGroupsTelemetry.poll_all_tracked()
self.assertEqual(len(metrics), num_extensions * 1) # Only CPU populated
self._assert_polled_metrics_equal(metrics, current_cpu, 0, 0)
self._assert_polled_metrics_equal(metrics, current_cpu, 0, 0, 0)
@patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror)
def test_process_cgroup_metric_with_no_cpu_cgroup_mounted(self, *args): # pylint: disable=unused-argument
@ -343,20 +349,23 @@ class TestCGroupsTelemetry(AgentTestCase):
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage:
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage:
with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active:
patch_is_active.return_value = True
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.try_swap_memory_usage") as patch_try_swap_memory_usage:
with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active:
patch_is_active.return_value = True
current_memory = 209715200
current_max_memory = 471859200
current_memory = 209715200
current_max_memory = 471859200
current_swap_memory = 20971520
patch_get_memory_usage.return_value = current_memory # example 200 MB
patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB
num_polls = 10
for data_count in range(1, num_polls + 1): # pylint: disable=unused-variable
metrics = CGroupsTelemetry.poll_all_tracked()
# Memory is only populated, CPU is not. Thus 2 metrics per cgroup.
self.assertEqual(len(metrics), num_extensions * 2)
self._assert_polled_metrics_equal(metrics, 0, current_memory, current_max_memory)
patch_get_memory_usage.return_value = current_memory # example 200 MB
patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB
patch_try_swap_memory_usage.return_value = current_swap_memory # example 20MB
num_polls = 10
for data_count in range(1, num_polls + 1): # pylint: disable=unused-variable
metrics = CGroupsTelemetry.poll_all_tracked()
# Memory is only populated, CPU is not. Thus 3 metrics for memory.
self.assertEqual(len(metrics), num_extensions * 3)
self._assert_polled_metrics_equal(metrics, 0, current_memory, current_max_memory, current_swap_memory)
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror)
@patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage", side_effect=raise_ioerror)
@ -408,4 +417,3 @@ class TestCGroupsTelemetry(AgentTestCase):
for metric in metrics:
self.assertGreaterEqual(metric.value, 0, "telemetry should not report negative value")

Просмотреть файл

@ -31,9 +31,10 @@ from mock import MagicMock
from azurelinuxagent.common.utils import textutil, fileutil
from azurelinuxagent.common import event, logger
from azurelinuxagent.common.AgentGlobals import AgentGlobals
from azurelinuxagent.common.event import add_event, add_periodic, add_log_event, elapsed_milliseconds, report_metric, \
from azurelinuxagent.common.event import add_event, add_periodic, add_log_event, elapsed_milliseconds, \
WALAEventOperation, parse_xml_event, parse_json_event, AGENT_EVENT_FILE_EXTENSION, EVENTS_DIRECTORY, \
TELEMETRY_EVENT_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID, TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID
TELEMETRY_EVENT_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID, TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID, \
report_metric
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.telemetryevent import CommonTelemetryEventSchema, GuestAgentGenericLogsSchema, \

Просмотреть файл

@ -0,0 +1,36 @@
cache 50000
rss 100000
rss_huge 4194304
shmem 8192
mapped_file 540672
dirty 0
writeback 0
swap 20000
pgpgin 42584
pgpgout 24188
pgfault 71983
pgmajfault 402
inactive_anon 32854016
active_anon 12288
inactive_file 47472640
active_file 1290240
unevictable 0
hierarchical_memory_limit 9223372036854771712
hierarchical_memsw_limit 9223372036854771712
total_cache 48771072
total_rss 32845824
total_rss_huge 4194304
total_shmem 8192
total_mapped_file 540672
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 42584
total_pgpgout 24188
total_pgfault 71983
total_pgmajfault 402
total_inactive_anon 32854016
total_active_anon 12288
total_inactive_file 47472640
total_active_file 1290240
total_unevictable 0

Просмотреть файл

@ -1 +0,0 @@
100000

Просмотреть файл

@ -0,0 +1,34 @@
cache 50000
rss_huge 4194304
shmem 8192
mapped_file 540672
dirty 0
writeback 0
pgpgin 42584
pgpgout 24188
pgfault 71983
pgmajfault 402
inactive_anon 32854016
active_anon 12288
inactive_file 47472640
active_file 1290240
unevictable 0
hierarchical_memory_limit 9223372036854771712
hierarchical_memsw_limit 9223372036854771712
total_cache 48771072
total_rss 32845824
total_rss_huge 4194304
total_shmem 8192
total_mapped_file 540672
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 42584
total_pgpgout 24188
total_pgfault 71983
total_pgmajfault 402
total_inactive_anon 32854016
total_active_anon 12288
total_inactive_file 47472640
total_active_file 1290240
total_unevictable 0

Просмотреть файл

@ -16,6 +16,7 @@ ExecStart=/usr/bin/python3 -u /usr/sbin/waagent -daemon
Restart=always
Slice=azure.slice
CPUAccounting=yes
MemoryAccounting=yes
[Install]
WantedBy=multi-user.target

Просмотреть файл

@ -21,7 +21,7 @@ import random
import string
from azurelinuxagent.common import event, logger
from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup, MetricValue
from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup, MetricValue, _REPORT_EVERY_HOUR
from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
from azurelinuxagent.common.event import EVENTS_DIRECTORY
from azurelinuxagent.common.protocol.healthservice import HealthService
@ -197,22 +197,23 @@ class TestExtensionMetricsDataTelemetry(AgentTestCase):
self.get_protocol.stop()
@patch('azurelinuxagent.common.event.EventLogger.add_metric')
@patch('azurelinuxagent.common.event.EventLogger.add_event')
@patch("azurelinuxagent.common.cgroupstelemetry.CGroupsTelemetry.poll_all_tracked")
def test_send_extension_metrics_telemetry(self, patch_poll_all_tracked, patch_add_event, # pylint: disable=unused-argument
def test_send_extension_metrics_telemetry(self, patch_poll_all_tracked, # pylint: disable=unused-argument
patch_add_metric, *args):
patch_poll_all_tracked.return_value = [MetricValue("Process", "% Processor Time", 1, 1),
MetricValue("Memory", "Total Memory Usage", 1, 1),
MetricValue("Memory", "Max Memory Usage", 1, 1)]
patch_poll_all_tracked.return_value = [MetricValue("Process", "% Processor Time", "service", 1),
MetricValue("Memory", "Total Memory Usage", "service", 1),
MetricValue("Memory", "Max Memory Usage", "service", 1, _REPORT_EVERY_HOUR),
MetricValue("Memory", "Swap Memory Usage", "service", 1, _REPORT_EVERY_HOUR)
]
PollResourceUsage().run()
self.assertEqual(1, patch_poll_all_tracked.call_count)
self.assertEqual(3, patch_add_metric.call_count) # Three metrics being sent.
self.assertEqual(4, patch_add_metric.call_count) # Four metrics being sent.
@patch('azurelinuxagent.common.event.EventLogger.add_metric')
@patch("azurelinuxagent.common.cgroupstelemetry.CGroupsTelemetry.poll_all_tracked")
def test_send_extension_metrics_telemetry_for_empty_cgroup(self, patch_poll_all_tracked, # pylint: disable=unused-argument
patch_add_metric,*args):
patch_add_metric, *args):
patch_poll_all_tracked.return_value = []
PollResourceUsage().run()
@ -245,41 +246,9 @@ class TestExtensionMetricsDataTelemetry(AgentTestCase):
ioerror.errno = 2
patch_cpu_usage.side_effect = ioerror
CGroupsTelemetry._tracked["/test/path"]= CpuCgroup("cgroup_name", "/test/path")
CGroupsTelemetry._tracked["/test/path"] = CpuCgroup("cgroup_name", "/test/path")
PollResourceUsage().run()
self.assertEqual(0, patch_periodic_warn.call_count)
self.assertEqual(0, patch_add_metric.call_count) # No metrics should be sent.
def test_generate_extension_metrics_telemetry_dictionary(self, *args): # pylint: disable=unused-argument
num_polls = 10
num_extensions = 1
cpu_percent_values = [random.randint(0, 100) for _ in range(num_polls)]
# only verifying calculations and not validity of the values.
memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)]
max_memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)]
# no need to initialize the CPU usage, since we mock get_cpu_usage() below
with patch("azurelinuxagent.common.cgroup.CpuCgroup.initialize_cpu_usage"):
for i in range(num_extensions):
dummy_cpu_cgroup = CpuCgroup("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i))
CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup)
dummy_memory_cgroup = MemoryCgroup("dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i))
CGroupsTelemetry.track_cgroup(dummy_memory_cgroup)
self.assertEqual(2 * num_extensions, len(CGroupsTelemetry._tracked))
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage:
with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage:
with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage:
with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active:
for i in range(num_polls):
patch_is_active.return_value = True
patch_get_cpu_usage.return_value = cpu_percent_values[i]
patch_get_memory_usage.return_value = memory_usage_values[i] # example 200 MB
patch_get_memory_max_usage.return_value = max_memory_usage_values[i] # example 450 MB
CGroupsTelemetry.poll_all_tracked()