Merge ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-2.9.1.1 into ubuntu/+source/walinuxagent:ubuntu/mantic-devel

Proposed by Calvin Mwadime Makokha
Status: Work in progress
Proposed branch: ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-2.9.1.1
Merge into: ubuntu/+source/walinuxagent:ubuntu/mantic-devel
Diff against target: 74236 lines (+43007/-14731)
403 files modified
.github/PULL_REQUEST_TEMPLATE.md (+1/-2)
.github/codecov.yml (+2/-0)
.github/workflows/ci_pr.yml (+128/-0)
.gitignore (+1/-2)
CODEOWNERS (+3/-1)
README.md (+110/-33)
SECURITY.md (+41/-0)
azurelinuxagent/agent.py (+203/-66)
azurelinuxagent/common/AgentGlobals.py (+39/-0)
azurelinuxagent/common/agent_supported_feature.py (+122/-0)
azurelinuxagent/common/cgroup.py (+208/-77)
azurelinuxagent/common/cgroupapi.py (+214/-429)
azurelinuxagent/common/cgroupconfigurator.py (+1000/-118)
azurelinuxagent/common/cgroupstelemetry.py (+25/-265)
azurelinuxagent/common/conf.py (+272/-21)
azurelinuxagent/common/datacontract.py (+4/-2)
azurelinuxagent/common/dhcp.py (+14/-11)
azurelinuxagent/common/event.py (+286/-112)
azurelinuxagent/common/exception.py (+72/-18)
azurelinuxagent/common/future.py (+91/-15)
azurelinuxagent/common/interfaces.py (+49/-0)
azurelinuxagent/common/logcollector.py (+401/-0)
azurelinuxagent/common/logcollector_manifests.py (+122/-0)
azurelinuxagent/common/logger.py (+35/-7)
azurelinuxagent/common/osutil/alpine.py (+2/-2)
azurelinuxagent/common/osutil/arch.py (+9/-2)
azurelinuxagent/common/osutil/bigip.py (+31/-30)
azurelinuxagent/common/osutil/clearlinux.py (+28/-16)
azurelinuxagent/common/osutil/coreos.py (+9/-3)
azurelinuxagent/common/osutil/debian.py (+13/-13)
azurelinuxagent/common/osutil/default.py (+441/-315)
azurelinuxagent/common/osutil/devuan.py (+52/-0)
azurelinuxagent/common/osutil/factory.py (+60/-33)
azurelinuxagent/common/osutil/fedora.py (+77/-0)
azurelinuxagent/common/osutil/freebsd.py (+43/-33)
azurelinuxagent/common/osutil/gaia.py (+29/-17)
azurelinuxagent/common/osutil/iosxe.py (+19/-7)
azurelinuxagent/common/osutil/mariner.py (+69/-0)
azurelinuxagent/common/osutil/nsbsd.py (+28/-26)
azurelinuxagent/common/osutil/openbsd.py (+16/-19)
azurelinuxagent/common/osutil/openwrt.py (+14/-13)
azurelinuxagent/common/osutil/photonos.py (+65/-0)
azurelinuxagent/common/osutil/redhat.py (+45/-16)
azurelinuxagent/common/osutil/suse.py (+93/-36)
azurelinuxagent/common/osutil/systemd.py (+86/-0)
azurelinuxagent/common/osutil/ubuntu.py (+32/-16)
azurelinuxagent/common/persist_firewall_rules.py (+338/-0)
azurelinuxagent/common/protocol/__init__.py (+0/-5)
azurelinuxagent/common/protocol/extensions_goal_state.py (+244/-0)
azurelinuxagent/common/protocol/extensions_goal_state_factory.py (+36/-0)
azurelinuxagent/common/protocol/extensions_goal_state_from_extensions_config.py (+571/-0)
azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py (+583/-0)
azurelinuxagent/common/protocol/goal_state.py (+705/-0)
azurelinuxagent/common/protocol/hostplugin.py (+371/-38)
azurelinuxagent/common/protocol/imds.py (+30/-14)
azurelinuxagent/common/protocol/metadata_server_migration_util.py (+79/-0)
azurelinuxagent/common/protocol/ovfenv.py (+7/-6)
azurelinuxagent/common/protocol/restapi.py (+165/-91)
azurelinuxagent/common/protocol/util.py (+109/-159)
azurelinuxagent/common/protocol/wire.py (+516/-1095)
azurelinuxagent/common/rdma.py (+199/-56)
azurelinuxagent/common/singletonperthread.py (+30/-0)
azurelinuxagent/common/telemetryevent.py (+72/-3)
azurelinuxagent/common/utils/archive.py (+204/-111)
azurelinuxagent/common/utils/cryptutil.py (+28/-25)
azurelinuxagent/common/utils/extensionprocessutil.py (+31/-7)
azurelinuxagent/common/utils/fileutil.py (+9/-7)
azurelinuxagent/common/utils/flexible_version.py (+29/-9)
azurelinuxagent/common/utils/networkutil.py (+172/-3)
azurelinuxagent/common/utils/restutil.py (+153/-56)
azurelinuxagent/common/utils/shellutil.py (+260/-57)
azurelinuxagent/common/utils/textutil.py (+63/-10)
azurelinuxagent/common/utils/timeutil.py (+39/-0)
azurelinuxagent/common/version.py (+102/-31)
azurelinuxagent/daemon/main.py (+36/-22)
azurelinuxagent/daemon/resourcedisk/default.py (+13/-12)
azurelinuxagent/daemon/resourcedisk/factory.py (+3/-7)
azurelinuxagent/daemon/resourcedisk/freebsd.py (+1/-1)
azurelinuxagent/daemon/resourcedisk/openwrt.py (+2/-2)
azurelinuxagent/daemon/scvmm.py (+3/-3)
azurelinuxagent/ga/collect_logs.py (+353/-0)
azurelinuxagent/ga/collect_telemetry_events.py (+586/-0)
azurelinuxagent/ga/env.py (+177/-123)
azurelinuxagent/ga/exthandlers.py (+1493/-645)
azurelinuxagent/ga/monitor.py (+249/-450)
azurelinuxagent/ga/periodic_operation.py (+81/-0)
azurelinuxagent/ga/remoteaccess.py (+81/-90)
azurelinuxagent/ga/send_telemetry_events.py (+164/-0)
azurelinuxagent/ga/update.py (+1089/-380)
azurelinuxagent/pa/deprovision/arch.py (+1/-1)
azurelinuxagent/pa/deprovision/clearlinux.py (+4/-2)
azurelinuxagent/pa/deprovision/coreos.py (+1/-1)
azurelinuxagent/pa/deprovision/default.py (+55/-17)
azurelinuxagent/pa/deprovision/factory.py (+5/-8)
azurelinuxagent/pa/deprovision/ubuntu.py (+2/-2)
azurelinuxagent/pa/provision/cloudinit.py (+33/-91)
azurelinuxagent/pa/provision/cloudinitdetect.py (+72/-0)
azurelinuxagent/pa/provision/default.py (+28/-42)
azurelinuxagent/pa/provision/factory.py (+3/-3)
azurelinuxagent/pa/rdma/centos.py (+6/-6)
azurelinuxagent/pa/rdma/factory.py (+9/-7)
azurelinuxagent/pa/rdma/suse.py (+12/-3)
azurelinuxagent/pa/rdma/ubuntu.py (+14/-14)
bin/py3/waagent (+53/-0)
bin/waagent (+5/-1)
bin/waagent2.0 (+5/-1)
ci/2.7.pylintrc (+42/-0)
ci/3.6.pylintrc (+40/-0)
ci/nosetests.sh (+25/-0)
config/66-azure-storage.rules (+23/-17)
config/alpine/waagent.conf (+4/-11)
config/arch/waagent.conf (+4/-6)
config/bigip/waagent.conf (+3/-10)
config/clearlinux/waagent.conf (+3/-5)
config/coreos/waagent.conf (+4/-11)
config/debian/waagent.conf (+10/-11)
config/devuan/waagent.conf (+130/-0)
config/freebsd/waagent.conf (+6/-13)
config/gaia/waagent.conf (+4/-6)
config/iosxe/waagent.conf (+4/-6)
config/mariner/waagent.conf (+88/-0)
config/nsbsd/waagent.conf (+4/-6)
config/openbsd/waagent.conf (+4/-6)
config/photonos/waagent.conf (+80/-0)
config/suse/waagent.conf (+12/-8)
config/ubuntu/waagent.conf (+10/-11)
config/waagent.conf (+26/-10)
debian/changelog (+35/-0)
debian/control (+11/-4)
debian/docs (+0/-1)
debian/install (+3/-3)
debian/patches/disable_udev_overrides.patch (+16/-0)
debian/patches/fix_cgroup_v2_mounting_and_systemd_process.patch (+232/-0)
debian/patches/fix_systemd_networkd_lease_file_path (+66/-0)
debian/patches/series (+3/-2)
debian/patches/update_dhcp_client_ubuntu_supported_versions.patch (+39/-0)
debian/rules (+9/-12)
debian/walinuxagent.manpages (+1/-0)
debian/watch (+4/-3)
dev/null (+0/-29)
init/azure-vmextensions.slice (+7/-0)
init/azure.slice (+4/-0)
init/devuan/default/walinuxagent (+2/-0)
init/devuan/walinuxagent (+344/-0)
init/mariner/waagent.service (+16/-0)
init/photonos/waagent.service (+16/-0)
init/redhat/py2/waagent.service (+19/-0)
init/redhat/waagent.service (+19/-0)
init/sles/waagent.service (+16/-0)
init/ubuntu/walinuxagent.service (+3/-0)
makepkg.py (+66/-51)
setup.py (+143/-60)
test-requirements.txt (+19/-3)
tests/common/dhcp/test_dhcp.py (+27/-14)
tests/common/mock_cgroup_environment.py (+122/-0)
tests/common/mock_command.py (+17/-0)
tests/common/mock_environment.py (+168/-0)
tests/common/osutil/test_alpine.py (+3/-2)
tests/common/osutil/test_arch.py (+3/-2)
tests/common/osutil/test_bigip.py (+20/-21)
tests/common/osutil/test_clearlinux.py (+3/-2)
tests/common/osutil/test_coreos.py (+3/-2)
tests/common/osutil/test_default.py (+429/-316)
tests/common/osutil/test_default_osutil.py (+3/-162)
tests/common/osutil/test_factory.py (+144/-69)
tests/common/osutil/test_freebsd.py (+8/-7)
tests/common/osutil/test_nsbsd.py (+12/-11)
tests/common/osutil/test_openbsd.py (+3/-2)
tests/common/osutil/test_openwrt.py (+3/-2)
tests/common/osutil/test_photonos.py (+37/-0)
tests/common/osutil/test_redhat.py (+3/-2)
tests/common/osutil/test_suse.py (+3/-2)
tests/common/osutil/test_ubuntu.py (+1/-1)
tests/common/test_agent_supported_feature.py (+55/-0)
tests/common/test_cgroupapi.py (+130/-548)
tests/common/test_cgroupconfigurator.py (+973/-261)
tests/common/test_cgroups.py (+62/-82)
tests/common/test_cgroupstelemetry.py (+120/-406)
tests/common/test_conf.py (+28/-53)
tests/common/test_errorstate.py (+2/-1)
tests/common/test_event.py (+583/-314)
tests/common/test_logcollector.py (+477/-0)
tests/common/test_logger.py (+45/-45)
tests/common/test_persist_firewall_rules.py (+416/-0)
tests/common/test_singletonperthread.py (+164/-0)
tests/common/test_telemetryevent.py (+20/-19)
tests/common/test_version.py (+80/-36)
tests/daemon/test_daemon.py (+15/-14)
tests/daemon/test_resourcedisk.py (+5/-5)
tests/data/cgroups/cpu.stat (+3/-0)
tests/data/cgroups/cpu.stat_t0 (+3/-0)
tests/data/cgroups/cpu.stat_t1 (+3/-0)
tests/data/cgroups/cpuacct.stat (+2/-0)
tests/data/cgroups/memory_mount/memory.stat (+36/-0)
tests/data/cgroups/missing_memory_counters/memory.stat (+34/-0)
tests/data/cgroups/proc_pid_cgroup (+13/-0)
tests/data/cgroups/proc_self_cgroup (+13/-0)
tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers (+7/-0)
tests/data/cloud-init/set-hostname (+4/-0)
tests/data/events/custom_script_1.tld (+30/-0)
tests/data/events/custom_script_2.tld (+30/-0)
tests/data/events/custom_script_extra_parameters.tld (+66/-0)
tests/data/events/custom_script_invalid_json.tld (+30/-0)
tests/data/events/custom_script_no_read_access.tld (+30/-0)
tests/data/events/custom_script_nonascii_characters.tld (+30/-0)
tests/data/events/event_with_callstack.waagent.tld (+1/-0)
tests/data/events/extension_events/different_cases/1591918616.json (+22/-0)
tests/data/events/extension_events/empty_message/1592350454.json (+24/-0)
tests/data/events/extension_events/extra_parameters/1592273009.json (+35/-0)
tests/data/events/extension_events/int_type/1519934744.json (+10/-0)
tests/data/events/extension_events/large_messages/1591921510.json (+12/-0)
tests/data/events/extension_events/malformed_files/1592008079.json (+13/-0)
tests/data/events/extension_events/malformed_files/1594857360.tld (+11/-0)
tests/data/events/extension_events/malformed_files/bad_json_files/1591816395.json (+3/-0)
tests/data/events/extension_events/malformed_files/bad_name_file.json (+24/-0)
tests/data/events/extension_events/missing_parameters/1592273793.json (+74/-0)
tests/data/events/extension_events/mix_files/1591835369.json (+3/-0)
tests/data/events/extension_events/mix_files/1591835848.json (+85/-0)
tests/data/events/extension_events/mix_files/1591835859.json (+11/-0)
tests/data/events/extension_events/special_chars/1591918939.json (+10/-0)
tests/data/events/extension_events/well_formed_files/1591905451.json (+82/-0)
tests/data/events/extension_events/well_formed_files/1592355539.json (+72/-0)
tests/data/events/extension_events/well_formed_files/9999999999.json (+82/-0)
tests/data/events/legacy_agent.tld (+66/-0)
tests/data/events/legacy_agent_no_timestamp.tld (+62/-0)
tests/data/ext/event_from_agent.json (+119/-1)
tests/data/ext/event_from_extension.xml (+9/-6)
tests/data/ext/sample-status-invalid-format-emptykey-line7.json (+37/-0)
tests/data/ext/sample-status-invalid-json-format.json (+37/-0)
tests/data/ext/sample-status-invalid-status-no-status-status-key.json (+35/-0)
tests/data/ext/sample-status-very-large-multiple-substatuses.json (+408/-0)
tests/data/ext/sample-status-very-large.json (+39/-0)
tests/data/ext/sample-status.json (+36/-0)
tests/data/ext/sample_ext-1.3.0/python.sh (+11/-0)
tests/data/ext/sample_ext-1.3.0/sample.py (+82/-23)
tests/data/hostgaplugin/ext_conf-empty_depends_on.xml (+56/-0)
tests/data/hostgaplugin/ext_conf-invalid_blob_type.xml (+94/-0)
tests/data/hostgaplugin/ext_conf-no_status_upload_blob.xml (+39/-0)
tests/data/hostgaplugin/ext_conf-requested_version.xml (+148/-0)
tests/data/hostgaplugin/ext_conf.xml (+146/-0)
tests/data/hostgaplugin/in_vm_artifacts_profile.json (+1/-0)
tests/data/hostgaplugin/vm_settings-difference_in_required_features.json (+201/-0)
tests/data/hostgaplugin/vm_settings-empty_depends_on.json (+69/-0)
tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json (+192/-0)
tests/data/hostgaplugin/vm_settings-invalid_blob_type.json (+104/-0)
tests/data/hostgaplugin/vm_settings-missing_cert.json (+68/-0)
tests/data/hostgaplugin/vm_settings-no_manifests.json (+73/-0)
tests/data/hostgaplugin/vm_settings-no_status_upload_blob.json (+66/-0)
tests/data/hostgaplugin/vm_settings-out-of-sync.json (+66/-0)
tests/data/hostgaplugin/vm_settings-parse_error.json (+72/-0)
tests/data/hostgaplugin/vm_settings-requested_version.json (+141/-0)
tests/data/hostgaplugin/vm_settings-unsupported_version.json (+72/-0)
tests/data/hostgaplugin/vm_settings.json (+201/-0)
tests/data/init/azure-vmextensions.slice (+6/-0)
tests/data/init/azure-walinuxagent-logcollector.slice (+9/-0)
tests/data/init/azure.slice (+4/-0)
tests/data/init/walinuxagent.service (+23/-0)
tests/data/init/walinuxagent.service.previous (+20/-0)
tests/data/init/walinuxagent.service_system-slice (+23/-0)
tests/data/test_waagent.conf (+6/-5)
tests/data/wire/certs-2.xml (+85/-0)
tests/data/wire/certs.xml (+80/-76)
tests/data/wire/certs_no_format_specified.xml (+78/-74)
tests/data/wire/ext_conf-no_gs_metadata.xml (+27/-0)
tests/data/wire/ext_conf.xml (+7/-5)
tests/data/wire/ext_conf_additional_locations.xml (+34/-0)
tests/data/wire/ext_conf_aks_extension.xml (+70/-0)
tests/data/wire/ext_conf_autoupgrade.xml (+9/-7)
tests/data/wire/ext_conf_autoupgrade_internalversion.xml (+9/-7)
tests/data/wire/ext_conf_dependencies_with_empty_settings.xml (+33/-0)
tests/data/wire/ext_conf_in_vm_artifacts_profile.xml (+29/-0)
tests/data/wire/ext_conf_in_vm_empty_artifacts_profile.xml (+29/-0)
tests/data/wire/ext_conf_in_vm_metadata.xml (+29/-0)
tests/data/wire/ext_conf_internalversion.xml (+9/-7)
tests/data/wire/ext_conf_invalid_and_valid_handlers.xml (+35/-0)
tests/data/wire/ext_conf_invalid_vm_metadata.xml (+29/-0)
tests/data/wire/ext_conf_missing_family.xml (+15/-14)
tests/data/wire/ext_conf_missing_requested_version.xml (+39/-0)
tests/data/wire/ext_conf_multiple_extensions.xml (+13/-32)
tests/data/wire/ext_conf_no_extensions-block_blob.xml (+13/-0)
tests/data/wire/ext_conf_no_extensions-no_status_blob.xml (+12/-0)
tests/data/wire/ext_conf_no_extensions-page_blob.xml (+25/-0)
tests/data/wire/ext_conf_no_public.xml (+25/-24)
tests/data/wire/ext_conf_no_settings.xml (+24/-23)
tests/data/wire/ext_conf_requested_version.xml (+29/-0)
tests/data/wire/ext_conf_required_features.xml (+41/-0)
tests/data/wire/ext_conf_sequencing.xml (+9/-7)
tests/data/wire/ext_conf_settings_case_mismatch.xml (+57/-0)
tests/data/wire/ext_conf_upgradeguid.xml (+7/-5)
tests/data/wire/ga_manifest.xml (+10/-31)
tests/data/wire/ga_manifest_no_upgrade.xml (+21/-21)
tests/data/wire/goal_state.xml (+7/-7)
tests/data/wire/goal_state_no_certs.xml (+27/-0)
tests/data/wire/goal_state_no_ext.xml (+6/-5)
tests/data/wire/goal_state_noop.xml (+14/-0)
tests/data/wire/goal_state_remote_access.xml (+9/-8)
tests/data/wire/in_vm_artifacts_profile.json (+1/-0)
tests/data/wire/invalid_config/ext_conf_multiple_depends_on_for_single_handler.xml (+45/-0)
tests/data/wire/invalid_config/ext_conf_multiple_runtime_settings_same_plugin.xml (+31/-0)
tests/data/wire/invalid_config/ext_conf_multiple_settings_for_same_handler.xml (+33/-0)
tests/data/wire/invalid_config/ext_conf_plugin_settings_version_mismatch.xml (+31/-0)
tests/data/wire/invalid_config/ext_conf_single_and_multi_config_settings_same_plugin.xml (+31/-0)
tests/data/wire/manifest.xml (+16/-16)
tests/data/wire/manifest_deletion.xml (+1/-1)
tests/data/wire/multi-config/ext_conf_mc_disabled_extensions.xml (+84/-0)
tests/data/wire/multi-config/ext_conf_mc_update_extensions.xml (+75/-0)
tests/data/wire/multi-config/ext_conf_multi_config_no_dependencies.xml (+75/-0)
tests/data/wire/multi-config/ext_conf_with_disabled_multi_config.xml (+129/-0)
tests/data/wire/multi-config/ext_conf_with_multi_config.xml (+131/-0)
tests/data/wire/multi-config/ext_conf_with_multi_config_dependencies.xml (+99/-0)
tests/data/wire/trans_cert (+17/-17)
tests/data/wire/trans_prv (+26/-26)
tests/data/wire/trans_pub (+7/-7)
tests/distro/test_resourceDisk.py (+1/-1)
tests/distro/test_scvmm.py (+6/-5)
tests/ga/extension_emulator.py (+373/-0)
tests/ga/mocks.py (+119/-0)
tests/ga/test_collect_logs.py (+239/-0)
tests/ga/test_collect_telemetry_events.py (+576/-0)
tests/ga/test_env.py (+50/-50)
tests/ga/test_extension.py (+2088/-1355)
tests/ga/test_exthandlers.py (+283/-108)
tests/ga/test_exthandlers_download_extension.py (+116/-58)
tests/ga/test_exthandlers_exthandlerinstance.py (+10/-12)
tests/ga/test_monitor.py (+155/-1141)
tests/ga/test_multi_config_extension.py (+1229/-0)
tests/ga/test_periodic_operation.py (+156/-0)
tests/ga/test_remoteaccess.py (+41/-49)
tests/ga/test_remoteaccess_handler.py (+429/-446)
tests/ga/test_report_status.py (+119/-0)
tests/ga/test_send_telemetry_events.py (+430/-0)
tests/ga/test_update.py (+1913/-733)
tests/pa/test_deprovision.py (+4/-4)
tests/pa/test_provision.py (+34/-27)
tests/protocol/HttpRequestPredicates.py (+101/-0)
tests/protocol/mocks.py (+167/-0)
tests/protocol/mockwiredata.py (+260/-47)
tests/protocol/test_datacontract.py (+5/-5)
tests/protocol/test_extensions_goal_state_from_extensions_config.py (+62/-0)
tests/protocol/test_extensions_goal_state_from_vm_settings.py (+156/-0)
tests/protocol/test_goal_state.py (+545/-0)
tests/protocol/test_healthservice.py (+1/-1)
tests/protocol/test_hostplugin.py (+620/-445)
tests/protocol/test_image_info_matcher.py (+2/-1)
tests/protocol/test_imds.py (+49/-46)
tests/protocol/test_metadata_server_migration_util.py (+134/-0)
tests/protocol/test_protocol_util.py (+208/-85)
tests/protocol/test_wire.py (+864/-934)
tests/test_agent.py (+154/-17)
tests/tools.py (+87/-86)
tests/utils/cgroups_tools.py (+1/-2)
tests/utils/event_logger_tools.py (+65/-0)
tests/utils/miscellaneous_tools.py (+62/-0)
tests/utils/test_archive.py (+123/-179)
tests/utils/test_crypt_util.py (+2/-7)
tests/utils/test_extension_process_util.py (+103/-54)
tests/utils/test_file_util.py (+19/-20)
tests/utils/test_flexible_version.py (+21/-19)
tests/utils/test_network_util.py (+36/-1)
tests/utils/test_rest_util.py (+74/-55)
tests/utils/test_shell_util.py (+337/-70)
tests/utils/test_text_util.py (+32/-14)
tests_e2e/orchestrator/docker/Dockerfile (+85/-0)
tests_e2e/orchestrator/lib/agent_junit.py (+66/-0)
tests_e2e/orchestrator/lib/agent_test_loader.py (+257/-0)
tests_e2e/orchestrator/lib/agent_test_suite.py (+645/-0)
tests_e2e/orchestrator/lib/agent_test_suite_combinator.py (+249/-0)
tests_e2e/orchestrator/runbook.yml (+142/-0)
tests_e2e/orchestrator/sample_runbooks/existing_vm.yml (+143/-0)
tests_e2e/orchestrator/sample_runbooks/local_machine/hello_world.py (+32/-0)
tests_e2e/orchestrator/sample_runbooks/local_machine/local.yml (+32/-0)
tests_e2e/orchestrator/scripts/check-agent-log.py (+49/-0)
tests_e2e/orchestrator/scripts/collect-logs (+34/-0)
tests_e2e/orchestrator/scripts/get-agent-bin-path (+56/-0)
tests_e2e/orchestrator/scripts/get-agent-modules-path (+37/-0)
tests_e2e/orchestrator/scripts/get-agent-python (+59/-0)
tests_e2e/orchestrator/scripts/install-agent (+137/-0)
tests_e2e/orchestrator/scripts/install-tools (+135/-0)
tests_e2e/orchestrator/scripts/uncompress.py (+33/-0)
tests_e2e/orchestrator/scripts/unzip.py (+36/-0)
tests_e2e/pipeline/pipeline-cleanup.yml (+58/-0)
tests_e2e/pipeline/pipeline.yml (+119/-0)
tests_e2e/pipeline/scripts/execute_tests.sh (+120/-0)
tests_e2e/test_suites/agent_bvt.yml (+8/-0)
tests_e2e/test_suites/fail.yml (+5/-0)
tests_e2e/test_suites/images.yml (+94/-0)
tests_e2e/test_suites/pass.yml (+4/-0)
tests_e2e/tests/bvts/extension_operations.py (+94/-0)
tests_e2e/tests/bvts/run_command.py (+94/-0)
tests_e2e/tests/bvts/vm_access.py (+79/-0)
tests_e2e/tests/error_test.py (+32/-0)
tests_e2e/tests/fail_test.py (+33/-0)
tests_e2e/tests/lib/agent_log.py (+446/-0)
tests_e2e/tests/lib/agent_test.py (+66/-0)
tests_e2e/tests/lib/agent_test_context.py (+164/-0)
tests_e2e/tests/lib/identifiers.py (+63/-0)
tests_e2e/tests/lib/logging.py (+155/-0)
tests_e2e/tests/lib/retry.py (+59/-0)
tests_e2e/tests/lib/shell.py (+56/-0)
tests_e2e/tests/lib/ssh_client.py (+85/-0)
tests_e2e/tests/lib/virtual_machine.py (+143/-0)
tests_e2e/tests/lib/vm_extension.py (+239/-0)
tests_e2e/tests/pass_test.py (+33/-0)
Reviewer Review Type Date Requested Status
Lukas Märdian (community) Needs Information
Lucas Kanashiro (community) Needs Fixing
Review via email: mp+461115@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Lucas Kanashiro (lucaskanashiro) wrote :

Hi Calvin,

Thanks for the MPs updating walinuxagent. You may know that to update a package you need to follow the SRU process:

https://wiki.ubuntu.com/StableReleaseUpdates

And we need to have a LP bug associated to the update, so please start by filing a bug against the walinuxagent package. To do that, you need to also check its SRU exception:

https://wiki.ubuntu.com/walinuxagentUpdates

There you will find info on the procedure to update walinuxagent to a new version (in this case to version 2.9.1.1) in a stable release. And before sponsoring any of your changes I want to make sure you performed all the steps described in the wiki page above, which includes (copy&paste from there):

Pre-SRU Test Cases

These are the test cases that all walinuxagent are subjected to before even getting to SRU:

1.) Launch instance on Azure
2.) Upgrade walinuxagent (usually from PPA)
3.) Confirm that "waagent" is running, check /var/log/waagent.log for errors
4.) Reboot, repeat step 3

SRU Test Cases

These are automated tests:

1.) Build new cloud image with -proposed package
2.) Boot instance
3.) Confirm that instance provisioned
4.) Run standard tests and regression tests
5.) Repeat from step 2 for all other Azure VM Sizes.

Once you prove you did the above with your proposed package and everything worked as expected, we can move on with the upload.

What I mentioned above is true to all your MPs against supported releases (mantic, jammy, focal, and bionic), I am not copying and pasting everything but bear in mind you need the same thing for all of them.

If you have any question, do no hesitate to ask here, or maybe use #ubuntu-devel IRC channel on Libera.chat.

review: Needs Fixing
Revision history for this message
Lucas Kanashiro (lucaskanashiro) wrote :

Sorry, I see the bug in the changelog now, it is not linked here in the MP, I am doing that. However, I still want to see the results of the testing before moving on.

Revision history for this message
Lucas Kanashiro (lucaskanashiro) wrote :

From a quick glance, I see you are also submitting the same changes that landed in Noble. And I'd confirm that after seeing the "~23.10" in the version string. Is that right? If so, I's expect to see the Noble changelog entry (as-is in Noble) and then a new changelog entry simply stating the backport and any change needed for the backport (which does not seem needed here). When your sponsor uploads the package both changelog entry should be included in the .changes file.

To make it more concrete, I'd expect something like this:

walinuxagent (2.9.1.1-0ubuntu1~23.10) mantic; urgency=medium

  * Backport version from Noble to Mantic (LP: #2035331).

 -- Calvin Mwadime <email address hidden> Thu, 14 Mar 2024 12:10:06 +0300

walinuxagent (2.9.1.1-0ubuntu1) noble; urgency=medium

  [ Calvin Mwadime ]
  * New upstream version 2.9.1.1
  * debian/watch: Fix package version regex pattern
  * debian/docs: Remove the changelog
  * debian/install: add new udev rules
  * debian/patches:
    - Remove deprecated patches
    - Update list of supported ubuntu versions
    - Correctly point to the right dhcp client lease file
    - Update the cgroup handling to include v2
  * disable_udev_overrides.patch: Remove udev overrides in setup.py
  * debian: Make python unit tests work
  * debian: Remove upstart config
  * debian/control: Remove isc-dhcp-client
  * debian/manpages: Added manpages

  [ Christian Ehrhardt ]
  * d/control, d/compat: switch to compat level 13
  * d/rules, d/install: install services once
  * d/control: drop old unneeded X-Python3-Version statement
  * d/rules: stop using the old systemd dh sequence
  * d/control: add iproute2 build dependency (test only)
  * d/control: fix lintian priority-extra-is-replaced-by-priority-optional
  * d/install: move remaining things to /usr

 -- Calvin Mwadime <email address hidden> Thu, 30 Nov 2023 11:07:05 +0300

Revision history for this message
Calvin Mwadime Makokha (calvinmwadime) wrote :

> From a quick glance, I see you are also submitting the same changes that
> landed in Noble. And I'd confirm that after seeing the "~23.10" in the version
> string. Is that right? If so, I's expect to see the Noble changelog entry (as-
> is in Noble) and then a new changelog entry simply stating the backport and
> any change needed for the backport (which does not seem needed here). When
> your sponsor uploads the package both changelog entry should be included in
> the .changes file.
>
> To make it more concrete, I'd expect something like this:
>
> walinuxagent (2.9.1.1-0ubuntu1~23.10) mantic; urgency=medium
>
> * Backport version from Noble to Mantic (LP: #2035331).
>
> -- Calvin Mwadime <email address hidden> Thu, 14 Mar 2024 12:10:06
> +0300
>
> walinuxagent (2.9.1.1-0ubuntu1) noble; urgency=medium
>
> [ Calvin Mwadime ]
> * New upstream version 2.9.1.1
> * debian/watch: Fix package version regex pattern
> * debian/docs: Remove the changelog
> * debian/install: add new udev rules
> * debian/patches:
> - Remove deprecated patches
> - Update list of supported ubuntu versions
> - Correctly point to the right dhcp client lease file
> - Update the cgroup handling to include v2
> * disable_udev_overrides.patch: Remove udev overrides in setup.py
> * debian: Make python unit tests work
> * debian: Remove upstart config
> * debian/control: Remove isc-dhcp-client
> * debian/manpages: Added manpages
>
> [ Christian Ehrhardt ]
> * d/control, d/compat: switch to compat level 13
> * d/rules, d/install: install services once
> * d/control: drop old unneeded X-Python3-Version statement
> * d/rules: stop using the old systemd dh sequence
> * d/control: add iproute2 build dependency (test only)
> * d/control: fix lintian priority-extra-is-replaced-by-priority-optional
> * d/install: move remaining things to /usr
>
> -- Calvin Mwadime <email address hidden> Thu, 30 Nov 2023 11:07:05
> +0300

Thanks for this information.

Revision history for this message
Lukas Märdian (slyon) wrote (last edit ):

Please re-request ~ubuntu-sponsors review once Lucas' comments are addressed and the pre-SRU tests cases have been executed. Also try improving your SRU bug report a little bit, e.g. missing "[ Where problems could occur ]" section.

review: Needs Information

Unmerged commits

dc00f5a... by Calvin Mwadime Makokha

debian/changelog: Update version

8f7a52e... by Calvin Mwadime Makokha

d/install: move remaining things to /usr

3eb8d54... by Calvin Mwadime Makokha

d/control: fix lintian priority-extra-is-replaced-by-priority-optional

df4d35b... by Calvin Mwadime Makokha

d/control: add iproute2 build depdency (test only)

49af44b... by Calvin Mwadime Makokha

d/rules: stop using the old systemd dh sequence

Matching our switch to dh_installsystemd we need to stop calling
for the older systemd-sequence in dh.

This fixes:
error: The systemd-sequence is no longer provided in compat >= 11,
please rely on dh_installsystemd instead

1748d96... by Calvin Mwadime Makokha

d/control: drop old unneeded X-Python3-Version statement

09d440c... by Calvin Mwadime Makokha

d/rules, d/install: install services once

Since "7d793bf debian: Remove upstart config" --no-restart-on-upgrade
behavior was lost. Add it back with the new name --no-stop-on-upgrade.
Furthermore the install was a mix of the upstream putting files
under /lib and then debian/install putting it under /usr/lib leading
to file-in-root-and-usr warnings.
Furthermore dh_systemd_enable and the formerly dropped dh_installinit
have been consumed into dh_installsystemd for systemd units.
Finally dh_installsystemd will drop the executable bit which is
wrong in the upstream archive and triggers executable-not-elf-or-script.
So overall to hopefully do it right we do:
- ephemeral-disk-warning.service gets installed by d/install
- walinuxagent.service gets installed badly by upstream and removed in
  d/rules
- both services are then picked up and configured by dh_installsystemd

58f3c76... by Calvin Mwadime Makokha

d/control, d/compat: switch to compat level 13

11d4a04... by Calvin Mwadime Makokha

d/manpages: add man pages.

abfdefa... by Calvin Mwadime Makokha

debian/patches: Update the cgroup handling to include v2

WALA only supported cgroup v1 systems. This causes failures when running some
commands on cgroup v2 systems. Newer ubuntu versions are support cgroup v2 by
default

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
diff --git a/.flake8 b/.flake8
0deleted file mode 1006440deleted file mode 100644
index 63303c3..0000000
--- a/.flake8
+++ /dev/null
@@ -1,32 +0,0 @@
1#
2# The project did not use flake8 since inception so there are a number
3# of time-consuming flake8-identified improvements that are just a lot
4# of busy work. Each of these should be disabled and code cleaned up.
5#
6# W503: Line break occurred before a binary operator
7# W504: Line break occurred after a binary operator
8# E126: Continuation line over-indented for hanging indent
9# E127: Continuation line over-indented for visual indent
10# E128: Continuation line under-indented for visual indent
11# E201: Whitespace after '('
12# E202: Whitespace before ')'
13# E203: Whitespace before ':'
14# E221: Multiple spaces before operator
15# E225: Missing whitespace around operator
16# E226: Missing whitespace around arithmetic operator
17# E231: Missing whitespace after ',', ';', or ':'
18# E261: At least two spaces before inline comment
19# E265: Block comment should start with '# '
20# E302: Expected 2 blank lines, found 0
21# E501: Line too long (xx > yy characters)
22# E502: The backslash is redundant between brackets
23# F401: Module imported but unused
24# F403: 'from module import *' used; unable to detect undefined names
25# F405: Name may be undefined, or defined from star imports: module
26#
27
28[flake8]
29ignore = W503,W504,E126,E127,E128,E201,E202,E203,E221,E225,E226,E231,E261,E265,E302,E501,E502,F401,F403,F405
30exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,tests
31max-complexity = 30
32max-line-length = 120
33\ No newline at end of file0\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index edfa1e6..fdcc07c 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -14,9 +14,8 @@ This will expedite the process of getting your pull request merged and avoid ext
14### PR information14### PR information
15- [ ] The title of the PR is clear and informative.15- [ ] The title of the PR is clear and informative.
16- [ ] There are a small number of commits, each of which has an informative message. This means that previously merged commits do not appear in the history of the PR. For information on cleaning up the commits in your pull request, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).16- [ ] There are a small number of commits, each of which has an informative message. This means that previously merged commits do not appear in the history of the PR. For information on cleaning up the commits in your pull request, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).
17- [ ] Except for special cases involving multiple contributors, the PR is started from a fork of the main repository, not a branch.
18- [ ] If applicable, the PR references the bug/issue that it fixes in the description.17- [ ] If applicable, the PR references the bug/issue that it fixes in the description.
19- [ ] New Unit tests were added for the changes made and Travis.CI is passing.18- [ ] New Unit tests were added for the changes made
2019
21### Quality of Code and Contribution Guidelines20### Quality of Code and Contribution Guidelines
22- [ ] I have read the [contribution guidelines](https://github.com/Azure/WALinuxAgent/blob/master/.github/CONTRIBUTING.md).21- [ ] I have read the [contribution guidelines](https://github.com/Azure/WALinuxAgent/blob/master/.github/CONTRIBUTING.md).
23\ No newline at end of file22\ No newline at end of file
diff --git a/.github/codecov.yml b/.github/codecov.yml
24new file mode 10064423new file mode 100644
index 0000000..77707aa
--- /dev/null
+++ b/.github/codecov.yml
@@ -0,0 +1,2 @@
1github_checks:
2 annotations: false
diff --git a/.github/workflows/ci_pr.yml b/.github/workflows/ci_pr.yml
0new file mode 1006443new file mode 100644
index 0000000..e559268
--- /dev/null
+++ b/.github/workflows/ci_pr.yml
@@ -0,0 +1,128 @@
1name: CI Unit tests
2
3on:
4 push:
5 branches: [ "*" ]
6 pull_request:
7 branches: [ "*" ]
8 workflow_dispatch:
9
10jobs:
11 test-legacy-python-versions:
12
13 strategy:
14 fail-fast: false
15 matrix:
16 include:
17 - python-version: 2.6
18 - python-version: 3.4
19
20 name: "Python ${{ matrix.python-version }} Unit Tests"
21 runs-on: ubuntu-20.04
22 container:
23 image: ubuntu:16.04
24 volumes:
25 - /home/waagent:/home/waagent
26 defaults:
27 run:
28 shell: bash -l {0}
29
30 env:
31 NOSEOPTS: "--verbose"
32
33 steps:
34 - uses: actions/checkout@v3
35
36 - name: Install Python ${{ matrix.python-version }}
37 run: |
38 apt-get update
39 apt-get install -y curl bzip2 sudo python3
40 curl https://dcrdata.blob.core.windows.net/python/python-${{ matrix.python-version }}.tar.bz2 -o python-${{ matrix.python-version }}.tar.bz2
41 sudo tar xjvf python-${{ matrix.python-version }}.tar.bz2 --directory /
42
43 - name: Test with nosetests
44 run: |
45 if [[ ${{ matrix.python-version }} == 2.6 ]]; then
46 source /home/waagent/virtualenv/python2.6.9/bin/activate
47 else
48 source /home/waagent/virtualenv/python3.4.8/bin/activate
49 fi
50 ./ci/nosetests.sh
51 exit $?
52
53 test-current-python-versions:
54
55 strategy:
56 fail-fast: false
57 matrix:
58 include:
59
60 - python-version: 2.7
61 PYLINTOPTS: "--rcfile=ci/2.7.pylintrc --ignore=tests_e2e,makepkg.py"
62
63 - python-version: 3.5
64 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e,makepkg.py"
65
66 - python-version: 3.6
67 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
68
69 - python-version: 3.7
70 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
71
72 - python-version: 3.8
73 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
74
75 - python-version: 3.9
76 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc"
77 additional-nose-opts: "--with-coverage --cover-erase --cover-inclusive --cover-branches --cover-package=azurelinuxagent"
78
79 name: "Python ${{ matrix.python-version }} Unit Tests"
80 runs-on: ubuntu-20.04
81
82 env:
83 PYLINTOPTS: ${{ matrix.PYLINTOPTS }}
84 PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests tests_e2e"
85 NOSEOPTS: "--with-timer ${{ matrix.additional-nose-opts }}"
86 PYTHON_VERSION: ${{ matrix.python-version }}
87
88 steps:
89
90 - name: Checkout WALinuxAgent repo
91 uses: actions/checkout@v3
92
93 - name: Setup Python ${{ matrix.python-version }}
94 uses: actions/setup-python@v4
95 with:
96 python-version: ${{ matrix.python-version }}
97
98 - name: Install dependencies
99 id: install-dependencies
100 run: |
101 sudo env "PATH=$PATH" python -m pip install --upgrade pip
102 sudo env "PATH=$PATH" pip install -r requirements.txt
103 sudo env "PATH=$PATH" pip install -r test-requirements.txt
104
105 - name: Run pylint
106 run: |
107 pylint $PYLINTOPTS --jobs=0 $PYLINTFILES
108
109 - name: Test with nosetests
110 if: success() || (failure() && steps.install-dependencies.outcome == 'success')
111 run: |
112 ./ci/nosetests.sh
113 exit $?
114
115 - name: Compile Coverage
116 if: matrix.python-version == 3.9
117 run: |
118 echo looking for coverage files :
119 ls -alh | grep -i coverage
120 sudo env "PATH=$PATH" coverage combine coverage.*.data
121 sudo env "PATH=$PATH" coverage xml
122 sudo env "PATH=$PATH" coverage report
123
124 - name: Upload Coverage
125 if: matrix.python-version == 3.9
126 uses: codecov/codecov-action@v2
127 with:
128 file: ./coverage.xml
0\ No newline at end of file129\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 0a31340..fd64d33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,8 +17,6 @@ develop-eggs/
17dist/17dist/
18downloads/18downloads/
19eggs/19eggs/
20lib/
21lib64/
22parts/20parts/
23sdist/21sdist/
24var/22var/
@@ -92,3 +90,4 @@ ENV/
9290
93# pyenv91# pyenv
94.python-version92.python-version
93.vscode/
diff --git a/.travis.yml b/.travis.yml
95deleted file mode 10064494deleted file mode 100644
index fa672d3..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,43 +0,0 @@
1---
2os: linux
3dist: xenial
4language: python
5env:
6 - NOSEOPTS="--verbose" SETUPOPTS=""
7 # Add SETUPOPTS="check flake8" to enable flake8 checks
8
9matrix:
10 # exclude the default "python" build - we're being specific here...
11 exclude:
12 - python:
13 env:
14 - NOSEOPTS="" SETUPOPTS="check flake8"
15
16 include:
17 - python: 2.6
18 dist: trusty
19 env:
20 - NOSEOPTS="--verbose" SETUPOPTS=""
21 - python: 2.7
22 - python: 3.4
23 - python: 3.6
24 - python: 3.7
25 env:
26 - >-
27 NOSEOPTS="--verbose --with-coverage --cover-inclusive
28 --cover-min-percentage=60 --cover-branches
29 --cover-package=azurelinuxagent --cover-xml"
30 SETUPOPTS=""
31
32install:
33 - pip install -r requirements.txt
34 - pip install -r test-requirements.txt
35
36script:
37 # future: - pylint setup.py makepkg.py azurelinuxagent/
38 - nosetests $NOSEOPTS --attr '!requires_sudo' tests
39 - sudo env "PATH=$PATH" nosetests $NOSEOPTS --verbose --attr 'requires_sudo' tests
40 - if [ ! -z "$SETUPOPTS" ]; then /usr/bin/env python setup.py $SETUPOPTS; fi
41
42after_success:
43 - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; then codecov; fi
44\ No newline at end of file0\ No newline at end of file
diff --git a/CODEOWNERS b/CODEOWNERS
index a8f2d5d..8707e60 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,3 +1,4 @@
11
1# See https://help.github.com/articles/about-codeowners/2# See https://help.github.com/articles/about-codeowners/
2# for more info about CODEOWNERS file3# for more info about CODEOWNERS file
34
@@ -9,6 +10,7 @@
9# when there are requests for changes in the provisioning agent. For any10# when there are requests for changes in the provisioning agent. For any
10# questions, please feel free to reach out to thstring@microsoft.com.11# questions, please feel free to reach out to thstring@microsoft.com.
11/azurelinuxagent/pa/ @trstringer @anhvoms12/azurelinuxagent/pa/ @trstringer @anhvoms
13/tests/pa/ @trstringer @anhvoms
1214
13#15#
14# RDMA16# RDMA
@@ -19,4 +21,4 @@
19#21#
20# Linux Agent team22# Linux Agent team
21#23#
22* @narrieta @vrdmr @pgombar @larohra24* @narrieta @ZhidongPeng @nagworld9 @maddieford
diff --git a/Changelog b/Changelog
23deleted file mode 10064425deleted file mode 100644
index da68890..0000000
--- a/Changelog
+++ /dev/null
@@ -1,38 +0,0 @@
1WALinuxAgent Changelog
2|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
3
4Refer to releases WALinuxAgent release page: https://github.com/Azure/WALinuxAgent/releases for detailed changelog after v2.2.0
5
612 August 2016, v2.1.6
7 . Improved RDMA support
8 . Extension state migration
9 . Alpine Linux support
10 . Fixes for #347, #351, #353
11
1215 July 2016, v2.1.5
13 . Goal state processing extension
14 . Multi-nic improvements
15 . Bug fixes for #145, #141, #133, #116, #187, #169, #104, #127, #163,
16 #190, #185, #174
17
1809 Mar 2016, WALinuxAgent 2.1.4
19 . Add support for FreeBSD
20 . Fix a bug for internal extension version resolving
21
2229 Jan 2016, WALinuxAgent 2.1.3
23 . Fixed endpoint probing for Azure Stack
24 . Multiple fixes for extension handling
25
2607 Dec 2015, WALinuxAgent 2.1.2
27 . Multiple fixes for extension handling and provisioning
28
2907 Aug 2015, WALinuxAgent 2.1.1
30 . Support python3
31 . Fixed bugs for metadata protocol
32 . Fixed a few pylint warnings
33 . Enabled travis-ci
34
35|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
3601 Jul 2015, WALinuxAgent 2.1.0
37 . Divide waagent into different modules
38
diff --git a/README.md b/README.md
index 0069d46..ae6a851 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,15 @@
1
1# Microsoft Azure Linux Agent2# Microsoft Azure Linux Agent
23
3## Develop branch status4## Linux distributions support
45
5[![Travis CI](https://travis-ci.org/Azure/WALinuxAgent.svg?branch=develop)](https://travis-ci.org/Azure/WALinuxAgent/branches)6Our daily automation tests most of the [Linux distributions supported by Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros); the Agent can be
6[![CodeCov](https://codecov.io/gh/Azure/WALinusAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop)7used on other distributions as well, but development, testing and support for those are done by the open source community.
78
8Each badge below represents our basic validation tests for an image, which are executed several times each day. These include provisioning, user account, disk, extension and networking scenarios.9Testing is done using the develop branch, which can be unstable. For a stable build please use the master branch instead.
910
10Note: These badges represent testing to our develop branch which might not be stable. For a stable build please use master branch instead. 11[![CodeCov](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop)
1112
12Image | Status |
13------|--------|
14Canonical UbuntuServer 14.04.5-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-LTS__agent--bvt.svg)
15Canonical UbuntuServer 14.04.5-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-DAILY-LTS__agent--bvt.svg)
16Canonical UbuntuServer 16.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-LTS__agent--bvt.svg)
17Canonical UbuntuServer 16.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-DAILY-LTS__agent--bvt.svg)
18Canonical UbuntuServer 18.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-LTS__agent--bvt.svg)
19Canonical UbuntuServer 18.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-DAILY-LTS__agent--bvt.svg)
20Credativ Debian 8|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8__agent--bvt.svg)
21Credativ Debian 8-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8-DAILY__agent--bvt.svg)
22Credativ Debian 9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9__agent--bvt.svg)
23Credativ Debian 9-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9-DAILY__agent--bvt.svg)
24OpenLogic CentOS 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_6.9__agent--bvt.svg)
25OpenLogic CentOS 7.4|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_7.4__agent--bvt.svg)
26RedHat RHEL 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_6.9__agent--bvt.svg)
27RedHat RHEL 7-RAW|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_7-RAW__agent--bvt.svg)
28SUSE SLES 12-SP3|![badge](https://dcrbadges.blob.core.windows.net/scenarios/SUSE_SLES_12-SP3__agent--bvt.svg)
2913
30## Introduction14## Introduction
3115
@@ -49,7 +33,6 @@ functionality for Linux IaaS deployments:
4933
50* Kernel34* Kernel
51 * Configure virtual NUMA (disable for kernel <2.6.37)35 * Configure virtual NUMA (disable for kernel <2.6.37)
52 * Consume Hyper-V entropy for /dev/random
53 * Configure SCSI timeouts for the root device (which could be remote)36 * Configure SCSI timeouts for the root device (which could be remote)
5437
55* Diagnostics38* Diagnostics
@@ -79,13 +62,15 @@ The agent will use an HTTP proxy if provided via the `http_proxy` (for `http` re
79`https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and62`https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and
80`HttpProxy.Port` configuration variables (see below), if used, will override the environment63`HttpProxy.Port` configuration variables (see below), if used, will override the environment
81settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring64settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring
82authentication.65authentication. Note that when the agent service is managed by systemd, environment variables
66such as `http_proxy` and `https_proxy` should be defined using one the mechanisms provided by
67systemd (e.g. by using Environment or EnvironmentFile in the service file).
8368
84## Requirements69## Requirements
8570
86The following systems have been tested and are known to work with the Azure71The following systems have been tested and are known to work with the Azure
87Linux Agent. Please note that this list may differ from the official list72Linux Agent. Please note that this list may differ from the official list
88of supported systems on the Microsoft Azure Platform as described [here](http://support.microsoft.com/kb/2805216).73of supported systems on the Microsoft Azure Platform as described [here](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros).
8974
90Waagent depends on some system packages in order to function properly:75Waagent depends on some system packages in order to function properly:
9176
@@ -109,6 +94,12 @@ For more advanced installation options, such as installing to custom locations o
109 sudo python setup.py install --register-service94 sudo python setup.py install --register-service
110```95```
11196
97For Python 3, use:
98
99```bash
100 sudo python3 setup.py install --register-service
101```
102
112You can view more installation options by running:103You can view more installation options by running:
113104
114```bash105```bash
@@ -177,6 +168,8 @@ For CoreOS, use:
177168
178`-start`: Run waagent as a background process169`-start`: Run waagent as a background process
179170
171`-collect-logs [-full]`: Runs the log collector utility that collects relevant agent logs for debugging and stores them in the agent folder on disk. Exact location will be shown when run. Use flag `-full` for more exhaustive log collection.
172
180## Configuration173## Configuration
181174
182A configuration file (/etc/waagent.conf) controls the actions of waagent. Blank lines and lines whose first character is a `#` are ignored (end-of-line comments are *not* supported).175A configuration file (/etc/waagent.conf) controls the actions of waagent. Blank lines and lines whose first character is a `#` are ignored (end-of-line comments are *not* supported).
@@ -185,6 +178,7 @@ A sample configuration file is shown below:
185178
186```yml179```yml
187Extensions.Enabled=y180Extensions.Enabled=y
181Extensions.GoalStatePeriod=6
188Provisioning.Agent=auto182Provisioning.Agent=auto
189Provisioning.DeleteRootPassword=n183Provisioning.DeleteRootPassword=n
190Provisioning.RegenerateSshHostKeyPair=y184Provisioning.RegenerateSshHostKeyPair=y
@@ -202,6 +196,8 @@ ResourceDisk.EnableSwap=n
202ResourceDisk.EnableSwapEncryption=n196ResourceDisk.EnableSwapEncryption=n
203ResourceDisk.SwapSizeMB=0197ResourceDisk.SwapSizeMB=0
204Logs.Verbose=n198Logs.Verbose=n
199Logs.Collect=y
200Logs.CollectPeriod=3600
205OS.AllowHTTP=n201OS.AllowHTTP=n
206OS.RootDeviceScsiTimeout=300202OS.RootDeviceScsiTimeout=300
207OS.EnableFIPS=n203OS.EnableFIPS=n
@@ -210,8 +206,6 @@ OS.SshClientAliveInterval=180
210OS.SshDir=/etc/ssh206OS.SshDir=/etc/ssh
211HttpProxy.Host=None207HttpProxy.Host=None
212HttpProxy.Port=None208HttpProxy.Port=None
213CGroups.EnforceLimits=y
214CGroups.Excluded=customscript,runcommand
215```209```
216210
217The various configuration options are described in detail below. Configuration211The various configuration options are described in detail below. Configuration
@@ -238,6 +232,32 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set
238provisioning time, via whichever API is being used. We will provide more details on232provisioning time, via whichever API is being used. We will provide more details on
239this on our wiki when it is generally available. 233this on our wiki when it is generally available.
240234
235#### __Extensions.GoalStatePeriod__
236
237_Type: Integer_
238_Default: 6_
239
240How often to poll for new goal states (in seconds) and report the status of the VM
241and extensions. Goal states describe the desired state of the extensions on the VM.
242
243_Note_: setting up this parameter to more than a few minutes can make the state of
244the VM be reported as unresponsive/unavailable on the Azure portal. Also, this
245setting affects how fast the agent starts executing extensions.
246
247#### __AutoUpdate.Enabled__
248
249_Type: Boolean_
250_Default: y_
251
252Enables auto-update of the Extension Handler. The Extension Handler is responsible
253for managing extensions and reporting VM status. The core functionality of the agent
254is contained in the Extension Handler, and we encourage users to enable this option
255in order to maintain an up to date version.
256
257On most distros the default value is 'y'.
258
259For more information on the agent version, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#what-does-goal-state-agent-mean-in-waagent---version-output).
260
241#### __Provisioning.Agent__261#### __Provisioning.Agent__
242262
243_Type: String_263_Type: String_
@@ -261,7 +281,22 @@ _Note_: This configuration option has been removed and has no effect. waagent
261now auto-detects cloud-init as a provisioning agent (with an option to override281now auto-detects cloud-init as a provisioning agent (with an option to override
262with `Provisioning.Agent`).282with `Provisioning.Agent`).
263283
264#### __Provisioning.UseCloudInit__ (*removed in 2.2.45*)284#### __Provisioning.MonitorHostName__
285
286_Type: Boolean_
287_Default: n_
288
289Monitor host name changes and publish changes via DHCP requests.
290
291#### __Provisioning.MonitorHostNamePeriod__
292
293_Type: Integer_
294_Default: 30_
295
296How often to monitor host name changes (in seconds). This setting is ignored if
297MonitorHostName is not set.
298
299#### __Provisioning.UseCloudInit__
265300
266_Type: Boolean_ 301_Type: Boolean_
267_Default: n_302_Default: n_
@@ -397,7 +432,7 @@ system swap space.
397_Type: Boolean_ 432_Type: Boolean_
398_Default: n_433_Default: n_
399434
400If set, the swap file (/swapfile) is mounted as an encrypted filesystem.435If set, the swap file (/swapfile) is mounted as an encrypted filesystem (flag supported only on FreeBSD.)
401436
402#### __ResourceDisk.SwapSizeMB__437#### __ResourceDisk.SwapSizeMB__
403438
@@ -414,6 +449,25 @@ _Default: n_
414If set, log verbosity is boosted. Waagent logs to /var/log/waagent.log and449If set, log verbosity is boosted. Waagent logs to /var/log/waagent.log and
415leverages the system logrotate functionality to rotate logs.450leverages the system logrotate functionality to rotate logs.
416451
452
453#### __Logs.Collect__
454
455_Type: Boolean_
456_Default: y_
457
458If set, agent logs will be periodically collected and uploaded to a secure location for improved supportability.
459
460NOTE: This feature relies on the agent's resource usage features (cgroups); this flag will not take effect on any distro not supported.
461
462#### __Logs.CollectPeriod__
463
464_Type: Integer_
465_Default: 3600_
466
467This configures how frequently to collect and upload logs. Default is each hour.
468
469NOTE: This only takes effect if the Logs.Collect option is enabled.
470
417#### __OS.AllowHTTP__471#### __OS.AllowHTTP__
418472
419_Type: Boolean_ 473_Type: Boolean_
@@ -442,6 +496,14 @@ OpenSSL commands. This signals OpenSSL to use any installed FIPS-compliant libra
442Note that the agent itself has no FIPS-specific code. _If no FIPS-compliant certificates are496Note that the agent itself has no FIPS-specific code. _If no FIPS-compliant certificates are
443installed, then enabling this option will cause all OpenSSL commands to fail._497installed, then enabling this option will cause all OpenSSL commands to fail._
444498
499#### __OS.MonitorDhcpClientRestartPeriod__
500
501_Type: Integer_
502_Default: 30_
503
504The agent monitor restarts of the DHCP client and restores network rules when it happens. This
505setting determines how often (in seconds) to monitor for restarts.
506
445#### __OS.RootDeviceScsiTimeout__507#### __OS.RootDeviceScsiTimeout__
446508
447_Type: Integer_ 509_Type: Integer_
@@ -450,6 +512,14 @@ _Default: 300_
450This configures the SCSI timeout in seconds on the root device. If not set, the512This configures the SCSI timeout in seconds on the root device. If not set, the
451system defaults are used.513system defaults are used.
452514
515#### __OS.RootDeviceScsiTimeoutPeriod__
516
517_Type: Integer_
518_Default: 30_
519
520How often to set the SCSI timeout on the root device (in seconds). This setting is
521ignored if RootDeviceScsiTimeout is not set.
522
453#### __OS.OpensslPath__523#### __OS.OpensslPath__
454524
455_Type: String_ 525_Type: String_
@@ -458,6 +528,13 @@ _Default: None_
458This can be used to specify an alternate path for the openssl binary to use for528This can be used to specify an alternate path for the openssl binary to use for
459cryptographic operations.529cryptographic operations.
460530
531#### __OS.RemovePersistentNetRulesPeriod__
532_Type: Integer_
533_Default: 30_
534
535How often to remove the udev rules for persistent network interface names (75-persistent-net-generator.rules
536and /etc/udev/rules.d/70-persistent-net.rules) (in seconds)
537
461#### __OS.SshClientAliveInterval__538#### __OS.SshClientAliveInterval__
462539
463_Type: Integer_ 540_Type: Integer_
diff --git a/SECURITY.md b/SECURITY.md
464new file mode 100644541new file mode 100644
index 0000000..e138ec5
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
1<!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
2
3## Security
4
5Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6
7If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8
9## Reporting Security Issues
10
11**Please do not report security vulnerabilities through public GitHub issues.**
12
13Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14
15If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16
17You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18
19Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20
21 * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 * Full paths of source file(s) related to the manifestation of the issue
23 * The location of the affected source code (tag/branch/commit or direct URL)
24 * Any special configuration required to reproduce the issue
25 * Step-by-step instructions to reproduce the issue
26 * Proof-of-concept or exploit code (if possible)
27 * Impact of the issue, including how an attacker might exploit the issue
28
29This information will help us triage your report more quickly.
30
31If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32
33## Preferred Languages
34
35We prefer all communications to be in English.
36
37## Policy
38
39Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40
41<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py
index 6e65084..8c30348 100644
--- a/azurelinuxagent/agent.py
+++ b/azurelinuxagent/agent.py
@@ -24,21 +24,47 @@ Module agent
24from __future__ import print_function24from __future__ import print_function
2525
26import os26import os
27import sys
28import re27import re
29import subprocess28import subprocess
29import sys
30import threading30import threading
31import traceback31from azurelinuxagent.common import cgroupconfigurator, logcollector
32from azurelinuxagent.common.cgroupapi import SystemdCgroupsApi
3233
33import azurelinuxagent.common.logger as logger
34import azurelinuxagent.common.event as event
35import azurelinuxagent.common.conf as conf34import azurelinuxagent.common.conf as conf
36from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, \35import azurelinuxagent.common.event as event
37 DISTRO_NAME, DISTRO_VERSION, \36import azurelinuxagent.common.logger as logger
38 PY_VERSION_MAJOR, PY_VERSION_MINOR, \37from azurelinuxagent.common.future import ustr
39 PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION38from azurelinuxagent.common.logcollector import LogCollector, OUTPUT_RESULTS_FILE_PATH
40from azurelinuxagent.common.osutil import get_osutil39from azurelinuxagent.common.osutil import get_osutil
41from azurelinuxagent.common.utils import fileutil40from azurelinuxagent.common.utils import fileutil, textutil
41from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
42from azurelinuxagent.common.utils.networkutil import AddFirewallRules
43from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, AGENT_VERSION, \
44 DISTRO_NAME, DISTRO_VERSION, \
45 PY_VERSION_MAJOR, PY_VERSION_MINOR, \
46 PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION, \
47 get_daemon_version, set_daemon_version
48from azurelinuxagent.ga.collect_logs import CollectLogsHandler, get_log_collector_monitor_handler
49from azurelinuxagent.pa.provision.default import ProvisionHandler
50
51
52class AgentCommands(object):
53 """
54 This is the list of all commands that the Linux Guest Agent supports
55 """
56 DeprovisionUser = "deprovision+user"
57 Deprovision = "deprovision"
58 Daemon = "daemon"
59 Start = "start"
60 RegisterService = "register-service"
61 RunExthandlers = "run-exthandlers"
62 Version = "version"
63 ShowConfig = "show-configuration"
64 Help = "help"
65 CollectLogs = "collect-logs"
66 SetupFirewall = "setup-firewall"
67 Provision = "provision"
4268
4369
44class Agent(object):70class Agent(object):
@@ -49,24 +75,24 @@ class Agent(object):
49 self.conf_file_path = conf_file_path75 self.conf_file_path = conf_file_path
50 self.osutil = get_osutil()76 self.osutil = get_osutil()
5177
52 #Init stdout log78 # Init stdout log
53 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO79 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO
54 logger.add_logger_appender(logger.AppenderType.STDOUT, level)80 logger.add_logger_appender(logger.AppenderType.STDOUT, level)
5581
56 #Init config82 # Init config
57 conf_file_path = self.conf_file_path \83 conf_file_path = self.conf_file_path \
58 if self.conf_file_path is not None \84 if self.conf_file_path is not None \
59 else self.osutil.get_agent_conf_file_path()85 else self.osutil.get_agent_conf_file_path()
60 conf.load_conf_from_file(conf_file_path)86 conf.load_conf_from_file(conf_file_path)
6187
62 #Init log88 # Init log
63 verbose = verbose or conf.get_logs_verbose()89 verbose = verbose or conf.get_logs_verbose()
64 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO90 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO
65 logger.add_logger_appender(logger.AppenderType.FILE, level,91 logger.add_logger_appender(logger.AppenderType.FILE, level, path=conf.get_agent_log_file())
66 path="/var/log/waagent.log")92
67 if conf.get_logs_console():93 # echo the log to /dev/console if the machine will be provisioned
68 logger.add_logger_appender(logger.AppenderType.CONSOLE, level,94 if conf.get_logs_console() and not ProvisionHandler.is_provisioned():
69 path="/dev/console")95 self.__add_console_appender(level)
7096
71 if event.send_logs_to_telemetry():97 if event.send_logs_to_telemetry():
72 logger.add_logger_appender(logger.AppenderType.TELEMETRY,98 logger.add_logger_appender(logger.AppenderType.TELEMETRY,
@@ -84,22 +110,30 @@ class Agent(object):
84 "Exception occurred while creating extension "110 "Exception occurred while creating extension "
85 "log directory {0}: {1}".format(ext_log_dir, e))111 "log directory {0}: {1}".format(ext_log_dir, e))
86112
87 #Init event reporter113 # Init event reporter
114 # Note that the reporter is not fully initialized here yet. Some telemetry fields are filled with data
115 # originating from the goal state or IMDS, which requires a WireProtocol instance. Once a protocol
116 # has been established, those fields must be explicitly initialized using
117 # initialize_event_logger_vminfo_common_parameters(). Any events created before that initialization
118 # will contain dummy values on those fields.
88 event.init_event_status(conf.get_lib_dir())119 event.init_event_status(conf.get_lib_dir())
89 event_dir = os.path.join(conf.get_lib_dir(), "events")120 event_dir = os.path.join(conf.get_lib_dir(), event.EVENTS_DIRECTORY)
90 event.init_event_logger(event_dir)121 event.init_event_logger(event_dir)
91 event.enable_unhandled_err_dump("WALA")122 event.enable_unhandled_err_dump("WALA")
92123
124 def __add_console_appender(self, level):
125 logger.add_logger_appender(logger.AppenderType.CONSOLE, level, path="/dev/console")
126
93 def daemon(self):127 def daemon(self):
94 """128 """
95 Run agent daemon129 Run agent daemon
96 """130 """
131 set_daemon_version(AGENT_VERSION)
97 logger.set_prefix("Daemon")132 logger.set_prefix("Daemon")
98 threading.current_thread().setName("Daemon")133 threading.current_thread().setName("Daemon")
99 child_args = None \134 child_args = None \
100 if self.conf_file_path is None \135 if self.conf_file_path is None \
101 else "-configuration-path:{0}".format(self.conf_file_path)136 else "-configuration-path:{0}".format(self.conf_file_path)
102
103 from azurelinuxagent.daemon import get_daemon_handler137 from azurelinuxagent.daemon import get_daemon_handler
104 daemon_handler = get_daemon_handler()138 daemon_handler = get_daemon_handler()
105 daemon_handler.run(child_args=child_args)139 daemon_handler.run(child_args=child_args)
@@ -137,6 +171,21 @@ class Agent(object):
137 """171 """
138 logger.set_prefix("ExtHandler")172 logger.set_prefix("ExtHandler")
139 threading.current_thread().setName("ExtHandler")173 threading.current_thread().setName("ExtHandler")
174
175 #
176 # Agents < 2.2.53 used to echo the log to the console. Since the extension handler could have been started by
177 # one of those daemons, output a message indicating that output to the console will stop, otherwise users
178 # may think that the agent died if they noticed that output to the console stops abruptly.
179 #
180 # Feel free to remove this code if telemetry shows there are no more agents <= 2.2.53 in the field.
181 #
182 if conf.get_logs_console() and get_daemon_version() < FlexibleVersion("2.2.53"):
183 self.__add_console_appender(logger.LogLevel.INFO)
184 try:
185 logger.info(u"The agent will now check for updates and then will process extensions. Output to /dev/console will be suspended during those operations.")
186 finally:
187 logger.disable_console_output()
188
140 from azurelinuxagent.ga.update import get_update_handler189 from azurelinuxagent.ga.update import get_update_handler
141 update_handler = get_update_handler()190 update_handler = get_update_handler()
142 update_handler.run(debug)191 update_handler.run(debug)
@@ -146,91 +195,175 @@ class Agent(object):
146 for k in sorted(configuration.keys()):195 for k in sorted(configuration.keys()):
147 print("{0} = {1}".format(k, configuration[k]))196 print("{0} = {1}".format(k, configuration[k]))
148197
198 def collect_logs(self, is_full_mode):
199 logger.set_prefix("LogCollector")
200
201 if is_full_mode:
202 logger.info("Running log collector mode full")
203 else:
204 logger.info("Running log collector mode normal")
205
206 # Check the cgroups unit
207 cpu_cgroup_path, memory_cgroup_path, log_collector_monitor = None, None, None
208 if CollectLogsHandler.should_validate_cgroups():
209 cgroups_api = SystemdCgroupsApi()
210 cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")
149211
150def main(args=[]):212 cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path)
213 memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path)
214
215 if not cpu_slice_matches or not memory_slice_matches:
216 logger.info("The Log Collector process is not in the proper cgroups:")
217 if not cpu_slice_matches:
218 logger.info("\tunexpected cpu slice")
219 if not memory_slice_matches:
220 logger.info("\tunexpected memory slice")
221
222 sys.exit(logcollector.INVALID_CGROUPS_ERRCODE)
223
224 try:
225 log_collector = LogCollector(is_full_mode, cpu_cgroup_path, memory_cgroup_path)
226 log_collector_monitor = get_log_collector_monitor_handler(log_collector.cgroups)
227 log_collector_monitor.run()
228 archive = log_collector.collect_logs_and_get_archive()
229 logger.info("Log collection successfully completed. Archive can be found at {0} "
230 "and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH))
231 except Exception as e:
232 logger.error("Log collection completed unsuccessfully. Error: {0}".format(ustr(e)))
233 logger.info("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH))
234 sys.exit(1)
235 finally:
236 if log_collector_monitor is not None:
237 log_collector_monitor.stop()
238
239 @staticmethod
240 def setup_firewall(firewall_metadata):
241
242 print("Setting up firewall for the WALinux Agent with args: {0}".format(firewall_metadata))
243 try:
244 AddFirewallRules.add_iptables_rules(firewall_metadata['wait'], firewall_metadata['dst_ip'],
245 firewall_metadata['uid'])
246 print("Successfully set the firewall rules")
247 except Exception as error:
248 print("Unable to add firewall rules. Error: {0}".format(ustr(error)))
249 sys.exit(1)
250
251
252def main(args=None):
151 """253 """
152 Parse command line arguments, exit with usage() on error.254 Parse command line arguments, exit with usage() on error.
153 Invoke different methods according to different command255 Invoke different methods according to different command
154 """256 """
257 if args is None:
258 args = []
155 if len(args) <= 0:259 if len(args) <= 0:
156 args = sys.argv[1:]260 args = sys.argv[1:]
157 command, force, verbose, debug, conf_file_path = parse_args(args)261 command, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata = parse_args(args)
158 if command == "version":262 if command == AgentCommands.Version:
159 version()263 version()
160 elif command == "help":264 elif command == AgentCommands.Help:
161 print(usage())265 print(usage())
162 elif command == "start":266 elif command == AgentCommands.Start:
163 start(conf_file_path=conf_file_path)267 start(conf_file_path=conf_file_path)
164 else:268 else:
165 try:269 try:
166 agent = Agent(verbose, conf_file_path=conf_file_path)270 agent = Agent(verbose, conf_file_path=conf_file_path)
167 if command == "deprovision+user":271 if command == AgentCommands.DeprovisionUser:
168 agent.deprovision(force, deluser=True)272 agent.deprovision(force, deluser=True)
169 elif command == "deprovision":273 elif command == AgentCommands.Deprovision:
170 agent.deprovision(force, deluser=False)274 agent.deprovision(force, deluser=False)
171 elif command == "provision":275 elif command == AgentCommands.Provision:
172 agent.provision()276 agent.provision()
173 elif command == "register-service":277 elif command == AgentCommands.RegisterService:
174 agent.register_service()278 agent.register_service()
175 elif command == "daemon":279 elif command == AgentCommands.Daemon:
176 agent.daemon()280 agent.daemon()
177 elif command == "run-exthandlers":281 elif command == AgentCommands.RunExthandlers:
178 agent.run_exthandlers(debug)282 agent.run_exthandlers(debug)
179 elif command == "show-configuration":283 elif command == AgentCommands.ShowConfig:
180 agent.show_configuration()284 agent.show_configuration()
181 except Exception:285 elif command == AgentCommands.CollectLogs:
286 agent.collect_logs(log_collector_full_mode)
287 elif command == AgentCommands.SetupFirewall:
288 agent.setup_firewall(firewall_metadata)
289 except Exception as e:
182 logger.error(u"Failed to run '{0}': {1}",290 logger.error(u"Failed to run '{0}': {1}",
183 command,291 command,
184 traceback.format_exc())292 textutil.format_exception(e))
293
185294
186def parse_args(sys_args):295def parse_args(sys_args):
187 """296 """
188 Parse command line arguments297 Parse command line arguments
189 """298 """
190 cmd = "help"299 cmd = AgentCommands.Help
191 force = False300 force = False
192 verbose = False301 verbose = False
193 debug = False302 debug = False
194 conf_file_path = None303 conf_file_path = None
195 for a in sys_args:304 log_collector_full_mode = False
196 m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", a)305 firewall_metadata = {
306 "dst_ip": None,
307 "uid": None,
308 "wait": ""
309 }
310
311 regex_cmd_format = "^([-/]*){0}"
312
313 for arg in sys_args:
314 if arg == "":
315 # Don't parse an empty parameter
316 continue
317 m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", arg) # pylint: disable=W1401
197 if not m is None:318 if not m is None:
198 conf_file_path = m.group(1)319 conf_file_path = m.group(1)
199 if not os.path.exists(conf_file_path):320 if not os.path.exists(conf_file_path):
200 print("Error: Configuration file {0} does not exist".format(321 print("Error: Configuration file {0} does not exist".format(
201 conf_file_path), file=sys.stderr)322 conf_file_path), file=sys.stderr)
202 usage()323 print(usage())
203 sys.exit(1)324 sys.exit(1)
204 325 elif re.match("^([-/]*)deprovision\\+user", arg):
205 elif re.match("^([-/]*)deprovision\\+user", a):326 cmd = AgentCommands.DeprovisionUser
206 cmd = "deprovision+user"327 elif re.match(regex_cmd_format.format(AgentCommands.Deprovision), arg):
207 elif re.match("^([-/]*)deprovision", a):328 cmd = AgentCommands.Deprovision
208 cmd = "deprovision"329 elif re.match(regex_cmd_format.format(AgentCommands.Daemon), arg):
209 elif re.match("^([-/]*)daemon", a):330 cmd = AgentCommands.Daemon
210 cmd = "daemon"331 elif re.match(regex_cmd_format.format(AgentCommands.Start), arg):
211 elif re.match("^([-/]*)start", a):332 cmd = AgentCommands.Start
212 cmd = "start"333 elif re.match(regex_cmd_format.format(AgentCommands.RegisterService), arg):
213 elif re.match("^([-/]*)register-service", a):334 cmd = AgentCommands.RegisterService
214 cmd = "register-service"335 elif re.match(regex_cmd_format.format(AgentCommands.RunExthandlers), arg):
215 elif re.match("^([-/]*)run-exthandlers", a):336 cmd = AgentCommands.RunExthandlers
216 cmd = "run-exthandlers"337 elif re.match(regex_cmd_format.format(AgentCommands.Version), arg):
217 elif re.match("^([-/]*)version", a):338 cmd = AgentCommands.Version
218 cmd = "version"339 elif re.match(regex_cmd_format.format("verbose"), arg):
219 elif re.match("^([-/]*)verbose", a):
220 verbose = True340 verbose = True
221 elif re.match("^([-/]*)debug", a):341 elif re.match(regex_cmd_format.format("debug"), arg):
222 debug = True342 debug = True
223 elif re.match("^([-/]*)force", a):343 elif re.match(regex_cmd_format.format("force"), arg):
224 force = True344 force = True
225 elif re.match("^([-/]*)show-configuration", a):345 elif re.match(regex_cmd_format.format(AgentCommands.ShowConfig), arg):
226 cmd = "show-configuration"346 cmd = AgentCommands.ShowConfig
227 elif re.match("^([-/]*)(help|usage|\\?)", a):347 elif re.match("^([-/]*)(help|usage|\\?)", arg):
228 cmd = "help"348 cmd = AgentCommands.Help
349 elif re.match(regex_cmd_format.format(AgentCommands.CollectLogs), arg):
350 cmd = AgentCommands.CollectLogs
351 elif re.match(regex_cmd_format.format("full"), arg):
352 log_collector_full_mode = True
353 elif re.match(regex_cmd_format.format(AgentCommands.SetupFirewall), arg):
354 cmd = AgentCommands.SetupFirewall
355 elif re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg):
356 firewall_metadata['dst_ip'] = re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg).group(
357 'dst_ip')
358 elif re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg):
359 firewall_metadata['uid'] = re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg).group('uid')
360 elif re.match(regex_cmd_format.format("(w|wait)$"), arg):
361 firewall_metadata['wait'] = "-w"
229 else:362 else:
230 cmd = "help"363 cmd = AgentCommands.Help
231 break364 break
232365
233 return cmd, force, verbose, debug, conf_file_path366 return cmd, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata
234367
235368
236def version():369def version():
@@ -245,29 +378,33 @@ def version():
245 PY_VERSION_MICRO))378 PY_VERSION_MICRO))
246 print("Goal state agent: {0}".format(GOAL_STATE_AGENT_VERSION))379 print("Goal state agent: {0}".format(GOAL_STATE_AGENT_VERSION))
247380
381
248def usage():382def usage():
249 """383 """
250 Return agent usage message384 Return agent usage message
251 """385 """
252 s = "\n"386 s = "\n"
253 s += ("usage: {0} [-verbose] [-force] [-help] "387 s += ("usage: {0} [-verbose] [-force] [-help] "
254 "-configuration-path:<path to configuration file>"388 "-configuration-path:<path to configuration file>"
255 "-deprovision[+user]|-register-service|-version|-daemon|-start|"389 "-deprovision[+user]|-register-service|-version|-daemon|-start|"
256 "-run-exthandlers|-show-configuration]"390 "-run-exthandlers|-show-configuration|-collect-logs [-full]|-setup-firewall [-dst_ip=<IP> -uid=<UID> [-w/--wait]]"
257 "").format(sys.argv[0])391 "").format(sys.argv[0])
258 s += "\n"392 s += "\n"
259 return s393 return s
260394
395
261def start(conf_file_path=None):396def start(conf_file_path=None):
262 """397 """
263 Start agent daemon in a background process and set stdout/stderr to398 Start agent daemon in a background process and set stdout/stderr to
264 /dev/null399 /dev/null
265 """400 """
266 devnull = open(os.devnull, 'w')
267 args = [sys.argv[0], '-daemon']401 args = [sys.argv[0], '-daemon']
268 if conf_file_path is not None:402 if conf_file_path is not None:
269 args.append('-configuration-path:{0}'.format(conf_file_path))403 args.append('-configuration-path:{0}'.format(conf_file_path))
270 subprocess.Popen(args, stdout=devnull, stderr=devnull)404
405 with open(os.devnull, 'w') as devnull:
406 subprocess.Popen(args, stdout=devnull, stderr=devnull)
407
271408
272if __name__ == '__main__' :409if __name__ == '__main__' :
273 main()410 main()
diff --git a/azurelinuxagent/common/AgentGlobals.py b/azurelinuxagent/common/AgentGlobals.py
274new file mode 100644411new file mode 100644
index 0000000..dbfda92
--- /dev/null
+++ b/azurelinuxagent/common/AgentGlobals.py
@@ -0,0 +1,39 @@
1# Microsoft Azure Linux Agent
2#
3# Copyright 2020 Microsoft Corporation
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Requires Python 2.6+ and Openssl 1.0+
18
19
20class AgentGlobals(object):
21 """
22 This class is used for setting AgentGlobals which can be used all throughout the Agent.
23 """
24
25 GUID_ZERO = "00000000-0000-0000-0000-000000000000"
26
27 #
28 # Some modules (e.g. telemetry) require an up-to-date container ID. We update this variable each time we
29 # fetch the goal state.
30 #
31 _container_id = GUID_ZERO
32
33 @staticmethod
34 def get_container_id():
35 return AgentGlobals._container_id
36
37 @staticmethod
38 def update_container_id(container_id):
39 AgentGlobals._container_id = container_id
diff --git a/azurelinuxagent/common/agent_supported_feature.py b/azurelinuxagent/common/agent_supported_feature.py
0new file mode 10064440new file mode 100644
index 0000000..d7f93e2
--- /dev/null
+++ b/azurelinuxagent/common/agent_supported_feature.py
@@ -0,0 +1,122 @@
1# Copyright 2018 Microsoft Corporation
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15# Requires Python 2.6+ and Openssl 1.0+
16#
17
18
19class SupportedFeatureNames(object):
20 """
21 Enum for defining the Feature Names for all features that we the agent supports
22 """
23 MultiConfig = "MultipleExtensionsPerHandler"
24 ExtensionTelemetryPipeline = "ExtensionTelemetryPipeline"
25 FastTrack = "FastTrack"
26
27
28class AgentSupportedFeature(object):
29 """
30 Interface for defining all features that the Linux Guest Agent supports and reports their if supported back to CRP
31 """
32
33 def __init__(self, name, version="1.0", supported=False):
34 self.__name = name
35 self.__version = version
36 self.__supported = supported
37
38 @property
39 def name(self):
40 return self.__name
41
42 @property
43 def version(self):
44 return self.__version
45
46 @property
47 def is_supported(self):
48 return self.__supported
49
50
51class _MultiConfigFeature(AgentSupportedFeature):
52
53 __NAME = SupportedFeatureNames.MultiConfig
54 __VERSION = "1.0"
55 __SUPPORTED = True
56
57 def __init__(self):
58 super(_MultiConfigFeature, self).__init__(name=_MultiConfigFeature.__NAME,
59 version=_MultiConfigFeature.__VERSION,
60 supported=_MultiConfigFeature.__SUPPORTED)
61
62
63class _ETPFeature(AgentSupportedFeature):
64
65 __NAME = SupportedFeatureNames.ExtensionTelemetryPipeline
66 __VERSION = "1.0"
67 __SUPPORTED = True
68
69 def __init__(self):
70 super(_ETPFeature, self).__init__(name=self.__NAME,
71 version=self.__VERSION,
72 supported=self.__SUPPORTED)
73
74
75# This is the list of features that Agent supports and we advertise to CRP
76__CRP_ADVERTISED_FEATURES = {
77 SupportedFeatureNames.MultiConfig: _MultiConfigFeature()
78}
79
80
81# This is the list of features that Agent supports and we advertise to Extensions
82__EXTENSION_ADVERTISED_FEATURES = {
83 SupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature()
84}
85
86
87def get_supported_feature_by_name(feature_name):
88 if feature_name in __CRP_ADVERTISED_FEATURES:
89 return __CRP_ADVERTISED_FEATURES[feature_name]
90
91 if feature_name in __EXTENSION_ADVERTISED_FEATURES:
92 return __EXTENSION_ADVERTISED_FEATURES[feature_name]
93
94 raise NotImplementedError("Feature with Name: {0} not found".format(feature_name))
95
96
97def get_agent_supported_features_list_for_crp():
98 """
99 List of features that the GuestAgent currently supports (like FastTrack, MultiConfig, etc).
100 We need to send this list as part of Status reporting to inform CRP of all the features the agent supports.
101 :return: Dict containing all CRP supported features with the key as their names and the AgentFeature object as
102 the value if they are supported by the Agent
103 Eg: {
104 MultipleExtensionsPerHandler: _MultiConfigFeature()
105 }
106 """
107
108 return dict((name, feature) for name, feature in __CRP_ADVERTISED_FEATURES.items() if feature.is_supported)
109
110
111def get_agent_supported_features_list_for_extensions():
112 """
113 List of features that the GuestAgent currently supports (like Extension Telemetry Pipeline, etc) needed by Extensions.
114 We need to send this list as environment variables when calling extension commands to inform Extensions of all the
115 features the agent supports.
116 :return: Dict containing all Extension supported features with the key as their names and the AgentFeature object as
117 the value if the feature is supported by the Agent.
118 Eg: {
119 CRPSupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature()
120 }
121 """
122 return dict((name, feature) for name, feature in __EXTENSION_ADVERTISED_FEATURES.items() if feature.is_supported)
diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py
index 2ad70c1..b2bf32f 100644
--- a/azurelinuxagent/common/cgroup.py
+++ b/azurelinuxagent/common/cgroup.py
@@ -13,47 +13,94 @@
13# limitations under the License.13# limitations under the License.
14#14#
15# Requires Python 2.6+ and Openssl 1.0+15# Requires Python 2.6+ and Openssl 1.0+
16
16import errno17import errno
17import os18import os
18import re19import re
20from datetime import timedelta
1921
20from azurelinuxagent.common import logger22from azurelinuxagent.common import logger, conf
21from azurelinuxagent.common.exception import CGroupsException23from azurelinuxagent.common.exception import CGroupsException
22from azurelinuxagent.common.future import ustr24from azurelinuxagent.common.future import ustr
23from azurelinuxagent.common.osutil import get_osutil25from azurelinuxagent.common.osutil import get_osutil
24from azurelinuxagent.common.utils import fileutil26from azurelinuxagent.common.utils import fileutil
2527
26re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')28_REPORT_EVERY_HOUR = timedelta(hours=1)
29_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period())
2730
31AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes
32AGENT_LOG_COLLECTOR = "azure-walinuxagent-logcollector"
2833
29class CGroupContollers(object):
30 CPU = "cpu"
31 MEMORY = "memory"
3234
35class CounterNotFound(Exception):
36 pass
3337
34class CGroup(object):
35 @staticmethod
36 def create(cgroup_path, controller, extension_name):
37 """
38 Factory method to create the correct CGroup.
39 """
40 if controller == CGroupContollers.CPU:
41 return CpuCgroup(extension_name, cgroup_path)
42 if controller == CGroupContollers.MEMORY:
43 return MemoryCgroup(extension_name, cgroup_path)
44 raise CGroupsException('CGroup controller {0} is not supported'.format(controller))
4538
46 def __init__(self, name, cgroup_path, controller_type):39class MetricValue(object):
40
41 """
42 Class for defining all the required metric fields to send telemetry.
43 """
44
45 def __init__(self, category, counter, instance, value, report_period=_DEFAULT_REPORT_PERIOD):
46 self._category = category
47 self._counter = counter
48 self._instance = instance
49 self._value = value
50 self._report_period = report_period
51
52 @property
53 def category(self):
54 return self._category
55
56 @property
57 def counter(self):
58 return self._counter
59
60 @property
61 def instance(self):
62 return self._instance
63
64 @property
65 def value(self):
66 return self._value
67
68 @property
69 def report_period(self):
70 return self._report_period
71
72
73class MetricsCategory(object):
74 MEMORY_CATEGORY = "Memory"
75 CPU_CATEGORY = "CPU"
76
77
78class MetricsCounter(object):
79 PROCESSOR_PERCENT_TIME = "% Processor Time"
80 TOTAL_MEM_USAGE = "Total Memory Usage"
81 MAX_MEM_USAGE = "Max Memory Usage"
82 THROTTLED_TIME = "Throttled Time"
83 SWAP_MEM_USAGE = "Swap Memory Usage"
84 AVAILABLE_MEM = "Available MBytes"
85 USED_MEM = "Used MBytes"
86
87
88re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')
89
90
91class CGroup(object):
92 def __init__(self, name, cgroup_path):
47 """93 """
48 Initialize _data collection for the Memory controller94 Initialize _data collection for the Memory controller
49 :param: name: Name of the CGroup95 :param: name: Name of the CGroup
50 :param: cgroup_path: Path of the controller96 :param: cgroup_path: Path of the controller
51 :param: controller_type:
52 :return:97 :return:
53 """98 """
54 self.name = name99 self.name = name
55 self.path = cgroup_path100 self.path = cgroup_path
56 self.controller = controller_type101
102 def __str__(self):
103 return "{0} [{1}]".format(self.name, self.path)
57104
58 def _get_cgroup_file(self, file_name):105 def _get_cgroup_file(self, file_name):
59 return os.path.join(self.path, file_name)106 return os.path.join(self.path, file_name)
@@ -89,7 +136,7 @@ class CGroup(object):
89 logger.error("File {0} is empty but should not be".format(parameter_filename))136 logger.error("File {0} is empty but should not be".format(parameter_filename))
90 raise CGroupsException("File {0} is empty but should not be".format(parameter_filename))137 raise CGroupsException("File {0} is empty but should not be".format(parameter_filename))
91 except Exception as e:138 except Exception as e:
92 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:139 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
93 raise e140 raise e
94 parameter_filename = self._get_cgroup_file(parameter_name)141 parameter_filename = self._get_cgroup_file(parameter_name)
95 raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e)142 raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e)
@@ -114,42 +161,26 @@ class CGroup(object):
114 ' Internal error: {1}'.format(self.path, ustr(e)))161 ' Internal error: {1}'.format(self.path, ustr(e)))
115 return False162 return False
116163
117 def get_tracked_processes(self):164 def get_tracked_metrics(self, **_):
118 """165 """
119 :return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes.166 Retrieves the current value of the metrics tracked for this cgroup and returns them as an array.
167
168 Note: Agent won't track the metrics if the current cpu ticks less than previous value and returns empty array.
120 """169 """
121 procs = []170 raise NotImplementedError()
122 try:
123 procs = self._get_parameters("cgroup.procs")
124 except (IOError, OSError) as e:
125 if e.errno == errno.ENOENT:
126 # only suppressing file not found exceptions.
127 pass
128 else:
129 logger.periodic_warn(logger.EVERY_HALF_HOUR,
130 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.'
131 ' Internal error: {1}'.format(self.path, ustr(e)))
132 except CGroupsException as e:
133 logger.periodic_warn(logger.EVERY_HALF_HOUR,
134 'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.'
135 ' Internal error: {1}'.format(self.path, ustr(e)))
136 return procs
137171
138172
139class CpuCgroup(CGroup):173class CpuCgroup(CGroup):
140 def __init__(self, name, cgroup_path):174 def __init__(self, name, cgroup_path):
141 super(CpuCgroup, self).__init__(name, cgroup_path, CGroupContollers.CPU)175 super(CpuCgroup, self).__init__(name, cgroup_path)
142176
143 self._osutil = get_osutil()177 self._osutil = get_osutil()
144 self._previous_cgroup_cpu = None178 self._previous_cgroup_cpu = None
145 self._previous_system_cpu = None179 self._previous_system_cpu = None
146 self._current_cgroup_cpu = None180 self._current_cgroup_cpu = None
147 self._current_system_cpu = None181 self._current_system_cpu = None
148182 self._previous_throttled_time = None
149 def __str__(self):183 self._current_throttled_time = None
150 return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(
151 self.name, self.path, self.controller
152 )
153184
154 def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False):185 def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False):
155 """186 """
@@ -159,24 +190,54 @@ class CpuCgroup(CGroup):
159 returns 0; this is useful when the function can be called before the cgroup has been created.190 returns 0; this is useful when the function can be called before the cgroup has been created.
160 """191 """
161 try:192 try:
162 cpu_stat = self._get_file_contents('cpuacct.stat')193 cpuacct_stat = self._get_file_contents('cpuacct.stat')
163 except Exception as e:194 except Exception as e:
164 if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:195 if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101
165 raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e)))196 raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e)))
166 if not allow_no_such_file_or_directory_error:197 if not allow_no_such_file_or_directory_error:
167 raise e198 raise e
168 cpu_stat = None199 cpuacct_stat = None
169200
170 cpu_ticks = 0201 cpu_ticks = 0
171202
172 if cpu_stat is not None:203 if cpuacct_stat is not None:
173 match = re_user_system_times.match(cpu_stat)204 #
205 # Sample file:
206 # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpuacct.stat
207 # user 10190
208 # system 3160
209 #
210 match = re_user_system_times.match(cpuacct_stat)
174 if not match:211 if not match:
175 raise CGroupsException("The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpu_stat))212 raise CGroupsException(
213 "The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat))
176 cpu_ticks = int(match.groups()[0]) + int(match.groups()[1])214 cpu_ticks = int(match.groups()[0]) + int(match.groups()[1])
177215
178 return cpu_ticks216 return cpu_ticks
179217
218 def get_throttled_time(self):
219 try:
220 with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat:
221 #
222 # Sample file:
223 #
224 # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpu.stat
225 # nr_periods 51660
226 # nr_throttled 19461
227 # throttled_time 1529590856339
228 #
229 for line in cpu_stat:
230 match = re.match(r'throttled_time\s+(\d+)', line)
231 if match is not None:
232 return int(match.groups()[0])
233 raise Exception("Cannot find throttled_time")
234 except (IOError, OSError) as e:
235 if e.errno == errno.ENOENT:
236 return 0
237 raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e)))
238 except Exception as e:
239 raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e)))
240
180 def _cpu_usage_initialized(self):241 def _cpu_usage_initialized(self):
181 return self._current_cgroup_cpu is not None and self._current_system_cpu is not None242 return self._current_cgroup_cpu is not None and self._current_system_cpu is not None
182243
@@ -188,13 +249,14 @@ class CpuCgroup(CGroup):
188 raise CGroupsException("initialize_cpu_usage() should be invoked only once")249 raise CGroupsException("initialize_cpu_usage() should be invoked only once")
189 self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True)250 self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True)
190 self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot()251 self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot()
252 self._current_throttled_time = self.get_throttled_time()
191253
192 def get_cpu_usage(self):254 def get_cpu_usage(self):
193 """255 """
194 Computes the CPU used by the cgroup since the last call to this function.256 Computes the CPU used by the cgroup since the last call to this function.
195257
196 The usage is measured as a percentage of utilization of all cores in the system. For example,258 The usage is measured as a percentage of utilization of 1 core in the system. For example,
197 using 1 core at 100% on a 4-core system would be reported as 25%.259 using 1 core all of the time on a 4-core system would be reported as 100%.
198260
199 NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage()261 NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage()
200 """262 """
@@ -209,53 +271,122 @@ class CpuCgroup(CGroup):
209 cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu271 cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu
210 system_delta = max(1, self._current_system_cpu - self._previous_system_cpu)272 system_delta = max(1, self._current_system_cpu - self._previous_system_cpu)
211273
212 return round(100.0 * float(cgroup_delta) / float(system_delta), 3)274 return round(100.0 * self._osutil.get_processor_cores() * float(cgroup_delta) / float(system_delta), 3)
275
276 def get_cpu_throttled_time(self, read_previous_throttled_time=True):
277 """
278 Computes the throttled time (in seconds) since the last call to this function.
279 NOTE: initialize_cpu_usage() must be invoked before calling this function
280 Compute only current throttled time if read_previous_throttled_time set to False
281 """
282 if not read_previous_throttled_time:
283 return float(self.get_throttled_time() / 1E9)
284
285 if not self._cpu_usage_initialized():
286 raise CGroupsException(
287 "initialize_cpu_usage() must be invoked before the first call to get_throttled_time()")
288
289 self._previous_throttled_time = self._current_throttled_time
290 self._current_throttled_time = self.get_throttled_time()
291
292 return float(self._current_throttled_time - self._previous_throttled_time) / 1E9
293
294 def get_tracked_metrics(self, **kwargs):
295 tracked = []
296 cpu_usage = self.get_cpu_usage()
297 if cpu_usage >= float(0):
298 tracked.append(
299 MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))
300
301 if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']:
302 throttled_time = self.get_cpu_throttled_time()
303 if cpu_usage >= float(0) and throttled_time >= float(0):
304 tracked.append(
305 MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))
306
307 return tracked
213308
214309
215class MemoryCgroup(CGroup):310class MemoryCgroup(CGroup):
216 def __init__(self, name, cgroup_path):311 def __init__(self, name, cgroup_path):
312 super(MemoryCgroup, self).__init__(name, cgroup_path)
313
314 self._counter_not_found_error_count = 0
315
316 def _get_memory_stat_counter(self, counter_name):
317 try:
318 with open(os.path.join(self.path, 'memory.stat')) as memory_stat:
319 # cat /sys/fs/cgroup/memory/azure.slice/memory.stat
320 # cache 67178496
321 # rss 42340352
322 # rss_huge 6291456
323 # swap 0
324 for line in memory_stat:
325 re_memory_counter = r'{0}\s+(\d+)'.format(counter_name)
326 match = re.match(re_memory_counter, line)
327 if match is not None:
328 return int(match.groups()[0])
329 except (IOError, OSError) as e:
330 if e.errno == errno.ENOENT:
331 raise
332 raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
333 except Exception as e:
334 raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
335
336 raise CounterNotFound("Cannot find counter: {0}".format(counter_name))
337
338 def get_memory_usage(self):
217 """339 """
218 Initialize _data collection for the Memory controller340 Collect RSS+CACHE from memory.stat cgroup.
219341
220 :return: MemoryCgroup342 :return: Memory usage in bytes
343 :rtype: int
221 """344 """
222 super(MemoryCgroup, self).__init__(name, cgroup_path, CGroupContollers.MEMORY)
223345
224 def __str__(self):346 cache = self._get_memory_stat_counter("cache")
225 return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(347 rss = self._get_memory_stat_counter("rss")
226 self.name, self.path, self.controller348 return cache + rss
227 )
228349
229 def get_memory_usage(self):350 def try_swap_memory_usage(self):
230 """351 """
231 Collect memory.usage_in_bytes from the cgroup.352 Collect SWAP from memory.stat cgroup.
232353
233 :return: Memory usage in bytes354 :return: Memory usage in bytes
234 :rtype: int355 :rtype: int
356 Note: stat file is the only place to get the SWAP since other swap related file memory.memsw.usage_in_bytes is for total Memory+SWAP.
235 """357 """
236 usage = None
237 try:358 try:
238 usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True)359 return self._get_memory_stat_counter("swap")
239 except Exception as e:360 except CounterNotFound as e:
240 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:361 if self._counter_not_found_error_count < 1:
241 raise362 logger.periodic_info(logger.EVERY_HALF_HOUR,
242 raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)363 '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path))
243364 self._counter_not_found_error_count += 1
244 return int(usage)365 return 0
245366
246 def get_max_memory_usage(self):367 def get_max_memory_usage(self):
247 """368 """
248 Collect memory.usage_in_bytes from the cgroup.369 Collect memory.max_usage_in_bytes from the cgroup.
249370
250 :return: Memory usage in bytes371 :return: Memory usage in bytes
251 :rtype: int372 :rtype: int
252 """373 """
253 usage = None374 usage = 0
254 try:375 try:
255 usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)376 usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True))
256 except Exception as e:377 except Exception as e:
257 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:378 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
258 raise379 raise
259 raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)380 raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e)
260381
261 return int(usage)382 return usage
383
384 def get_tracked_metrics(self, **_):
385 return [
386 MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name,
387 self.get_memory_usage()),
388 MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name,
389 self.get_max_memory_usage(), _REPORT_EVERY_HOUR),
390 MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name,
391 self.try_swap_memory_usage(), _REPORT_EVERY_HOUR)
392 ]
diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py
index c671a2e..ca0ef3b 100644
--- a/azurelinuxagent/common/cgroupapi.py
+++ b/azurelinuxagent/common/cgroupapi.py
@@ -1,3 +1,4 @@
1# -*- coding: utf-8 -*-
1# Copyright 2018 Microsoft Corporation2# Copyright 2018 Microsoft Corporation
2#3#
3# Licensed under the Apache License, Version 2.0 (the "License");4# Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,102 +15,65 @@
14#15#
15# Requires Python 2.6+ and Openssl 1.0+16# Requires Python 2.6+ and Openssl 1.0+
1617
17import errno
18import os18import os
19import re
19import shutil20import shutil
20import subprocess21import subprocess
22import threading
21import uuid23import uuid
2224
23from azurelinuxagent.common import logger25from azurelinuxagent.common import logger
24from azurelinuxagent.common.cgroup import CGroup26from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup
25from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry27from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
26from azurelinuxagent.common.conf import get_agent_pid_file_path28from azurelinuxagent.common.conf import get_agent_pid_file_path
27from azurelinuxagent.common.event import add_event, WALAEventOperation
28from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \29from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \
29 ExtensionOperationError30 ExtensionOperationError
30from azurelinuxagent.common.future import ustr31from azurelinuxagent.common.future import ustr
32from azurelinuxagent.common.osutil import systemd
31from azurelinuxagent.common.utils import fileutil, shellutil33from azurelinuxagent.common.utils import fileutil, shellutil
32from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output34from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \
33from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION35 TELEMETRY_MESSAGE_MAX_LEN
36from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
37from azurelinuxagent.common.version import get_distro
3438
35CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup'39CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup'
36CGROUP_CONTROLLERS = ["cpu", "memory"]40CGROUP_CONTROLLERS = ["cpu", "memory"]
37VM_AGENT_CGROUP_NAME = "walinuxagent.service"41EXTENSION_SLICE_PREFIX = "azure-vmextensions"
38EXTENSIONS_ROOT_CGROUP_NAME = "walinuxagent.extensions"
39UNIT_FILES_FILE_SYSTEM_PATH = "/etc/systemd/system"
4042
4143
42class CGroupsApi(object):44class SystemdRunError(CGroupsException):
43 """45 """
44 Interface for the cgroups API46 Raised when systemd-run fails
45 """47 """
46 def create_agent_cgroups(self):
47 raise NotImplementedError()
48
49 def create_extension_cgroups_root(self):
50 raise NotImplementedError()
51
52 def create_extension_cgroups(self, extension_name):
53 raise NotImplementedError()
54
55 def remove_extension_cgroups(self, extension_name):
56 raise NotImplementedError()
5748
58 def get_extension_cgroups(self, extension_name):49 def __init__(self, msg=None):
59 raise NotImplementedError()50 super(SystemdRunError, self).__init__(msg)
6051
61 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code):
62 raise NotImplementedError()
6352
64 def cleanup_legacy_cgroups(self):53class CGroupsApi(object):
65 raise NotImplementedError()54 @staticmethod
55 def cgroups_supported():
56 distro_info = get_distro()
57 distro_name = distro_info[0]
58 try:
59 distro_version = FlexibleVersion(distro_info[1])
60 except ValueError:
61 return False
62 return distro_name.lower() == 'ubuntu' and distro_version.major >= 16
6663
67 @staticmethod64 @staticmethod
68 def track_cgroups(extension_cgroups):65 def track_cgroups(extension_cgroups):
69 try:66 try:
70 for cgroup in extension_cgroups:67 for cgroup in extension_cgroups:
71 CGroupsTelemetry.track_cgroup(cgroup)68 CGroupsTelemetry.track_cgroup(cgroup)
72 except Exception as e:69 except Exception as exception:
73 logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. "70 logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. "
74 "Error: {1}".format(cgroup.path, ustr(e)))71 "Error: {1}".format(cgroup.path, ustr(exception)))
7572
76 @staticmethod73 @staticmethod
77 def _get_extension_cgroup_name(extension_name):74 def get_processes_in_cgroup(cgroup_path):
78 # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.75 with open(os.path.join(cgroup_path, "cgroup.procs"), "r") as cgroup_procs:
79 return extension_name.replace('-', '_')76 return [int(pid) for pid in cgroup_procs.read().split()]
80
81 @staticmethod
82 def create():
83 """
84 Factory method to create the correct API for the current platform
85 """
86 return SystemdCgroupsApi() if CGroupsApi._is_systemd() else FileSystemCgroupsApi()
87
88 @staticmethod
89 def _is_systemd():
90 """
91 Determine if systemd is managing system services; the implementation follows the same strategy as, for example,
92 sd_booted() in libsystemd, or /usr/sbin/service
93 """
94 return os.path.exists('/run/systemd/system/')
95
96 @staticmethod
97 def _foreach_controller(operation, message):
98 """
99 Executes the given operation on all controllers that need to be tracked; outputs 'message' if the controller
100 is not mounted or if an error occurs in the operation
101 :return: Returns a list of error messages or an empty list if no errors occurred
102 """
103 mounted_controllers = os.listdir(CGROUPS_FILE_SYSTEM_ROOT)
104
105 for controller in CGROUP_CONTROLLERS:
106 try:
107 if controller not in mounted_controllers:
108 logger.warn('Cgroup controller "{0}" is not mounted. {1}', controller, message)
109 else:
110 operation(controller)
111 except Exception as e:
112 logger.warn('Error in cgroup controller "{0}": {1}. {2}', controller, ustr(e), message)
11377
114 @staticmethod78 @staticmethod
115 def _foreach_legacy_cgroup(operation):79 def _foreach_legacy_cgroup(operation):
@@ -138,429 +102,250 @@ class CGroupsApi(object):
138102
139 if os.path.exists(procs_file):103 if os.path.exists(procs_file):
140 procs_file_contents = fileutil.read_file(procs_file).strip()104 procs_file_contents = fileutil.read_file(procs_file).strip()
141 daemon_pid = fileutil.read_file(get_agent_pid_file_path()).strip()105 daemon_pid = CGroupsApi.get_daemon_pid()
142106
143 if daemon_pid in procs_file_contents:107 if ustr(daemon_pid) in procs_file_contents:
144 operation(controller, daemon_pid)108 operation(controller, daemon_pid)
145 finally:109 finally:
146 for _, cgroup in legacy_cgroups:110 for _, cgroup in legacy_cgroups:
147 logger.info('Removing {0}', cgroup)111 logger.info('Removing {0}', cgroup)
148 shutil.rmtree(cgroup, ignore_errors=True)112 shutil.rmtree(cgroup, ignore_errors=True)
113 return len(legacy_cgroups)
149114
150
151class FileSystemCgroupsApi(CGroupsApi):
152 """
153 Cgroups interface using the cgroups file system directly
154 """
155 @staticmethod115 @staticmethod
156 def _try_mkdir(path):116 def get_daemon_pid():
157 """117 return int(fileutil.read_file(get_agent_pid_file_path()).strip())
158 Try to create a directory, recursively. If it already exists as such, do nothing. Raise the appropriate
159 exception should an error occur.
160
161 :param path: str
162 """
163 if not os.path.isdir(path):
164 try:
165 os.makedirs(path, 0o755)
166 except OSError as e:
167 if e.errno == errno.EEXIST:
168 if not os.path.isdir(path):
169 raise CGroupsException("Create directory for cgroup {0}: normal file already exists with that name".format(path))
170 else:
171 pass # There was a race to create the directory, but it's there now, and that's fine
172 elif e.errno == errno.EACCES:
173 # This is unexpected, as the agent runs as root
174 raise CGroupsException("Create directory for cgroup {0}: permission denied".format(path))
175 else:
176 raise
177
178 @staticmethod
179 def _get_agent_cgroup_path(controller):
180 return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME)
181
182 @staticmethod
183 def _get_extension_cgroups_root_path(controller):
184 return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, EXTENSIONS_ROOT_CGROUP_NAME)
185
186 def _get_extension_cgroup_path(self, controller, extension_name):
187 extensions_root = self._get_extension_cgroups_root_path(controller)
188
189 if not os.path.exists(extensions_root):
190 logger.warn("Root directory {0} does not exist.".format(extensions_root))
191
192 cgroup_name = self._get_extension_cgroup_name(extension_name)
193118
194 return os.path.join(extensions_root, cgroup_name)
195119
196 def _create_extension_cgroup(self, controller, extension_name):120class SystemdCgroupsApi(CGroupsApi):
197 return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), controller, extension_name)121 """
122 Cgroups interface via systemd
123 """
198124
199 @staticmethod125 def __init__(self):
200 def _add_process_to_cgroup(pid, cgroup_path):126 self._cgroup_mountpoints = None
201 tasks_file = os.path.join(cgroup_path, 'cgroup.procs')127 self._agent_unit_name = None
202 fileutil.append_file(tasks_file, "{0}\n".format(pid))128 self._systemd_run_commands = []
203 logger.info("Added PID {0} to cgroup {1}".format(pid, cgroup_path))129 self._systemd_run_commands_lock = threading.RLock()
204130
205 def cleanup_legacy_cgroups(self):131 def get_systemd_run_commands(self):
206 """132 """
207 Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;133 Returns a list of the systemd-run commands currently running (given as PIDs)
208 starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. This
209 method moves the daemon's PID from the legacy cgroups to the newer cgroups.
210 """134 """
211 def move_daemon_pid(controller, daemon_pid):135 with self._systemd_run_commands_lock:
212 new_path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)136 return self._systemd_run_commands[:]
213 logger.info("Writing daemon's PID ({0}) to {1}", daemon_pid, new_path)
214 fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid)
215 msg = "Moved daemon's PID from legacy cgroup to {0}".format(new_path)
216 add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=True, message=msg)
217137
218 CGroupsApi._foreach_legacy_cgroup(move_daemon_pid)138 def get_cgroup_mount_points(self):
219
220 def create_agent_cgroups(self):
221 """139 """
222 Creates a cgroup for the VM Agent in each of the controllers we are tracking; returns the created cgroups.140 Returns a tuple with the mount points for the cpu and memory controllers; the values can be None
141 if the corresponding controller is not mounted
223 """142 """
224 cgroups = []143 # the output of mount is similar to
225144 # $ mount -t cgroup
226 pid = int(os.getpid())145 # cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
227146 # cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
228 def create_cgroup(controller):147 # cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
229 path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)148 # etc
230149 #
231 if not os.path.isdir(path):150 if self._cgroup_mountpoints is None:
232 FileSystemCgroupsApi._try_mkdir(path)151 cpu = None
233 logger.info("Created cgroup {0}".format(path))152 memory = None
234153 for line in shellutil.run_command(['mount', '-t', 'cgroup']).splitlines():
235 self._add_process_to_cgroup(pid, path)154 match = re.search(r'on\s+(?P<path>/\S+(memory|cpuacct))\s', line)
236155 if match is not None:
237 cgroups.append(CGroup.create(path, controller, VM_AGENT_CGROUP_NAME))156 path = match.group('path')
238157 if 'cpuacct' in path:
239 self._foreach_controller(create_cgroup, 'Failed to create a cgroup for the VM Agent; resource usage will not be tracked')158 cpu = path
240159 else:
241 if len(cgroups) == 0:160 memory = path
242 raise CGroupsException("Failed to create any cgroup for the VM Agent")161 self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory}
243162
244 return cgroups163 return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory']
245164
246 def create_extension_cgroups_root(self):165 @staticmethod
166 def get_process_cgroup_relative_paths(process_id):
247 """167 """
248 Creates the directory within the cgroups file system that will contain the cgroups for the extensions.168 Returns a tuple with the path of the cpu and memory cgroups for the given process (relative to the mount point of the corresponding
169 controller).
170 The 'process_id' can be a numeric PID or the string "self" for the current process.
171 The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted).
249 """172 """
250 def create_cgroup(controller):173 # The contents of the file are similar to
251 path = self._get_extension_cgroups_root_path(controller)174 # # cat /proc/1218/cgroup
252175 # 10:memory:/system.slice/walinuxagent.service
253 if not os.path.isdir(path):176 # 3:cpu,cpuacct:/system.slice/walinuxagent.service
254 FileSystemCgroupsApi._try_mkdir(path)177 # etc
255 logger.info("Created {0}".format(path))178 cpu_path = None
179 memory_path = None
180 for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines():
181 match = re.match(r'\d+:(?P<controller>(memory|.*cpuacct.*)):(?P<path>.+)', line)
182 if match is not None:
183 controller = match.group('controller')
184 path = match.group('path').lstrip('/') if match.group('path') != '/' else None
185 if controller == 'memory':
186 memory_path = path
187 else:
188 cpu_path = path
256189
257 self._foreach_controller(create_cgroup, 'Failed to create a root cgroup for extensions')190 return cpu_path, memory_path
258191
259 def create_extension_cgroups(self, extension_name):192 def get_process_cgroup_paths(self, process_id):
260 """193 """
261 Creates a cgroup for the given extension in each of the controllers we are tracking; returns the created cgroups.194 Returns a tuple with the path of the cpu and memory cgroups for the given process. The 'process_id' can be a numeric PID or the string "self" for the current process.
195 The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted).
262 """196 """
263 cgroups = []197 cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id)
264
265 def create_cgroup(controller):
266 cgroup = self._create_extension_cgroup(controller, extension_name)
267198
268 if not os.path.isdir(cgroup.path):199 cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points()
269 FileSystemCgroupsApi._try_mkdir(cgroup.path)
270 logger.info("Created cgroup {0}".format(cgroup.path))
271200
272 cgroups.append(cgroup)201 cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \
202 if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None
273203
274 self._foreach_controller(create_cgroup, 'Failed to create a cgroup for extension {0}'.format(extension_name))204 memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \
205 if memory_mount_point is not None and memory_cgroup_relative_path is not None else None
275206
276 return cgroups207 return cpu_cgroup_path, memory_cgroup_path
277208
278 def remove_extension_cgroups(self, extension_name):209 def get_unit_cgroup_paths(self, unit_name):
279 """210 """
280 Deletes the cgroups for the given extension.211 Returns a tuple with the path of the cpu and memory cgroups for the given unit.
212 The values returned can be None if the controller is not mounted.
213 Ex: ControlGroup=/azure.slice/walinuxagent.service
214 controlgroup_path[1:] = azure.slice/walinuxagent.service
281 """215 """
282 def remove_cgroup(controller):216 controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup")
283 path = self._get_extension_cgroup_path(controller, extension_name)217 cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points()
284
285 if os.path.exists(path):
286 try:
287 os.rmdir(path)
288 logger.info('Deleted cgroup "{0}".'.format(path))
289 except OSError as exception:
290 if exception.errno == 16: # [Errno 16] Device or resource busy
291 logger.warn('CGroup "{0}" still has active tasks; will not remove it.'.format(path))
292
293 self._foreach_controller(remove_cgroup, 'Failed to delete cgroups for extension {0}'.format(extension_name))
294
295 def get_extension_cgroups(self, extension_name):
296 """
297 Returns the cgroups for the given extension.
298 """
299
300 cgroups = []
301218
302 def get_cgroup(controller):219 cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \
303 cgroup = self._create_extension_cgroup(controller, extension_name)220 if cpu_mount_point is not None else None
304 cgroups.append(cgroup)
305221
306 self._foreach_controller(get_cgroup, 'Failed to retrieve cgroups for extension {0}'.format(extension_name))222 memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \
223 if memory_mount_point is not None else None
307224
308 return cgroups225 return cpu_cgroup_path, memory_cgroup_path
309226
310 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,227 @staticmethod
311 error_code=ExtensionErrorCodes.PluginUnknownFailure):228 def get_cgroup2_controllers():
312 """229 """
313 Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup230 Returns a tuple with the mount point for the cgroups v2 controllers, and the currently mounted controllers;
314 :param extension_name: The extension executing the command231 either value can be None if cgroups v2 or its controllers are not mounted
315 :param command: The command to invoke
316 :param timeout: Number of seconds to wait for command completion
317 :param cwd: The working directory for the command
318 :param env: The environment to pass to the command's process
319 :param stdout: File object to redirect stdout to
320 :param stderr: File object to redirect stderr to
321 :param error_code: Extension error code to raise in case of error
322 """232 """
323 try:233 # the output of mount is similar to
324 extension_cgroups = self.create_extension_cgroups(extension_name)234 # $ mount -t cgroup2
325 except Exception as exception:235 # cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate)
326 extension_cgroups = []236 #
327 logger.warn("Failed to create cgroups for extension '{0}'; resource usage will not be tracked. "237 for line in shellutil.run_command(['mount', '-t', 'cgroup2']).splitlines():
328 "Error: {1}".format(extension_name, ustr(exception)))238 match = re.search(r'on\s+(?P<path>/\S+)\s', line)
329239 if match is not None:
330 def pre_exec_function():240 mount_point = match.group('path')
331 os.setsid()241 controllers = None
332242 controllers_file = os.path.join(mount_point, 'cgroup.controllers')
333 try:243 if os.path.exists(controllers_file):
334 pid = os.getpid()244 controllers = fileutil.read_file(controllers_file)
335245 return mount_point, controllers
336 for cgroup in extension_cgroups:246 return None, None
337 try:
338 self._add_process_to_cgroup(pid, cgroup.path)
339 except Exception as exception:
340 logger.warn("Failed to add PID {0} to the cgroups for extension '{1}'. "
341 "Resource usage will not be tracked. Error: {2}".format(pid,
342 extension_name,
343 ustr(exception)))
344 except Exception as e:
345 logger.warn("Failed to add extension {0} to its cgroup. Resource usage will not be tracked. "
346 "Error: {1}".format(extension_name, ustr(e)))
347
348 process = subprocess.Popen(command,
349 shell=shell,
350 cwd=cwd,
351 env=env,
352 stdout=stdout,
353 stderr=stderr,
354 preexec_fn=pre_exec_function)
355
356 self.track_cgroups(extension_cgroups)
357 process_output = handle_process_completion(process=process,
358 command=command,
359 timeout=timeout,
360 stdout=stdout,
361 stderr=stderr,
362 error_code=error_code)
363
364 return extension_cgroups, process_output
365
366
367class SystemdCgroupsApi(CGroupsApi):
368 """
369 Cgroups interface via systemd
370 """
371
372 @staticmethod
373 def create_and_start_unit(unit_filename, unit_contents):
374 try:
375 unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
376 fileutil.write_file(unit_path, unit_contents)
377 shellutil.run_command(["systemctl", "daemon-reload"])
378 shellutil.run_command(["systemctl", "start", unit_filename])
379 except Exception as e:
380 raise CGroupsException("Failed to create and start {0}. Error: {1}".format(unit_filename, ustr(e)))
381247
382 @staticmethod248 @staticmethod
383 def _get_extensions_slice_root_name():249 def _is_systemd_failure(scope_name, stderr):
384 return "system-{0}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME)250 stderr.seek(0)
385251 stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace')
386 def _get_extension_slice_name(self, extension_name):252 unit_not_found = "Unit {0} not found.".format(scope_name)
387 return "system-{0}-{1}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME, self._get_extension_cgroup_name(extension_name))253 return unit_not_found in stderr or scope_name not in stderr
388
389 def create_agent_cgroups(self):
390 try:
391 cgroup_unit = None
392 cgroup_paths = fileutil.read_file("/proc/self/cgroup")
393 for entry in cgroup_paths.splitlines():
394 fields = entry.split(':')
395 if fields[1] == "name=systemd":
396 cgroup_unit = fields[2].lstrip(os.path.sep)
397
398 cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'cpu', cgroup_unit)
399 memory_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'memory', cgroup_unit)
400
401 return [CGroup.create(cpu_cgroup_path, 'cpu', VM_AGENT_CGROUP_NAME),
402 CGroup.create(memory_cgroup_path, 'memory', VM_AGENT_CGROUP_NAME)]
403 except Exception as e:
404 raise CGroupsException("Failed to get paths of agent's cgroups. Error: {0}".format(ustr(e)))
405
406 def create_extension_cgroups_root(self):
407 unit_contents = """
408[Unit]
409Description=Slice for walinuxagent extensions
410DefaultDependencies=no
411Before=slices.target
412Requires=system.slice
413After=system.slice"""
414 unit_filename = self._get_extensions_slice_root_name()
415 self.create_and_start_unit(unit_filename, unit_contents)
416 logger.info("Created slice for walinuxagent extensions {0}".format(unit_filename))
417
418 def create_extension_cgroups(self, extension_name):
419 # TODO: The slice created by this function is not used currently. We need to create the extension scopes within
420 # this slice and use the slice to monitor the cgroups. Also see comment in get_extension_cgroups.
421 # the slice.
422 unit_contents = """
423[Unit]
424Description=Slice for extension {0}
425DefaultDependencies=no
426Before=slices.target
427Requires=system-{1}.slice
428After=system-{1}.slice""".format(extension_name, EXTENSIONS_ROOT_CGROUP_NAME)
429 unit_filename = self._get_extension_slice_name(extension_name)
430 self.create_and_start_unit(unit_filename, unit_contents)
431 logger.info("Created slice for {0}".format(unit_filename))
432
433 return self.get_extension_cgroups(extension_name)
434
435 def remove_extension_cgroups(self, extension_name):
436 # For transient units, cgroups are released automatically when the unit stops, so it is sufficient
437 # to call stop on them. Persistent cgroups are released when the unit is disabled and its configuration
438 # file is deleted.
439 # The assumption is that this method is called after the extension has been uninstalled. For now, since
440 # we're running extensions within transient scopes which clean up after they finish running, no removal
441 # of units is needed. In the future, when the extension is running under its own slice,
442 # the following clean up is needed.
443 unit_filename = self._get_extension_slice_name(extension_name)
444 try:
445 unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
446 shellutil.run_command(["systemctl", "stop", unit_filename])
447 fileutil.rm_files(unit_path)
448 shellutil.run_command(["systemctl", "daemon-reload"])
449 except Exception as e:
450 raise CGroupsException("Failed to remove {0}. Error: {1}".format(unit_filename, ustr(e)))
451
452 def get_extension_cgroups(self, extension_name):
453 # TODO: The slice returned by this function is not used currently. We need to create the extension scopes within
454 # this slice and use the slice to monitor the cgroups. Also see comment in create_extension_cgroups.
455 slice_name = self._get_extension_cgroup_name(extension_name)
456
457 cgroups = []
458
459 def create_cgroup(controller):
460 cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', slice_name)
461 cgroups.append(CGroup.create(cpu_cgroup_path, controller, extension_name))
462
463 self._foreach_controller(create_cgroup, 'Cannot retrieve cgroup for extension {0}; resource usage will not be tracked.'.format(extension_name))
464
465 return cgroups
466254
467 @staticmethod255 @staticmethod
468 def _is_systemd_failure(scope_name, process_output):256 def get_extension_slice_name(extension_name, old_slice=False):
469 unit_not_found = "Unit {0} not found.".format(scope_name)257 # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version.
470 return unit_not_found in process_output or scope_name not in process_output258 # old slice includes <HandlerName>.<ExtensionName>-<HandlerVersion>
259 # new slice without version <HandlerName>.<ExtensionName>
260 if not old_slice:
261 extension_name = extension_name.rsplit("-", 1)[0]
262 # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.
263 return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice"
471264
472 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,265 def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr,
473 error_code=ExtensionErrorCodes.PluginUnknownFailure):266 error_code=ExtensionErrorCodes.PluginUnknownFailure):
474 scope_name = "{0}_{1}".format(self._get_extension_cgroup_name(extension_name), uuid.uuid4())267 scope = "{0}_{1}".format(cmd_name, uuid.uuid4())
475268 extension_slice_name = self.get_extension_slice_name(extension_name)
476 process = subprocess.Popen(269 with self._systemd_run_commands_lock:
477 "systemd-run --unit={0} --scope {1}".format(scope_name, command),270 process = subprocess.Popen( # pylint: disable=W1509
478 shell=shell,271 # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice
479 cwd=cwd,272 # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in extension Cgroup
480 stdout=stdout,273 # since slice unit file configured with accounting enabled.
481 stderr=stderr,274 "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command),
482 env=env,275 shell=shell,
483 preexec_fn=os.setsid)276 cwd=cwd,
277 stdout=stdout,
278 stderr=stderr,
279 env=env,
280 preexec_fn=os.setsid)
281
282 # We start systemd-run with shell == True so process.pid is the shell's pid, not the pid for systemd-run
283 self._systemd_run_commands.append(process.pid)
284
285 scope_name = scope + '.scope'
286
287 logger.info("Started extension in unit '{0}'", scope_name)
288
289 cpu_cgroup = None
290 try:
291 cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name)
484292
485 logger.info("Started extension using scope '{0}'", scope_name)293 cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points()
486 extension_cgroups = []
487294
488 def create_cgroup(controller):295 if cpu_cgroup_mountpoint is None:
489 cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', scope_name + ".scope")296 logger.info("The CPU controller is not mounted; will not track resource usage")
490 extension_cgroups.append(CGroup.create(cgroup_path, controller, extension_name))297 else:
298 cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
299 cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path)
300 CGroupsTelemetry.track_cgroup(cpu_cgroup)
491301
492 self._foreach_controller(create_cgroup, 'Cannot create cgroup for extension {0}; '302 if memory_cgroup_mountpoint is None:
493 'resource usage will not be tracked.'.format(extension_name))303 logger.info("The Memory controller is not mounted; will not track resource usage")
494 self.track_cgroups(extension_cgroups)304 else:
305 memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path)
306 memory_cgroup = MemoryCgroup(extension_name, memory_cgroup_path)
307 CGroupsTelemetry.track_cgroup(memory_cgroup)
308
309 except IOError as e:
310 if e.errno == 2: # 'No such file or directory'
311 logger.info("The extension command already completed; will not track resource usage")
312 logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))
313 except Exception as e:
314 logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))
495315
496 # Wait for process completion or timeout316 # Wait for process completion or timeout
497 try:317 try:
498 process_output = handle_process_completion(process=process,318 return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout,
499 command=command,319 stderr=stderr, error_code=error_code, cpu_cgroup=cpu_cgroup)
500 timeout=timeout,
501 stdout=stdout,
502 stderr=stderr,
503 error_code=error_code)
504 except ExtensionError as e:320 except ExtensionError as e:
505 # The extension didn't terminate successfully. Determine whether it was due to systemd errors or321 # The extension didn't terminate successfully. Determine whether it was due to systemd errors or
506 # extension errors.322 # extension errors.
507 process_output = read_output(stdout, stderr)323 if not self._is_systemd_failure(scope, stderr):
508 systemd_failure = self._is_systemd_failure(scope_name, process_output)
509
510 if not systemd_failure:
511 # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error324 # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error
512 raise325 raise
326
327 # There was an issue with systemd-run. We need to log it and retry the extension without systemd.
328 process_output = read_output(stdout, stderr)
329 # Reset the stdout and stderr
330 stdout.truncate(0)
331 stderr.truncate(0)
332
333 if isinstance(e, ExtensionOperationError):
334 # no-member: Instance of 'ExtensionError' has no 'exit_code' member (no-member) - Disabled: e is actually an ExtensionOperationError
335 err_msg = 'Systemd process exited with code %s and output %s' % (
336 e.exit_code, process_output) # pylint: disable=no-member
513 else:337 else:
514 # There was an issue with systemd-run. We need to log it and retry the extension without systemd.338 err_msg = "Systemd timed-out, output: %s" % process_output
515 err_msg = 'Systemd process exited with code %s and output %s' % (e.exit_code, process_output) \339 raise SystemdRunError(err_msg)
516 if isinstance(e, ExtensionOperationError) else "Systemd timed-out, output: %s" % process_output340 finally:
517 event_msg = 'Failed to run systemd-run for unit {0}.scope. ' \341 with self._systemd_run_commands_lock:
518 'Will retry invoking the extension without systemd. ' \342 self._systemd_run_commands.remove(process.pid)
519 'Systemd-run error: {1}'.format(scope_name, err_msg)
520 add_event(AGENT_NAME,
521 version=CURRENT_VERSION,
522 op=WALAEventOperation.InvokeCommandUsingSystemd,
523 is_success=False,
524 log_event=False,
525 message=event_msg)
526 logger.warn(event_msg)
527
528 # Reset the stdout and stderr
529 stdout.truncate(0)
530 stderr.truncate(0)
531
532 # Try invoking the process again, this time without systemd-run
533 logger.info('Extension invocation using systemd failed, falling back to regular invocation '
534 'without cgroups tracking.')
535 process = subprocess.Popen(command,
536 shell=shell,
537 cwd=cwd,
538 env=env,
539 stdout=stdout,
540 stderr=stderr,
541 preexec_fn=os.setsid)
542
543 process_output = handle_process_completion(process=process,
544 command=command,
545 timeout=timeout,
546 stdout=stdout,
547 stderr=stderr,
548 error_code=error_code)
549
550 return [], process_output
551
552 # The process terminated in time and successfully
553 return extension_cgroups, process_output
554343
555 def cleanup_legacy_cgroups(self):344 def cleanup_legacy_cgroups(self):
556 """345 """
557 Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;346 Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;
558 starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If347 starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If
559 we find that any of the legacy groups include the PID of the daemon then we disable data collection for this instance348 we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this
560 (under systemd, moving PIDs across the cgroup file system can produce unpredictable results)349 instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results)
561 """350 """
562 def report_error(_, daemon_pid):351 return CGroupsApi._foreach_legacy_cgroup(lambda *_: None)
563 raise CGroupsException(
564 "The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid))
565
566 CGroupsApi._foreach_legacy_cgroup(report_error)
diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py
index ea6983f..767786f 100644
--- a/azurelinuxagent/common/cgroupconfigurator.py
+++ b/azurelinuxagent/common/cgroupconfigurator.py
@@ -1,3 +1,4 @@
1# -*- encoding: utf-8 -*-
1# Copyright 2018 Microsoft Corporation2# Copyright 2018 Microsoft Corporation
2#3#
3# Licensed under the Apache License, Version 2.0 (the "License");4# Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,157 +14,870 @@
13# limitations under the License.14# limitations under the License.
14#15#
15# Requires Python 2.6+ and Openssl 1.0+16# Requires Python 2.6+ and Openssl 1.0+
1617import glob
18import json
17import os19import os
20import re
18import subprocess21import subprocess
22import threading
1923
24from azurelinuxagent.common import conf
20from azurelinuxagent.common import logger25from azurelinuxagent.common import logger
21from azurelinuxagent.common.cgroupapi import CGroupsApi26from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup
27from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX
22from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry28from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
23from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes29from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException
24from azurelinuxagent.common.future import ustr30from azurelinuxagent.common.future import ustr
25from azurelinuxagent.common.osutil import get_osutil31from azurelinuxagent.common.osutil import get_osutil, systemd
32from azurelinuxagent.common.version import get_distro
33from azurelinuxagent.common.utils import shellutil, fileutil
26from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion34from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion
27from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION
28from azurelinuxagent.common.event import add_event, WALAEventOperation35from azurelinuxagent.common.event import add_event, WALAEventOperation
2936
37AZURE_SLICE = "azure.slice"
38_AZURE_SLICE_CONTENTS = """
39[Unit]
40Description=Slice for Azure VM Agent and Extensions
41DefaultDependencies=no
42Before=slices.target
43"""
44_VMEXTENSIONS_SLICE = EXTENSION_SLICE_PREFIX + ".slice"
45_AZURE_VMEXTENSIONS_SLICE = AZURE_SLICE + "/" + _VMEXTENSIONS_SLICE
46_VMEXTENSIONS_SLICE_CONTENTS = """
47[Unit]
48Description=Slice for Azure VM Extensions
49DefaultDependencies=no
50Before=slices.target
51[Slice]
52CPUAccounting=yes
53MemoryAccounting=yes
54"""
55_EXTENSION_SLICE_CONTENTS = """
56[Unit]
57Description=Slice for Azure VM extension {extension_name}
58DefaultDependencies=no
59Before=slices.target
60[Slice]
61CPUAccounting=yes
62CPUQuota={cpu_quota}
63MemoryAccounting=yes
64"""
65LOGCOLLECTOR_SLICE = "azure-walinuxagent-logcollector.slice"
66# More info on resource limits properties in systemd here:
67# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups
68_LOGCOLLECTOR_SLICE_CONTENTS_FMT = """
69[Unit]
70Description=Slice for Azure VM Agent Periodic Log Collector
71DefaultDependencies=no
72Before=slices.target
73[Slice]
74CPUAccounting=yes
75CPUQuota={cpu_quota}
76MemoryAccounting=yes
77"""
78_LOGCOLLECTOR_CPU_QUOTA = "5%"
79LOGCOLLECTOR_MEMORY_LIMIT = 30 * 1024 ** 2 # 30Mb
80
81_AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf"
82_AGENT_DROP_IN_FILE_SLICE_CONTENTS = """
83# This drop-in unit file was created by the Azure VM Agent.
84# Do not edit.
85[Service]
86Slice=azure.slice
87"""
88_DROP_IN_FILE_CPU_ACCOUNTING = "11-CPUAccounting.conf"
89_DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS = """
90# This drop-in unit file was created by the Azure VM Agent.
91# Do not edit.
92[Service]
93CPUAccounting=yes
94"""
95_DROP_IN_FILE_CPU_QUOTA = "12-CPUQuota.conf"
96_DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT = """
97# This drop-in unit file was created by the Azure VM Agent.
98# Do not edit.
99[Service]
100CPUQuota={0}
101"""
102_DROP_IN_FILE_MEMORY_ACCOUNTING = "13-MemoryAccounting.conf"
103_DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS = """
104# This drop-in unit file was created by the Azure VM Agent.
105# Do not edit.
106[Service]
107MemoryAccounting=yes
108"""
109
110
111class DisableCgroups(object):
112 ALL = "all"
113 AGENT = "agent"
114 EXTENSIONS = "extensions"
115
116
117def _log_cgroup_info(format_string, *args):
118 message = format_string.format(*args)
119 logger.info("[CGI] " + message)
120 add_event(op=WALAEventOperation.CGroupsInfo, message=message)
121
122
123def _log_cgroup_warning(format_string, *args):
124 message = format_string.format(*args)
125 logger.info("[CGW] " + message) # log as INFO for now, in the future it should be logged as WARNING
126 add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False)
127
30128
31class CGroupConfigurator(object):129class CGroupConfigurator(object):
32 """130 """
33 This class implements the high-level operations on CGroups (e.g. initialization, creation, etc)131 This class implements the high-level operations on CGroups (e.g. initialization, creation, etc)
34132
35 NOTE: with the exception of start_extension_command, none of the methods in this class raise exceptions (cgroup operations should not block extensions)133 NOTE: with the exception of start_extension_command, none of the methods in this class
134 raise exceptions (cgroup operations should not block extensions)
36 """135 """
37 class __impl(object):136
137 class _Impl(object):
38 def __init__(self):138 def __init__(self):
139 self._initialized = False
140 self._cgroups_supported = False
141 self._agent_cgroups_enabled = False
142 self._extensions_cgroups_enabled = False
143 self._cgroups_api = None
144 self._agent_cpu_cgroup_path = None
145 self._agent_memory_cgroup_path = None
146 self._agent_memory_cgroup = None
147 self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop.
148
149 def initialize(self):
150 try:
151 if self._initialized:
152 return
153 # This check is to reset the quotas if agent goes from cgroup supported to unsupported distros later in time.
154 if not CGroupsApi.cgroups_supported():
155 agent_drop_in_path = systemd.get_agent_drop_in_path()
156 try:
157 if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path):
158 files_to_cleanup = []
159 agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
160 agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path,
161 _DROP_IN_FILE_CPU_ACCOUNTING)
162 agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path,
163 _DROP_IN_FILE_MEMORY_ACCOUNTING)
164 agent_drop_in_file_cpu_quota = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
165 files_to_cleanup.extend([agent_drop_in_file_slice, agent_drop_in_file_cpu_accounting,
166 agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota])
167 self.__cleanup_all_files(files_to_cleanup)
168 self.__reload_systemd_config()
169 logger.info("Agent reset the quotas if distro: {0} goes from supported to unsupported list", get_distro())
170 except Exception as err:
171 logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err))
172
173 # check whether cgroup monitoring is supported on the current distro
174 self._cgroups_supported = CGroupsApi.cgroups_supported()
175 if not self._cgroups_supported:
176 logger.info("Cgroup monitoring is not supported on {0}", get_distro())
177 return
178
179 # check that systemd is detected correctly
180 self._cgroups_api = SystemdCgroupsApi()
181 if not systemd.is_systemd():
182 _log_cgroup_warning("systemd was not detected on {0}", get_distro())
183 return
184
185 _log_cgroup_info("systemd version: {0}", systemd.get_version())
186
187 # This is temporarily disabled while we analyze telemetry. Likely it will be removed.
188 # self.__collect_azure_unit_telemetry()
189 # self.__collect_agent_unit_files_telemetry()
190
191 if not self.__check_no_legacy_cgroups():
192 return
193
194 agent_unit_name = systemd.get_agent_unit_name()
195 agent_slice = systemd.get_unit_property(agent_unit_name, "Slice")
196 if agent_slice not in (AZURE_SLICE, "system.slice"):
197 _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice)
198 return
199
200 self.__setup_azure_slice()
201
202 cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers()
203 self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice,
204 cpu_controller_root,
205 memory_controller_root)
206
207 if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None:
208 self.enable()
209
210 if self._agent_cpu_cgroup_path is not None:
211 _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path)
212 self.__set_cpu_quota(conf.get_agent_cpu_quota())
213 CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
214
215 if self._agent_memory_cgroup_path is not None:
216 _log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path)
217 self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path)
218 CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup)
219
220 _log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled)
221
222 except Exception as exception:
223 _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception))
224 finally:
225 self._initialized = True
226
227 @staticmethod
228 def __collect_azure_unit_telemetry():
229 azure_units = []
230
231 try:
232 units = shellutil.run_command(['systemctl', 'list-units', 'azure*', '-all'])
233 for line in units.split('\n'):
234 match = re.match(r'\s?(azure[^\s]*)\s?', line, re.IGNORECASE)
235 if match is not None:
236 azure_units.append((match.group(1), line))
237 except shellutil.CommandError as command_error:
238 _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
239
240 for unit_name, unit_description in azure_units:
241 unit_slice = "Unknown"
242 try:
243 unit_slice = systemd.get_unit_property(unit_name, "Slice")
244 except Exception as exception:
245 _log_cgroup_warning("Failed to query Slice for {0}: {1}", unit_name, ustr(exception))
246
247 _log_cgroup_info("Found an Azure unit under slice {0}: {1}", unit_slice, unit_description)
248
249 if len(azure_units) == 0:
250 try:
251 cgroups = shellutil.run_command('systemd-cgls')
252 for line in cgroups.split('\n'):
253 if re.match(r'[^\x00-\xff]+azure\.slice\s*', line, re.UNICODE):
254 logger.info(ustr("Found a cgroup for azure.slice\n{0}").format(cgroups))
255 # Don't add the output of systemd-cgls to the telemetry, since currently it does not support Unicode
256 add_event(op=WALAEventOperation.CGroupsInfo, message="Found a cgroup for azure.slice")
257 except shellutil.CommandError as command_error:
258 _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
259
260 @staticmethod
261 def __collect_agent_unit_files_telemetry():
262 agent_unit_files = []
263 agent_service_name = get_osutil().get_service_name()
264 try:
265 fragment_path = systemd.get_unit_property(agent_service_name, "FragmentPath")
266 if fragment_path != systemd.get_agent_unit_file():
267 agent_unit_files.append(fragment_path)
268 except Exception as exception:
269 _log_cgroup_warning("Failed to query the agent's FragmentPath: {0}", ustr(exception))
270
271 try:
272 drop_in_paths = systemd.get_unit_property(agent_service_name, "DropInPaths")
273 for path in drop_in_paths.split():
274 agent_unit_files.append(path)
275 except Exception as exception:
276 _log_cgroup_warning("Failed to query the agent's DropInPaths: {0}", ustr(exception))
277
278 for unit_file in agent_unit_files:
279 try:
280 with open(unit_file, "r") as file_object:
281 _log_cgroup_info("Found a custom unit file for the agent: {0}\n{1}", unit_file,
282 file_object.read())
283 except Exception as exception:
284 _log_cgroup_warning("Can't read {0}: {1}", unit_file, ustr(exception))
285
286 def __check_no_legacy_cgroups(self):
287 """
288 Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running
289 under systemd this could produce invalid resource usage data. Cgroups should not be enabled under this condition.
290 """
291 legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups()
292 if legacy_cgroups > 0:
293 _log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.")
294 return False
295 return True
296
297 def __get_cgroup_controllers(self):
298 #
299 # check v1 controllers
300 #
301 cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points()
302
303 if cpu_controller_root is not None:
304 logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root)
305 else:
306 _log_cgroup_warning("The CPU cgroup controller is not mounted")
307
308 if memory_controller_root is not None:
309 logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root)
310 else:
311 _log_cgroup_warning("The memory cgroup controller is not mounted")
312
313 #
314 # check v2 controllers
315 #
316 cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers()
317 if cgroup2_mount_point is not None:
318 _log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point,
319 cgroup2_controllers)
320
321 return cpu_controller_root, memory_controller_root
322
323 @staticmethod
324 def __setup_azure_slice():
39 """325 """
40 Ensures the cgroups file system is mounted and selects the correct API to interact with it326 The agent creates "azure.slice" for use by extensions and the agent. The agent runs under "azure.slice" directly and each
327 extension runs under its own slice ("Microsoft.CPlat.Extension.slice" in the example below). All the slices for
328 extensions are grouped under "vmextensions.slice".
329
330 Example: -.slice
331 ├─user.slice
332 ├─system.slice
333 └─azure.slice
334 ├─walinuxagent.service
335 │ ├─5759 /usr/bin/python3 -u /usr/sbin/waagent -daemon
336 │ └─5764 python3 -u bin/WALinuxAgent-2.2.53-py2.7.egg -run-exthandlers
337 └─azure-vmextensions.slice
338 └─Microsoft.CPlat.Extension.slice
339 └─5894 /usr/bin/python3 /var/lib/waagent/Microsoft.CPlat.Extension-1.0.0.0/enable.py
340
341 This method ensures that the "azure" and "vmextensions" slices are created. Setup should create those slices
342 under /lib/systemd/system; but if they do not exist, __ensure_azure_slices_exist will create them.
343
344 It also creates drop-in files to set the agent's Slice and CPUAccounting if they have not been
345 set up in the agent's unit file.
346
347 Lastly, the method also cleans up unit files left over from previous versions of the agent.
41 """348 """
42 osutil = get_osutil()
43349
44 self._cgroups_supported = osutil.is_cgroups_supported()350 # Older agents used to create this slice, but it was never used. Cleanup the file.
351 CGroupConfigurator._Impl.__cleanup_unit_file("/etc/systemd/system/system-walinuxagent.extensions.slice")
352
353 unit_file_install_path = systemd.get_unit_file_install_path()
354 azure_slice = os.path.join(unit_file_install_path, AZURE_SLICE)
355 vmextensions_slice = os.path.join(unit_file_install_path, _VMEXTENSIONS_SLICE)
356 logcollector_slice = os.path.join(unit_file_install_path, LOGCOLLECTOR_SLICE)
357 agent_unit_file = systemd.get_agent_unit_file()
358 agent_drop_in_path = systemd.get_agent_drop_in_path()
359 agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
360 agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_ACCOUNTING)
361 agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_MEMORY_ACCOUNTING)
362
363 files_to_create = []
364
365 if not os.path.exists(azure_slice):
366 files_to_create.append((azure_slice, _AZURE_SLICE_CONTENTS))
367
368 if not os.path.exists(vmextensions_slice):
369 files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS))
370
371 # Update log collector slice contents
372 slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA)
373 files_to_create.append((logcollector_slice, slice_contents))
374
375 if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None:
376 CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_slice)
377 else:
378 if not os.path.exists(agent_drop_in_file_slice):
379 files_to_create.append((agent_drop_in_file_slice, _AGENT_DROP_IN_FILE_SLICE_CONTENTS))
380
381 if fileutil.findre_in_file(agent_unit_file, r"CPUAccounting=") is not None:
382 CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_cpu_accounting)
383 else:
384 if not os.path.exists(agent_drop_in_file_cpu_accounting):
385 files_to_create.append((agent_drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS))
386
387 if fileutil.findre_in_file(agent_unit_file, r"MemoryAccounting=") is not None:
388 CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_memory_accounting)
389 else:
390 if not os.path.exists(agent_drop_in_file_memory_accounting):
391 files_to_create.append(
392 (agent_drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS))
393
394 if len(files_to_create) > 0:
395 # create the unit files, but if 1 fails remove all and return
396 try:
397 for path, contents in files_to_create:
398 CGroupConfigurator._Impl.__create_unit_file(path, contents)
399 except Exception as exception:
400 _log_cgroup_warning("Failed to create unit files for the azure slice: {0}", ustr(exception))
401 for unit_file in files_to_create:
402 CGroupConfigurator._Impl.__cleanup_unit_file(unit_file)
403 return
404
405 CGroupConfigurator._Impl.__reload_systemd_config()
45406
46 if self._cgroups_supported:407 @staticmethod
47 self._enabled = True408 def __reload_systemd_config():
409 # reload the systemd configuration; the new slices will be used once the agent's service restarts
410 try:
411 logger.info("Executing systemctl daemon-reload...")
412 shellutil.run_command(["systemctl", "daemon-reload"])
413 except Exception as exception:
414 _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception))
415
416 @staticmethod
417 def __create_unit_file(path, contents):
418 parent, _ = os.path.split(path)
419 if not os.path.exists(parent):
420 fileutil.mkdir(parent, mode=0o755)
421 exists = os.path.exists(path)
422 fileutil.write_file(path, contents)
423 _log_cgroup_info("{0} {1}", "Updated" if exists else "Created", path)
424
425 @staticmethod
426 def __cleanup_unit_file(path):
427 if os.path.exists(path):
48 try:428 try:
49 osutil.mount_cgroups()429 os.remove(path)
50 self._cgroups_api = CGroupsApi.create()430 _log_cgroup_info("Removed {0}", path)
51 status = "The cgroup filesystem is ready to use"431 except Exception as exception:
52 except Exception as e:432 _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception))
53 status = ustr(e)433
54 self._enabled = False434 @staticmethod
435 def __cleanup_all_files(files_to_cleanup):
436 for path in files_to_cleanup:
437 if os.path.exists(path):
438 try:
439 os.remove(path)
440 _log_cgroup_info("Removed {0}", path)
441 except Exception as exception:
442 _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception))
443
444 @staticmethod
445 def __create_all_files(files_to_create):
446 # create the unit files, but if 1 fails remove all and return
447 try:
448 for path, contents in files_to_create:
449 CGroupConfigurator._Impl.__create_unit_file(path, contents)
450 except Exception as exception:
451 _log_cgroup_warning("Failed to create unit files : {0}", ustr(exception))
452 for unit_file in files_to_create:
453 CGroupConfigurator._Impl.__cleanup_unit_file(unit_file)
454 return
455
456 def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota=None):
457 unit_file_install_path = systemd.get_unit_file_install_path()
458 old_extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name, old_slice=True))
459 # clean up the old slice from the disk
460 if os.path.exists(old_extension_slice_path):
461 CGroupConfigurator._Impl.__cleanup_unit_file(old_extension_slice_path)
462
463 extension_slice_path = os.path.join(unit_file_install_path,
464 SystemdCgroupsApi.get_extension_slice_name(extension_name))
465 cpu_quota = str(
466 cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
467 slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name,
468 cpu_quota=cpu_quota)
469 if os.path.exists(extension_slice_path):
470 with open(extension_slice_path, "r") as file_:
471 if file_.read() == slice_contents:
472 return True
473 return False
474
475 def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controller_root):
476 agent_unit_name = systemd.get_agent_unit_name()
477
478 expected_relative_path = os.path.join(agent_slice, agent_unit_name)
479 cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths(
480 "self")
481
482 if cpu_cgroup_relative_path is None:
483 _log_cgroup_warning("The agent's process is not within a CPU cgroup")
484 else:
485 if cpu_cgroup_relative_path == expected_relative_path:
486 _log_cgroup_info('CPUAccounting: {0}', systemd.get_unit_property(agent_unit_name, "CPUAccounting"))
487 _log_cgroup_info('CPUQuota: {0}', systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec"))
488 else:
489 _log_cgroup_warning(
490 "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]",
491 cpu_cgroup_relative_path,
492 expected_relative_path)
493 cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring
494
495 if memory_cgroup_relative_path is None:
496 _log_cgroup_warning("The agent's process is not within a memory cgroup")
55 else:497 else:
56 self._enabled = False498 if memory_cgroup_relative_path == expected_relative_path:
57 self._cgroups_api = None499 memory_accounting = systemd.get_unit_property(agent_unit_name, "MemoryAccounting")
58 status = "Cgroups are not supported by the platform"500 _log_cgroup_info('MemoryAccounting: {0}', memory_accounting)
501 else:
502 _log_cgroup_info(
503 "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]",
504 memory_cgroup_relative_path,
505 expected_relative_path)
506 memory_cgroup_relative_path = None # Set the path to None to prevent monitoring
59507
60 logger.info("CGroups Status: {0}".format(status))508 if cpu_controller_root is not None and cpu_cgroup_relative_path is not None:
509 agent_cpu_cgroup_path = os.path.join(cpu_controller_root, cpu_cgroup_relative_path)
510 else:
511 agent_cpu_cgroup_path = None
512
513 if memory_controller_root is not None and memory_cgroup_relative_path is not None:
514 agent_memory_cgroup_path = os.path.join(memory_controller_root, memory_cgroup_relative_path)
515 else:
516 agent_memory_cgroup_path = None
61517
62 add_event(518 return agent_cpu_cgroup_path, agent_memory_cgroup_path
63 AGENT_NAME,519
64 version=CURRENT_VERSION,520 def supported(self):
65 op=WALAEventOperation.InitializeCGroups,521 return self._cgroups_supported
66 is_success=self._enabled,
67 message=status,
68 log_event=False)
69522
70 def enabled(self):523 def enabled(self):
71 return self._enabled524 return self._agent_cgroups_enabled or self._extensions_cgroups_enabled
525
526 def agent_enabled(self):
527 return self._agent_cgroups_enabled
528
529 def extensions_enabled(self):
530 return self._extensions_cgroups_enabled
72531
73 def enable(self):532 def enable(self):
74 if not self._cgroups_supported:533 if not self.supported():
75 raise CGroupsException("cgroups are not supported on the current platform")534 raise CGroupsException(
535 "Attempted to enable cgroups, but they are not supported on the current platform")
536 self._agent_cgroups_enabled = True
537 self._extensions_cgroups_enabled = True
76538
77 self._enabled = True539 def disable(self, reason, disable_cgroups):
540 if disable_cgroups == DisableCgroups.ALL: # disable all
541 # Reset quotas
542 self.__reset_agent_cpu_quota()
543 extension_services = self.get_extension_services_list()
544 for extension in extension_services:
545 logger.info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension]))
546 self.__reset_extension_cpu_quota(extension_name=extension)
547 self.__reset_extension_services_cpu_quota(extension_services[extension])
548 self.__reload_systemd_config()
78549
79 def disable(self):550 CGroupsTelemetry.reset()
80 self._enabled = False551 self._agent_cgroups_enabled = False
81 CGroupsTelemetry.reset()552 self._extensions_cgroups_enabled = False
553 elif disable_cgroups == DisableCgroups.AGENT: # disable agent
554 self._agent_cgroups_enabled = False
555 self.__reset_agent_cpu_quota()
556 CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
82557
83 def _invoke_cgroup_operation(self, operation, error_message, on_error=None):558 message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason)
559 logger.info(message) # log as INFO for now, in the future it should be logged as WARNING
560 add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False)
561
562 @staticmethod
563 def __set_cpu_quota(quota):
84 """564 """
85 Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation.565 Sets the agent's CPU quota to the given percentage (100% == 1 CPU)
566
567 NOTE: This is done using a dropin file in the default dropin directory; any local overrides on the VM will take precedence
568 over this setting.
86 """569 """
87 if not self.enabled():570 quota_percentage = "{0}%".format(quota)
88 return571 _log_cgroup_info("Ensuring the agent's CPUQuota is {0}", quota_percentage)
572 if CGroupConfigurator._Impl.__try_set_cpu_quota(quota_percentage):
573 CGroupsTelemetry.set_track_throttled_time(True)
574
575 @staticmethod
576 def __reset_agent_cpu_quota():
577 """
578 Removes any CPUQuota on the agent
89579
580 NOTE: This resets the quota on the agent's default dropin file; any local overrides on the VM will take precedence
581 over this setting.
582 """
583 logger.info("Resetting agent's CPUQuota")
584 if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity)
585 _log_cgroup_info('CPUQuota: {0}',
586 systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec"))
587
588 @staticmethod
589 def __try_set_cpu_quota(quota):
90 try:590 try:
91 return operation()591 drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
92 except Exception as e:592 contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(quota)
93 logger.warn("{0} Error: {1}".format(error_message, ustr(e)))593 if os.path.exists(drop_in_file):
94 if on_error is not None:594 with open(drop_in_file, "r") as file_:
95 try:595 if file_.read() == contents:
96 on_error(e)596 return True # no need to update the file; return here to avoid doing a daemon-reload
97 except Exception as ex:597 CGroupConfigurator._Impl.__create_unit_file(drop_in_file, contents)
98 logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e)))598 except Exception as exception:
599 _log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception))
600 return False
601 try:
602 logger.info("Executing systemctl daemon-reload...")
603 shellutil.run_command(["systemctl", "daemon-reload"])
604 except Exception as exception:
605 _log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception))
606 return False
607 return True
608
609 def check_cgroups(self, cgroup_metrics):
610 self._check_cgroups_lock.acquire()
611 try:
612 if not self.enabled():
613 return
614
615 errors = []
616
617 process_check_success = False
618 try:
619 self._check_processes_in_agent_cgroup()
620 process_check_success = True
621 except CGroupsException as exception:
622 errors.append(exception)
99623
100 def create_agent_cgroups(self, track_cgroups):624 quota_check_success = False
625 try:
626 if cgroup_metrics:
627 self._check_agent_throttled_time(cgroup_metrics)
628 quota_check_success = True
629 except CGroupsException as exception:
630 errors.append(exception)
631
632 reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors]))
633
634 if not process_check_success and conf.get_cgroup_disable_on_process_check_failure():
635 self.disable(reason, DisableCgroups.ALL)
636
637 if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure():
638 self.disable(reason, DisableCgroups.AGENT)
639 finally:
640 self._check_cgroups_lock.release()
641
642 def _check_processes_in_agent_cgroup(self):
101 """643 """
102 Creates and returns the cgroups needed to track the VM Agent644 Verifies that the agent's cgroup includes only the current process, its parent, commands started using shellutil and instances of systemd-run
645 (those processes correspond, respectively, to the extension handler, the daemon, commands started by the extension handler, and the systemd-run
646 commands used to start extensions on their own cgroup).
647 Other processes started by the agent (e.g. extensions) and processes not started by the agent (e.g. services installed by extensions) are reported
648 as unexpected, since they should belong to their own cgroup.
649
650 Raises a CGroupsException if the check fails
103 """651 """
104 def __impl():652 unexpected = []
105 cgroups = self._cgroups_api.create_agent_cgroups()653 agent_cgroup_proc_names = []
654 try:
655 daemon = os.getppid()
656 extension_handler = os.getpid()
657 agent_commands = set()
658 agent_commands.update(shellutil.get_running_commands())
659 systemd_run_commands = set()
660 systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
661 agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path)
662 # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup;
663 agent_commands.update(shellutil.get_running_commands())
664 systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
106665
107 if track_cgroups:666 for process in agent_cgroup:
108 for cgroup in cgroups:667 agent_cgroup_proc_names.append(self.__format_process(process))
109 CGroupsTelemetry.track_cgroup(cgroup)668 # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't.
669 if process in (daemon, extension_handler) or process in systemd_run_commands:
670 continue
671 # check shell systemd_run process if above process check didn't catch it
672 if self._check_systemd_run_process(process):
673 continue
674 # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent
675 if self._get_parent(process) in systemd_run_commands and self._get_command(
676 process) == 'systemd-run':
677 continue
678 # check if the process is a command started by the agent or a descendant of one of those commands
679 current = process
680 while current != 0 and current not in agent_commands:
681 current = self._get_parent(current)
682 # Verify if Process started by agent based on the marker found in process environment or process is in Zombie state.
683 # If so, consider it as valid process in agent cgroup.
684 if current == 0 and not (self.__is_process_descendant_of_the_agent(process) or self.__is_zombie_process(process)):
685 unexpected.append(self.__format_process(process))
686 if len(unexpected) >= 5: # collect just a small sample
687 break
688 except Exception as exception:
689 _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception)))
110690
111 return cgroups691 if len(unexpected) > 0:
692 self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected)
693 raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected))
112694
113 self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked.")695 @staticmethod
696 def _get_command(pid):
697 try:
698 with open('/proc/{0}/comm'.format(pid), "r") as file_:
699 comm = file_.read()
700 if comm and comm[-1] == '\x00': # if null-terminated, remove the null
701 comm = comm[:-1]
702 return comm.rstrip()
703 except Exception:
704 return "UNKNOWN"
114705
115 def cleanup_legacy_cgroups(self):706 @staticmethod
116 def __impl():707 def __format_process(pid):
117 self._cgroups_api.cleanup_legacy_cgroups()708 """
709 Formats the given PID as a string containing the PID and the corresponding command line truncated to 64 chars
710 """
711 try:
712 cmdline = '/proc/{0}/cmdline'.format(pid)
713 if os.path.exists(cmdline):
714 with open(cmdline, "r") as cmdline_file:
715 return "[PID: {0}] {1:64.64}".format(pid, cmdline_file.read())
716 except Exception:
717 pass
718 return "[PID: {0}] UNKNOWN".format(pid)
118719
119 message = 'Failed to process legacy cgroups. Collection of resource usage data will be disabled.'720 @staticmethod
721 def __is_process_descendant_of_the_agent(pid):
722 """
723 Returns True if the process is descendant of the agent by looking at the env flag(AZURE_GUEST_AGENT_PARENT_PROCESS_NAME)
724 that we set when the process starts otherwise False.
725 """
726 try:
727 env = '/proc/{0}/environ'.format(pid)
728 if os.path.exists(env):
729 with open(env, "r") as env_file:
730 environ = env_file.read()
731 if environ and environ[-1] == '\x00':
732 environ = environ[:-1]
733 return "{0}={1}".format(shellutil.PARENT_PROCESS_NAME, shellutil.AZURE_GUEST_AGENT) in environ
734 except Exception:
735 pass
736 return False
120737
121 def disable_cgroups(exception):738 @staticmethod
122 self.disable()739 def __is_zombie_process(pid):
123 add_event(740 """
124 AGENT_NAME,741 Returns True if process is in Zombie state otherwise False.
125 version=CURRENT_VERSION,
126 op=WALAEventOperation.CGroupsCleanUp,
127 is_success=False,
128 log_event=False,
129 message='{0} {1}'.format(message, ustr(exception)))
130742
131 self._invoke_cgroup_operation(__impl, message, on_error=disable_cgroups)743 Ex: cat /proc/18171/stat
744 18171 (python3) S 18103 18103 18103 0 -1 4194624 57736 64902 0 3
745 """
746 try:
747 stat = '/proc/{0}/stat'.format(pid)
748 if os.path.exists(stat):
749 with open(stat, "r") as stat_file:
750 return stat_file.read().split()[2] == 'Z'
751 except Exception:
752 pass
753 return False
132754
133 def create_extension_cgroups_root(self):755 @staticmethod
756 def _check_systemd_run_process(process):
134 """757 """
135 Creates the container (directory/cgroup) that includes the cgroups for all extensions (/sys/fs/cgroup/*/walinuxagent.extensions)758 Returns True if process is shell systemd-run process started by agent otherwise False.
759
760 Ex: sh,7345 -c systemd-run --unit=enable_7c5cab19-eb79-4661-95d9-9e5091bd5ae0 --scope --slice=azure-vmextensions-Microsoft.OSTCExtensions.VMAccessForLinux_1.5.11.slice /var/lib/waagent/Microsoft.OSTCExtensions.VMAccessForLinux-1.5.11/processes.sh
136 """761 """
137 def __impl():762 try:
138 self._cgroups_api.create_extension_cgroups_root()763 process_name = "UNKNOWN"
764 cmdline = '/proc/{0}/cmdline'.format(process)
765 if os.path.exists(cmdline):
766 with open(cmdline, "r") as cmdline_file:
767 process_name = "{0}".format(cmdline_file.read())
768 match = re.search(r'systemd-run.*--unit=.*--scope.*--slice=azure-vmextensions.*', process_name)
769 if match is not None:
770 return True
771 except Exception:
772 pass
773 return False
774
775 @staticmethod
776 def _report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected):
777 for proc_name in unexpected:
778 if 'UNKNOWN' in proc_name:
779 msg = "Agent includes following processes when UNKNOWN process found: {0}".format("\n".join([ustr(proc) for proc in agent_cgroup_proc_names]))
780 add_event(op=WALAEventOperation.CGroupsInfo, message=msg)
139781
140 self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked.")782 @staticmethod
783 def _check_agent_throttled_time(cgroup_metrics):
784 for metric in cgroup_metrics:
785 if metric.instance == AGENT_NAME_TELEMETRY and metric.counter == MetricsCounter.THROTTLED_TIME:
786 if metric.value > conf.get_agent_cpu_throttled_time_threshold():
787 raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value))
141788
142 def create_extension_cgroups(self, name):789 def check_agent_memory_usage(self):
790 if self.enabled() and self._agent_memory_cgroup:
791 metrics = self._agent_memory_cgroup.get_tracked_metrics()
792 current_usage = 0
793 for metric in metrics:
794 if metric.counter == MetricsCounter.TOTAL_MEM_USAGE:
795 current_usage += metric.value
796 elif metric.counter == MetricsCounter.SWAP_MEM_USAGE:
797 current_usage += metric.value
798
799 if current_usage > conf.get_agent_memory_quota():
800 raise AgentMemoryExceededException("The agent memory limit {0} bytes exceeded. The current reported usage is {1} bytes.".format(conf.get_agent_memory_quota(), current_usage))
801
802 @staticmethod
803 def _get_parent(pid):
143 """804 """
144 Creates and returns the cgroups for the given extension805 Returns the parent of the given process. If the parent cannot be determined returns 0 (which is the PID for the scheduler)
145 """806 """
146 def __impl():807 try:
147 return self._cgroups_api.create_extension_cgroups(name)808 stat = '/proc/{0}/stat'.format(pid)
809 if os.path.exists(stat):
810 with open(stat, "r") as stat_file:
811 return int(stat_file.read().split()[3])
812 except Exception:
813 pass
814 return 0
148815
149 return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked.".format(name))816 def start_tracking_unit_cgroups(self, unit_name):
817 """
818 TODO: Start tracking Memory Cgroups
819 """
820 try:
821 cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name)
822
823 if cpu_cgroup_path is None:
824 logger.info("The CPU controller is not mounted; will not track resource usage")
825 else:
826 CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path))
827
828 if memory_cgroup_path is None:
829 logger.info("The Memory controller is not mounted; will not track resource usage")
830 else:
831 CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path))
832
833 except Exception as exception:
834 logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception))
150835
151 def remove_extension_cgroups(self, name):836 def stop_tracking_unit_cgroups(self, unit_name):
152 """837 """
153 Deletes the cgroup for the given extension838 TODO: remove Memory cgroups from tracked list.
154 """839 """
155 def __impl():840 try:
156 cgroups = self._cgroups_api.remove_extension_cgroups(name)841 cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name)
157 return cgroups842
843 if cpu_cgroup_path is not None:
844 CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path))
845
846 if memory_cgroup_path is not None:
847 CGroupsTelemetry.stop_tracking(MemoryCgroup(unit_name, memory_cgroup_path))
158848
159 self._invoke_cgroup_operation(__impl, "Failed to delete cgroups for extension '{0}'.".format(name))849 except Exception as exception:
850 logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
160851
161 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,852 def stop_tracking_extension_cgroups(self, extension_name):
853 """
854 TODO: remove extension Memory cgroups from tracked list
855 """
856 try:
857 extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name)
858 cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE,
859 extension_slice_name)
860
861 cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self._cgroups_api.get_cgroup_mount_points()
862 cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
863 memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path)
864
865 if cpu_cgroup_path is not None:
866 CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path))
867
868 if memory_cgroup_path is not None:
869 CGroupsTelemetry.stop_tracking(MemoryCgroup(extension_name, memory_cgroup_path))
870
871 except Exception as exception:
872 logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
873
874 def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr,
162 error_code=ExtensionErrorCodes.PluginUnknownFailure):875 error_code=ExtensionErrorCodes.PluginUnknownFailure):
163 """876 """
164 Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup877 Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup
165 :param extension_name: The extension executing the command878 :param extension_name: The extension executing the command
166 :param command: The command to invoke879 :param command: The command to invoke
880 :param cmd_name: The type of the command(enable, install, etc.)
167 :param timeout: Number of seconds to wait for command completion881 :param timeout: Number of seconds to wait for command completion
168 :param cwd: The working directory for the command882 :param cwd: The working directory for the command
169 :param env: The environment to pass to the command's process883 :param env: The environment to pass to the command's process
@@ -172,39 +886,207 @@ class CGroupConfigurator(object):
172 :param stderr: File object to redirect stderr to886 :param stderr: File object to redirect stderr to
173 :param error_code: Extension error code to raise in case of error887 :param error_code: Extension error code to raise in case of error
174 """888 """
175 if not self.enabled():889 if self.enabled():
176 process = subprocess.Popen(command,890 try:
177 shell=shell,891 return self._cgroups_api.start_extension_command(extension_name, command, cmd_name, timeout,
178 cwd=cwd,892 shell=shell, cwd=cwd, env=env, stdout=stdout,
179 env=env,893 stderr=stderr, error_code=error_code)
180 stdout=stdout,894 except SystemdRunError as exception:
181 stderr=stderr,895 reason = 'Failed to start {0} using systemd-run, will try invoking the extension directly. Error: {1}'.format(
182 preexec_fn=os.setsid)896 extension_name, ustr(exception))
183897 self.disable(reason, DisableCgroups.ALL)
184 process_output = handle_process_completion(process=process,898 # fall-through and re-invoke the extension
185 command=command,899
186 timeout=timeout,900 # subprocess-popen-preexec-fn<W1509> Disabled: code is not multi-threaded
187 stdout=stdout,901 process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509
188 stderr=stderr,902 return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code)
189 error_code=error_code)903
190 else:904 def __reset_extension_cpu_quota(self, extension_name):
191 extension_cgroups, process_output = self._cgroups_api.start_extension_command(extension_name,905 """
192 command,906 Removes any CPUQuota on the extension
193 timeout,907
194 shell=shell,908 NOTE: This resets the quota on the extension's slice; any local overrides on the VM will take precedence
195 cwd=cwd,909 over this setting.
196 env=env,910 """
197 stdout=stdout,911 if self.enabled():
198 stderr=stderr,912 self.setup_extension_slice(extension_name, cpu_quota=None)
199 error_code=error_code)913
200914 def setup_extension_slice(self, extension_name, cpu_quota):
201 return process_output915 """
202916 Each extension runs under its own slice (Ex "Microsoft.CPlat.Extension.slice"). All the slices for
203 # unique instance for the singleton (TODO: find a better pattern for a singleton)917 extensions are grouped under "azure-vmextensions.slice.
918
919 This method ensures that the extension slice is created. Setup should create
920 under /lib/systemd/system if it is not exist.
921 TODO: set memory quotas
922 """
923 if self.enabled():
924 unit_file_install_path = systemd.get_unit_file_install_path()
925 extension_slice_path = os.path.join(unit_file_install_path,
926 SystemdCgroupsApi.get_extension_slice_name(extension_name))
927 try:
928 cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
929 if cpu_quota == "":
930 _log_cgroup_info("CPUQuota not set for {0}", extension_name)
931 else:
932 _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota)
933 slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name,
934 cpu_quota=cpu_quota)
935 CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents)
936 except Exception as exception:
937 _log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name,
938 ustr(exception))
939 CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path)
940
941 def remove_extension_slice(self, extension_name):
942 """
943 This method ensures that the extension slice gets removed from /lib/systemd/system if it exist
944 Lastly stop the unit. This would ensure the cleanup the /sys/fs/cgroup controller paths
945 """
946 if self.enabled():
947 unit_file_install_path = systemd.get_unit_file_install_path()
948 extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name)
949 extension_slice_path = os.path.join(unit_file_install_path, extension_slice_name)
950 if os.path.exists(extension_slice_path):
951 self.stop_tracking_extension_cgroups(extension_name)
952 CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path)
953
954 def set_extension_services_cpu_memory_quota(self, services_list):
955 """
956 Each extension service will have name, systemd path and it's quotas.
957 This method ensures that drop-in files are created under service.d folder if quotas given.
958 ex: /lib/systemd/system/extension.service.d/11-CPUAccounting.conf
959 TODO: set memory quotas
960 """
961 if self.enabled() and services_list is not None:
962 for service in services_list:
963 service_name = service.get('name', None)
964 unit_file_path = systemd.get_unit_file_install_path()
965 if service_name is not None and unit_file_path is not None:
966 files_to_create = []
967 drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
968 drop_in_file_cpu_accounting = os.path.join(drop_in_path,
969 _DROP_IN_FILE_CPU_ACCOUNTING)
970 files_to_create.append((drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS))
971 drop_in_file_memory_accounting = os.path.join(drop_in_path,
972 _DROP_IN_FILE_MEMORY_ACCOUNTING)
973 files_to_create.append(
974 (drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS))
975
976 cpu_quota = service.get('cpuQuotaPercentage', None)
977 if cpu_quota is not None:
978 cpu_quota = str(cpu_quota) + "%"
979 _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", service_name, cpu_quota)
980 drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
981 cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota)
982 files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents))
983
984 self.__create_all_files(files_to_create)
985 self.__reload_systemd_config()
986
987 def __reset_extension_services_cpu_quota(self, services_list):
988 """
989 Removes any CPUQuota on the extension service
990
991 NOTE: This resets the quota on the extension service's default dropin file; any local overrides on the VM will take precedence
992 over this setting.
993 """
994 if self.enabled() and services_list is not None:
995 service_name = None
996 try:
997 for service in services_list:
998 service_name = service.get('name', None)
999 unit_file_path = systemd.get_unit_file_install_path()
1000 if service_name is not None and unit_file_path is not None:
1001 files_to_create = []
1002 drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
1003 cpu_quota = "" # setting an empty value resets to the default (infinity)
1004 drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
1005 cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota)
1006 if os.path.exists(drop_in_file_cpu_quota):
1007 with open(drop_in_file_cpu_quota, "r") as file_:
1008 if file_.read() == cpu_quota_contents:
1009 return
1010 files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents))
1011 self.__create_all_files(files_to_create)
1012 except Exception as exception:
1013 _log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception))
1014
1015 def remove_extension_services_drop_in_files(self, services_list):
1016 """
1017 Remove the dropin files from service .d folder for the given service
1018 """
1019 if services_list is not None:
1020 for service in services_list:
1021 service_name = service.get('name', None)
1022 unit_file_path = systemd.get_unit_file_install_path()
1023 if service_name is not None and unit_file_path is not None:
1024 files_to_cleanup = []
1025 drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
1026 drop_in_file_cpu_accounting = os.path.join(drop_in_path,
1027 _DROP_IN_FILE_CPU_ACCOUNTING)
1028 files_to_cleanup.append(drop_in_file_cpu_accounting)
1029 drop_in_file_memory_accounting = os.path.join(drop_in_path,
1030 _DROP_IN_FILE_MEMORY_ACCOUNTING)
1031 files_to_cleanup.append(drop_in_file_memory_accounting)
1032 cpu_quota = service.get('cpuQuotaPercentage', None)
1033 if cpu_quota is not None:
1034 drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
1035 files_to_cleanup.append(drop_in_file_cpu_quota)
1036
1037 CGroupConfigurator._Impl.__cleanup_all_files(files_to_cleanup)
1038 _log_cgroup_info("Drop in files removed for {0}".format(service_name))
1039
1040 def stop_tracking_extension_services_cgroups(self, services_list):
1041 """
1042 Remove the cgroup entry from the tracked groups to stop tracking.
1043 """
1044 if self.enabled() and services_list is not None:
1045 for service in services_list:
1046 service_name = service.get('name', None)
1047 if service_name is not None:
1048 self.stop_tracking_unit_cgroups(service_name)
1049
1050 def start_tracking_extension_services_cgroups(self, services_list):
1051 """
1052 Add the cgroup entry to start tracking the services cgroups.
1053 """
1054 if self.enabled() and services_list is not None:
1055 for service in services_list:
1056 service_name = service.get('name', None)
1057 if service_name is not None:
1058 self.start_tracking_unit_cgroups(service_name)
1059
1060 @staticmethod
1061 def get_extension_services_list():
1062 """
1063 ResourceLimits for extensions are coming from <extName>/HandlerManifest.json file.
1064 Use this pattern to determine all the installed extension HandlerManifest files and
1065 read the extension services if ResourceLimits are present.
1066 """
1067 extensions_services = {}
1068 for manifest_path in glob.iglob(os.path.join(conf.get_lib_dir(), "*/HandlerManifest.json")):
1069 match = re.search("(?P<extname>[\\w+\\.-]+).HandlerManifest\\.json", manifest_path)
1070 if match is not None:
1071 extensions_name = match.group('extname')
1072 if not extensions_name.startswith('WALinuxAgent'):
1073 try:
1074 data = json.loads(fileutil.read_file(manifest_path))
1075 resource_limits = data[0].get('resourceLimits', None)
1076 services = resource_limits.get('services') if resource_limits else None
1077 extensions_services[extensions_name] = services
1078 except (IOError, OSError) as e:
1079 _log_cgroup_warning(
1080 'Failed to load manifest file ({0}): {1}'.format(manifest_path, e.strerror))
1081 except ValueError:
1082 _log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path))
1083 return extensions_services
1084
1085 # unique instance for the singleton
204 _instance = None1086 _instance = None
2051087
206 @staticmethod1088 @staticmethod
207 def get_instance():1089 def get_instance():
208 if CGroupConfigurator._instance is None:1090 if CGroupConfigurator._instance is None:
209 CGroupConfigurator._instance = CGroupConfigurator.__impl()1091 CGroupConfigurator._instance = CGroupConfigurator._Impl()
210 return CGroupConfigurator._instance1092 return CGroupConfigurator._instance
diff --git a/azurelinuxagent/common/cgroupstelemetry.py b/azurelinuxagent/common/cgroupstelemetry.py
index 4bbcba1..7b6bba0 100644
--- a/azurelinuxagent/common/cgroupstelemetry.py
+++ b/azurelinuxagent/common/cgroupstelemetry.py
@@ -15,101 +15,26 @@
15# Requires Python 2.6+ and Openssl 1.0+15# Requires Python 2.6+ and Openssl 1.0+
16import errno16import errno
17import threading17import threading
18from collections import namedtuple
19from datetime import datetime as dt
2018
21from azurelinuxagent.common import logger19from azurelinuxagent.common import logger
22from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers20from azurelinuxagent.common.cgroup import CpuCgroup
23from azurelinuxagent.common.exception import CGroupsException
24from azurelinuxagent.common.future import ustr21from azurelinuxagent.common.future import ustr
25from azurelinuxagent.common.logger import EVERY_SIX_HOURS
26from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo
27
28MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
29StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric'])
30
31DELIM = " | "
32DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
33DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"
34
35
36class MetricsCategory(object):
37 MEMORY_CATEGORY = "Memory"
38 PROCESS_CATEGORY = "Process"
39
40
41class MetricsCounter(object):
42 PROCESSOR_PERCENT_TIME = "% Processor Time"
43 TOTAL_MEM_USAGE = "Total Memory Usage"
44 MAX_MEM_USAGE = "Max Memory Usage"
45 MEM_USED_BY_PROCESS = "Memory Used by Process"
4622
4723
48class CGroupsTelemetry(object):24class CGroupsTelemetry(object):
49 """25 """
50 """26 """
51 _tracked = []27 _tracked = {}
52 _cgroup_metrics = {}28 _track_throttled_time = False
53 _rlock = threading.RLock()29 _rlock = threading.RLock()
5430
55 @staticmethod31 @staticmethod
56 def get_process_info_summary(process_id):32 def set_track_throttled_time(value):
57 process_cmdline = DEFAULT_PROCESS_COMMANDLINE33 CGroupsTelemetry._track_throttled_time = value
58 process_name = DEFAULT_PROCESS_NAME
59
60 # The ProcessName and ProcessCommandLine can generate Exception if the file /proc/<pid>/{comm,cmdline} cease to
61 # exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the
62 # details from those files.
63 try:
64 process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE
65 except Exception as e:
66 logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
67
68 try:
69 process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME
70 except Exception as e:
71 logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
72
73 return process_id + DELIM + process_name + DELIM + process_cmdline
7434
75 @staticmethod35 @staticmethod
76 def _get_metrics_list(metric):36 def get_track_throttled_time():
77 return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(),37 return CGroupsTelemetry._track_throttled_time
78 metric.first_poll_time(), metric.last_poll_time()]
79
80 @staticmethod
81 def _process_cgroup_metric(cgroup_metrics):
82 memory_usage = cgroup_metrics.get_memory_metrics()
83 max_memory_usage = cgroup_metrics.get_max_memory_metrics()
84 cpu_usage = cgroup_metrics.get_cpu_metrics()
85 memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics()
86
87 processed_extension = {}