Merge ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-merge-2.9.1.1 into ubuntu/+source/walinuxagent:ubuntu/mantic-devel

Proposed by Calvin Mwadime Makokha
Status: Superseded
Proposed branch: ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-merge-2.9.1.1
Merge into: ubuntu/+source/walinuxagent:ubuntu/mantic-devel
Diff against target: 74801 lines (+43588/-14724)
404 files modified
.github/PULL_REQUEST_TEMPLATE.md (+1/-2)
.github/codecov.yml (+2/-0)
.github/workflows/ci_pr.yml (+128/-0)
.gitignore (+1/-2)
CODEOWNERS (+3/-1)
README.md (+110/-33)
SECURITY.md (+41/-0)
azurelinuxagent/agent.py (+203/-66)
azurelinuxagent/common/AgentGlobals.py (+39/-0)
azurelinuxagent/common/agent_supported_feature.py (+122/-0)
azurelinuxagent/common/cgroup.py (+208/-77)
azurelinuxagent/common/cgroupapi.py (+214/-429)
azurelinuxagent/common/cgroupconfigurator.py (+1000/-118)
azurelinuxagent/common/cgroupstelemetry.py (+25/-265)
azurelinuxagent/common/conf.py (+272/-21)
azurelinuxagent/common/datacontract.py (+4/-2)
azurelinuxagent/common/dhcp.py (+14/-11)
azurelinuxagent/common/event.py (+286/-112)
azurelinuxagent/common/exception.py (+72/-18)
azurelinuxagent/common/future.py (+91/-15)
azurelinuxagent/common/interfaces.py (+49/-0)
azurelinuxagent/common/logcollector.py (+401/-0)
azurelinuxagent/common/logcollector_manifests.py (+122/-0)
azurelinuxagent/common/logger.py (+35/-7)
azurelinuxagent/common/osutil/alpine.py (+2/-2)
azurelinuxagent/common/osutil/arch.py (+9/-2)
azurelinuxagent/common/osutil/bigip.py (+31/-30)
azurelinuxagent/common/osutil/clearlinux.py (+28/-16)
azurelinuxagent/common/osutil/coreos.py (+9/-3)
azurelinuxagent/common/osutil/debian.py (+13/-13)
azurelinuxagent/common/osutil/default.py (+441/-315)
azurelinuxagent/common/osutil/devuan.py (+52/-0)
azurelinuxagent/common/osutil/factory.py (+60/-33)
azurelinuxagent/common/osutil/fedora.py (+77/-0)
azurelinuxagent/common/osutil/freebsd.py (+43/-33)
azurelinuxagent/common/osutil/gaia.py (+29/-17)
azurelinuxagent/common/osutil/iosxe.py (+19/-7)
azurelinuxagent/common/osutil/mariner.py (+69/-0)
azurelinuxagent/common/osutil/nsbsd.py (+28/-26)
azurelinuxagent/common/osutil/openbsd.py (+16/-19)
azurelinuxagent/common/osutil/openwrt.py (+14/-13)
azurelinuxagent/common/osutil/photonos.py (+65/-0)
azurelinuxagent/common/osutil/redhat.py (+45/-16)
azurelinuxagent/common/osutil/suse.py (+93/-36)
azurelinuxagent/common/osutil/systemd.py (+86/-0)
azurelinuxagent/common/osutil/ubuntu.py (+32/-16)
azurelinuxagent/common/persist_firewall_rules.py (+338/-0)
azurelinuxagent/common/protocol/__init__.py (+0/-5)
azurelinuxagent/common/protocol/extensions_goal_state.py (+244/-0)
azurelinuxagent/common/protocol/extensions_goal_state_factory.py (+36/-0)
azurelinuxagent/common/protocol/extensions_goal_state_from_extensions_config.py (+571/-0)
azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py (+583/-0)
azurelinuxagent/common/protocol/goal_state.py (+705/-0)
azurelinuxagent/common/protocol/hostplugin.py (+371/-38)
azurelinuxagent/common/protocol/imds.py (+30/-14)
azurelinuxagent/common/protocol/metadata_server_migration_util.py (+79/-0)
azurelinuxagent/common/protocol/ovfenv.py (+7/-6)
azurelinuxagent/common/protocol/restapi.py (+165/-91)
azurelinuxagent/common/protocol/util.py (+109/-159)
azurelinuxagent/common/protocol/wire.py (+516/-1095)
azurelinuxagent/common/rdma.py (+199/-56)
azurelinuxagent/common/singletonperthread.py (+30/-0)
azurelinuxagent/common/telemetryevent.py (+72/-3)
azurelinuxagent/common/utils/archive.py (+204/-111)
azurelinuxagent/common/utils/cryptutil.py (+28/-25)
azurelinuxagent/common/utils/extensionprocessutil.py (+31/-7)
azurelinuxagent/common/utils/fileutil.py (+9/-7)
azurelinuxagent/common/utils/flexible_version.py (+29/-9)
azurelinuxagent/common/utils/networkutil.py (+172/-3)
azurelinuxagent/common/utils/restutil.py (+153/-56)
azurelinuxagent/common/utils/shellutil.py (+260/-57)
azurelinuxagent/common/utils/textutil.py (+63/-10)
azurelinuxagent/common/utils/timeutil.py (+39/-0)
azurelinuxagent/common/version.py (+102/-31)
azurelinuxagent/daemon/main.py (+36/-22)
azurelinuxagent/daemon/resourcedisk/default.py (+13/-12)
azurelinuxagent/daemon/resourcedisk/factory.py (+3/-7)
azurelinuxagent/daemon/resourcedisk/freebsd.py (+1/-1)
azurelinuxagent/daemon/resourcedisk/openwrt.py (+2/-2)
azurelinuxagent/daemon/scvmm.py (+3/-3)
azurelinuxagent/ga/collect_logs.py (+353/-0)
azurelinuxagent/ga/collect_telemetry_events.py (+586/-0)
azurelinuxagent/ga/env.py (+177/-123)
azurelinuxagent/ga/exthandlers.py (+1493/-645)
azurelinuxagent/ga/monitor.py (+249/-450)
azurelinuxagent/ga/periodic_operation.py (+81/-0)
azurelinuxagent/ga/remoteaccess.py (+81/-90)
azurelinuxagent/ga/send_telemetry_events.py (+164/-0)
azurelinuxagent/ga/update.py (+1089/-380)
azurelinuxagent/pa/deprovision/arch.py (+1/-1)
azurelinuxagent/pa/deprovision/clearlinux.py (+4/-2)
azurelinuxagent/pa/deprovision/coreos.py (+1/-1)
azurelinuxagent/pa/deprovision/default.py (+55/-17)
azurelinuxagent/pa/deprovision/factory.py (+5/-8)
azurelinuxagent/pa/deprovision/ubuntu.py (+2/-2)
azurelinuxagent/pa/provision/cloudinit.py (+33/-91)
azurelinuxagent/pa/provision/cloudinitdetect.py (+72/-0)
azurelinuxagent/pa/provision/default.py (+28/-42)
azurelinuxagent/pa/provision/factory.py (+3/-3)
azurelinuxagent/pa/rdma/centos.py (+6/-6)
azurelinuxagent/pa/rdma/factory.py (+9/-7)
azurelinuxagent/pa/rdma/suse.py (+12/-3)
azurelinuxagent/pa/rdma/ubuntu.py (+14/-14)
bin/py3/waagent (+53/-0)
bin/waagent (+5/-1)
bin/waagent2.0 (+5/-1)
ci/2.7.pylintrc (+42/-0)
ci/3.6.pylintrc (+40/-0)
ci/nosetests.sh (+25/-0)
config/66-azure-storage.rules (+23/-17)
config/alpine/waagent.conf (+4/-11)
config/arch/waagent.conf (+4/-6)
config/bigip/waagent.conf (+3/-10)
config/clearlinux/waagent.conf (+3/-5)
config/coreos/waagent.conf (+4/-11)
config/debian/waagent.conf (+10/-11)
config/devuan/waagent.conf (+130/-0)
config/freebsd/waagent.conf (+6/-13)
config/gaia/waagent.conf (+4/-6)
config/iosxe/waagent.conf (+4/-6)
config/mariner/waagent.conf (+88/-0)
config/nsbsd/waagent.conf (+4/-6)
config/openbsd/waagent.conf (+4/-6)
config/photonos/waagent.conf (+80/-0)
config/suse/waagent.conf (+12/-8)
config/ubuntu/waagent.conf (+10/-11)
config/waagent.conf (+26/-10)
debian/changelog (+21/-0)
debian/control (+0/-1)
debian/docs (+0/-1)
debian/install (+1/-1)
debian/patches/add_manpage.patch (+471/-0)
debian/patches/disable_udev_overrides.patch (+17/-3)
debian/patches/fix_cgroup_v2_mounting_and_systemd_process.patch (+164/-0)
debian/patches/fix_systemd_networkd_lease_file_path (+83/-0)
debian/patches/series (+4/-2)
debian/patches/sru_v2_9_1_1.patch (+233/-0)
debian/rules (+3/-8)
debian/walinuxagent.manpages (+1/-0)
debian/watch (+2/-2)
dev/null (+0/-29)
init/azure-vmextensions.slice (+7/-0)
init/azure.slice (+4/-0)
init/devuan/default/walinuxagent (+2/-0)
init/devuan/walinuxagent (+344/-0)
init/mariner/waagent.service (+16/-0)
init/photonos/waagent.service (+16/-0)
init/redhat/py2/waagent.service (+19/-0)
init/redhat/waagent.service (+19/-0)
init/sles/waagent.service (+16/-0)
init/ubuntu/walinuxagent.service (+3/-0)
makepkg.py (+66/-51)
setup.py (+143/-60)
test-requirements.txt (+19/-3)
tests/common/dhcp/test_dhcp.py (+27/-14)
tests/common/mock_cgroup_environment.py (+122/-0)
tests/common/mock_command.py (+17/-0)
tests/common/mock_environment.py (+168/-0)
tests/common/osutil/test_alpine.py (+3/-2)
tests/common/osutil/test_arch.py (+3/-2)
tests/common/osutil/test_bigip.py (+20/-21)
tests/common/osutil/test_clearlinux.py (+3/-2)
tests/common/osutil/test_coreos.py (+3/-2)
tests/common/osutil/test_default.py (+429/-316)
tests/common/osutil/test_default_osutil.py (+3/-162)
tests/common/osutil/test_factory.py (+144/-69)
tests/common/osutil/test_freebsd.py (+8/-7)
tests/common/osutil/test_nsbsd.py (+12/-11)
tests/common/osutil/test_openbsd.py (+3/-2)
tests/common/osutil/test_openwrt.py (+3/-2)
tests/common/osutil/test_photonos.py (+37/-0)
tests/common/osutil/test_redhat.py (+3/-2)
tests/common/osutil/test_suse.py (+3/-2)
tests/common/osutil/test_ubuntu.py (+1/-1)
tests/common/test_agent_supported_feature.py (+55/-0)
tests/common/test_cgroupapi.py (+130/-548)
tests/common/test_cgroupconfigurator.py (+973/-261)
tests/common/test_cgroups.py (+62/-82)
tests/common/test_cgroupstelemetry.py (+120/-406)
tests/common/test_conf.py (+28/-53)
tests/common/test_errorstate.py (+2/-1)
tests/common/test_event.py (+583/-314)
tests/common/test_logcollector.py (+477/-0)
tests/common/test_logger.py (+45/-45)
tests/common/test_persist_firewall_rules.py (+416/-0)
tests/common/test_singletonperthread.py (+164/-0)
tests/common/test_telemetryevent.py (+20/-19)
tests/common/test_version.py (+80/-36)
tests/daemon/test_daemon.py (+15/-14)
tests/daemon/test_resourcedisk.py (+5/-5)
tests/data/cgroups/cpu.stat (+3/-0)
tests/data/cgroups/cpu.stat_t0 (+3/-0)
tests/data/cgroups/cpu.stat_t1 (+3/-0)
tests/data/cgroups/cpuacct.stat (+2/-0)
tests/data/cgroups/memory_mount/memory.stat (+36/-0)
tests/data/cgroups/missing_memory_counters/memory.stat (+34/-0)
tests/data/cgroups/proc_pid_cgroup (+13/-0)
tests/data/cgroups/proc_self_cgroup (+13/-0)
tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers (+7/-0)
tests/data/cloud-init/set-hostname (+4/-0)
tests/data/events/custom_script_1.tld (+30/-0)
tests/data/events/custom_script_2.tld (+30/-0)
tests/data/events/custom_script_extra_parameters.tld (+66/-0)
tests/data/events/custom_script_invalid_json.tld (+30/-0)
tests/data/events/custom_script_no_read_access.tld (+30/-0)
tests/data/events/custom_script_nonascii_characters.tld (+30/-0)
tests/data/events/event_with_callstack.waagent.tld (+1/-0)
tests/data/events/extension_events/different_cases/1591918616.json (+22/-0)
tests/data/events/extension_events/empty_message/1592350454.json (+24/-0)
tests/data/events/extension_events/extra_parameters/1592273009.json (+35/-0)
tests/data/events/extension_events/int_type/1519934744.json (+10/-0)
tests/data/events/extension_events/large_messages/1591921510.json (+12/-0)
tests/data/events/extension_events/malformed_files/1592008079.json (+13/-0)
tests/data/events/extension_events/malformed_files/1594857360.tld (+11/-0)
tests/data/events/extension_events/malformed_files/bad_json_files/1591816395.json (+3/-0)
tests/data/events/extension_events/malformed_files/bad_name_file.json (+24/-0)
tests/data/events/extension_events/missing_parameters/1592273793.json (+74/-0)
tests/data/events/extension_events/mix_files/1591835369.json (+3/-0)
tests/data/events/extension_events/mix_files/1591835848.json (+85/-0)
tests/data/events/extension_events/mix_files/1591835859.json (+11/-0)
tests/data/events/extension_events/special_chars/1591918939.json (+10/-0)
tests/data/events/extension_events/well_formed_files/1591905451.json (+82/-0)
tests/data/events/extension_events/well_formed_files/1592355539.json (+72/-0)
tests/data/events/extension_events/well_formed_files/9999999999.json (+82/-0)
tests/data/events/legacy_agent.tld (+66/-0)
tests/data/events/legacy_agent_no_timestamp.tld (+62/-0)
tests/data/ext/event_from_agent.json (+119/-1)
tests/data/ext/event_from_extension.xml (+9/-6)
tests/data/ext/sample-status-invalid-format-emptykey-line7.json (+37/-0)
tests/data/ext/sample-status-invalid-json-format.json (+37/-0)
tests/data/ext/sample-status-invalid-status-no-status-status-key.json (+35/-0)
tests/data/ext/sample-status-very-large-multiple-substatuses.json (+408/-0)
tests/data/ext/sample-status-very-large.json (+39/-0)
tests/data/ext/sample-status.json (+36/-0)
tests/data/ext/sample_ext-1.3.0/python.sh (+11/-0)
tests/data/ext/sample_ext-1.3.0/sample.py (+82/-23)
tests/data/hostgaplugin/ext_conf-empty_depends_on.xml (+56/-0)
tests/data/hostgaplugin/ext_conf-invalid_blob_type.xml (+94/-0)
tests/data/hostgaplugin/ext_conf-no_status_upload_blob.xml (+39/-0)
tests/data/hostgaplugin/ext_conf-requested_version.xml (+148/-0)
tests/data/hostgaplugin/ext_conf.xml (+146/-0)
tests/data/hostgaplugin/in_vm_artifacts_profile.json (+1/-0)
tests/data/hostgaplugin/vm_settings-difference_in_required_features.json (+201/-0)
tests/data/hostgaplugin/vm_settings-empty_depends_on.json (+69/-0)
tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json (+192/-0)
tests/data/hostgaplugin/vm_settings-invalid_blob_type.json (+104/-0)
tests/data/hostgaplugin/vm_settings-missing_cert.json (+68/-0)
tests/data/hostgaplugin/vm_settings-no_manifests.json (+73/-0)
tests/data/hostgaplugin/vm_settings-no_status_upload_blob.json (+66/-0)
tests/data/hostgaplugin/vm_settings-out-of-sync.json (+66/-0)
tests/data/hostgaplugin/vm_settings-parse_error.json (+72/-0)
tests/data/hostgaplugin/vm_settings-requested_version.json (+141/-0)
tests/data/hostgaplugin/vm_settings-unsupported_version.json (+72/-0)
tests/data/hostgaplugin/vm_settings.json (+201/-0)
tests/data/init/azure-vmextensions.slice (+6/-0)
tests/data/init/azure-walinuxagent-logcollector.slice (+9/-0)
tests/data/init/azure.slice (+4/-0)
tests/data/init/walinuxagent.service (+23/-0)
tests/data/init/walinuxagent.service.previous (+20/-0)
tests/data/init/walinuxagent.service_system-slice (+23/-0)
tests/data/test_waagent.conf (+6/-5)
tests/data/wire/certs-2.xml (+85/-0)
tests/data/wire/certs.xml (+80/-76)
tests/data/wire/certs_no_format_specified.xml (+78/-74)
tests/data/wire/ext_conf-no_gs_metadata.xml (+27/-0)
tests/data/wire/ext_conf.xml (+7/-5)
tests/data/wire/ext_conf_additional_locations.xml (+34/-0)
tests/data/wire/ext_conf_aks_extension.xml (+70/-0)
tests/data/wire/ext_conf_autoupgrade.xml (+9/-7)
tests/data/wire/ext_conf_autoupgrade_internalversion.xml (+9/-7)
tests/data/wire/ext_conf_dependencies_with_empty_settings.xml (+33/-0)
tests/data/wire/ext_conf_in_vm_artifacts_profile.xml (+29/-0)
tests/data/wire/ext_conf_in_vm_empty_artifacts_profile.xml (+29/-0)
tests/data/wire/ext_conf_in_vm_metadata.xml (+29/-0)
tests/data/wire/ext_conf_internalversion.xml (+9/-7)
tests/data/wire/ext_conf_invalid_and_valid_handlers.xml (+35/-0)
tests/data/wire/ext_conf_invalid_vm_metadata.xml (+29/-0)
tests/data/wire/ext_conf_missing_family.xml (+15/-14)
tests/data/wire/ext_conf_missing_requested_version.xml (+39/-0)
tests/data/wire/ext_conf_multiple_extensions.xml (+13/-32)
tests/data/wire/ext_conf_no_extensions-block_blob.xml (+13/-0)
tests/data/wire/ext_conf_no_extensions-no_status_blob.xml (+12/-0)
tests/data/wire/ext_conf_no_extensions-page_blob.xml (+25/-0)
tests/data/wire/ext_conf_no_public.xml (+25/-24)
tests/data/wire/ext_conf_no_settings.xml (+24/-23)
tests/data/wire/ext_conf_requested_version.xml (+29/-0)
tests/data/wire/ext_conf_required_features.xml (+41/-0)
tests/data/wire/ext_conf_sequencing.xml (+9/-7)
tests/data/wire/ext_conf_settings_case_mismatch.xml (+57/-0)
tests/data/wire/ext_conf_upgradeguid.xml (+7/-5)
tests/data/wire/ga_manifest.xml (+10/-31)
tests/data/wire/ga_manifest_no_upgrade.xml (+21/-21)
tests/data/wire/goal_state.xml (+7/-7)
tests/data/wire/goal_state_no_certs.xml (+27/-0)
tests/data/wire/goal_state_no_ext.xml (+6/-5)
tests/data/wire/goal_state_noop.xml (+14/-0)
tests/data/wire/goal_state_remote_access.xml (+9/-8)
tests/data/wire/in_vm_artifacts_profile.json (+1/-0)
tests/data/wire/invalid_config/ext_conf_multiple_depends_on_for_single_handler.xml (+45/-0)
tests/data/wire/invalid_config/ext_conf_multiple_runtime_settings_same_plugin.xml (+31/-0)
tests/data/wire/invalid_config/ext_conf_multiple_settings_for_same_handler.xml (+33/-0)
tests/data/wire/invalid_config/ext_conf_plugin_settings_version_mismatch.xml (+31/-0)
tests/data/wire/invalid_config/ext_conf_single_and_multi_config_settings_same_plugin.xml (+31/-0)
tests/data/wire/manifest.xml (+16/-16)
tests/data/wire/manifest_deletion.xml (+1/-1)
tests/data/wire/multi-config/ext_conf_mc_disabled_extensions.xml (+84/-0)
tests/data/wire/multi-config/ext_conf_mc_update_extensions.xml (+75/-0)
tests/data/wire/multi-config/ext_conf_multi_config_no_dependencies.xml (+75/-0)
tests/data/wire/multi-config/ext_conf_with_disabled_multi_config.xml (+129/-0)
tests/data/wire/multi-config/ext_conf_with_multi_config.xml (+131/-0)
tests/data/wire/multi-config/ext_conf_with_multi_config_dependencies.xml (+99/-0)
tests/data/wire/trans_cert (+17/-17)
tests/data/wire/trans_prv (+26/-26)
tests/data/wire/trans_pub (+7/-7)
tests/distro/test_resourceDisk.py (+1/-1)
tests/distro/test_scvmm.py (+6/-5)
tests/ga/extension_emulator.py (+373/-0)
tests/ga/mocks.py (+119/-0)
tests/ga/test_collect_logs.py (+239/-0)
tests/ga/test_collect_telemetry_events.py (+576/-0)
tests/ga/test_env.py (+50/-50)
tests/ga/test_extension.py (+2088/-1355)
tests/ga/test_exthandlers.py (+283/-108)
tests/ga/test_exthandlers_download_extension.py (+116/-58)
tests/ga/test_exthandlers_exthandlerinstance.py (+10/-12)
tests/ga/test_monitor.py (+155/-1141)
tests/ga/test_multi_config_extension.py (+1229/-0)
tests/ga/test_periodic_operation.py (+156/-0)
tests/ga/test_remoteaccess.py (+41/-49)
tests/ga/test_remoteaccess_handler.py (+429/-446)
tests/ga/test_report_status.py (+119/-0)
tests/ga/test_send_telemetry_events.py (+430/-0)
tests/ga/test_update.py (+1913/-733)
tests/pa/test_deprovision.py (+4/-4)
tests/pa/test_provision.py (+34/-27)
tests/protocol/HttpRequestPredicates.py (+101/-0)
tests/protocol/mocks.py (+167/-0)
tests/protocol/mockwiredata.py (+260/-47)
tests/protocol/test_datacontract.py (+5/-5)
tests/protocol/test_extensions_goal_state_from_extensions_config.py (+62/-0)
tests/protocol/test_extensions_goal_state_from_vm_settings.py (+156/-0)
tests/protocol/test_goal_state.py (+545/-0)
tests/protocol/test_healthservice.py (+1/-1)
tests/protocol/test_hostplugin.py (+620/-445)
tests/protocol/test_image_info_matcher.py (+2/-1)
tests/protocol/test_imds.py (+49/-46)
tests/protocol/test_metadata_server_migration_util.py (+134/-0)
tests/protocol/test_protocol_util.py (+208/-85)
tests/protocol/test_wire.py (+864/-934)
tests/test_agent.py (+154/-17)
tests/tools.py (+87/-86)
tests/utils/cgroups_tools.py (+1/-2)
tests/utils/event_logger_tools.py (+65/-0)
tests/utils/miscellaneous_tools.py (+62/-0)
tests/utils/test_archive.py (+123/-179)
tests/utils/test_crypt_util.py (+2/-7)
tests/utils/test_extension_process_util.py (+103/-54)
tests/utils/test_file_util.py (+19/-20)
tests/utils/test_flexible_version.py (+21/-19)
tests/utils/test_network_util.py (+36/-1)
tests/utils/test_rest_util.py (+74/-55)
tests/utils/test_shell_util.py (+337/-70)
tests/utils/test_text_util.py (+32/-14)
tests_e2e/orchestrator/docker/Dockerfile (+85/-0)
tests_e2e/orchestrator/lib/agent_junit.py (+66/-0)
tests_e2e/orchestrator/lib/agent_test_loader.py (+257/-0)
tests_e2e/orchestrator/lib/agent_test_suite.py (+645/-0)
tests_e2e/orchestrator/lib/agent_test_suite_combinator.py (+249/-0)
tests_e2e/orchestrator/runbook.yml (+142/-0)
tests_e2e/orchestrator/sample_runbooks/existing_vm.yml (+143/-0)
tests_e2e/orchestrator/sample_runbooks/local_machine/hello_world.py (+32/-0)
tests_e2e/orchestrator/sample_runbooks/local_machine/local.yml (+32/-0)
tests_e2e/orchestrator/scripts/check-agent-log.py (+49/-0)
tests_e2e/orchestrator/scripts/collect-logs (+34/-0)
tests_e2e/orchestrator/scripts/get-agent-bin-path (+56/-0)
tests_e2e/orchestrator/scripts/get-agent-modules-path (+37/-0)
tests_e2e/orchestrator/scripts/get-agent-python (+59/-0)
tests_e2e/orchestrator/scripts/install-agent (+137/-0)
tests_e2e/orchestrator/scripts/install-tools (+135/-0)
tests_e2e/orchestrator/scripts/uncompress.py (+33/-0)
tests_e2e/orchestrator/scripts/unzip.py (+36/-0)
tests_e2e/pipeline/pipeline-cleanup.yml (+58/-0)
tests_e2e/pipeline/pipeline.yml (+119/-0)
tests_e2e/pipeline/scripts/execute_tests.sh (+120/-0)
tests_e2e/test_suites/agent_bvt.yml (+8/-0)
tests_e2e/test_suites/fail.yml (+5/-0)
tests_e2e/test_suites/images.yml (+94/-0)
tests_e2e/test_suites/pass.yml (+4/-0)
tests_e2e/tests/bvts/extension_operations.py (+94/-0)
tests_e2e/tests/bvts/run_command.py (+94/-0)
tests_e2e/tests/bvts/vm_access.py (+79/-0)
tests_e2e/tests/error_test.py (+32/-0)
tests_e2e/tests/fail_test.py (+33/-0)
tests_e2e/tests/lib/agent_log.py (+446/-0)
tests_e2e/tests/lib/agent_test.py (+66/-0)
tests_e2e/tests/lib/agent_test_context.py (+164/-0)
tests_e2e/tests/lib/identifiers.py (+63/-0)
tests_e2e/tests/lib/logging.py (+155/-0)
tests_e2e/tests/lib/retry.py (+59/-0)
tests_e2e/tests/lib/shell.py (+56/-0)
tests_e2e/tests/lib/ssh_client.py (+85/-0)
tests_e2e/tests/lib/virtual_machine.py (+143/-0)
tests_e2e/tests/lib/vm_extension.py (+239/-0)
tests_e2e/tests/pass_test.py (+33/-0)
Reviewer Review Type Date Requested Status
Lucas Kanashiro Pending
git-ubuntu import Pending
Review via email: mp+452920@code.launchpad.net

This proposal has been superseded by a proposal from 2023-10-31.

Commit message

added sru tests

added manpages

debian/patches: update cgroup logic to include v2

fix_systemd_networkd_lease_file_path.patch: patch osutil/ubuntu.py

debian/control: Remove isc-dhcp-client from Depends

Remove upstart config

debian/rules: stop installing SysV init files and stop running tests

disable_udev_overrides.patch: patch setup.py

debian/patches: Remove deprecated patches

debian/install: add new udev rules

debian/docs: Remove Changelog

debian/watch: Fix package version regex pattern

New upstream version 2.9.1

To post a comment you must log in.
Revision history for this message
Bryce Harrington (bryce) wrote :

Hi Calvin, it's a bit late in the release for merges, did you have an MRE or SRU bug to accompany this?

Also, I'm not spotting changes to debian/changelog, was that intentional to be excluded?

Revision history for this message
Calvin Mwadime Makokha (calvinmwadime) wrote :

Hi Bryce,

> Hi Calvin, it's a bit late in the release for merges, did you have an MRE or
> SRU bug to accompany this?
After discussion with Utkarsh, it was decided we shall revisit this MP later since I was late to raise it.

> Also, I'm not spotting changes to debian/changelog, was that intentional to be
> excluded?
There are changes made to it. I think the diff display might have been truncated, however, you can see the file listed in the list above showing modified files

Revision history for this message
Lucas Kanashiro (lucaskanashiro) wrote :

Hi Calvin,

Since you decided to work on this later, I'd ask you to check if there is a way to remove the review slot for ubuntu-sponsors. Otherwise, it will be kept in the general sponsorship queue and patch pilots will keep revisiting this.

Revision history for this message
Calvin Mwadime Makokha (calvinmwadime) wrote :

Hi Lucas, which is the best way to do so? I have changed its status from `needs review` to `work in progress` will that help?

Revision history for this message
Calvin Mwadime Makokha (calvinmwadime) wrote :

Hi Lucas, there has been a change in urgency for this merge. I would request a review on this merge request.Changing the status to `needs review`.

8a67b96... by Calvin Mwadime Makokha

fix_systemd_networkd_lease_file_path.patch: patch osutil/ubuntu.py

Use systemd_networkd for newer ubuntu servers

7d40672... by Calvin Mwadime Makokha

changelog: update version

0b1a8de... by Calvin Mwadime Makokha

debian/patches: update cgroup logic to include v2

Properly handle systemd using cgroup v2

c7bd079... by Calvin Mwadime Makokha

changelog update

dbbfc63... by Calvin Mwadime Makokha

Add manpages

fbe4dd5... by Calvin Mwadime Makokha

changelog update

1d131ad... by Calvin Mwadime Makokha

Added sru tests

9c007ee... by Calvin Mwadime Makokha

changelog update

Unmerged commits

9c007ee... by Calvin Mwadime Makokha

changelog update

1d131ad... by Calvin Mwadime Makokha

Added sru tests

fbe4dd5... by Calvin Mwadime Makokha

changelog update

dbbfc63... by Calvin Mwadime Makokha

Add manpages

c7bd079... by Calvin Mwadime Makokha

changelog update

0b1a8de... by Calvin Mwadime Makokha

debian/patches: update cgroup logic to include v2

Properly handle systemd using cgroup v2

7d40672... by Calvin Mwadime Makokha

changelog: update version

8a67b96... by Calvin Mwadime Makokha

fix_systemd_networkd_lease_file_path.patch: patch osutil/ubuntu.py

Use systemd_networkd for newer ubuntu servers

95c00e8... by Calvin Mwadime Makokha

debian/control: Remove isc-dhcp-client from Depends

9f935bf... by Calvin Mwadime Makokha

Remove upstart config

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/.flake8 b/.flake8
2deleted file mode 100644
3index 63303c3..0000000
4--- a/.flake8
5+++ /dev/null
6@@ -1,32 +0,0 @@
7-#
8-# The project did not use flake8 since inception so there are a number
9-# of time-consuming flake8-identified improvements that are just a lot
10-# of busy work. Each of these should be disabled and code cleaned up.
11-#
12-# W503: Line break occurred before a binary operator
13-# W504: Line break occurred after a binary operator
14-# E126: Continuation line over-indented for hanging indent
15-# E127: Continuation line over-indented for visual indent
16-# E128: Continuation line under-indented for visual indent
17-# E201: Whitespace after '('
18-# E202: Whitespace before ')'
19-# E203: Whitespace before ':'
20-# E221: Multiple spaces before operator
21-# E225: Missing whitespace around operator
22-# E226: Missing whitespace around arithmetic operator
23-# E231: Missing whitespace after ',', ';', or ':'
24-# E261: At least two spaces before inline comment
25-# E265: Block comment should start with '# '
26-# E302: Expected 2 blank lines, found 0
27-# E501: Line too long (xx > yy characters)
28-# E502: The backslash is redundant between brackets
29-# F401: Module imported but unused
30-# F403: 'from module import *' used; unable to detect undefined names
31-# F405: Name may be undefined, or defined from star imports: module
32-#
33-
34-[flake8]
35-ignore = W503,W504,E126,E127,E128,E201,E202,E203,E221,E225,E226,E231,E261,E265,E302,E501,E502,F401,F403,F405
36-exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,tests
37-max-complexity = 30
38-max-line-length = 120
39\ No newline at end of file
40diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
41index edfa1e6..fdcc07c 100644
42--- a/.github/PULL_REQUEST_TEMPLATE.md
43+++ b/.github/PULL_REQUEST_TEMPLATE.md
44@@ -14,9 +14,8 @@ This will expedite the process of getting your pull request merged and avoid ext
45 ### PR information
46 - [ ] The title of the PR is clear and informative.
47 - [ ] There are a small number of commits, each of which has an informative message. This means that previously merged commits do not appear in the history of the PR. For information on cleaning up the commits in your pull request, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).
48-- [ ] Except for special cases involving multiple contributors, the PR is started from a fork of the main repository, not a branch.
49 - [ ] If applicable, the PR references the bug/issue that it fixes in the description.
50-- [ ] New Unit tests were added for the changes made and Travis.CI is passing.
51+- [ ] New Unit tests were added for the changes made
52
53 ### Quality of Code and Contribution Guidelines
54 - [ ] I have read the [contribution guidelines](https://github.com/Azure/WALinuxAgent/blob/master/.github/CONTRIBUTING.md).
55\ No newline at end of file
56diff --git a/.github/codecov.yml b/.github/codecov.yml
57new file mode 100644
58index 0000000..77707aa
59--- /dev/null
60+++ b/.github/codecov.yml
61@@ -0,0 +1,2 @@
62+github_checks:
63+ annotations: false
64diff --git a/.github/workflows/ci_pr.yml b/.github/workflows/ci_pr.yml
65new file mode 100644
66index 0000000..e559268
67--- /dev/null
68+++ b/.github/workflows/ci_pr.yml
69@@ -0,0 +1,128 @@
70+name: CI Unit tests
71+
72+on:
73+ push:
74+ branches: [ "*" ]
75+ pull_request:
76+ branches: [ "*" ]
77+ workflow_dispatch:
78+
79+jobs:
80+ test-legacy-python-versions:
81+
82+ strategy:
83+ fail-fast: false
84+ matrix:
85+ include:
86+ - python-version: 2.6
87+ - python-version: 3.4
88+
89+ name: "Python ${{ matrix.python-version }} Unit Tests"
90+ runs-on: ubuntu-20.04
91+ container:
92+ image: ubuntu:16.04
93+ volumes:
94+ - /home/waagent:/home/waagent
95+ defaults:
96+ run:
97+ shell: bash -l {0}
98+
99+ env:
100+ NOSEOPTS: "--verbose"
101+
102+ steps:
103+ - uses: actions/checkout@v3
104+
105+ - name: Install Python ${{ matrix.python-version }}
106+ run: |
107+ apt-get update
108+ apt-get install -y curl bzip2 sudo python3
109+ curl https://dcrdata.blob.core.windows.net/python/python-${{ matrix.python-version }}.tar.bz2 -o python-${{ matrix.python-version }}.tar.bz2
110+ sudo tar xjvf python-${{ matrix.python-version }}.tar.bz2 --directory /
111+
112+ - name: Test with nosetests
113+ run: |
114+ if [[ ${{ matrix.python-version }} == 2.6 ]]; then
115+ source /home/waagent/virtualenv/python2.6.9/bin/activate
116+ else
117+ source /home/waagent/virtualenv/python3.4.8/bin/activate
118+ fi
119+ ./ci/nosetests.sh
120+ exit $?
121+
122+ test-current-python-versions:
123+
124+ strategy:
125+ fail-fast: false
126+ matrix:
127+ include:
128+
129+ - python-version: 2.7
130+ PYLINTOPTS: "--rcfile=ci/2.7.pylintrc --ignore=tests_e2e,makepkg.py"
131+
132+ - python-version: 3.5
133+ PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e,makepkg.py"
134+
135+ - python-version: 3.6
136+ PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
137+
138+ - python-version: 3.7
139+ PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
140+
141+ - python-version: 3.8
142+ PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
143+
144+ - python-version: 3.9
145+ PYLINTOPTS: "--rcfile=ci/3.6.pylintrc"
146+ additional-nose-opts: "--with-coverage --cover-erase --cover-inclusive --cover-branches --cover-package=azurelinuxagent"
147+
148+ name: "Python ${{ matrix.python-version }} Unit Tests"
149+ runs-on: ubuntu-20.04
150+
151+ env:
152+ PYLINTOPTS: ${{ matrix.PYLINTOPTS }}
153+ PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests tests_e2e"
154+ NOSEOPTS: "--with-timer ${{ matrix.additional-nose-opts }}"
155+ PYTHON_VERSION: ${{ matrix.python-version }}
156+
157+ steps:
158+
159+ - name: Checkout WALinuxAgent repo
160+ uses: actions/checkout@v3
161+
162+ - name: Setup Python ${{ matrix.python-version }}
163+ uses: actions/setup-python@v4
164+ with:
165+ python-version: ${{ matrix.python-version }}
166+
167+ - name: Install dependencies
168+ id: install-dependencies
169+ run: |
170+ sudo env "PATH=$PATH" python -m pip install --upgrade pip
171+ sudo env "PATH=$PATH" pip install -r requirements.txt
172+ sudo env "PATH=$PATH" pip install -r test-requirements.txt
173+
174+ - name: Run pylint
175+ run: |
176+ pylint $PYLINTOPTS --jobs=0 $PYLINTFILES
177+
178+ - name: Test with nosetests
179+ if: success() || (failure() && steps.install-dependencies.outcome == 'success')
180+ run: |
181+ ./ci/nosetests.sh
182+ exit $?
183+
184+ - name: Compile Coverage
185+ if: matrix.python-version == 3.9
186+ run: |
187+ echo looking for coverage files :
188+ ls -alh | grep -i coverage
189+ sudo env "PATH=$PATH" coverage combine coverage.*.data
190+ sudo env "PATH=$PATH" coverage xml
191+ sudo env "PATH=$PATH" coverage report
192+
193+ - name: Upload Coverage
194+ if: matrix.python-version == 3.9
195+ uses: codecov/codecov-action@v2
196+ with:
197+ file: ./coverage.xml
198\ No newline at end of file
199diff --git a/.gitignore b/.gitignore
200index 0a31340..fd64d33 100644
201--- a/.gitignore
202+++ b/.gitignore
203@@ -17,8 +17,6 @@ develop-eggs/
204 dist/
205 downloads/
206 eggs/
207-lib/
208-lib64/
209 parts/
210 sdist/
211 var/
212@@ -92,3 +90,4 @@ ENV/
213
214 # pyenv
215 .python-version
216+.vscode/
217diff --git a/.travis.yml b/.travis.yml
218deleted file mode 100644
219index fa672d3..0000000
220--- a/.travis.yml
221+++ /dev/null
222@@ -1,43 +0,0 @@
223----
224-os: linux
225-dist: xenial
226-language: python
227-env:
228- - NOSEOPTS="--verbose" SETUPOPTS=""
229- # Add SETUPOPTS="check flake8" to enable flake8 checks
230-
231-matrix:
232- # exclude the default "python" build - we're being specific here...
233- exclude:
234- - python:
235- env:
236- - NOSEOPTS="" SETUPOPTS="check flake8"
237-
238- include:
239- - python: 2.6
240- dist: trusty
241- env:
242- - NOSEOPTS="--verbose" SETUPOPTS=""
243- - python: 2.7
244- - python: 3.4
245- - python: 3.6
246- - python: 3.7
247- env:
248- - >-
249- NOSEOPTS="--verbose --with-coverage --cover-inclusive
250- --cover-min-percentage=60 --cover-branches
251- --cover-package=azurelinuxagent --cover-xml"
252- SETUPOPTS=""
253-
254-install:
255- - pip install -r requirements.txt
256- - pip install -r test-requirements.txt
257-
258-script:
259- # future: - pylint setup.py makepkg.py azurelinuxagent/
260- - nosetests $NOSEOPTS --attr '!requires_sudo' tests
261- - sudo env "PATH=$PATH" nosetests $NOSEOPTS --verbose --attr 'requires_sudo' tests
262- - if [ ! -z "$SETUPOPTS" ]; then /usr/bin/env python setup.py $SETUPOPTS; fi
263-
264-after_success:
265- - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; then codecov; fi
266\ No newline at end of file
267diff --git a/CODEOWNERS b/CODEOWNERS
268index a8f2d5d..8707e60 100644
269--- a/CODEOWNERS
270+++ b/CODEOWNERS
271@@ -1,3 +1,4 @@
272+1
273 # See https://help.github.com/articles/about-codeowners/
274 # for more info about CODEOWNERS file
275
276@@ -9,6 +10,7 @@
277 # when there are requests for changes in the provisioning agent. For any
278 # questions, please feel free to reach out to thstring@microsoft.com.
279 /azurelinuxagent/pa/ @trstringer @anhvoms
280+/tests/pa/ @trstringer @anhvoms
281
282 #
283 # RDMA
284@@ -19,4 +21,4 @@
285 #
286 # Linux Agent team
287 #
288-* @narrieta @vrdmr @pgombar @larohra
289+* @narrieta @ZhidongPeng @nagworld9 @maddieford
290diff --git a/Changelog b/Changelog
291deleted file mode 100644
292index da68890..0000000
293--- a/Changelog
294+++ /dev/null
295@@ -1,38 +0,0 @@
296-WALinuxAgent Changelog
297-|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
298-
299-Refer to releases WALinuxAgent release page: https://github.com/Azure/WALinuxAgent/releases for detailed changelog after v2.2.0
300-
301-12 August 2016, v2.1.6
302- . Improved RDMA support
303- . Extension state migration
304- . Alpine Linux support
305- . Fixes for #347, #351, #353
306-
307-15 July 2016, v2.1.5
308- . Goal state processing extension
309- . Multi-nic improvements
310- . Bug fixes for #145, #141, #133, #116, #187, #169, #104, #127, #163,
311- #190, #185, #174
312-
313-09 Mar 2016, WALinuxAgent 2.1.4
314- . Add support for FreeBSD
315- . Fix a bug for internal extension version resolving
316-
317-29 Jan 2016, WALinuxAgent 2.1.3
318- . Fixed endpoint probing for Azure Stack
319- . Multiple fixes for extension handling
320-
321-07 Dec 2015, WALinuxAgent 2.1.2
322- . Multiple fixes for extension handling and provisioning
323-
324-07 Aug 2015, WALinuxAgent 2.1.1
325- . Support python3
326- . Fixed bugs for metadata protocol
327- . Fixed a few pylint warnings
328- . Enabled travis-ci
329-
330-|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
331-01 Jul 2015, WALinuxAgent 2.1.0
332- . Divide waagent into different modules
333-
334diff --git a/README.md b/README.md
335index 0069d46..ae6a851 100644
336--- a/README.md
337+++ b/README.md
338@@ -1,31 +1,15 @@
339+
340 # Microsoft Azure Linux Agent
341
342-## Develop branch status
343-
344-[![Travis CI](https://travis-ci.org/Azure/WALinuxAgent.svg?branch=develop)](https://travis-ci.org/Azure/WALinuxAgent/branches)
345-[![CodeCov](https://codecov.io/gh/Azure/WALinusAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop)
346-
347-Each badge below represents our basic validation tests for an image, which are executed several times each day. These include provisioning, user account, disk, extension and networking scenarios.
348-
349-Note: These badges represent testing to our develop branch which might not be stable. For a stable build please use master branch instead.
350-
351-Image | Status |
352-------|--------|
353-Canonical UbuntuServer 14.04.5-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-LTS__agent--bvt.svg)
354-Canonical UbuntuServer 14.04.5-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-DAILY-LTS__agent--bvt.svg)
355-Canonical UbuntuServer 16.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-LTS__agent--bvt.svg)
356-Canonical UbuntuServer 16.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-DAILY-LTS__agent--bvt.svg)
357-Canonical UbuntuServer 18.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-LTS__agent--bvt.svg)
358-Canonical UbuntuServer 18.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-DAILY-LTS__agent--bvt.svg)
359-Credativ Debian 8|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8__agent--bvt.svg)
360-Credativ Debian 8-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8-DAILY__agent--bvt.svg)
361-Credativ Debian 9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9__agent--bvt.svg)
362-Credativ Debian 9-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9-DAILY__agent--bvt.svg)
363-OpenLogic CentOS 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_6.9__agent--bvt.svg)
364-OpenLogic CentOS 7.4|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_7.4__agent--bvt.svg)
365-RedHat RHEL 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_6.9__agent--bvt.svg)
366-RedHat RHEL 7-RAW|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_7-RAW__agent--bvt.svg)
367-SUSE SLES 12-SP3|![badge](https://dcrbadges.blob.core.windows.net/scenarios/SUSE_SLES_12-SP3__agent--bvt.svg)
368+## Linux distributions support
369+
370+Our daily automation tests most of the [Linux distributions supported by Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros); the Agent can be
371+used on other distributions as well, but development, testing and support for those are done by the open source community.
372+
373+Testing is done using the develop branch, which can be unstable. For a stable build please use the master branch instead.
374+
375+[![CodeCov](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop)
376+
377
378 ## Introduction
379
380@@ -49,7 +33,6 @@ functionality for Linux IaaS deployments:
381
382 * Kernel
383 * Configure virtual NUMA (disable for kernel <2.6.37)
384- * Consume Hyper-V entropy for /dev/random
385 * Configure SCSI timeouts for the root device (which could be remote)
386
387 * Diagnostics
388@@ -79,13 +62,15 @@ The agent will use an HTTP proxy if provided via the `http_proxy` (for `http` re
389 `https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and
390 `HttpProxy.Port` configuration variables (see below), if used, will override the environment
391 settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring
392-authentication.
393+authentication. Note that when the agent service is managed by systemd, environment variables
394+such as `http_proxy` and `https_proxy` should be defined using one the mechanisms provided by
395+systemd (e.g. by using Environment or EnvironmentFile in the service file).
396
397 ## Requirements
398
399 The following systems have been tested and are known to work with the Azure
400 Linux Agent. Please note that this list may differ from the official list
401-of supported systems on the Microsoft Azure Platform as described [here](http://support.microsoft.com/kb/2805216).
402+of supported systems on the Microsoft Azure Platform as described [here](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros).
403
404 Waagent depends on some system packages in order to function properly:
405
406@@ -109,6 +94,12 @@ For more advanced installation options, such as installing to custom locations o
407 sudo python setup.py install --register-service
408 ```
409
410+For Python 3, use:
411+
412+```bash
413+ sudo python3 setup.py install --register-service
414+```
415+
416 You can view more installation options by running:
417
418 ```bash
419@@ -177,6 +168,8 @@ For CoreOS, use:
420
421 `-start`: Run waagent as a background process
422
423+`-collect-logs [-full]`: Runs the log collector utility that collects relevant agent logs for debugging and stores them in the agent folder on disk. Exact location will be shown when run. Use flag `-full` for more exhaustive log collection.
424+
425 ## Configuration
426
427 A configuration file (/etc/waagent.conf) controls the actions of waagent. Blank lines and lines whose first character is a `#` are ignored (end-of-line comments are *not* supported).
428@@ -185,6 +178,7 @@ A sample configuration file is shown below:
429
430 ```yml
431 Extensions.Enabled=y
432+Extensions.GoalStatePeriod=6
433 Provisioning.Agent=auto
434 Provisioning.DeleteRootPassword=n
435 Provisioning.RegenerateSshHostKeyPair=y
436@@ -202,6 +196,8 @@ ResourceDisk.EnableSwap=n
437 ResourceDisk.EnableSwapEncryption=n
438 ResourceDisk.SwapSizeMB=0
439 Logs.Verbose=n
440+Logs.Collect=y
441+Logs.CollectPeriod=3600
442 OS.AllowHTTP=n
443 OS.RootDeviceScsiTimeout=300
444 OS.EnableFIPS=n
445@@ -210,8 +206,6 @@ OS.SshClientAliveInterval=180
446 OS.SshDir=/etc/ssh
447 HttpProxy.Host=None
448 HttpProxy.Port=None
449-CGroups.EnforceLimits=y
450-CGroups.Excluded=customscript,runcommand
451 ```
452
453 The various configuration options are described in detail below. Configuration
454@@ -238,6 +232,32 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set
455 provisioning time, via whichever API is being used. We will provide more details on
456 this on our wiki when it is generally available.
457
458+#### __Extensions.GoalStatePeriod__
459+
460+_Type: Integer_
461+_Default: 6_
462+
463+How often to poll for new goal states (in seconds) and report the status of the VM
464+and extensions. Goal states describe the desired state of the extensions on the VM.
465+
466+_Note_: setting up this parameter to more than a few minutes can make the state of
467+the VM be reported as unresponsive/unavailable on the Azure portal. Also, this
468+setting affects how fast the agent starts executing extensions.
469+
470+#### __AutoUpdate.Enabled__
471+
472+_Type: Boolean_
473+_Default: y_
474+
475+Enables auto-update of the Extension Handler. The Extension Handler is responsible
476+for managing extensions and reporting VM status. The core functionality of the agent
477+is contained in the Extension Handler, and we encourage users to enable this option
478+in order to maintain an up to date version.
479+
480+On most distros the default value is 'y'.
481+
482+For more information on the agent version, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#what-does-goal-state-agent-mean-in-waagent---version-output).
483+
484 #### __Provisioning.Agent__
485
486 _Type: String_
487@@ -261,7 +281,22 @@ _Note_: This configuration option has been removed and has no effect. waagent
488 now auto-detects cloud-init as a provisioning agent (with an option to override
489 with `Provisioning.Agent`).
490
491-#### __Provisioning.UseCloudInit__ (*removed in 2.2.45*)
492+#### __Provisioning.MonitorHostName__
493+
494+_Type: Boolean_
495+_Default: n_
496+
497+Monitor host name changes and publish changes via DHCP requests.
498+
499+#### __Provisioning.MonitorHostNamePeriod__
500+
501+_Type: Integer_
502+_Default: 30_
503+
504+How often to monitor host name changes (in seconds). This setting is ignored if
505+MonitorHostName is not set.
506+
507+#### __Provisioning.UseCloudInit__
508
509 _Type: Boolean_
510 _Default: n_
511@@ -397,7 +432,7 @@ system swap space.
512 _Type: Boolean_
513 _Default: n_
514
515-If set, the swap file (/swapfile) is mounted as an encrypted filesystem.
516+If set, the swap file (/swapfile) is mounted as an encrypted filesystem (flag supported only on FreeBSD.)
517
518 #### __ResourceDisk.SwapSizeMB__
519
520@@ -414,6 +449,25 @@ _Default: n_
521 If set, log verbosity is boosted. Waagent logs to /var/log/waagent.log and
522 leverages the system logrotate functionality to rotate logs.
523
524+
525+#### __Logs.Collect__
526+
527+_Type: Boolean_
528+_Default: y_
529+
530+If set, agent logs will be periodically collected and uploaded to a secure location for improved supportability.
531+
532+NOTE: This feature relies on the agent's resource usage features (cgroups); this flag will not take effect on any distro not supported.
533+
534+#### __Logs.CollectPeriod__
535+
536+_Type: Integer_
537+_Default: 3600_
538+
539+This configures how frequently to collect and upload logs. Default is each hour.
540+
541+NOTE: This only takes effect if the Logs.Collect option is enabled.
542+
543 #### __OS.AllowHTTP__
544
545 _Type: Boolean_
546@@ -442,6 +496,14 @@ OpenSSL commands. This signals OpenSSL to use any installed FIPS-compliant libra
547 Note that the agent itself has no FIPS-specific code. _If no FIPS-compliant certificates are
548 installed, then enabling this option will cause all OpenSSL commands to fail._
549
550+#### __OS.MonitorDhcpClientRestartPeriod__
551+
552+_Type: Integer_
553+_Default: 30_
554+
555+The agent monitor restarts of the DHCP client and restores network rules when it happens. This
556+setting determines how often (in seconds) to monitor for restarts.
557+
558 #### __OS.RootDeviceScsiTimeout__
559
560 _Type: Integer_
561@@ -450,6 +512,14 @@ _Default: 300_
562 This configures the SCSI timeout in seconds on the root device. If not set, the
563 system defaults are used.
564
565+#### __OS.RootDeviceScsiTimeoutPeriod__
566+
567+_Type: Integer_
568+_Default: 30_
569+
570+How often to set the SCSI timeout on the root device (in seconds). This setting is
571+ignored if RootDeviceScsiTimeout is not set.
572+
573 #### __OS.OpensslPath__
574
575 _Type: String_
576@@ -458,6 +528,13 @@ _Default: None_
577 This can be used to specify an alternate path for the openssl binary to use for
578 cryptographic operations.
579
580+#### __OS.RemovePersistentNetRulesPeriod__
581+_Type: Integer_
582+_Default: 30_
583+
584+How often to remove the udev rules for persistent network interface names (75-persistent-net-generator.rules
585+and /etc/udev/rules.d/70-persistent-net.rules) (in seconds)
586+
587 #### __OS.SshClientAliveInterval__
588
589 _Type: Integer_
590diff --git a/SECURITY.md b/SECURITY.md
591new file mode 100644
592index 0000000..e138ec5
593--- /dev/null
594+++ b/SECURITY.md
595@@ -0,0 +1,41 @@
596+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
597+
598+## Security
599+
600+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
601+
602+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
603+
604+## Reporting Security Issues
605+
606+**Please do not report security vulnerabilities through public GitHub issues.**
607+
608+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
609+
610+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
611+
612+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
613+
614+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
615+
616+ * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
617+ * Full paths of source file(s) related to the manifestation of the issue
618+ * The location of the affected source code (tag/branch/commit or direct URL)
619+ * Any special configuration required to reproduce the issue
620+ * Step-by-step instructions to reproduce the issue
621+ * Proof-of-concept or exploit code (if possible)
622+ * Impact of the issue, including how an attacker might exploit the issue
623+
624+This information will help us triage your report more quickly.
625+
626+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
627+
628+## Preferred Languages
629+
630+We prefer all communications to be in English.
631+
632+## Policy
633+
634+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
635+
636+<!-- END MICROSOFT SECURITY.MD BLOCK -->
637diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py
638index 6e65084..8c30348 100644
639--- a/azurelinuxagent/agent.py
640+++ b/azurelinuxagent/agent.py
641@@ -24,21 +24,47 @@ Module agent
642 from __future__ import print_function
643
644 import os
645-import sys
646 import re
647 import subprocess
648+import sys
649 import threading
650-import traceback
651+from azurelinuxagent.common import cgroupconfigurator, logcollector
652+from azurelinuxagent.common.cgroupapi import SystemdCgroupsApi
653
654-import azurelinuxagent.common.logger as logger
655-import azurelinuxagent.common.event as event
656 import azurelinuxagent.common.conf as conf
657-from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, \
658- DISTRO_NAME, DISTRO_VERSION, \
659- PY_VERSION_MAJOR, PY_VERSION_MINOR, \
660- PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION
661+import azurelinuxagent.common.event as event
662+import azurelinuxagent.common.logger as logger
663+from azurelinuxagent.common.future import ustr
664+from azurelinuxagent.common.logcollector import LogCollector, OUTPUT_RESULTS_FILE_PATH
665 from azurelinuxagent.common.osutil import get_osutil
666-from azurelinuxagent.common.utils import fileutil
667+from azurelinuxagent.common.utils import fileutil, textutil
668+from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
669+from azurelinuxagent.common.utils.networkutil import AddFirewallRules
670+from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, AGENT_VERSION, \
671+ DISTRO_NAME, DISTRO_VERSION, \
672+ PY_VERSION_MAJOR, PY_VERSION_MINOR, \
673+ PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION, \
674+ get_daemon_version, set_daemon_version
675+from azurelinuxagent.ga.collect_logs import CollectLogsHandler, get_log_collector_monitor_handler
676+from azurelinuxagent.pa.provision.default import ProvisionHandler
677+
678+
679+class AgentCommands(object):
680+ """
681+ This is the list of all commands that the Linux Guest Agent supports
682+ """
683+ DeprovisionUser = "deprovision+user"
684+ Deprovision = "deprovision"
685+ Daemon = "daemon"
686+ Start = "start"
687+ RegisterService = "register-service"
688+ RunExthandlers = "run-exthandlers"
689+ Version = "version"
690+ ShowConfig = "show-configuration"
691+ Help = "help"
692+ CollectLogs = "collect-logs"
693+ SetupFirewall = "setup-firewall"
694+ Provision = "provision"
695
696
697 class Agent(object):
698@@ -49,24 +75,24 @@ class Agent(object):
699 self.conf_file_path = conf_file_path
700 self.osutil = get_osutil()
701
702- #Init stdout log
703+ # Init stdout log
704 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO
705 logger.add_logger_appender(logger.AppenderType.STDOUT, level)
706
707- #Init config
708+ # Init config
709 conf_file_path = self.conf_file_path \
710 if self.conf_file_path is not None \
711 else self.osutil.get_agent_conf_file_path()
712 conf.load_conf_from_file(conf_file_path)
713
714- #Init log
715+ # Init log
716 verbose = verbose or conf.get_logs_verbose()
717 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO
718- logger.add_logger_appender(logger.AppenderType.FILE, level,
719- path="/var/log/waagent.log")
720- if conf.get_logs_console():
721- logger.add_logger_appender(logger.AppenderType.CONSOLE, level,
722- path="/dev/console")
723+ logger.add_logger_appender(logger.AppenderType.FILE, level, path=conf.get_agent_log_file())
724+
725+ # echo the log to /dev/console if the machine will be provisioned
726+ if conf.get_logs_console() and not ProvisionHandler.is_provisioned():
727+ self.__add_console_appender(level)
728
729 if event.send_logs_to_telemetry():
730 logger.add_logger_appender(logger.AppenderType.TELEMETRY,
731@@ -84,22 +110,30 @@ class Agent(object):
732 "Exception occurred while creating extension "
733 "log directory {0}: {1}".format(ext_log_dir, e))
734
735- #Init event reporter
736+ # Init event reporter
737+ # Note that the reporter is not fully initialized here yet. Some telemetry fields are filled with data
738+ # originating from the goal state or IMDS, which requires a WireProtocol instance. Once a protocol
739+ # has been established, those fields must be explicitly initialized using
740+ # initialize_event_logger_vminfo_common_parameters(). Any events created before that initialization
741+ # will contain dummy values on those fields.
742 event.init_event_status(conf.get_lib_dir())
743- event_dir = os.path.join(conf.get_lib_dir(), "events")
744+ event_dir = os.path.join(conf.get_lib_dir(), event.EVENTS_DIRECTORY)
745 event.init_event_logger(event_dir)
746 event.enable_unhandled_err_dump("WALA")
747
748+ def __add_console_appender(self, level):
749+ logger.add_logger_appender(logger.AppenderType.CONSOLE, level, path="/dev/console")
750+
751 def daemon(self):
752 """
753 Run agent daemon
754 """
755+ set_daemon_version(AGENT_VERSION)
756 logger.set_prefix("Daemon")
757 threading.current_thread().setName("Daemon")
758 child_args = None \
759 if self.conf_file_path is None \
760 else "-configuration-path:{0}".format(self.conf_file_path)
761-
762 from azurelinuxagent.daemon import get_daemon_handler
763 daemon_handler = get_daemon_handler()
764 daemon_handler.run(child_args=child_args)
765@@ -137,6 +171,21 @@ class Agent(object):
766 """
767 logger.set_prefix("ExtHandler")
768 threading.current_thread().setName("ExtHandler")
769+
770+ #
771+ # Agents < 2.2.53 used to echo the log to the console. Since the extension handler could have been started by
772+ # one of those daemons, output a message indicating that output to the console will stop, otherwise users
773+ # may think that the agent died if they noticed that output to the console stops abruptly.
774+ #
775+ # Feel free to remove this code if telemetry shows there are no more agents <= 2.2.53 in the field.
776+ #
777+ if conf.get_logs_console() and get_daemon_version() < FlexibleVersion("2.2.53"):
778+ self.__add_console_appender(logger.LogLevel.INFO)
779+ try:
780+ logger.info(u"The agent will now check for updates and then will process extensions. Output to /dev/console will be suspended during those operations.")
781+ finally:
782+ logger.disable_console_output()
783+
784 from azurelinuxagent.ga.update import get_update_handler
785 update_handler = get_update_handler()
786 update_handler.run(debug)
787@@ -146,91 +195,175 @@ class Agent(object):
788 for k in sorted(configuration.keys()):
789 print("{0} = {1}".format(k, configuration[k]))
790
791+ def collect_logs(self, is_full_mode):
792+ logger.set_prefix("LogCollector")
793+
794+ if is_full_mode:
795+ logger.info("Running log collector mode full")
796+ else:
797+ logger.info("Running log collector mode normal")
798+
799+ # Check the cgroups unit
800+ cpu_cgroup_path, memory_cgroup_path, log_collector_monitor = None, None, None
801+ if CollectLogsHandler.should_validate_cgroups():
802+ cgroups_api = SystemdCgroupsApi()
803+ cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")
804
805-def main(args=[]):
806+ cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path)
807+ memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path)
808+
809+ if not cpu_slice_matches or not memory_slice_matches:
810+ logger.info("The Log Collector process is not in the proper cgroups:")
811+ if not cpu_slice_matches:
812+ logger.info("\tunexpected cpu slice")
813+ if not memory_slice_matches:
814+ logger.info("\tunexpected memory slice")
815+
816+ sys.exit(logcollector.INVALID_CGROUPS_ERRCODE)
817+
818+ try:
819+ log_collector = LogCollector(is_full_mode, cpu_cgroup_path, memory_cgroup_path)
820+ log_collector_monitor = get_log_collector_monitor_handler(log_collector.cgroups)
821+ log_collector_monitor.run()
822+ archive = log_collector.collect_logs_and_get_archive()
823+ logger.info("Log collection successfully completed. Archive can be found at {0} "
824+ "and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH))
825+ except Exception as e:
826+ logger.error("Log collection completed unsuccessfully. Error: {0}".format(ustr(e)))
827+ logger.info("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH))
828+ sys.exit(1)
829+ finally:
830+ if log_collector_monitor is not None:
831+ log_collector_monitor.stop()
832+
833+ @staticmethod
834+ def setup_firewall(firewall_metadata):
835+
836+ print("Setting up firewall for the WALinux Agent with args: {0}".format(firewall_metadata))
837+ try:
838+ AddFirewallRules.add_iptables_rules(firewall_metadata['wait'], firewall_metadata['dst_ip'],
839+ firewall_metadata['uid'])
840+ print("Successfully set the firewall rules")
841+ except Exception as error:
842+ print("Unable to add firewall rules. Error: {0}".format(ustr(error)))
843+ sys.exit(1)
844+
845+
846+def main(args=None):
847 """
848 Parse command line arguments, exit with usage() on error.
849 Invoke different methods according to different command
850 """
851+ if args is None:
852+ args = []
853 if len(args) <= 0:
854 args = sys.argv[1:]
855- command, force, verbose, debug, conf_file_path = parse_args(args)
856- if command == "version":
857+ command, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata = parse_args(args)
858+ if command == AgentCommands.Version:
859 version()
860- elif command == "help":
861+ elif command == AgentCommands.Help:
862 print(usage())
863- elif command == "start":
864+ elif command == AgentCommands.Start:
865 start(conf_file_path=conf_file_path)
866 else:
867 try:
868 agent = Agent(verbose, conf_file_path=conf_file_path)
869- if command == "deprovision+user":
870+ if command == AgentCommands.DeprovisionUser:
871 agent.deprovision(force, deluser=True)
872- elif command == "deprovision":
873+ elif command == AgentCommands.Deprovision:
874 agent.deprovision(force, deluser=False)
875- elif command == "provision":
876+ elif command == AgentCommands.Provision:
877 agent.provision()
878- elif command == "register-service":
879+ elif command == AgentCommands.RegisterService:
880 agent.register_service()
881- elif command == "daemon":
882+ elif command == AgentCommands.Daemon:
883 agent.daemon()
884- elif command == "run-exthandlers":
885+ elif command == AgentCommands.RunExthandlers:
886 agent.run_exthandlers(debug)
887- elif command == "show-configuration":
888+ elif command == AgentCommands.ShowConfig:
889 agent.show_configuration()
890- except Exception:
891+ elif command == AgentCommands.CollectLogs:
892+ agent.collect_logs(log_collector_full_mode)
893+ elif command == AgentCommands.SetupFirewall:
894+ agent.setup_firewall(firewall_metadata)
895+ except Exception as e:
896 logger.error(u"Failed to run '{0}': {1}",
897 command,
898- traceback.format_exc())
899+ textutil.format_exception(e))
900+
901
902 def parse_args(sys_args):
903 """
904 Parse command line arguments
905 """
906- cmd = "help"
907+ cmd = AgentCommands.Help
908 force = False
909 verbose = False
910 debug = False
911 conf_file_path = None
912- for a in sys_args:
913- m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", a)
914+ log_collector_full_mode = False
915+ firewall_metadata = {
916+ "dst_ip": None,
917+ "uid": None,
918+ "wait": ""
919+ }
920+
921+ regex_cmd_format = "^([-/]*){0}"
922+
923+ for arg in sys_args:
924+ if arg == "":
925+ # Don't parse an empty parameter
926+ continue
927+ m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", arg) # pylint: disable=W1401
928 if not m is None:
929 conf_file_path = m.group(1)
930 if not os.path.exists(conf_file_path):
931 print("Error: Configuration file {0} does not exist".format(
932 conf_file_path), file=sys.stderr)
933- usage()
934+ print(usage())
935 sys.exit(1)
936-
937- elif re.match("^([-/]*)deprovision\\+user", a):
938- cmd = "deprovision+user"
939- elif re.match("^([-/]*)deprovision", a):
940- cmd = "deprovision"
941- elif re.match("^([-/]*)daemon", a):
942- cmd = "daemon"
943- elif re.match("^([-/]*)start", a):
944- cmd = "start"
945- elif re.match("^([-/]*)register-service", a):
946- cmd = "register-service"
947- elif re.match("^([-/]*)run-exthandlers", a):
948- cmd = "run-exthandlers"
949- elif re.match("^([-/]*)version", a):
950- cmd = "version"
951- elif re.match("^([-/]*)verbose", a):
952+ elif re.match("^([-/]*)deprovision\\+user", arg):
953+ cmd = AgentCommands.DeprovisionUser
954+ elif re.match(regex_cmd_format.format(AgentCommands.Deprovision), arg):
955+ cmd = AgentCommands.Deprovision
956+ elif re.match(regex_cmd_format.format(AgentCommands.Daemon), arg):
957+ cmd = AgentCommands.Daemon
958+ elif re.match(regex_cmd_format.format(AgentCommands.Start), arg):
959+ cmd = AgentCommands.Start
960+ elif re.match(regex_cmd_format.format(AgentCommands.RegisterService), arg):
961+ cmd = AgentCommands.RegisterService
962+ elif re.match(regex_cmd_format.format(AgentCommands.RunExthandlers), arg):
963+ cmd = AgentCommands.RunExthandlers
964+ elif re.match(regex_cmd_format.format(AgentCommands.Version), arg):
965+ cmd = AgentCommands.Version
966+ elif re.match(regex_cmd_format.format("verbose"), arg):
967 verbose = True
968- elif re.match("^([-/]*)debug", a):
969+ elif re.match(regex_cmd_format.format("debug"), arg):
970 debug = True
971- elif re.match("^([-/]*)force", a):
972+ elif re.match(regex_cmd_format.format("force"), arg):
973 force = True
974- elif re.match("^([-/]*)show-configuration", a):
975- cmd = "show-configuration"
976- elif re.match("^([-/]*)(help|usage|\\?)", a):
977- cmd = "help"
978+ elif re.match(regex_cmd_format.format(AgentCommands.ShowConfig), arg):
979+ cmd = AgentCommands.ShowConfig
980+ elif re.match("^([-/]*)(help|usage|\\?)", arg):
981+ cmd = AgentCommands.Help
982+ elif re.match(regex_cmd_format.format(AgentCommands.CollectLogs), arg):
983+ cmd = AgentCommands.CollectLogs
984+ elif re.match(regex_cmd_format.format("full"), arg):
985+ log_collector_full_mode = True
986+ elif re.match(regex_cmd_format.format(AgentCommands.SetupFirewall), arg):
987+ cmd = AgentCommands.SetupFirewall
988+ elif re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg):
989+ firewall_metadata['dst_ip'] = re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg).group(
990+ 'dst_ip')
991+ elif re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg):
992+ firewall_metadata['uid'] = re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg).group('uid')
993+ elif re.match(regex_cmd_format.format("(w|wait)$"), arg):
994+ firewall_metadata['wait'] = "-w"
995 else:
996- cmd = "help"
997+ cmd = AgentCommands.Help
998 break
999
1000- return cmd, force, verbose, debug, conf_file_path
1001+ return cmd, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata
1002
1003
1004 def version():
1005@@ -245,29 +378,33 @@ def version():
1006 PY_VERSION_MICRO))
1007 print("Goal state agent: {0}".format(GOAL_STATE_AGENT_VERSION))
1008
1009+
1010 def usage():
1011 """
1012 Return agent usage message
1013 """
1014 s = "\n"
1015 s += ("usage: {0} [-verbose] [-force] [-help] "
1016- "-configuration-path:<path to configuration file>"
1017+ "-configuration-path:<path to configuration file>"
1018 "-deprovision[+user]|-register-service|-version|-daemon|-start|"
1019- "-run-exthandlers|-show-configuration]"
1020+ "-run-exthandlers|-show-configuration|-collect-logs [-full]|-setup-firewall [-dst_ip=<IP> -uid=<UID> [-w/--wait]]"
1021 "").format(sys.argv[0])
1022 s += "\n"
1023 return s
1024
1025+
1026 def start(conf_file_path=None):
1027 """
1028 Start agent daemon in a background process and set stdout/stderr to
1029 /dev/null
1030 """
1031- devnull = open(os.devnull, 'w')
1032 args = [sys.argv[0], '-daemon']
1033 if conf_file_path is not None:
1034 args.append('-configuration-path:{0}'.format(conf_file_path))
1035- subprocess.Popen(args, stdout=devnull, stderr=devnull)
1036+
1037+ with open(os.devnull, 'w') as devnull:
1038+ subprocess.Popen(args, stdout=devnull, stderr=devnull)
1039+
1040
1041 if __name__ == '__main__' :
1042 main()
1043diff --git a/azurelinuxagent/common/AgentGlobals.py b/azurelinuxagent/common/AgentGlobals.py
1044new file mode 100644
1045index 0000000..dbfda92
1046--- /dev/null
1047+++ b/azurelinuxagent/common/AgentGlobals.py
1048@@ -0,0 +1,39 @@
1049+# Microsoft Azure Linux Agent
1050+#
1051+# Copyright 2020 Microsoft Corporation
1052+#
1053+# Licensed under the Apache License, Version 2.0 (the "License");
1054+# you may not use this file except in compliance with the License.
1055+# You may obtain a copy of the License at
1056+#
1057+# http://www.apache.org/licenses/LICENSE-2.0
1058+#
1059+# Unless required by applicable law or agreed to in writing, software
1060+# distributed under the License is distributed on an "AS IS" BASIS,
1061+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1062+# See the License for the specific language governing permissions and
1063+# limitations under the License.
1064+#
1065+# Requires Python 2.6+ and Openssl 1.0+
1066+
1067+
1068+class AgentGlobals(object):
1069+ """
1070+ This class is used for setting AgentGlobals which can be used all throughout the Agent.
1071+ """
1072+
1073+ GUID_ZERO = "00000000-0000-0000-0000-000000000000"
1074+
1075+ #
1076+ # Some modules (e.g. telemetry) require an up-to-date container ID. We update this variable each time we
1077+ # fetch the goal state.
1078+ #
1079+ _container_id = GUID_ZERO
1080+
1081+ @staticmethod
1082+ def get_container_id():
1083+ return AgentGlobals._container_id
1084+
1085+ @staticmethod
1086+ def update_container_id(container_id):
1087+ AgentGlobals._container_id = container_id
1088diff --git a/azurelinuxagent/common/agent_supported_feature.py b/azurelinuxagent/common/agent_supported_feature.py
1089new file mode 100644
1090index 0000000..d7f93e2
1091--- /dev/null
1092+++ b/azurelinuxagent/common/agent_supported_feature.py
1093@@ -0,0 +1,122 @@
1094+# Copyright 2018 Microsoft Corporation
1095+#
1096+# Licensed under the Apache License, Version 2.0 (the "License");
1097+# you may not use this file except in compliance with the License.
1098+# You may obtain a copy of the License at
1099+#
1100+# http://www.apache.org/licenses/LICENSE-2.0
1101+#
1102+# Unless required by applicable law or agreed to in writing, software
1103+# distributed under the License is distributed on an "AS IS" BASIS,
1104+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1105+# See the License for the specific language governing permissions and
1106+# limitations under the License.
1107+#
1108+# Requires Python 2.6+ and Openssl 1.0+
1109+#
1110+
1111+
1112+class SupportedFeatureNames(object):
1113+ """
1114+ Enum for defining the Feature Names for all features that we the agent supports
1115+ """
1116+ MultiConfig = "MultipleExtensionsPerHandler"
1117+ ExtensionTelemetryPipeline = "ExtensionTelemetryPipeline"
1118+ FastTrack = "FastTrack"
1119+
1120+
1121+class AgentSupportedFeature(object):
1122+ """
1123+ Interface for defining all features that the Linux Guest Agent supports and reports their if supported back to CRP
1124+ """
1125+
1126+ def __init__(self, name, version="1.0", supported=False):
1127+ self.__name = name
1128+ self.__version = version
1129+ self.__supported = supported
1130+
1131+ @property
1132+ def name(self):
1133+ return self.__name
1134+
1135+ @property
1136+ def version(self):
1137+ return self.__version
1138+
1139+ @property
1140+ def is_supported(self):
1141+ return self.__supported
1142+
1143+
1144+class _MultiConfigFeature(AgentSupportedFeature):
1145+
1146+ __NAME = SupportedFeatureNames.MultiConfig
1147+ __VERSION = "1.0"
1148+ __SUPPORTED = True
1149+
1150+ def __init__(self):
1151+ super(_MultiConfigFeature, self).__init__(name=_MultiConfigFeature.__NAME,
1152+ version=_MultiConfigFeature.__VERSION,
1153+ supported=_MultiConfigFeature.__SUPPORTED)
1154+
1155+
1156+class _ETPFeature(AgentSupportedFeature):
1157+
1158+ __NAME = SupportedFeatureNames.ExtensionTelemetryPipeline
1159+ __VERSION = "1.0"
1160+ __SUPPORTED = True
1161+
1162+ def __init__(self):
1163+ super(_ETPFeature, self).__init__(name=self.__NAME,
1164+ version=self.__VERSION,
1165+ supported=self.__SUPPORTED)
1166+
1167+
1168+# This is the list of features that Agent supports and we advertise to CRP
1169+__CRP_ADVERTISED_FEATURES = {
1170+ SupportedFeatureNames.MultiConfig: _MultiConfigFeature()
1171+}
1172+
1173+
1174+# This is the list of features that Agent supports and we advertise to Extensions
1175+__EXTENSION_ADVERTISED_FEATURES = {
1176+ SupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature()
1177+}
1178+
1179+
1180+def get_supported_feature_by_name(feature_name):
1181+ if feature_name in __CRP_ADVERTISED_FEATURES:
1182+ return __CRP_ADVERTISED_FEATURES[feature_name]
1183+
1184+ if feature_name in __EXTENSION_ADVERTISED_FEATURES:
1185+ return __EXTENSION_ADVERTISED_FEATURES[feature_name]
1186+
1187+ raise NotImplementedError("Feature with Name: {0} not found".format(feature_name))
1188+
1189+
1190+def get_agent_supported_features_list_for_crp():
1191+ """
1192+ List of features that the GuestAgent currently supports (like FastTrack, MultiConfig, etc).
1193+ We need to send this list as part of Status reporting to inform CRP of all the features the agent supports.
1194+ :return: Dict containing all CRP supported features with the key as their names and the AgentFeature object as
1195+ the value if they are supported by the Agent
1196+ Eg: {
1197+ MultipleExtensionsPerHandler: _MultiConfigFeature()
1198+ }
1199+ """
1200+
1201+ return dict((name, feature) for name, feature in __CRP_ADVERTISED_FEATURES.items() if feature.is_supported)
1202+
1203+
1204+def get_agent_supported_features_list_for_extensions():
1205+ """
1206+ List of features that the GuestAgent currently supports (like Extension Telemetry Pipeline, etc) needed by Extensions.
1207+ We need to send this list as environment variables when calling extension commands to inform Extensions of all the
1208+ features the agent supports.
1209+ :return: Dict containing all Extension supported features with the key as their names and the AgentFeature object as
1210+ the value if the feature is supported by the Agent.
1211+ Eg: {
1212+ CRPSupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature()
1213+ }
1214+ """
1215+ return dict((name, feature) for name, feature in __EXTENSION_ADVERTISED_FEATURES.items() if feature.is_supported)
1216diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py
1217index 2ad70c1..b2bf32f 100644
1218--- a/azurelinuxagent/common/cgroup.py
1219+++ b/azurelinuxagent/common/cgroup.py
1220@@ -13,47 +13,94 @@
1221 # limitations under the License.
1222 #
1223 # Requires Python 2.6+ and Openssl 1.0+
1224+
1225 import errno
1226 import os
1227 import re
1228+from datetime import timedelta
1229
1230-from azurelinuxagent.common import logger
1231+from azurelinuxagent.common import logger, conf
1232 from azurelinuxagent.common.exception import CGroupsException
1233 from azurelinuxagent.common.future import ustr
1234 from azurelinuxagent.common.osutil import get_osutil
1235 from azurelinuxagent.common.utils import fileutil
1236
1237-re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')
1238+_REPORT_EVERY_HOUR = timedelta(hours=1)
1239+_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period())
1240
1241+AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes
1242+AGENT_LOG_COLLECTOR = "azure-walinuxagent-logcollector"
1243
1244-class CGroupContollers(object):
1245- CPU = "cpu"
1246- MEMORY = "memory"
1247
1248+class CounterNotFound(Exception):
1249+ pass
1250
1251-class CGroup(object):
1252- @staticmethod
1253- def create(cgroup_path, controller, extension_name):
1254- """
1255- Factory method to create the correct CGroup.
1256- """
1257- if controller == CGroupContollers.CPU:
1258- return CpuCgroup(extension_name, cgroup_path)
1259- if controller == CGroupContollers.MEMORY:
1260- return MemoryCgroup(extension_name, cgroup_path)
1261- raise CGroupsException('CGroup controller {0} is not supported'.format(controller))
1262
1263- def __init__(self, name, cgroup_path, controller_type):
1264+class MetricValue(object):
1265+
1266+ """
1267+ Class for defining all the required metric fields to send telemetry.
1268+ """
1269+
1270+ def __init__(self, category, counter, instance, value, report_period=_DEFAULT_REPORT_PERIOD):
1271+ self._category = category
1272+ self._counter = counter
1273+ self._instance = instance
1274+ self._value = value
1275+ self._report_period = report_period
1276+
1277+ @property
1278+ def category(self):
1279+ return self._category
1280+
1281+ @property
1282+ def counter(self):
1283+ return self._counter
1284+
1285+ @property
1286+ def instance(self):
1287+ return self._instance
1288+
1289+ @property
1290+ def value(self):
1291+ return self._value
1292+
1293+ @property
1294+ def report_period(self):
1295+ return self._report_period
1296+
1297+
1298+class MetricsCategory(object):
1299+ MEMORY_CATEGORY = "Memory"
1300+ CPU_CATEGORY = "CPU"
1301+
1302+
1303+class MetricsCounter(object):
1304+ PROCESSOR_PERCENT_TIME = "% Processor Time"
1305+ TOTAL_MEM_USAGE = "Total Memory Usage"
1306+ MAX_MEM_USAGE = "Max Memory Usage"
1307+ THROTTLED_TIME = "Throttled Time"
1308+ SWAP_MEM_USAGE = "Swap Memory Usage"
1309+ AVAILABLE_MEM = "Available MBytes"
1310+ USED_MEM = "Used MBytes"
1311+
1312+
1313+re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')
1314+
1315+
1316+class CGroup(object):
1317+ def __init__(self, name, cgroup_path):
1318 """
1319 Initialize _data collection for the Memory controller
1320 :param: name: Name of the CGroup
1321 :param: cgroup_path: Path of the controller
1322- :param: controller_type:
1323 :return:
1324 """
1325 self.name = name
1326 self.path = cgroup_path
1327- self.controller = controller_type
1328+
1329+ def __str__(self):
1330+ return "{0} [{1}]".format(self.name, self.path)
1331
1332 def _get_cgroup_file(self, file_name):
1333 return os.path.join(self.path, file_name)
1334@@ -89,7 +136,7 @@ class CGroup(object):
1335 logger.error("File {0} is empty but should not be".format(parameter_filename))
1336 raise CGroupsException("File {0} is empty but should not be".format(parameter_filename))
1337 except Exception as e:
1338- if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
1339+ if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
1340 raise e
1341 parameter_filename = self._get_cgroup_file(parameter_name)
1342 raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e)
1343@@ -114,42 +161,26 @@ class CGroup(object):
1344 ' Internal error: {1}'.format(self.path, ustr(e)))
1345 return False
1346
1347- def get_tracked_processes(self):
1348+ def get_tracked_metrics(self, **_):
1349 """
1350- :return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes.
1351+ Retrieves the current value of the metrics tracked for this cgroup and returns them as an array.
1352+
1353+ Note: Agent won't track the metrics if the current cpu ticks less than previous value and returns empty array.
1354 """
1355- procs = []
1356- try:
1357- procs = self._get_parameters("cgroup.procs")
1358- except (IOError, OSError) as e:
1359- if e.errno == errno.ENOENT:
1360- # only suppressing file not found exceptions.
1361- pass
1362- else:
1363- logger.periodic_warn(logger.EVERY_HALF_HOUR,
1364- 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.'
1365- ' Internal error: {1}'.format(self.path, ustr(e)))
1366- except CGroupsException as e:
1367- logger.periodic_warn(logger.EVERY_HALF_HOUR,
1368- 'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.'
1369- ' Internal error: {1}'.format(self.path, ustr(e)))
1370- return procs
1371+ raise NotImplementedError()
1372
1373
1374 class CpuCgroup(CGroup):
1375 def __init__(self, name, cgroup_path):
1376- super(CpuCgroup, self).__init__(name, cgroup_path, CGroupContollers.CPU)
1377+ super(CpuCgroup, self).__init__(name, cgroup_path)
1378
1379 self._osutil = get_osutil()
1380 self._previous_cgroup_cpu = None
1381 self._previous_system_cpu = None
1382 self._current_cgroup_cpu = None
1383 self._current_system_cpu = None
1384-
1385- def __str__(self):
1386- return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(
1387- self.name, self.path, self.controller
1388- )
1389+ self._previous_throttled_time = None
1390+ self._current_throttled_time = None
1391
1392 def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False):
1393 """
1394@@ -159,24 +190,54 @@ class CpuCgroup(CGroup):
1395 returns 0; this is useful when the function can be called before the cgroup has been created.
1396 """
1397 try:
1398- cpu_stat = self._get_file_contents('cpuacct.stat')
1399+ cpuacct_stat = self._get_file_contents('cpuacct.stat')
1400 except Exception as e:
1401- if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
1402+ if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101
1403 raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e)))
1404 if not allow_no_such_file_or_directory_error:
1405 raise e
1406- cpu_stat = None
1407+ cpuacct_stat = None
1408
1409 cpu_ticks = 0
1410
1411- if cpu_stat is not None:
1412- match = re_user_system_times.match(cpu_stat)
1413+ if cpuacct_stat is not None:
1414+ #
1415+ # Sample file:
1416+ # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpuacct.stat
1417+ # user 10190
1418+ # system 3160
1419+ #
1420+ match = re_user_system_times.match(cpuacct_stat)
1421 if not match:
1422- raise CGroupsException("The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpu_stat))
1423+ raise CGroupsException(
1424+ "The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat))
1425 cpu_ticks = int(match.groups()[0]) + int(match.groups()[1])
1426
1427 return cpu_ticks
1428
1429+ def get_throttled_time(self):
1430+ try:
1431+ with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat:
1432+ #
1433+ # Sample file:
1434+ #
1435+ # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpu.stat
1436+ # nr_periods 51660
1437+ # nr_throttled 19461
1438+ # throttled_time 1529590856339
1439+ #
1440+ for line in cpu_stat:
1441+ match = re.match(r'throttled_time\s+(\d+)', line)
1442+ if match is not None:
1443+ return int(match.groups()[0])
1444+ raise Exception("Cannot find throttled_time")
1445+ except (IOError, OSError) as e:
1446+ if e.errno == errno.ENOENT:
1447+ return 0
1448+ raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e)))
1449+ except Exception as e:
1450+ raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e)))
1451+
1452 def _cpu_usage_initialized(self):
1453 return self._current_cgroup_cpu is not None and self._current_system_cpu is not None
1454
1455@@ -188,13 +249,14 @@ class CpuCgroup(CGroup):
1456 raise CGroupsException("initialize_cpu_usage() should be invoked only once")
1457 self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True)
1458 self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot()
1459+ self._current_throttled_time = self.get_throttled_time()
1460
1461 def get_cpu_usage(self):
1462 """
1463 Computes the CPU used by the cgroup since the last call to this function.
1464
1465- The usage is measured as a percentage of utilization of all cores in the system. For example,
1466- using 1 core at 100% on a 4-core system would be reported as 25%.
1467+ The usage is measured as a percentage of utilization of 1 core in the system. For example,
1468+ using 1 core all of the time on a 4-core system would be reported as 100%.
1469
1470 NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage()
1471 """
1472@@ -209,53 +271,122 @@ class CpuCgroup(CGroup):
1473 cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu
1474 system_delta = max(1, self._current_system_cpu - self._previous_system_cpu)
1475
1476- return round(100.0 * float(cgroup_delta) / float(system_delta), 3)
1477+ return round(100.0 * self._osutil.get_processor_cores() * float(cgroup_delta) / float(system_delta), 3)
1478+
1479+ def get_cpu_throttled_time(self, read_previous_throttled_time=True):
1480+ """
1481+ Computes the throttled time (in seconds) since the last call to this function.
1482+ NOTE: initialize_cpu_usage() must be invoked before calling this function
1483+ Compute only current throttled time if read_previous_throttled_time set to False
1484+ """
1485+ if not read_previous_throttled_time:
1486+ return float(self.get_throttled_time() / 1E9)
1487+
1488+ if not self._cpu_usage_initialized():
1489+ raise CGroupsException(
1490+ "initialize_cpu_usage() must be invoked before the first call to get_throttled_time()")
1491+
1492+ self._previous_throttled_time = self._current_throttled_time
1493+ self._current_throttled_time = self.get_throttled_time()
1494+
1495+ return float(self._current_throttled_time - self._previous_throttled_time) / 1E9
1496+
1497+ def get_tracked_metrics(self, **kwargs):
1498+ tracked = []
1499+ cpu_usage = self.get_cpu_usage()
1500+ if cpu_usage >= float(0):
1501+ tracked.append(
1502+ MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))
1503+
1504+ if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']:
1505+ throttled_time = self.get_cpu_throttled_time()
1506+ if cpu_usage >= float(0) and throttled_time >= float(0):
1507+ tracked.append(
1508+ MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))
1509+
1510+ return tracked
1511
1512
1513 class MemoryCgroup(CGroup):
1514 def __init__(self, name, cgroup_path):
1515+ super(MemoryCgroup, self).__init__(name, cgroup_path)
1516+
1517+ self._counter_not_found_error_count = 0
1518+
1519+ def _get_memory_stat_counter(self, counter_name):
1520+ try:
1521+ with open(os.path.join(self.path, 'memory.stat')) as memory_stat:
1522+ # cat /sys/fs/cgroup/memory/azure.slice/memory.stat
1523+ # cache 67178496
1524+ # rss 42340352
1525+ # rss_huge 6291456
1526+ # swap 0
1527+ for line in memory_stat:
1528+ re_memory_counter = r'{0}\s+(\d+)'.format(counter_name)
1529+ match = re.match(re_memory_counter, line)
1530+ if match is not None:
1531+ return int(match.groups()[0])
1532+ except (IOError, OSError) as e:
1533+ if e.errno == errno.ENOENT:
1534+ raise
1535+ raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
1536+ except Exception as e:
1537+ raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
1538+
1539+ raise CounterNotFound("Cannot find counter: {0}".format(counter_name))
1540+
1541+ def get_memory_usage(self):
1542 """
1543- Initialize _data collection for the Memory controller
1544+ Collect RSS+CACHE from memory.stat cgroup.
1545
1546- :return: MemoryCgroup
1547+ :return: Memory usage in bytes
1548+ :rtype: int
1549 """
1550- super(MemoryCgroup, self).__init__(name, cgroup_path, CGroupContollers.MEMORY)
1551
1552- def __str__(self):
1553- return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(
1554- self.name, self.path, self.controller
1555- )
1556+ cache = self._get_memory_stat_counter("cache")
1557+ rss = self._get_memory_stat_counter("rss")
1558+ return cache + rss
1559
1560- def get_memory_usage(self):
1561+ def try_swap_memory_usage(self):
1562 """
1563- Collect memory.usage_in_bytes from the cgroup.
1564+ Collect SWAP from memory.stat cgroup.
1565
1566 :return: Memory usage in bytes
1567 :rtype: int
1568+ Note: stat file is the only place to get the SWAP since other swap related file memory.memsw.usage_in_bytes is for total Memory+SWAP.
1569 """
1570- usage = None
1571 try:
1572- usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True)
1573- except Exception as e:
1574- if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
1575- raise
1576- raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)
1577-
1578- return int(usage)
1579+ return self._get_memory_stat_counter("swap")
1580+ except CounterNotFound as e:
1581+ if self._counter_not_found_error_count < 1:
1582+ logger.periodic_info(logger.EVERY_HALF_HOUR,
1583+ '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path))
1584+ self._counter_not_found_error_count += 1
1585+ return 0
1586
1587 def get_max_memory_usage(self):
1588 """
1589- Collect memory.usage_in_bytes from the cgroup.
1590+ Collect memory.max_usage_in_bytes from the cgroup.
1591
1592 :return: Memory usage in bytes
1593 :rtype: int
1594 """
1595- usage = None
1596+ usage = 0
1597 try:
1598- usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)
1599+ usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True))
1600 except Exception as e:
1601- if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
1602+ if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
1603 raise
1604- raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)
1605-
1606- return int(usage)
1607+ raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e)
1608+
1609+ return usage
1610+
1611+ def get_tracked_metrics(self, **_):
1612+ return [
1613+ MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name,
1614+ self.get_memory_usage()),
1615+ MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name,
1616+ self.get_max_memory_usage(), _REPORT_EVERY_HOUR),
1617+ MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name,
1618+ self.try_swap_memory_usage(), _REPORT_EVERY_HOUR)
1619+ ]
1620diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py
1621index c671a2e..ca0ef3b 100644
1622--- a/azurelinuxagent/common/cgroupapi.py
1623+++ b/azurelinuxagent/common/cgroupapi.py
1624@@ -1,3 +1,4 @@
1625+# -*- coding: utf-8 -*-
1626 # Copyright 2018 Microsoft Corporation
1627 #
1628 # Licensed under the Apache License, Version 2.0 (the "License");
1629@@ -14,102 +15,65 @@
1630 #
1631 # Requires Python 2.6+ and Openssl 1.0+
1632
1633-import errno
1634 import os
1635+import re
1636 import shutil
1637 import subprocess
1638+import threading
1639 import uuid
1640
1641 from azurelinuxagent.common import logger
1642-from azurelinuxagent.common.cgroup import CGroup
1643+from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup
1644 from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
1645 from azurelinuxagent.common.conf import get_agent_pid_file_path
1646-from azurelinuxagent.common.event import add_event, WALAEventOperation
1647 from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \
1648 ExtensionOperationError
1649 from azurelinuxagent.common.future import ustr
1650+from azurelinuxagent.common.osutil import systemd
1651 from azurelinuxagent.common.utils import fileutil, shellutil
1652-from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output
1653-from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION
1654+from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \
1655+ TELEMETRY_MESSAGE_MAX_LEN
1656+from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
1657+from azurelinuxagent.common.version import get_distro
1658
1659 CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup'
1660 CGROUP_CONTROLLERS = ["cpu", "memory"]
1661-VM_AGENT_CGROUP_NAME = "walinuxagent.service"
1662-EXTENSIONS_ROOT_CGROUP_NAME = "walinuxagent.extensions"
1663-UNIT_FILES_FILE_SYSTEM_PATH = "/etc/systemd/system"
1664+EXTENSION_SLICE_PREFIX = "azure-vmextensions"
1665
1666
1667-class CGroupsApi(object):
1668+class SystemdRunError(CGroupsException):
1669 """
1670- Interface for the cgroups API
1671+ Raised when systemd-run fails
1672 """
1673- def create_agent_cgroups(self):
1674- raise NotImplementedError()
1675-
1676- def create_extension_cgroups_root(self):
1677- raise NotImplementedError()
1678-
1679- def create_extension_cgroups(self, extension_name):
1680- raise NotImplementedError()
1681-
1682- def remove_extension_cgroups(self, extension_name):
1683- raise NotImplementedError()
1684
1685- def get_extension_cgroups(self, extension_name):
1686- raise NotImplementedError()
1687+ def __init__(self, msg=None):
1688+ super(SystemdRunError, self).__init__(msg)
1689
1690- def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code):
1691- raise NotImplementedError()
1692
1693- def cleanup_legacy_cgroups(self):
1694- raise NotImplementedError()
1695+class CGroupsApi(object):
1696+ @staticmethod
1697+ def cgroups_supported():
1698+ distro_info = get_distro()
1699+ distro_name = distro_info[0]
1700+ try:
1701+ distro_version = FlexibleVersion(distro_info[1])
1702+ except ValueError:
1703+ return False
1704+ return distro_name.lower() == 'ubuntu' and distro_version.major >= 16
1705
1706 @staticmethod
1707 def track_cgroups(extension_cgroups):
1708 try:
1709 for cgroup in extension_cgroups:
1710 CGroupsTelemetry.track_cgroup(cgroup)
1711- except Exception as e:
1712+ except Exception as exception:
1713 logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. "
1714- "Error: {1}".format(cgroup.path, ustr(e)))
1715+ "Error: {1}".format(cgroup.path, ustr(exception)))
1716
1717 @staticmethod
1718- def _get_extension_cgroup_name(extension_name):
1719- # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.
1720- return extension_name.replace('-', '_')
1721-
1722- @staticmethod
1723- def create():
1724- """
1725- Factory method to create the correct API for the current platform
1726- """
1727- return SystemdCgroupsApi() if CGroupsApi._is_systemd() else FileSystemCgroupsApi()
1728-
1729- @staticmethod
1730- def _is_systemd():
1731- """
1732- Determine if systemd is managing system services; the implementation follows the same strategy as, for example,
1733- sd_booted() in libsystemd, or /usr/sbin/service
1734- """
1735- return os.path.exists('/run/systemd/system/')
1736-
1737- @staticmethod
1738- def _foreach_controller(operation, message):
1739- """
1740- Executes the given operation on all controllers that need to be tracked; outputs 'message' if the controller
1741- is not mounted or if an error occurs in the operation
1742- :return: Returns a list of error messages or an empty list if no errors occurred
1743- """
1744- mounted_controllers = os.listdir(CGROUPS_FILE_SYSTEM_ROOT)
1745-
1746- for controller in CGROUP_CONTROLLERS:
1747- try:
1748- if controller not in mounted_controllers:
1749- logger.warn('Cgroup controller "{0}" is not mounted. {1}', controller, message)
1750- else:
1751- operation(controller)
1752- except Exception as e:
1753- logger.warn('Error in cgroup controller "{0}": {1}. {2}', controller, ustr(e), message)
1754+ def get_processes_in_cgroup(cgroup_path):
1755+ with open(os.path.join(cgroup_path, "cgroup.procs"), "r") as cgroup_procs:
1756+ return [int(pid) for pid in cgroup_procs.read().split()]
1757
1758 @staticmethod
1759 def _foreach_legacy_cgroup(operation):
1760@@ -138,429 +102,250 @@ class CGroupsApi(object):
1761
1762 if os.path.exists(procs_file):
1763 procs_file_contents = fileutil.read_file(procs_file).strip()
1764- daemon_pid = fileutil.read_file(get_agent_pid_file_path()).strip()
1765+ daemon_pid = CGroupsApi.get_daemon_pid()
1766
1767- if daemon_pid in procs_file_contents:
1768+ if ustr(daemon_pid) in procs_file_contents:
1769 operation(controller, daemon_pid)
1770 finally:
1771 for _, cgroup in legacy_cgroups:
1772 logger.info('Removing {0}', cgroup)
1773 shutil.rmtree(cgroup, ignore_errors=True)
1774+ return len(legacy_cgroups)
1775
1776-
1777-class FileSystemCgroupsApi(CGroupsApi):
1778- """
1779- Cgroups interface using the cgroups file system directly
1780- """
1781 @staticmethod
1782- def _try_mkdir(path):
1783- """
1784- Try to create a directory, recursively. If it already exists as such, do nothing. Raise the appropriate
1785- exception should an error occur.
1786-
1787- :param path: str
1788- """
1789- if not os.path.isdir(path):
1790- try:
1791- os.makedirs(path, 0o755)
1792- except OSError as e:
1793- if e.errno == errno.EEXIST:
1794- if not os.path.isdir(path):
1795- raise CGroupsException("Create directory for cgroup {0}: normal file already exists with that name".format(path))
1796- else:
1797- pass # There was a race to create the directory, but it's there now, and that's fine
1798- elif e.errno == errno.EACCES:
1799- # This is unexpected, as the agent runs as root
1800- raise CGroupsException("Create directory for cgroup {0}: permission denied".format(path))
1801- else:
1802- raise
1803-
1804- @staticmethod
1805- def _get_agent_cgroup_path(controller):
1806- return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME)
1807-
1808- @staticmethod
1809- def _get_extension_cgroups_root_path(controller):
1810- return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, EXTENSIONS_ROOT_CGROUP_NAME)
1811-
1812- def _get_extension_cgroup_path(self, controller, extension_name):
1813- extensions_root = self._get_extension_cgroups_root_path(controller)
1814-
1815- if not os.path.exists(extensions_root):
1816- logger.warn("Root directory {0} does not exist.".format(extensions_root))
1817-
1818- cgroup_name = self._get_extension_cgroup_name(extension_name)
1819+ def get_daemon_pid():
1820+ return int(fileutil.read_file(get_agent_pid_file_path()).strip())
1821
1822- return os.path.join(extensions_root, cgroup_name)
1823
1824- def _create_extension_cgroup(self, controller, extension_name):
1825- return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), controller, extension_name)
1826+class SystemdCgroupsApi(CGroupsApi):
1827+ """
1828+ Cgroups interface via systemd
1829+ """
1830
1831- @staticmethod
1832- def _add_process_to_cgroup(pid, cgroup_path):
1833- tasks_file = os.path.join(cgroup_path, 'cgroup.procs')
1834- fileutil.append_file(tasks_file, "{0}\n".format(pid))
1835- logger.info("Added PID {0} to cgroup {1}".format(pid, cgroup_path))
1836+ def __init__(self):
1837+ self._cgroup_mountpoints = None
1838+ self._agent_unit_name = None
1839+ self._systemd_run_commands = []
1840+ self._systemd_run_commands_lock = threading.RLock()
1841
1842- def cleanup_legacy_cgroups(self):
1843+ def get_systemd_run_commands(self):
1844 """
1845- Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;
1846- starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. This
1847- method moves the daemon's PID from the legacy cgroups to the newer cgroups.
1848+ Returns a list of the systemd-run commands currently running (given as PIDs)
1849 """
1850- def move_daemon_pid(controller, daemon_pid):
1851- new_path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)
1852- logger.info("Writing daemon's PID ({0}) to {1}", daemon_pid, new_path)
1853- fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid)
1854- msg = "Moved daemon's PID from legacy cgroup to {0}".format(new_path)
1855- add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=True, message=msg)
1856+ with self._systemd_run_commands_lock:
1857+ return self._systemd_run_commands[:]
1858
1859- CGroupsApi._foreach_legacy_cgroup(move_daemon_pid)
1860-
1861- def create_agent_cgroups(self):
1862+ def get_cgroup_mount_points(self):
1863 """
1864- Creates a cgroup for the VM Agent in each of the controllers we are tracking; returns the created cgroups.
1865+ Returns a tuple with the mount points for the cpu and memory controllers; the values can be None
1866+ if the corresponding controller is not mounted
1867 """
1868- cgroups = []
1869-
1870- pid = int(os.getpid())
1871-
1872- def create_cgroup(controller):
1873- path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)
1874-
1875- if not os.path.isdir(path):
1876- FileSystemCgroupsApi._try_mkdir(path)
1877- logger.info("Created cgroup {0}".format(path))
1878-
1879- self._add_process_to_cgroup(pid, path)
1880-
1881- cgroups.append(CGroup.create(path, controller, VM_AGENT_CGROUP_NAME))
1882-
1883- self._foreach_controller(create_cgroup, 'Failed to create a cgroup for the VM Agent; resource usage will not be tracked')
1884-
1885- if len(cgroups) == 0:
1886- raise CGroupsException("Failed to create any cgroup for the VM Agent")
1887+ # the output of mount is similar to
1888+ # $ mount -t cgroup
1889+ # cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
1890+ # cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
1891+ # cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
1892+ # etc
1893+ #
1894+ if self._cgroup_mountpoints is None:
1895+ cpu = None
1896+ memory = None
1897+ for line in shellutil.run_command(['mount', '-t', 'cgroup']).splitlines():
1898+ match = re.search(r'on\s+(?P<path>/\S+(memory|cpuacct))\s', line)
1899+ if match is not None:
1900+ path = match.group('path')
1901+ if 'cpuacct' in path:
1902+ cpu = path
1903+ else:
1904+ memory = path
1905+ self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory}
1906
1907- return cgroups
1908+ return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory']
1909
1910- def create_extension_cgroups_root(self):
1911+ @staticmethod
1912+ def get_process_cgroup_relative_paths(process_id):
1913 """
1914- Creates the directory within the cgroups file system that will contain the cgroups for the extensions.
1915+ Returns a tuple with the path of the cpu and memory cgroups for the given process (relative to the mount point of the corresponding
1916+ controller).
1917+ The 'process_id' can be a numeric PID or the string "self" for the current process.
1918+ The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted).
1919 """
1920- def create_cgroup(controller):
1921- path = self._get_extension_cgroups_root_path(controller)
1922-
1923- if not os.path.isdir(path):
1924- FileSystemCgroupsApi._try_mkdir(path)
1925- logger.info("Created {0}".format(path))
1926+ # The contents of the file are similar to
1927+ # # cat /proc/1218/cgroup
1928+ # 10:memory:/system.slice/walinuxagent.service
1929+ # 3:cpu,cpuacct:/system.slice/walinuxagent.service
1930+ # etc
1931+ cpu_path = None
1932+ memory_path = None
1933+ for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines():
1934+ match = re.match(r'\d+:(?P<controller>(memory|.*cpuacct.*)):(?P<path>.+)', line)
1935+ if match is not None:
1936+ controller = match.group('controller')
1937+ path = match.group('path').lstrip('/') if match.group('path') != '/' else None
1938+ if controller == 'memory':
1939+ memory_path = path
1940+ else:
1941+ cpu_path = path
1942
1943- self._foreach_controller(create_cgroup, 'Failed to create a root cgroup for extensions')
1944+ return cpu_path, memory_path
1945
1946- def create_extension_cgroups(self, extension_name):
1947+ def get_process_cgroup_paths(self, process_id):
1948 """
1949- Creates a cgroup for the given extension in each of the controllers we are tracking; returns the created cgroups.
1950+ Returns a tuple with the path of the cpu and memory cgroups for the given process. The 'process_id' can be a numeric PID or the string "self" for the current process.
1951+ The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted).
1952 """
1953- cgroups = []
1954-
1955- def create_cgroup(controller):
1956- cgroup = self._create_extension_cgroup(controller, extension_name)
1957+ cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id)
1958
1959- if not os.path.isdir(cgroup.path):
1960- FileSystemCgroupsApi._try_mkdir(cgroup.path)
1961- logger.info("Created cgroup {0}".format(cgroup.path))
1962+ cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points()
1963
1964- cgroups.append(cgroup)
1965+ cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \
1966+ if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None
1967
1968- self._foreach_controller(create_cgroup, 'Failed to create a cgroup for extension {0}'.format(extension_name))
1969+ memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \
1970+ if memory_mount_point is not None and memory_cgroup_relative_path is not None else None
1971
1972- return cgroups
1973+ return cpu_cgroup_path, memory_cgroup_path
1974
1975- def remove_extension_cgroups(self, extension_name):
1976+ def get_unit_cgroup_paths(self, unit_name):
1977 """
1978- Deletes the cgroups for the given extension.
1979+ Returns a tuple with the path of the cpu and memory cgroups for the given unit.
1980+ The values returned can be None if the controller is not mounted.
1981+ Ex: ControlGroup=/azure.slice/walinuxagent.service
1982+ controlgroup_path[1:] = azure.slice/walinuxagent.service
1983 """
1984- def remove_cgroup(controller):
1985- path = self._get_extension_cgroup_path(controller, extension_name)
1986-
1987- if os.path.exists(path):
1988- try:
1989- os.rmdir(path)
1990- logger.info('Deleted cgroup "{0}".'.format(path))
1991- except OSError as exception:
1992- if exception.errno == 16: # [Errno 16] Device or resource busy
1993- logger.warn('CGroup "{0}" still has active tasks; will not remove it.'.format(path))
1994-
1995- self._foreach_controller(remove_cgroup, 'Failed to delete cgroups for extension {0}'.format(extension_name))
1996-
1997- def get_extension_cgroups(self, extension_name):
1998- """
1999- Returns the cgroups for the given extension.
2000- """
2001-
2002- cgroups = []
2003+ controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup")
2004+ cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points()
2005
2006- def get_cgroup(controller):
2007- cgroup = self._create_extension_cgroup(controller, extension_name)
2008- cgroups.append(cgroup)
2009+ cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \
2010+ if cpu_mount_point is not None else None
2011
2012- self._foreach_controller(get_cgroup, 'Failed to retrieve cgroups for extension {0}'.format(extension_name))
2013+ memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \
2014+ if memory_mount_point is not None else None
2015
2016- return cgroups
2017+ return cpu_cgroup_path, memory_cgroup_path
2018
2019- def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
2020- error_code=ExtensionErrorCodes.PluginUnknownFailure):
2021+ @staticmethod
2022+ def get_cgroup2_controllers():
2023 """
2024- Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup
2025- :param extension_name: The extension executing the command
2026- :param command: The command to invoke
2027- :param timeout: Number of seconds to wait for command completion
2028- :param cwd: The working directory for the command
2029- :param env: The environment to pass to the command's process
2030- :param stdout: File object to redirect stdout to
2031- :param stderr: File object to redirect stderr to
2032- :param error_code: Extension error code to raise in case of error
2033+ Returns a tuple with the mount point for the cgroups v2 controllers, and the currently mounted controllers;
2034+ either value can be None if cgroups v2 or its controllers are not mounted
2035 """
2036- try:
2037- extension_cgroups = self.create_extension_cgroups(extension_name)
2038- except Exception as exception:
2039- extension_cgroups = []
2040- logger.warn("Failed to create cgroups for extension '{0}'; resource usage will not be tracked. "
2041- "Error: {1}".format(extension_name, ustr(exception)))
2042-
2043- def pre_exec_function():
2044- os.setsid()
2045-
2046- try:
2047- pid = os.getpid()
2048-
2049- for cgroup in extension_cgroups:
2050- try:
2051- self._add_process_to_cgroup(pid, cgroup.path)
2052- except Exception as exception:
2053- logger.warn("Failed to add PID {0} to the cgroups for extension '{1}'. "
2054- "Resource usage will not be tracked. Error: {2}".format(pid,
2055- extension_name,
2056- ustr(exception)))
2057- except Exception as e:
2058- logger.warn("Failed to add extension {0} to its cgroup. Resource usage will not be tracked. "
2059- "Error: {1}".format(extension_name, ustr(e)))
2060-
2061- process = subprocess.Popen(command,
2062- shell=shell,
2063- cwd=cwd,
2064- env=env,
2065- stdout=stdout,
2066- stderr=stderr,
2067- preexec_fn=pre_exec_function)
2068-
2069- self.track_cgroups(extension_cgroups)
2070- process_output = handle_process_completion(process=process,
2071- command=command,
2072- timeout=timeout,
2073- stdout=stdout,
2074- stderr=stderr,
2075- error_code=error_code)
2076-
2077- return extension_cgroups, process_output
2078-
2079-
2080-class SystemdCgroupsApi(CGroupsApi):
2081- """
2082- Cgroups interface via systemd
2083- """
2084-
2085- @staticmethod
2086- def create_and_start_unit(unit_filename, unit_contents):
2087- try:
2088- unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
2089- fileutil.write_file(unit_path, unit_contents)
2090- shellutil.run_command(["systemctl", "daemon-reload"])
2091- shellutil.run_command(["systemctl", "start", unit_filename])
2092- except Exception as e:
2093- raise CGroupsException("Failed to create and start {0}. Error: {1}".format(unit_filename, ustr(e)))
2094+ # the output of mount is similar to
2095+ # $ mount -t cgroup2
2096+ # cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate)
2097+ #
2098+ for line in shellutil.run_command(['mount', '-t', 'cgroup2']).splitlines():
2099+ match = re.search(r'on\s+(?P<path>/\S+)\s', line)
2100+ if match is not None:
2101+ mount_point = match.group('path')
2102+ controllers = None
2103+ controllers_file = os.path.join(mount_point, 'cgroup.controllers')
2104+ if os.path.exists(controllers_file):
2105+ controllers = fileutil.read_file(controllers_file)
2106+ return mount_point, controllers
2107+ return None, None
2108
2109 @staticmethod
2110- def _get_extensions_slice_root_name():
2111- return "system-{0}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME)
2112-
2113- def _get_extension_slice_name(self, extension_name):
2114- return "system-{0}-{1}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME, self._get_extension_cgroup_name(extension_name))
2115-
2116- def create_agent_cgroups(self):
2117- try:
2118- cgroup_unit = None
2119- cgroup_paths = fileutil.read_file("/proc/self/cgroup")
2120- for entry in cgroup_paths.splitlines():
2121- fields = entry.split(':')
2122- if fields[1] == "name=systemd":
2123- cgroup_unit = fields[2].lstrip(os.path.sep)
2124-
2125- cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'cpu', cgroup_unit)
2126- memory_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'memory', cgroup_unit)
2127-
2128- return [CGroup.create(cpu_cgroup_path, 'cpu', VM_AGENT_CGROUP_NAME),
2129- CGroup.create(memory_cgroup_path, 'memory', VM_AGENT_CGROUP_NAME)]
2130- except Exception as e:
2131- raise CGroupsException("Failed to get paths of agent's cgroups. Error: {0}".format(ustr(e)))
2132-
2133- def create_extension_cgroups_root(self):
2134- unit_contents = """
2135-[Unit]
2136-Description=Slice for walinuxagent extensions
2137-DefaultDependencies=no
2138-Before=slices.target
2139-Requires=system.slice
2140-After=system.slice"""
2141- unit_filename = self._get_extensions_slice_root_name()
2142- self.create_and_start_unit(unit_filename, unit_contents)
2143- logger.info("Created slice for walinuxagent extensions {0}".format(unit_filename))
2144-
2145- def create_extension_cgroups(self, extension_name):
2146- # TODO: The slice created by this function is not used currently. We need to create the extension scopes within
2147- # this slice and use the slice to monitor the cgroups. Also see comment in get_extension_cgroups.
2148- # the slice.
2149- unit_contents = """
2150-[Unit]
2151-Description=Slice for extension {0}
2152-DefaultDependencies=no
2153-Before=slices.target
2154-Requires=system-{1}.slice
2155-After=system-{1}.slice""".format(extension_name, EXTENSIONS_ROOT_CGROUP_NAME)
2156- unit_filename = self._get_extension_slice_name(extension_name)
2157- self.create_and_start_unit(unit_filename, unit_contents)
2158- logger.info("Created slice for {0}".format(unit_filename))
2159-
2160- return self.get_extension_cgroups(extension_name)
2161-
2162- def remove_extension_cgroups(self, extension_name):
2163- # For transient units, cgroups are released automatically when the unit stops, so it is sufficient
2164- # to call stop on them. Persistent cgroups are released when the unit is disabled and its configuration
2165- # file is deleted.
2166- # The assumption is that this method is called after the extension has been uninstalled. For now, since
2167- # we're running extensions within transient scopes which clean up after they finish running, no removal
2168- # of units is needed. In the future, when the extension is running under its own slice,
2169- # the following clean up is needed.
2170- unit_filename = self._get_extension_slice_name(extension_name)
2171- try:
2172- unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
2173- shellutil.run_command(["systemctl", "stop", unit_filename])
2174- fileutil.rm_files(unit_path)
2175- shellutil.run_command(["systemctl", "daemon-reload"])
2176- except Exception as e:
2177- raise CGroupsException("Failed to remove {0}. Error: {1}".format(unit_filename, ustr(e)))
2178-
2179- def get_extension_cgroups(self, extension_name):
2180- # TODO: The slice returned by this function is not used currently. We need to create the extension scopes within
2181- # this slice and use the slice to monitor the cgroups. Also see comment in create_extension_cgroups.
2182- slice_name = self._get_extension_cgroup_name(extension_name)
2183-
2184- cgroups = []
2185-
2186- def create_cgroup(controller):
2187- cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', slice_name)
2188- cgroups.append(CGroup.create(cpu_cgroup_path, controller, extension_name))
2189-
2190- self._foreach_controller(create_cgroup, 'Cannot retrieve cgroup for extension {0}; resource usage will not be tracked.'.format(extension_name))
2191-
2192- return cgroups
2193+ def _is_systemd_failure(scope_name, stderr):
2194+ stderr.seek(0)
2195+ stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace')
2196+ unit_not_found = "Unit {0} not found.".format(scope_name)
2197+ return unit_not_found in stderr or scope_name not in stderr
2198
2199 @staticmethod
2200- def _is_systemd_failure(scope_name, process_output):
2201- unit_not_found = "Unit {0} not found.".format(scope_name)
2202- return unit_not_found in process_output or scope_name not in process_output
2203+ def get_extension_slice_name(extension_name, old_slice=False):
2204+ # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version.
2205+ # old slice includes <HandlerName>.<ExtensionName>-<HandlerVersion>
2206+ # new slice without version <HandlerName>.<ExtensionName>
2207+ if not old_slice:
2208+ extension_name = extension_name.rsplit("-", 1)[0]
2209+ # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.
2210+ return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice"
2211
2212- def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
2213+ def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr,
2214 error_code=ExtensionErrorCodes.PluginUnknownFailure):
2215- scope_name = "{0}_{1}".format(self._get_extension_cgroup_name(extension_name), uuid.uuid4())
2216-
2217- process = subprocess.Popen(
2218- "systemd-run --unit={0} --scope {1}".format(scope_name, command),
2219- shell=shell,
2220- cwd=cwd,
2221- stdout=stdout,
2222- stderr=stderr,
2223- env=env,
2224- preexec_fn=os.setsid)
2225+ scope = "{0}_{1}".format(cmd_name, uuid.uuid4())
2226+ extension_slice_name = self.get_extension_slice_name(extension_name)
2227+ with self._systemd_run_commands_lock:
2228+ process = subprocess.Popen( # pylint: disable=W1509
2229+ # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice
2230+ # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in extension Cgroup
2231+ # since slice unit file configured with accounting enabled.
2232+ "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command),
2233+ shell=shell,
2234+ cwd=cwd,
2235+ stdout=stdout,
2236+ stderr=stderr,
2237+ env=env,
2238+ preexec_fn=os.setsid)
2239+
2240+ # We start systemd-run with shell == True so process.pid is the shell's pid, not the pid for systemd-run
2241+ self._systemd_run_commands.append(process.pid)
2242+
2243+ scope_name = scope + '.scope'
2244+
2245+ logger.info("Started extension in unit '{0}'", scope_name)
2246+
2247+ cpu_cgroup = None
2248+ try:
2249+ cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name)
2250
2251- logger.info("Started extension using scope '{0}'", scope_name)
2252- extension_cgroups = []
2253+ cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points()
2254
2255- def create_cgroup(controller):
2256- cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', scope_name + ".scope")
2257- extension_cgroups.append(CGroup.create(cgroup_path, controller, extension_name))
2258+ if cpu_cgroup_mountpoint is None:
2259+ logger.info("The CPU controller is not mounted; will not track resource usage")
2260+ else:
2261+ cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
2262+ cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path)
2263+ CGroupsTelemetry.track_cgroup(cpu_cgroup)
2264
2265- self._foreach_controller(create_cgroup, 'Cannot create cgroup for extension {0}; '
2266- 'resource usage will not be tracked.'.format(extension_name))
2267- self.track_cgroups(extension_cgroups)
2268+ if memory_cgroup_mountpoint is None:
2269+ logger.info("The Memory controller is not mounted; will not track resource usage")
2270+ else:
2271+ memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path)
2272+ memory_cgroup = MemoryCgroup(extension_name, memory_cgroup_path)
2273+ CGroupsTelemetry.track_cgroup(memory_cgroup)
2274+
2275+ except IOError as e:
2276+ if e.errno == 2: # 'No such file or directory'
2277+ logger.info("The extension command already completed; will not track resource usage")
2278+ logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))
2279+ except Exception as e:
2280+ logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))
2281
2282 # Wait for process completion or timeout
2283 try:
2284- process_output = handle_process_completion(process=process,
2285- command=command,
2286- timeout=timeout,
2287- stdout=stdout,
2288- stderr=stderr,
2289- error_code=error_code)
2290+ return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout,
2291+ stderr=stderr, error_code=error_code, cpu_cgroup=cpu_cgroup)
2292 except ExtensionError as e:
2293 # The extension didn't terminate successfully. Determine whether it was due to systemd errors or
2294 # extension errors.
2295- process_output = read_output(stdout, stderr)
2296- systemd_failure = self._is_systemd_failure(scope_name, process_output)
2297-
2298- if not systemd_failure:
2299+ if not self._is_systemd_failure(scope, stderr):
2300 # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error
2301 raise
2302+
2303+ # There was an issue with systemd-run. We need to log it and retry the extension without systemd.
2304+ process_output = read_output(stdout, stderr)
2305+ # Reset the stdout and stderr
2306+ stdout.truncate(0)
2307+ stderr.truncate(0)
2308+
2309+ if isinstance(e, ExtensionOperationError):
2310+ # no-member: Instance of 'ExtensionError' has no 'exit_code' member (no-member) - Disabled: e is actually an ExtensionOperationError
2311+ err_msg = 'Systemd process exited with code %s and output %s' % (
2312+ e.exit_code, process_output) # pylint: disable=no-member
2313 else:
2314- # There was an issue with systemd-run. We need to log it and retry the extension without systemd.
2315- err_msg = 'Systemd process exited with code %s and output %s' % (e.exit_code, process_output) \
2316- if isinstance(e, ExtensionOperationError) else "Systemd timed-out, output: %s" % process_output
2317- event_msg = 'Failed to run systemd-run for unit {0}.scope. ' \
2318- 'Will retry invoking the extension without systemd. ' \
2319- 'Systemd-run error: {1}'.format(scope_name, err_msg)
2320- add_event(AGENT_NAME,
2321- version=CURRENT_VERSION,
2322- op=WALAEventOperation.InvokeCommandUsingSystemd,
2323- is_success=False,
2324- log_event=False,
2325- message=event_msg)
2326- logger.warn(event_msg)
2327-
2328- # Reset the stdout and stderr
2329- stdout.truncate(0)
2330- stderr.truncate(0)
2331-
2332- # Try invoking the process again, this time without systemd-run
2333- logger.info('Extension invocation using systemd failed, falling back to regular invocation '
2334- 'without cgroups tracking.')
2335- process = subprocess.Popen(command,
2336- shell=shell,
2337- cwd=cwd,
2338- env=env,
2339- stdout=stdout,
2340- stderr=stderr,
2341- preexec_fn=os.setsid)
2342-
2343- process_output = handle_process_completion(process=process,
2344- command=command,
2345- timeout=timeout,
2346- stdout=stdout,
2347- stderr=stderr,
2348- error_code=error_code)
2349-
2350- return [], process_output
2351-
2352- # The process terminated in time and successfully
2353- return extension_cgroups, process_output
2354+ err_msg = "Systemd timed-out, output: %s" % process_output
2355+ raise SystemdRunError(err_msg)
2356+ finally:
2357+ with self._systemd_run_commands_lock:
2358+ self._systemd_run_commands.remove(process.pid)
2359
2360 def cleanup_legacy_cgroups(self):
2361 """
2362 Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;
2363 starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If
2364- we find that any of the legacy groups include the PID of the daemon then we disable data collection for this instance
2365- (under systemd, moving PIDs across the cgroup file system can produce unpredictable results)
2366+ we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this
2367+ instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results)
2368 """
2369- def report_error(_, daemon_pid):
2370- raise CGroupsException(
2371- "The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid))
2372-
2373- CGroupsApi._foreach_legacy_cgroup(report_error)
2374+ return CGroupsApi._foreach_legacy_cgroup(lambda *_: None)
2375diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py
2376index ea6983f..767786f 100644
2377--- a/azurelinuxagent/common/cgroupconfigurator.py
2378+++ b/azurelinuxagent/common/cgroupconfigurator.py
2379@@ -1,3 +1,4 @@
2380+# -*- encoding: utf-8 -*-
2381 # Copyright 2018 Microsoft Corporation
2382 #
2383 # Licensed under the Apache License, Version 2.0 (the "License");
2384@@ -13,157 +14,870 @@
2385 # limitations under the License.
2386 #
2387 # Requires Python 2.6+ and Openssl 1.0+
2388-
2389+import glob
2390+import json
2391 import os
2392+import re
2393 import subprocess
2394+import threading
2395
2396+from azurelinuxagent.common import conf
2397 from azurelinuxagent.common import logger
2398-from azurelinuxagent.common.cgroupapi import CGroupsApi
2399+from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup
2400+from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX
2401 from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
2402-from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes
2403+from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException
2404 from azurelinuxagent.common.future import ustr
2405-from azurelinuxagent.common.osutil import get_osutil
2406+from azurelinuxagent.common.osutil import get_osutil, systemd
2407+from azurelinuxagent.common.version import get_distro
2408+from azurelinuxagent.common.utils import shellutil, fileutil
2409 from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion
2410-from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION
2411 from azurelinuxagent.common.event import add_event, WALAEventOperation
2412
2413+AZURE_SLICE = "azure.slice"
2414+_AZURE_SLICE_CONTENTS = """
2415+[Unit]
2416+Description=Slice for Azure VM Agent and Extensions
2417+DefaultDependencies=no
2418+Before=slices.target
2419+"""
2420+_VMEXTENSIONS_SLICE = EXTENSION_SLICE_PREFIX + ".slice"
2421+_AZURE_VMEXTENSIONS_SLICE = AZURE_SLICE + "/" + _VMEXTENSIONS_SLICE
2422+_VMEXTENSIONS_SLICE_CONTENTS = """
2423+[Unit]
2424+Description=Slice for Azure VM Extensions
2425+DefaultDependencies=no
2426+Before=slices.target
2427+[Slice]
2428+CPUAccounting=yes
2429+MemoryAccounting=yes
2430+"""
2431+_EXTENSION_SLICE_CONTENTS = """
2432+[Unit]
2433+Description=Slice for Azure VM extension {extension_name}
2434+DefaultDependencies=no
2435+Before=slices.target
2436+[Slice]
2437+CPUAccounting=yes
2438+CPUQuota={cpu_quota}
2439+MemoryAccounting=yes
2440+"""
2441+LOGCOLLECTOR_SLICE = "azure-walinuxagent-logcollector.slice"
2442+# More info on resource limits properties in systemd here:
2443+# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups
2444+_LOGCOLLECTOR_SLICE_CONTENTS_FMT = """
2445+[Unit]
2446+Description=Slice for Azure VM Agent Periodic Log Collector
2447+DefaultDependencies=no
2448+Before=slices.target
2449+[Slice]
2450+CPUAccounting=yes
2451+CPUQuota={cpu_quota}
2452+MemoryAccounting=yes
2453+"""
2454+_LOGCOLLECTOR_CPU_QUOTA = "5%"
2455+LOGCOLLECTOR_MEMORY_LIMIT = 30 * 1024 ** 2 # 30Mb
2456+
2457+_AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf"
2458+_AGENT_DROP_IN_FILE_SLICE_CONTENTS = """
2459+# This drop-in unit file was created by the Azure VM Agent.
2460+# Do not edit.
2461+[Service]
2462+Slice=azure.slice
2463+"""
2464+_DROP_IN_FILE_CPU_ACCOUNTING = "11-CPUAccounting.conf"
2465+_DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS = """
2466+# This drop-in unit file was created by the Azure VM Agent.
2467+# Do not edit.
2468+[Service]
2469+CPUAccounting=yes
2470+"""
2471+_DROP_IN_FILE_CPU_QUOTA = "12-CPUQuota.conf"
2472+_DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT = """
2473+# This drop-in unit file was created by the Azure VM Agent.
2474+# Do not edit.
2475+[Service]
2476+CPUQuota={0}
2477+"""
2478+_DROP_IN_FILE_MEMORY_ACCOUNTING = "13-MemoryAccounting.conf"
2479+_DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS = """
2480+# This drop-in unit file was created by the Azure VM Agent.
2481+# Do not edit.
2482+[Service]
2483+MemoryAccounting=yes
2484+"""
2485+
2486+
2487+class DisableCgroups(object):
2488+ ALL = "all"
2489+ AGENT = "agent"
2490+ EXTENSIONS = "extensions"
2491+
2492+
2493+def _log_cgroup_info(format_string, *args):
2494+ message = format_string.format(*args)
2495+ logger.info("[CGI] " + message)
2496+ add_event(op=WALAEventOperation.CGroupsInfo, message=message)
2497+
2498+
2499+def _log_cgroup_warning(format_string, *args):
2500+ message = format_string.format(*args)
2501+ logger.info("[CGW] " + message) # log as INFO for now, in the future it should be logged as WARNING
2502+ add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False)
2503+
2504
2505 class CGroupConfigurator(object):
2506 """
2507 This class implements the high-level operations on CGroups (e.g. initialization, creation, etc)
2508
2509- NOTE: with the exception of start_extension_command, none of the methods in this class raise exceptions (cgroup operations should not block extensions)
2510+ NOTE: with the exception of start_extension_command, none of the methods in this class
2511+ raise exceptions (cgroup operations should not block extensions)
2512 """
2513- class __impl(object):
2514+
2515+ class _Impl(object):
2516 def __init__(self):
2517+ self._initialized = False
2518+ self._cgroups_supported = False
2519+ self._agent_cgroups_enabled = False
2520+ self._extensions_cgroups_enabled = False
2521+ self._cgroups_api = None
2522+ self._agent_cpu_cgroup_path = None
2523+ self._agent_memory_cgroup_path = None
2524+ self._agent_memory_cgroup = None
2525+ self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop.
2526+
2527+ def initialize(self):
2528+ try:
2529+ if self._initialized:
2530+ return
2531+ # This check is to reset the quotas if agent goes from cgroup supported to unsupported distros later in time.
2532+ if not CGroupsApi.cgroups_supported():
2533+ agent_drop_in_path = systemd.get_agent_drop_in_path()
2534+ try:
2535+ if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path):
2536+ files_to_cleanup = []
2537+ agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
2538+ agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path,
2539+ _DROP_IN_FILE_CPU_ACCOUNTING)
2540+ agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path,
2541+ _DROP_IN_FILE_MEMORY_ACCOUNTING)
2542+ agent_drop_in_file_cpu_quota = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
2543+ files_to_cleanup.extend([agent_drop_in_file_slice, agent_drop_in_file_cpu_accounting,
2544+ agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota])
2545+ self.__cleanup_all_files(files_to_cleanup)
2546+ self.__reload_systemd_config()
2547+ logger.info("Agent reset the quotas if distro: {0} goes from supported to unsupported list", get_distro())
2548+ except Exception as err:
2549+ logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err))
2550+
2551+ # check whether cgroup monitoring is supported on the current distro
2552+ self._cgroups_supported = CGroupsApi.cgroups_supported()
2553+ if not self._cgroups_supported:
2554+ logger.info("Cgroup monitoring is not supported on {0}", get_distro())
2555+ return
2556+
2557+ # check that systemd is detected correctly
2558+ self._cgroups_api = SystemdCgroupsApi()
2559+ if not systemd.is_systemd():
2560+ _log_cgroup_warning("systemd was not detected on {0}", get_distro())
2561+ return
2562+
2563+ _log_cgroup_info("systemd version: {0}", systemd.get_version())
2564+
2565+ # This is temporarily disabled while we analyze telemetry. Likely it will be removed.
2566+ # self.__collect_azure_unit_telemetry()
2567+ # self.__collect_agent_unit_files_telemetry()
2568+
2569+ if not self.__check_no_legacy_cgroups():
2570+ return
2571+
2572+ agent_unit_name = systemd.get_agent_unit_name()
2573+ agent_slice = systemd.get_unit_property(agent_unit_name, "Slice")
2574+ if agent_slice not in (AZURE_SLICE, "system.slice"):
2575+ _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice)
2576+ return
2577+
2578+ self.__setup_azure_slice()
2579+
2580+ cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers()
2581+ self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice,
2582+ cpu_controller_root,
2583+ memory_controller_root)
2584+
2585+ if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None:
2586+ self.enable()
2587+
2588+ if self._agent_cpu_cgroup_path is not None:
2589+ _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path)
2590+ self.__set_cpu_quota(conf.get_agent_cpu_quota())
2591+ CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
2592+
2593+ if self._agent_memory_cgroup_path is not None:
2594+ _log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path)
2595+ self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path)
2596+ CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup)
2597+
2598+ _log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled)
2599+
2600+ except Exception as exception:
2601+ _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception))
2602+ finally:
2603+ self._initialized = True
2604+
2605+ @staticmethod
2606+ def __collect_azure_unit_telemetry():
2607+ azure_units = []
2608+
2609+ try:
2610+ units = shellutil.run_command(['systemctl', 'list-units', 'azure*', '-all'])
2611+ for line in units.split('\n'):
2612+ match = re.match(r'\s?(azure[^\s]*)\s?', line, re.IGNORECASE)
2613+ if match is not None:
2614+ azure_units.append((match.group(1), line))
2615+ except shellutil.CommandError as command_error:
2616+ _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
2617+
2618+ for unit_name, unit_description in azure_units:
2619+ unit_slice = "Unknown"
2620+ try:
2621+ unit_slice = systemd.get_unit_property(unit_name, "Slice")
2622+ except Exception as exception:
2623+ _log_cgroup_warning("Failed to query Slice for {0}: {1}", unit_name, ustr(exception))
2624+
2625+ _log_cgroup_info("Found an Azure unit under slice {0}: {1}", unit_slice, unit_description)
2626+
2627+ if len(azure_units) == 0:
2628+ try:
2629+ cgroups = shellutil.run_command('systemd-cgls')
2630+ for line in cgroups.split('\n'):
2631+ if re.match(r'[^\x00-\xff]+azure\.slice\s*', line, re.UNICODE):
2632+ logger.info(ustr("Found a cgroup for azure.slice\n{0}").format(cgroups))
2633+ # Don't add the output of systemd-cgls to the telemetry, since currently it does not support Unicode
2634+ add_event(op=WALAEventOperation.CGroupsInfo, message="Found a cgroup for azure.slice")
2635+ except shellutil.CommandError as command_error:
2636+ _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
2637+
2638+ @staticmethod
2639+ def __collect_agent_unit_files_telemetry():
2640+ agent_unit_files = []
2641+ agent_service_name = get_osutil().get_service_name()
2642+ try:
2643+ fragment_path = systemd.get_unit_property(agent_service_name, "FragmentPath")
2644+ if fragment_path != systemd.get_agent_unit_file():
2645+ agent_unit_files.append(fragment_path)
2646+ except Exception as exception:
2647+ _log_cgroup_warning("Failed to query the agent's FragmentPath: {0}", ustr(exception))
2648+
2649+ try:
2650+ drop_in_paths = systemd.get_unit_property(agent_service_name, "DropInPaths")
2651+ for path in drop_in_paths.split():
2652+ agent_unit_files.append(path)
2653+ except Exception as exception:
2654+ _log_cgroup_warning("Failed to query the agent's DropInPaths: {0}", ustr(exception))
2655+
2656+ for unit_file in agent_unit_files:
2657+ try:
2658+ with open(unit_file, "r") as file_object:
2659+ _log_cgroup_info("Found a custom unit file for the agent: {0}\n{1}", unit_file,
2660+ file_object.read())
2661+ except Exception as exception:
2662+ _log_cgroup_warning("Can't read {0}: {1}", unit_file, ustr(exception))
2663+
2664+ def __check_no_legacy_cgroups(self):
2665+ """
2666+ Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running
2667+ under systemd this could produce invalid resource usage data. Cgroups should not be enabled under this condition.
2668+ """
2669+ legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups()
2670+ if legacy_cgroups > 0:
2671+ _log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.")
2672+ return False
2673+ return True
2674+
2675+ def __get_cgroup_controllers(self):
2676+ #
2677+ # check v1 controllers
2678+ #
2679+ cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points()
2680+
2681+ if cpu_controller_root is not None:
2682+ logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root)
2683+ else:
2684+ _log_cgroup_warning("The CPU cgroup controller is not mounted")
2685+
2686+ if memory_controller_root is not None:
2687+ logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root)
2688+ else:
2689+ _log_cgroup_warning("The memory cgroup controller is not mounted")
2690+
2691+ #
2692+ # check v2 controllers
2693+ #
2694+ cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers()
2695+ if cgroup2_mount_point is not None:
2696+ _log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point,
2697+ cgroup2_controllers)
2698+
2699+ return cpu_controller_root, memory_controller_root
2700+
2701+ @staticmethod
2702+ def __setup_azure_slice():
2703 """
2704- Ensures the cgroups file system is mounted and selects the correct API to interact with it
2705+ The agent creates "azure.slice" for use by extensions and the agent. The agent runs under "azure.slice" directly and each
2706+ extension runs under its own slice ("Microsoft.CPlat.Extension.slice" in the example below). All the slices for
2707+ extensions are grouped under "vmextensions.slice".
2708+
2709+ Example: -.slice
2710+ ├─user.slice
2711+ ├─system.slice
2712+ └─azure.slice
2713+ ├─walinuxagent.service
2714+ │ ├─5759 /usr/bin/python3 -u /usr/sbin/waagent -daemon
2715+ │ └─5764 python3 -u bin/WALinuxAgent-2.2.53-py2.7.egg -run-exthandlers
2716+ └─azure-vmextensions.slice
2717+ └─Microsoft.CPlat.Extension.slice
2718+ └─5894 /usr/bin/python3 /var/lib/waagent/Microsoft.CPlat.Extension-1.0.0.0/enable.py
2719+
2720+ This method ensures that the "azure" and "vmextensions" slices are created. Setup should create those slices
2721+ under /lib/systemd/system; but if they do not exist, __ensure_azure_slices_exist will create them.
2722+
2723+ It also creates drop-in files to set the agent's Slice and CPUAccounting if they have not been
2724+ set up in the agent's unit file.
2725+
2726+ Lastly, the method also cleans up unit files left over from previous versions of the agent.
2727 """
2728- osutil = get_osutil()
2729
2730- self._cgroups_supported = osutil.is_cgroups_supported()
2731+ # Older agents used to create this slice, but it was never used. Cleanup the file.
2732+ CGroupConfigurator._Impl.__cleanup_unit_file("/etc/systemd/system/system-walinuxagent.extensions.slice")
2733+
2734+ unit_file_install_path = systemd.get_unit_file_install_path()
2735+ azure_slice = os.path.join(unit_file_install_path, AZURE_SLICE)
2736+ vmextensions_slice = os.path.join(unit_file_install_path, _VMEXTENSIONS_SLICE)
2737+ logcollector_slice = os.path.join(unit_file_install_path, LOGCOLLECTOR_SLICE)
2738+ agent_unit_file = systemd.get_agent_unit_file()
2739+ agent_drop_in_path = systemd.get_agent_drop_in_path()
2740+ agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
2741+ agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_ACCOUNTING)
2742+ agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_MEMORY_ACCOUNTING)
2743+
2744+ files_to_create = []
2745+
2746+ if not os.path.exists(azure_slice):
2747+ files_to_create.append((azure_slice, _AZURE_SLICE_CONTENTS))
2748+
2749+ if not os.path.exists(vmextensions_slice):
2750+ files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS))
2751+
2752+ # Update log collector slice contents
2753+ slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA)
2754+ files_to_create.append((logcollector_slice, slice_contents))
2755+
2756+ if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None:
2757+ CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_slice)
2758+ else:
2759+ if not os.path.exists(agent_drop_in_file_slice):
2760+ files_to_create.append((agent_drop_in_file_slice, _AGENT_DROP_IN_FILE_SLICE_CONTENTS))
2761+
2762+ if fileutil.findre_in_file(agent_unit_file, r"CPUAccounting=") is not None:
2763+ CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_cpu_accounting)
2764+ else:
2765+ if not os.path.exists(agent_drop_in_file_cpu_accounting):
2766+ files_to_create.append((agent_drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS))
2767+
2768+ if fileutil.findre_in_file(agent_unit_file, r"MemoryAccounting=") is not None:
2769+ CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_memory_accounting)
2770+ else:
2771+ if not os.path.exists(agent_drop_in_file_memory_accounting):
2772+ files_to_create.append(
2773+ (agent_drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS))
2774+
2775+ if len(files_to_create) > 0:
2776+ # create the unit files, but if 1 fails remove all and return
2777+ try:
2778+ for path, contents in files_to_create:
2779+ CGroupConfigurator._Impl.__create_unit_file(path, contents)
2780+ except Exception as exception:
2781+ _log_cgroup_warning("Failed to create unit files for the azure slice: {0}", ustr(exception))
2782+ for unit_file in files_to_create:
2783+ CGroupConfigurator._Impl.__cleanup_unit_file(unit_file)
2784+ return
2785+
2786+ CGroupConfigurator._Impl.__reload_systemd_config()
2787
2788- if self._cgroups_supported:
2789- self._enabled = True
2790+ @staticmethod
2791+ def __reload_systemd_config():
2792+ # reload the systemd configuration; the new slices will be used once the agent's service restarts
2793+ try:
2794+ logger.info("Executing systemctl daemon-reload...")
2795+ shellutil.run_command(["systemctl", "daemon-reload"])
2796+ except Exception as exception:
2797+ _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception))
2798+
2799+ @staticmethod
2800+ def __create_unit_file(path, contents):
2801+ parent, _ = os.path.split(path)
2802+ if not os.path.exists(parent):
2803+ fileutil.mkdir(parent, mode=0o755)
2804+ exists = os.path.exists(path)
2805+ fileutil.write_file(path, contents)
2806+ _log_cgroup_info("{0} {1}", "Updated" if exists else "Created", path)
2807+
2808+ @staticmethod
2809+ def __cleanup_unit_file(path):
2810+ if os.path.exists(path):
2811 try:
2812- osutil.mount_cgroups()
2813- self._cgroups_api = CGroupsApi.create()
2814- status = "The cgroup filesystem is ready to use"
2815- except Exception as e:
2816- status = ustr(e)
2817- self._enabled = False
2818+ os.remove(path)
2819+ _log_cgroup_info("Removed {0}", path)
2820+ except Exception as exception:
2821+ _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception))
2822+
2823+ @staticmethod
2824+ def __cleanup_all_files(files_to_cleanup):
2825+ for path in files_to_cleanup:
2826+ if os.path.exists(path):
2827+ try:
2828+ os.remove(path)
2829+ _log_cgroup_info("Removed {0}", path)
2830+ except Exception as exception:
2831+ _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception))
2832+
2833+ @staticmethod
2834+ def __create_all_files(files_to_create):
2835+ # create the unit files, but if 1 fails remove all and return
2836+ try:
2837+ for path, contents in files_to_create:
2838+ CGroupConfigurator._Impl.__create_unit_file(path, contents)
2839+ except Exception as exception:
2840+ _log_cgroup_warning("Failed to create unit files : {0}", ustr(exception))
2841+ for unit_file in files_to_create:
2842+ CGroupConfigurator._Impl.__cleanup_unit_file(unit_file)
2843+ return
2844+
2845+ def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota=None):
2846+ unit_file_install_path = systemd.get_unit_file_install_path()
2847+ old_extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name, old_slice=True))
2848+ # clean up the old slice from the disk
2849+ if os.path.exists(old_extension_slice_path):
2850+ CGroupConfigurator._Impl.__cleanup_unit_file(old_extension_slice_path)
2851+
2852+ extension_slice_path = os.path.join(unit_file_install_path,
2853+ SystemdCgroupsApi.get_extension_slice_name(extension_name))
2854+ cpu_quota = str(
2855+ cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
2856+ slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name,
2857+ cpu_quota=cpu_quota)
2858+ if os.path.exists(extension_slice_path):
2859+ with open(extension_slice_path, "r") as file_:
2860+ if file_.read() == slice_contents:
2861+ return True
2862+ return False
2863+
2864+ def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controller_root):
2865+ agent_unit_name = systemd.get_agent_unit_name()
2866+
2867+ expected_relative_path = os.path.join(agent_slice, agent_unit_name)
2868+ cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths(
2869+ "self")
2870+
2871+ if cpu_cgroup_relative_path is None:
2872+ _log_cgroup_warning("The agent's process is not within a CPU cgroup")
2873+ else:
2874+ if cpu_cgroup_relative_path == expected_relative_path:
2875+ _log_cgroup_info('CPUAccounting: {0}', systemd.get_unit_property(agent_unit_name, "CPUAccounting"))
2876+ _log_cgroup_info('CPUQuota: {0}', systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec"))
2877+ else:
2878+ _log_cgroup_warning(
2879+ "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]",
2880+ cpu_cgroup_relative_path,
2881+ expected_relative_path)
2882+ cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring
2883+
2884+ if memory_cgroup_relative_path is None:
2885+ _log_cgroup_warning("The agent's process is not within a memory cgroup")
2886 else:
2887- self._enabled = False
2888- self._cgroups_api = None
2889- status = "Cgroups are not supported by the platform"
2890+ if memory_cgroup_relative_path == expected_relative_path:
2891+ memory_accounting = systemd.get_unit_property(agent_unit_name, "MemoryAccounting")
2892+ _log_cgroup_info('MemoryAccounting: {0}', memory_accounting)
2893+ else:
2894+ _log_cgroup_info(
2895+ "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]",
2896+ memory_cgroup_relative_path,
2897+ expected_relative_path)
2898+ memory_cgroup_relative_path = None # Set the path to None to prevent monitoring
2899
2900- logger.info("CGroups Status: {0}".format(status))
2901+ if cpu_controller_root is not None and cpu_cgroup_relative_path is not None:
2902+ agent_cpu_cgroup_path = os.path.join(cpu_controller_root, cpu_cgroup_relative_path)
2903+ else:
2904+ agent_cpu_cgroup_path = None
2905+
2906+ if memory_controller_root is not None and memory_cgroup_relative_path is not None:
2907+ agent_memory_cgroup_path = os.path.join(memory_controller_root, memory_cgroup_relative_path)
2908+ else:
2909+ agent_memory_cgroup_path = None
2910
2911- add_event(
2912- AGENT_NAME,
2913- version=CURRENT_VERSION,
2914- op=WALAEventOperation.InitializeCGroups,
2915- is_success=self._enabled,
2916- message=status,
2917- log_event=False)
2918+ return agent_cpu_cgroup_path, agent_memory_cgroup_path
2919+
2920+ def supported(self):
2921+ return self._cgroups_supported
2922
2923 def enabled(self):
2924- return self._enabled
2925+ return self._agent_cgroups_enabled or self._extensions_cgroups_enabled
2926+
2927+ def agent_enabled(self):
2928+ return self._agent_cgroups_enabled
2929+
2930+ def extensions_enabled(self):
2931+ return self._extensions_cgroups_enabled
2932
2933 def enable(self):
2934- if not self._cgroups_supported:
2935- raise CGroupsException("cgroups are not supported on the current platform")
2936+ if not self.supported():
2937+ raise CGroupsException(
2938+ "Attempted to enable cgroups, but they are not supported on the current platform")
2939+ self._agent_cgroups_enabled = True
2940+ self._extensions_cgroups_enabled = True
2941
2942- self._enabled = True
2943+ def disable(self, reason, disable_cgroups):
2944+ if disable_cgroups == DisableCgroups.ALL: # disable all
2945+ # Reset quotas
2946+ self.__reset_agent_cpu_quota()
2947+ extension_services = self.get_extension_services_list()
2948+ for extension in extension_services:
2949+ logger.info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension]))
2950+ self.__reset_extension_cpu_quota(extension_name=extension)
2951+ self.__reset_extension_services_cpu_quota(extension_services[extension])
2952+ self.__reload_systemd_config()
2953
2954- def disable(self):
2955- self._enabled = False
2956- CGroupsTelemetry.reset()
2957+ CGroupsTelemetry.reset()
2958+ self._agent_cgroups_enabled = False
2959+ self._extensions_cgroups_enabled = False
2960+ elif disable_cgroups == DisableCgroups.AGENT: # disable agent
2961+ self._agent_cgroups_enabled = False
2962+ self.__reset_agent_cpu_quota()
2963+ CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
2964
2965- def _invoke_cgroup_operation(self, operation, error_message, on_error=None):
2966+ message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason)
2967+ logger.info(message) # log as INFO for now, in the future it should be logged as WARNING
2968+ add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False)
2969+
2970+ @staticmethod
2971+ def __set_cpu_quota(quota):
2972 """
2973- Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation.
2974+ Sets the agent's CPU quota to the given percentage (100% == 1 CPU)
2975+
2976+ NOTE: This is done using a dropin file in the default dropin directory; any local overrides on the VM will take precedence
2977+ over this setting.
2978 """
2979- if not self.enabled():
2980- return
2981+ quota_percentage = "{0}%".format(quota)
2982+ _log_cgroup_info("Ensuring the agent's CPUQuota is {0}", quota_percentage)
2983+ if CGroupConfigurator._Impl.__try_set_cpu_quota(quota_percentage):
2984+ CGroupsTelemetry.set_track_throttled_time(True)
2985+
2986+ @staticmethod
2987+ def __reset_agent_cpu_quota():
2988+ """
2989+ Removes any CPUQuota on the agent
2990
2991+ NOTE: This resets the quota on the agent's default dropin file; any local overrides on the VM will take precedence
2992+ over this setting.
2993+ """
2994+ logger.info("Resetting agent's CPUQuota")
2995+ if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity)
2996+ _log_cgroup_info('CPUQuota: {0}',
2997+ systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec"))
2998+
2999+ @staticmethod
3000+ def __try_set_cpu_quota(quota):
3001 try:
3002- return operation()
3003- except Exception as e:
3004- logger.warn("{0} Error: {1}".format(error_message, ustr(e)))
3005- if on_error is not None:
3006- try:
3007- on_error(e)
3008- except Exception as ex:
3009- logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e)))
3010+ drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
3011+ contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(quota)
3012+ if os.path.exists(drop_in_file):
3013+ with open(drop_in_file, "r") as file_:
3014+ if file_.read() == contents:
3015+ return True # no need to update the file; return here to avoid doing a daemon-reload
3016+ CGroupConfigurator._Impl.__create_unit_file(drop_in_file, contents)
3017+ except Exception as exception:
3018+ _log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception))
3019+ return False
3020+ try:
3021+ logger.info("Executing systemctl daemon-reload...")
3022+ shellutil.run_command(["systemctl", "daemon-reload"])
3023+ except Exception as exception:
3024+ _log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception))
3025+ return False
3026+ return True
3027+
3028+ def check_cgroups(self, cgroup_metrics):
3029+ self._check_cgroups_lock.acquire()
3030+ try:
3031+ if not self.enabled():
3032+ return
3033+
3034+ errors = []
3035+
3036+ process_check_success = False
3037+ try:
3038+ self._check_processes_in_agent_cgroup()
3039+ process_check_success = True
3040+ except CGroupsException as exception:
3041+ errors.append(exception)
3042
3043- def create_agent_cgroups(self, track_cgroups):
3044+ quota_check_success = False
3045+ try:
3046+ if cgroup_metrics:
3047+ self._check_agent_throttled_time(cgroup_metrics)
3048+ quota_check_success = True
3049+ except CGroupsException as exception:
3050+ errors.append(exception)
3051+
3052+ reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors]))
3053+
3054+ if not process_check_success and conf.get_cgroup_disable_on_process_check_failure():
3055+ self.disable(reason, DisableCgroups.ALL)
3056+
3057+ if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure():
3058+ self.disable(reason, DisableCgroups.AGENT)
3059+ finally:
3060+ self._check_cgroups_lock.release()
3061+
3062+ def _check_processes_in_agent_cgroup(self):
3063 """
3064- Creates and returns the cgroups needed to track the VM Agent
3065+ Verifies that the agent's cgroup includes only the current process, its parent, commands started using shellutil and instances of systemd-run
3066+ (those processes correspond, respectively, to the extension handler, the daemon, commands started by the extension handler, and the systemd-run
3067+ commands used to start extensions on their own cgroup).
3068+ Other processes started by the agent (e.g. extensions) and processes not started by the agent (e.g. services installed by extensions) are reported
3069+ as unexpected, since they should belong to their own cgroup.
3070+
3071+ Raises a CGroupsException if the check fails
3072 """
3073- def __impl():
3074- cgroups = self._cgroups_api.create_agent_cgroups()
3075+ unexpected = []
3076+ agent_cgroup_proc_names = []
3077+ try:
3078+ daemon = os.getppid()
3079+ extension_handler = os.getpid()
3080+ agent_commands = set()
3081+ agent_commands.update(shellutil.get_running_commands())
3082+ systemd_run_commands = set()
3083+ systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
3084+ agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path)
3085+ # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup;
3086+ agent_commands.update(shellutil.get_running_commands())
3087+ systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
3088
3089- if track_cgroups:
3090- for cgroup in cgroups:
3091- CGroupsTelemetry.track_cgroup(cgroup)
3092+ for process in agent_cgroup:
3093+ agent_cgroup_proc_names.append(self.__format_process(process))
3094+ # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't.
3095+ if process in (daemon, extension_handler) or process in systemd_run_commands:
3096+ continue
3097+ # check shell systemd_run process if above process check didn't catch it
3098+ if self._check_systemd_run_process(process):
3099+ continue
3100+ # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent
3101+ if self._get_parent(process) in systemd_run_commands and self._get_command(
3102+ process) == 'systemd-run':
3103+ continue
3104+ # check if the process is a command started by the agent or a descendant of one of those commands
3105+ current = process
3106+ while current != 0 and current not in agent_commands:
3107+ current = self._get_parent(current)
3108+ # Verify if Process started by agent based on the marker found in process environment or process is in Zombie state.
3109+ # If so, consider it as valid process in agent cgroup.
3110+ if current == 0 and not (self.__is_process_descendant_of_the_agent(process) or self.__is_zombie_process(process)):
3111+ unexpected.append(self.__format_process(process))
3112+ if len(unexpected) >= 5: # collect just a small sample
3113+ break
3114+ except Exception as exception:
3115+ _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception)))
3116
3117- return cgroups
3118+ if len(unexpected) > 0:
3119+ self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected)
3120+ raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected))
3121
3122- self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked.")
3123+ @staticmethod
3124+ def _get_command(pid):
3125+ try:
3126+ with open('/proc/{0}/comm'.format(pid), "r") as file_:
3127+ comm = file_.read()
3128+ if comm and comm[-1] == '\x00': # if null-terminated, remove the null
3129+ comm = comm[:-1]
3130+ return comm.rstrip()
3131+ except Exception:
3132+ return "UNKNOWN"
3133
3134- def cleanup_legacy_cgroups(self):
3135- def __impl():
3136- self._cgroups_api.cleanup_legacy_cgroups()
3137+ @staticmethod
3138+ def __format_process(pid):
3139+ """
3140+ Formats the given PID as a string containing the PID and the corresponding command line truncated to 64 chars
3141+ """
3142+ try:
3143+ cmdline = '/proc/{0}/cmdline'.format(pid)
3144+ if os.path.exists(cmdline):
3145+ with open(cmdline, "r") as cmdline_file:
3146+ return "[PID: {0}] {1:64.64}".format(pid, cmdline_file.read())
3147+ except Exception:
3148+ pass
3149+ return "[PID: {0}] UNKNOWN".format(pid)
3150
3151- message = 'Failed to process legacy cgroups. Collection of resource usage data will be disabled.'
3152+ @staticmethod
3153+ def __is_process_descendant_of_the_agent(pid):
3154+ """
3155+ Returns True if the process is descendant of the agent by looking at the env flag(AZURE_GUEST_AGENT_PARENT_PROCESS_NAME)
3156+ that we set when the process starts otherwise False.
3157+ """
3158+ try:
3159+ env = '/proc/{0}/environ'.format(pid)
3160+ if os.path.exists(env):
3161+ with open(env, "r") as env_file:
3162+ environ = env_file.read()
3163+ if environ and environ[-1] == '\x00':
3164+ environ = environ[:-1]
3165+ return "{0}={1}".format(shellutil.PARENT_PROCESS_NAME, shellutil.AZURE_GUEST_AGENT) in environ
3166+ except Exception:
3167+ pass
3168+ return False
3169
3170- def disable_cgroups(exception):
3171- self.disable()
3172- add_event(
3173- AGENT_NAME,
3174- version=CURRENT_VERSION,
3175- op=WALAEventOperation.CGroupsCleanUp,
3176- is_success=False,
3177- log_event=False,
3178- message='{0} {1}'.format(message, ustr(exception)))
3179+ @staticmethod
3180+ def __is_zombie_process(pid):
3181+ """
3182+ Returns True if process is in Zombie state otherwise False.
3183
3184- self._invoke_cgroup_operation(__impl, message, on_error=disable_cgroups)
3185+ Ex: cat /proc/18171/stat
3186+ 18171 (python3) S 18103 18103 18103 0 -1 4194624 57736 64902 0 3
3187+ """
3188+ try:
3189+ stat = '/proc/{0}/stat'.format(pid)
3190+ if os.path.exists(stat):
3191+ with open(stat, "r") as stat_file:
3192+ return stat_file.read().split()[2] == 'Z'
3193+ except Exception:
3194+ pass
3195+ return False
3196
3197- def create_extension_cgroups_root(self):
3198+ @staticmethod
3199+ def _check_systemd_run_process(process):
3200 """
3201- Creates the container (directory/cgroup) that includes the cgroups for all extensions (/sys/fs/cgroup/*/walinuxagent.extensions)
3202+ Returns True if process is shell systemd-run process started by agent otherwise False.
3203+
3204+ Ex: sh,7345 -c systemd-run --unit=enable_7c5cab19-eb79-4661-95d9-9e5091bd5ae0 --scope --slice=azure-vmextensions-Microsoft.OSTCExtensions.VMAccessForLinux_1.5.11.slice /var/lib/waagent/Microsoft.OSTCExtensions.VMAccessForLinux-1.5.11/processes.sh
3205 """
3206- def __impl():
3207- self._cgroups_api.create_extension_cgroups_root()
3208+ try:
3209+ process_name = "UNKNOWN"
3210+ cmdline = '/proc/{0}/cmdline'.format(process)
3211+ if os.path.exists(cmdline):
3212+ with open(cmdline, "r") as cmdline_file:
3213+ process_name = "{0}".format(cmdline_file.read())
3214+ match = re.search(r'systemd-run.*--unit=.*--scope.*--slice=azure-vmextensions.*', process_name)
3215+ if match is not None:
3216+ return True
3217+ except Exception:
3218+ pass
3219+ return False
3220+
3221+ @staticmethod
3222+ def _report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected):
3223+ for proc_name in unexpected:
3224+ if 'UNKNOWN' in proc_name:
3225+ msg = "Agent includes following processes when UNKNOWN process found: {0}".format("\n".join([ustr(proc) for proc in agent_cgroup_proc_names]))
3226+ add_event(op=WALAEventOperation.CGroupsInfo, message=msg)
3227
3228- self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked.")
3229+ @staticmethod
3230+ def _check_agent_throttled_time(cgroup_metrics):
3231+ for metric in cgroup_metrics:
3232+ if metric.instance == AGENT_NAME_TELEMETRY and metric.counter == MetricsCounter.THROTTLED_TIME:
3233+ if metric.value > conf.get_agent_cpu_throttled_time_threshold():
3234+ raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value))
3235
3236- def create_extension_cgroups(self, name):
3237+ def check_agent_memory_usage(self):
3238+ if self.enabled() and self._agent_memory_cgroup:
3239+ metrics = self._agent_memory_cgroup.get_tracked_metrics()
3240+ current_usage = 0
3241+ for metric in metrics:
3242+ if metric.counter == MetricsCounter.TOTAL_MEM_USAGE:
3243+ current_usage += metric.value
3244+ elif metric.counter == MetricsCounter.SWAP_MEM_USAGE:
3245+ current_usage += metric.value
3246+
3247+ if current_usage > conf.get_agent_memory_quota():
3248+ raise AgentMemoryExceededException("The agent memory limit {0} bytes exceeded. The current reported usage is {1} bytes.".format(conf.get_agent_memory_quota(), current_usage))
3249+
3250+ @staticmethod
3251+ def _get_parent(pid):
3252 """
3253- Creates and returns the cgroups for the given extension
3254+ Returns the parent of the given process. If the parent cannot be determined returns 0 (which is the PID for the scheduler)
3255 """
3256- def __impl():
3257- return self._cgroups_api.create_extension_cgroups(name)
3258+ try:
3259+ stat = '/proc/{0}/stat'.format(pid)
3260+ if os.path.exists(stat):
3261+ with open(stat, "r") as stat_file:
3262+ return int(stat_file.read().split()[3])
3263+ except Exception:
3264+ pass
3265+ return 0
3266
3267- return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked.".format(name))
3268+ def start_tracking_unit_cgroups(self, unit_name):
3269+ """
3270+ TODO: Start tracking Memory Cgroups
3271+ """
3272+ try:
3273+ cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name)
3274+
3275+ if cpu_cgroup_path is None:
3276+ logger.info("The CPU controller is not mounted; will not track resource usage")
3277+ else:
3278+ CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path))
3279+
3280+ if memory_cgroup_path is None:
3281+ logger.info("The Memory controller is not mounted; will not track resource usage")
3282+ else:
3283+ CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path))
3284+
3285+ except Exception as exception:
3286+ logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception))
3287
3288- def remove_extension_cgroups(self, name):
3289+ def stop_tracking_unit_cgroups(self, unit_name):
3290 """
3291- Deletes the cgroup for the given extension
3292+ TODO: remove Memory cgroups from tracked list.
3293 """
3294- def __impl():
3295- cgroups = self._cgroups_api.remove_extension_cgroups(name)
3296- return cgroups
3297+ try:
3298+ cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name)
3299+
3300+ if cpu_cgroup_path is not None:
3301+ CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path))
3302+
3303+ if memory_cgroup_path is not None:
3304+ CGroupsTelemetry.stop_tracking(MemoryCgroup(unit_name, memory_cgroup_path))
3305
3306- self._invoke_cgroup_operation(__impl, "Failed to delete cgroups for extension '{0}'.".format(name))
3307+ except Exception as exception:
3308+ logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
3309
3310- def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
3311+ def stop_tracking_extension_cgroups(self, extension_name):
3312+ """
3313+ TODO: remove extension Memory cgroups from tracked list
3314+ """
3315+ try:
3316+ extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name)
3317+ cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE,
3318+ extension_slice_name)
3319+
3320+ cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self._cgroups_api.get_cgroup_mount_points()
3321+ cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
3322+ memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path)
3323+
3324+ if cpu_cgroup_path is not None:
3325+ CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path))
3326+
3327+ if memory_cgroup_path is not None:
3328+ CGroupsTelemetry.stop_tracking(MemoryCgroup(extension_name, memory_cgroup_path))
3329+
3330+ except Exception as exception:
3331+ logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
3332+
3333+ def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr,
3334 error_code=ExtensionErrorCodes.PluginUnknownFailure):
3335 """
3336 Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup
3337 :param extension_name: The extension executing the command
3338 :param command: The command to invoke
3339+ :param cmd_name: The type of the command(enable, install, etc.)
3340 :param timeout: Number of seconds to wait for command completion
3341 :param cwd: The working directory for the command
3342 :param env: The environment to pass to the command's process
3343@@ -172,39 +886,207 @@ class CGroupConfigurator(object):
3344 :param stderr: File object to redirect stderr to
3345 :param error_code: Extension error code to raise in case of error
3346 """
3347- if not self.enabled():
3348- process = subprocess.Popen(command,
3349- shell=shell,
3350- cwd=cwd,
3351- env=env,
3352- stdout=stdout,
3353- stderr=stderr,
3354- preexec_fn=os.setsid)
3355-
3356- process_output = handle_process_completion(process=process,
3357- command=command,
3358- timeout=timeout,
3359- stdout=stdout,
3360- stderr=stderr,
3361- error_code=error_code)
3362- else:
3363- extension_cgroups, process_output = self._cgroups_api.start_extension_command(extension_name,
3364- command,
3365- timeout,
3366- shell=shell,
3367- cwd=cwd,
3368- env=env,
3369- stdout=stdout,
3370- stderr=stderr,
3371- error_code=error_code)
3372-
3373- return process_output
3374-
3375- # unique instance for the singleton (TODO: find a better pattern for a singleton)
3376+ if self.enabled():
3377+ try:
3378+ return self._cgroups_api.start_extension_command(extension_name, command, cmd_name, timeout,
3379+ shell=shell, cwd=cwd, env=env, stdout=stdout,
3380+ stderr=stderr, error_code=error_code)
3381+ except SystemdRunError as exception:
3382+ reason = 'Failed to start {0} using systemd-run, will try invoking the extension directly. Error: {1}'.format(
3383+ extension_name, ustr(exception))
3384+ self.disable(reason, DisableCgroups.ALL)
3385+ # fall-through and re-invoke the extension
3386+
3387+ # subprocess-popen-preexec-fn<W1509> Disabled: code is not multi-threaded
3388+ process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509
3389+ return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code)
3390+
3391+ def __reset_extension_cpu_quota(self, extension_name):
3392+ """
3393+ Removes any CPUQuota on the extension
3394+
3395+ NOTE: This resets the quota on the extension's slice; any local overrides on the VM will take precedence
3396+ over this setting.
3397+ """
3398+ if self.enabled():
3399+ self.setup_extension_slice(extension_name, cpu_quota=None)
3400+
3401+ def setup_extension_slice(self, extension_name, cpu_quota):
3402+ """
3403+ Each extension runs under its own slice (Ex "Microsoft.CPlat.Extension.slice"). All the slices for
3404+ extensions are grouped under "azure-vmextensions.slice.
3405+
3406+ This method ensures that the extension slice is created. Setup should create
3407+ under /lib/systemd/system if it is not exist.
3408+ TODO: set memory quotas
3409+ """
3410+ if self.enabled():
3411+ unit_file_install_path = systemd.get_unit_file_install_path()
3412+ extension_slice_path = os.path.join(unit_file_install_path,
3413+ SystemdCgroupsApi.get_extension_slice_name(extension_name))
3414+ try:
3415+ cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
3416+ if cpu_quota == "":
3417+ _log_cgroup_info("CPUQuota not set for {0}", extension_name)
3418+ else:
3419+ _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota)
3420+ slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name,
3421+ cpu_quota=cpu_quota)
3422+ CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents)
3423+ except Exception as exception:
3424+ _log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name,
3425+ ustr(exception))
3426+ CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path)
3427+
3428+ def remove_extension_slice(self, extension_name):
3429+ """
3430+ This method ensures that the extension slice gets removed from /lib/systemd/system if it exist
3431+ Lastly stop the unit. This would ensure the cleanup the /sys/fs/cgroup controller paths
3432+ """
3433+ if self.enabled():
3434+ unit_file_install_path = systemd.get_unit_file_install_path()
3435+ extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name)
3436+ extension_slice_path = os.path.join(unit_file_install_path, extension_slice_name)
3437+ if os.path.exists(extension_slice_path):
3438+ self.stop_tracking_extension_cgroups(extension_name)
3439+ CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path)
3440+
3441+ def set_extension_services_cpu_memory_quota(self, services_list):
3442+ """
3443+ Each extension service will have name, systemd path and it's quotas.
3444+ This method ensures that drop-in files are created under service.d folder if quotas given.
3445+ ex: /lib/systemd/system/extension.service.d/11-CPUAccounting.conf
3446+ TODO: set memory quotas
3447+ """
3448+ if self.enabled() and services_list is not None:
3449+ for service in services_list:
3450+ service_name = service.get('name', None)
3451+ unit_file_path = systemd.get_unit_file_install_path()
3452+ if service_name is not None and unit_file_path is not None:
3453+ files_to_create = []
3454+ drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
3455+ drop_in_file_cpu_accounting = os.path.join(drop_in_path,
3456+ _DROP_IN_FILE_CPU_ACCOUNTING)
3457+ files_to_create.append((drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS))
3458+ drop_in_file_memory_accounting = os.path.join(drop_in_path,
3459+ _DROP_IN_FILE_MEMORY_ACCOUNTING)
3460+ files_to_create.append(
3461+ (drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS))
3462+
3463+ cpu_quota = service.get('cpuQuotaPercentage', None)
3464+ if cpu_quota is not None:
3465+ cpu_quota = str(cpu_quota) + "%"
3466+ _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", service_name, cpu_quota)
3467+ drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
3468+ cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota)
3469+ files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents))
3470+
3471+ self.__create_all_files(files_to_create)
3472+ self.__reload_systemd_config()
3473+
3474+ def __reset_extension_services_cpu_quota(self, services_list):
3475+ """
3476+ Removes any CPUQuota on the extension service
3477+
3478+ NOTE: This resets the quota on the extension service's default dropin file; any local overrides on the VM will take precedence
3479+ over this setting.
3480+ """
3481+ if self.enabled() and services_list is not None:
3482+ service_name = None
3483+ try:
3484+ for service in services_list:
3485+ service_name = service.get('name', None)
3486+ unit_file_path = systemd.get_unit_file_install_path()
3487+ if service_name is not None and unit_file_path is not None:
3488+ files_to_create = []
3489+ drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
3490+ cpu_quota = "" # setting an empty value resets to the default (infinity)
3491+ drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
3492+ cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota)
3493+ if os.path.exists(drop_in_file_cpu_quota):
3494+ with open(drop_in_file_cpu_quota, "r") as file_:
3495+ if file_.read() == cpu_quota_contents:
3496+ return
3497+ files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents))
3498+ self.__create_all_files(files_to_create)
3499+ except Exception as exception:
3500+ _log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception))
3501+
3502+ def remove_extension_services_drop_in_files(self, services_list):
3503+ """
3504+ Remove the dropin files from service .d folder for the given service
3505+ """
3506+ if services_list is not None:
3507+ for service in services_list:
3508+ service_name = service.get('name', None)
3509+ unit_file_path = systemd.get_unit_file_install_path()
3510+ if service_name is not None and unit_file_path is not None:
3511+ files_to_cleanup = []
3512+ drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
3513+ drop_in_file_cpu_accounting = os.path.join(drop_in_path,
3514+ _DROP_IN_FILE_CPU_ACCOUNTING)
3515+ files_to_cleanup.append(drop_in_file_cpu_accounting)
3516+ drop_in_file_memory_accounting = os.path.join(drop_in_path,
3517+ _DROP_IN_FILE_MEMORY_ACCOUNTING)
3518+ files_to_cleanup.append(drop_in_file_memory_accounting)
3519+ cpu_quota = service.get('cpuQuotaPercentage', None)
3520+ if cpu_quota is not None:
3521+ drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
3522+ files_to_cleanup.append(drop_in_file_cpu_quota)
3523+
3524+ CGroupConfigurator._Impl.__cleanup_all_files(files_to_cleanup)
3525+ _log_cgroup_info("Drop in files removed for {0}".format(service_name))
3526+
3527+ def stop_tracking_extension_services_cgroups(self, services_list):
3528+ """
3529+ Remove the cgroup entry from the tracked groups to stop tracking.
3530+ """
3531+ if self.enabled() and services_list is not None:
3532+ for service in services_list:
3533+ service_name = service.get('name', None)
3534+ if service_name is not None:
3535+ self.stop_tracking_unit_cgroups(service_name)
3536+
3537+ def start_tracking_extension_services_cgroups(self, services_list):
3538+ """
3539+ Add the cgroup entry to start tracking the services cgroups.
3540+ """
3541+ if self.enabled() and services_list is not None:
3542+ for service in services_list:
3543+ service_name = service.get('name', None)
3544+ if service_name is not None:
3545+ self.start_tracking_unit_cgroups(service_name)
3546+
3547+ @staticmethod
3548+ def get_extension_services_list():
3549+ """
3550+ ResourceLimits for extensions are coming from <extName>/HandlerManifest.json file.
3551+ Use this pattern to determine all the installed extension HandlerManifest files and
3552+ read the extension services if ResourceLimits are present.
3553+ """
3554+ extensions_services = {}
3555+ for manifest_path in glob.iglob(os.path.join(conf.get_lib_dir(), "*/HandlerManifest.json")):
3556+ match = re.search("(?P<extname>[\\w+\\.-]+).HandlerManifest\\.json", manifest_path)
3557+ if match is not None:
3558+ extensions_name = match.group('extname')
3559+ if not extensions_name.startswith('WALinuxAgent'):
3560+ try:
3561+ data = json.loads(fileutil.read_file(manifest_path))
3562+ resource_limits = data[0].get('resourceLimits', None)
3563+ services = resource_limits.get('services') if resource_limits else None
3564+ extensions_services[extensions_name] = services
3565+ except (IOError, OSError) as e:
3566+ _log_cgroup_warning(
3567+ 'Failed to load manifest file ({0}): {1}'.format(manifest_path, e.strerror))
3568+ except ValueError:
3569+ _log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path))
3570+ return extensions_services
3571+
3572+ # unique instance for the singleton
3573 _instance = None
3574
3575 @staticmethod
3576 def get_instance():
3577 if CGroupConfigurator._instance is None:
3578- CGroupConfigurator._instance = CGroupConfigurator.__impl()
3579+ CGroupConfigurator._instance = CGroupConfigurator._Impl()
3580 return CGroupConfigurator._instance
3581diff --git a/azurelinuxagent/common/cgroupstelemetry.py b/azurelinuxagent/common/cgroupstelemetry.py
3582index 4bbcba1..7b6bba0 100644
3583--- a/azurelinuxagent/common/cgroupstelemetry.py
3584+++ b/azurelinuxagent/common/cgroupstelemetry.py
3585@@ -15,101 +15,26 @@
3586 # Requires Python 2.6+ and Openssl 1.0+
3587 import errno
3588 import threading
3589-from collections import namedtuple
3590-from datetime import datetime as dt
3591
3592 from azurelinuxagent.common import logger
3593-from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers
3594-from azurelinuxagent.common.exception import CGroupsException
3595+from azurelinuxagent.common.cgroup import CpuCgroup
3596 from azurelinuxagent.common.future import ustr
3597-from azurelinuxagent.common.logger import EVERY_SIX_HOURS
3598-from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo
3599-
3600-MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
3601-StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric'])
3602-
3603-DELIM = " | "
3604-DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
3605-DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"
3606-
3607-
3608-class MetricsCategory(object):
3609- MEMORY_CATEGORY = "Memory"
3610- PROCESS_CATEGORY = "Process"
3611-
3612-
3613-class MetricsCounter(object):
3614- PROCESSOR_PERCENT_TIME = "% Processor Time"
3615- TOTAL_MEM_USAGE = "Total Memory Usage"
3616- MAX_MEM_USAGE = "Max Memory Usage"
3617- MEM_USED_BY_PROCESS = "Memory Used by Process"
3618
3619
3620 class CGroupsTelemetry(object):
3621 """
3622 """
3623- _tracked = []
3624- _cgroup_metrics = {}
3625+ _tracked = {}
3626+ _track_throttled_time = False
3627 _rlock = threading.RLock()
3628
3629 @staticmethod
3630- def get_process_info_summary(process_id):
3631- process_cmdline = DEFAULT_PROCESS_COMMANDLINE
3632- process_name = DEFAULT_PROCESS_NAME
3633-
3634- # The ProcessName and ProcessCommandLine can generate Exception if the file /proc/<pid>/{comm,cmdline} cease to
3635- # exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the
3636- # details from those files.
3637- try:
3638- process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE
3639- except Exception as e:
3640- logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
3641-
3642- try:
3643- process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME
3644- except Exception as e:
3645- logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
3646-
3647- return process_id + DELIM + process_name + DELIM + process_cmdline
3648+ def set_track_throttled_time(value):
3649+ CGroupsTelemetry._track_throttled_time = value
3650
3651 @staticmethod
3652- def _get_metrics_list(metric):
3653- return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(),
3654- metric.first_poll_time(), metric.last_poll_time()]
3655-
3656- @staticmethod
3657- def _process_cgroup_metric(cgroup_metrics):
3658- memory_usage = cgroup_metrics.get_memory_metrics()
3659- max_memory_usage = cgroup_metrics.get_max_memory_metrics()
3660- cpu_usage = cgroup_metrics.get_cpu_metrics()
3661- memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics()
3662-
3663- processed_extension = {}
3664-
3665- if cpu_usage.count() > 0:
3666- processed_extension["cpu"] = {"cur_cpu": CGroupsTelemetry._get_metrics_list(cpu_usage)}
3667-
3668- if memory_usage.count() > 0:
3669- if "memory" in processed_extension:
3670- processed_extension["memory"]["cur_mem"] = CGroupsTelemetry._get_metrics_list(memory_usage)
3671- else:
3672- processed_extension["memory"] = {"cur_mem": CGroupsTelemetry._get_metrics_list(memory_usage)}
3673-
3674- if max_memory_usage.count() > 0:
3675- if "memory" in processed_extension:
3676- processed_extension["memory"]["max_mem"] = CGroupsTelemetry._get_metrics_list(max_memory_usage)
3677- else:
3678- processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)}
3679-
3680- for pid_process_memory in memory_usage_per_process:
3681- if "proc_statm_memory" in processed_extension:
3682- processed_extension["proc_statm_memory"][pid_process_memory.pid_name_cmdline] = \
3683- CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)
3684- else:
3685- processed_extension["proc_statm_memory"] = {pid_process_memory.pid_name_cmdline:
3686- CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)}
3687-
3688- return processed_extension
3689+ def get_track_throttled_time():
3690+ return CGroupsTelemetry._track_throttled_time
3691
3692 @staticmethod
3693 def track_cgroup(cgroup):
3694@@ -122,221 +47,56 @@ class CGroupsTelemetry(object):
3695
3696 with CGroupsTelemetry._rlock:
3697 if not CGroupsTelemetry.is_tracked(cgroup.path):
3698- CGroupsTelemetry._tracked.append(cgroup)
3699- logger.info("Started tracking new cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path))
3700+ CGroupsTelemetry._tracked[cgroup.path] = cgroup
3701+ logger.info("Started tracking cgroup {0}", cgroup)
3702
3703 @staticmethod
3704 def is_tracked(path):
3705 """
3706 Returns true if the given item is in the list of tracked items
3707- O(n) operation. But limited to few cgroup objects we have.
3708+ O(1) operation.
3709 """
3710 with CGroupsTelemetry._rlock:
3711- for cgroup in CGroupsTelemetry._tracked:
3712- if path == cgroup.path:
3713- return True
3714+ if path in CGroupsTelemetry._tracked:
3715+ return True
3716
3717 return False
3718
3719 @staticmethod
3720 def stop_tracking(cgroup):
3721 """
3722- Stop tracking the cgroups for the given name
3723+ Stop tracking the cgroups for the given path
3724 """
3725 with CGroupsTelemetry._rlock:
3726- CGroupsTelemetry._tracked.remove(cgroup)
3727- logger.info("Stopped tracking cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path))
3728-
3729- @staticmethod
3730- def report_all_tracked():
3731- """
3732- The report_all_tracked's purpose is to collect the data from the tracked cgroups and process the metric into a
3733- data structure by _process_cgroup_metric. The perf metric is added into the data structure and returned to the
3734- caller.
3735-
3736- The report_all_tracked would be removed soon - in favor of sending report_metric directly, when polling the data
3737- from tracked groups.
3738-
3739- :return collected_metrics: dictionary of cgroups metrics.
3740- """
3741- collected_metrics = {}
3742-
3743- for name, cgroup_metrics in CGroupsTelemetry._cgroup_metrics.items():
3744- perf_metric = CGroupsTelemetry._process_cgroup_metric(cgroup_metrics)
3745-
3746- if perf_metric:
3747- collected_metrics[name] = perf_metric
3748-
3749- cgroup_metrics.clear()
3750-
3751- # Doing cleanup after the metrics have already been collected.
3752- for key in [key for key in CGroupsTelemetry._cgroup_metrics if
3753- CGroupsTelemetry._cgroup_metrics[key].marked_for_delete]:
3754- del CGroupsTelemetry._cgroup_metrics[key]
3755-
3756- return collected_metrics
3757+ if cgroup.path in CGroupsTelemetry._tracked:
3758+ CGroupsTelemetry._tracked.pop(cgroup.path)
3759+ logger.info("Stopped tracking cgroup {0}", cgroup)
3760
3761 @staticmethod
3762 def poll_all_tracked():
3763 metrics = []
3764-
3765+ inactive_cgroups = []
3766 with CGroupsTelemetry._rlock:
3767- for cgroup in CGroupsTelemetry._tracked[:]:
3768- if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
3769- CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics()
3770+ for cgroup in CGroupsTelemetry._tracked.values():
3771 try:
3772- if cgroup.controller == CGroupContollers.CPU:
3773- current_cpu_usage = cgroup.get_cpu_usage()
3774- CGroupsTelemetry._cgroup_metrics[cgroup.name].add_cpu_usage(current_cpu_usage)
3775- metrics.append(MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.
3776- PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage))
3777- elif cgroup.controller == CGroupContollers.MEMORY:
3778- current_memory_usage = cgroup.get_memory_usage()
3779- CGroupsTelemetry._cgroup_metrics[cgroup.name].add_memory_usage(current_memory_usage)
3780- metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
3781- TOTAL_MEM_USAGE, cgroup.name, current_memory_usage))
3782-
3783- max_memory_usage = cgroup.get_max_memory_usage()
3784- CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage)
3785- metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE,
3786- cgroup.name, max_memory_usage))
3787-
3788- pids = cgroup.get_tracked_processes()
3789- for pid in pids:
3790- try:
3791- mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid)
3792- metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
3793- MEM_USED_BY_PROCESS, CGroupsTelemetry.get_process_info_summary(pid),
3794- mem_usage_from_procstatm))
3795- CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory(
3796- CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm)
3797- except Exception as e:
3798- if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
3799- logger.periodic_warn(logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm "
3800- "for pid {0}. Error : {1}", pid, ustr(e))
3801- else:
3802- raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(
3803- cgroup.controller, cgroup.name))
3804+ metrics.extend(cgroup.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time))
3805 except Exception as e:
3806 # There can be scenarios when the CGroup has been deleted by the time we are fetching the values
3807 # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
3808 # every occurrences of such case as it would be very verbose. We do want to log all the other
3809 # exceptions which could occur, which is why we do a periodic log for all the other errors.
3810- if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
3811+ if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101
3812 logger.periodic_warn(logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup '
3813 '{0}. Error : {1}'.format(cgroup.name, ustr(e)))
3814 if not cgroup.is_active():
3815- CGroupsTelemetry.stop_tracking(cgroup)
3816- CGroupsTelemetry._cgroup_metrics[cgroup.name].marked_for_delete = True
3817+ inactive_cgroups.append(cgroup)
3818+ for inactive_cgroup in inactive_cgroups:
3819+ CGroupsTelemetry.stop_tracking(inactive_cgroup)
3820
3821 return metrics
3822
3823 @staticmethod
3824- def prune_all_tracked():
3825- with CGroupsTelemetry._rlock:
3826- for cgroup in CGroupsTelemetry._tracked[:]:
3827- if not cgroup.is_active():
3828- CGroupsTelemetry.stop_tracking(cgroup)
3829-
3830- @staticmethod
3831 def reset():
3832 with CGroupsTelemetry._rlock:
3833- CGroupsTelemetry._tracked *= 0 # emptying the list
3834- CGroupsTelemetry._cgroup_metrics = {}
3835-
3836-
3837-class CgroupMetrics(object):
3838- def __init__(self):
3839- self._memory_usage = Metric()
3840- self._max_memory_usage = Metric()
3841- self._cpu_usage = Metric()
3842- self._proc_statm_mem = {}
3843-
3844- self.marked_for_delete = False
3845-
3846- def add_memory_usage(self, usage):
3847- if not self.marked_for_delete:
3848- self._memory_usage.append(usage)
3849-
3850- def add_max_memory_usage(self, usage):
3851- if not self.marked_for_delete:
3852- self._max_memory_usage.append(usage)
3853-
3854- def add_cpu_usage(self, usage):
3855- if not self.marked_for_delete:
3856- self._cpu_usage.append(usage)
3857-
3858- def add_proc_statm_memory(self, pid, usage):
3859- if not self.marked_for_delete:
3860- if pid not in self._proc_statm_mem:
3861- self._proc_statm_mem[pid] = Metric()
3862- self._proc_statm_mem[pid].append(usage)
3863-
3864- def get_memory_metrics(self):
3865- return self._memory_usage
3866-
3867- def get_max_memory_metrics(self):
3868- return self._max_memory_usage
3869-
3870- def get_cpu_metrics(self):
3871- return self._cpu_usage
3872-
3873- def get_proc_statm_memory_metrics(self):
3874- """
3875- :return: StatmMetricValue tuples of pid and metric
3876- """
3877- return [StatmMetricValue(pid_name_cmdline, metric) for pid_name_cmdline, metric in self._proc_statm_mem.items()]
3878-
3879- def clear(self):
3880- self._memory_usage.clear()
3881- self._max_memory_usage.clear()
3882- self._cpu_usage.clear()
3883- self._proc_statm_mem.clear()
3884-
3885-
3886-class Metric(object):
3887- def __init__(self):
3888- self._data = []
3889- self._first_poll_time = None
3890- self._last_poll_time = None
3891-
3892- def append(self, data):
3893- if not self._first_poll_time:
3894- # We only want to do it first time.
3895- self._first_poll_time = dt.utcnow()
3896-
3897- self._data.append(data)
3898- self._last_poll_time = dt.utcnow()
3899-
3900- def clear(self):
3901- self._first_poll_time = None
3902- self._last_poll_time = None
3903- self._data *= 0
3904-
3905- def average(self):
3906- return float(sum(self._data)) / float(len(self._data)) if self._data else None
3907-
3908- def max(self):
3909- return max(self._data) if self._data else None
3910-
3911- def min(self):
3912- return min(self._data) if self._data else None
3913-
3914- def median(self):
3915- data = sorted(self._data)
3916- l_len = len(data)
3917- if l_len < 1:
3918- return None
3919- if l_len % 2 == 0:
3920- return (data[int((l_len - 1) / 2)] + data[int((l_len + 1) / 2)]) / 2.0
3921- else:
3922- return data[int((l_len - 1) / 2)]
3923-
3924- def count(self):
3925- return len(self._data)
3926-
3927- def first_poll_time(self):
3928- return str(self._first_poll_time)
3929-
3930- def last_poll_time(self):
3931- return str(self._last_poll_time)
3932+ CGroupsTelemetry._tracked.clear() # emptying the dictionary
3933+ CGroupsTelemetry._track_throttled_time = False
3934diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py
3935index bfc61f0..46765ea 100644
3936--- a/azurelinuxagent/common/conf.py
3937+++ b/azurelinuxagent/common/conf.py
3938@@ -19,11 +19,11 @@
3939
3940 """
3941 Module conf loads and parses configuration file
3942-"""
3943+""" # pylint: disable=W0105
3944 import os
3945 import os.path
3946
3947-import azurelinuxagent.common.utils.fileutil as fileutil
3948+from azurelinuxagent.common.utils.fileutil import read_file #pylint: disable=R0401
3949 from azurelinuxagent.common.exception import AgentConfigError
3950
3951 DISABLE_AGENT_FILE = 'disable_agent'
3952@@ -49,25 +49,43 @@ class ConfigurationProvider(object):
3953 value = parts[1].split('#')[0].strip("\" ").strip()
3954 self.values[key] = value if value != "None" else None
3955
3956- def get(self, key, default_val):
3957+ @staticmethod
3958+ def _get_default(default):
3959+ if hasattr(default, '__call__'):
3960+ return default()
3961+ return default
3962+
3963+ def get(self, key, default_value):
3964+ """
3965+ Retrieves a string parameter by key and returns its value. If not found returns the default value,
3966+ or if the default value is a callable returns the result of invoking the callable.
3967+ """
3968 val = self.values.get(key)
3969- return val if val is not None else default_val
3970+ return val if val is not None else self._get_default(default_value)
3971
3972- def get_switch(self, key, default_val):
3973+ def get_switch(self, key, default_value):
3974+ """
3975+ Retrieves a switch parameter by key and returns its value as a boolean. If not found returns the default value,
3976+ or if the default value is a callable returns the result of invoking the callable.
3977+ """
3978 val = self.values.get(key)
3979 if val is not None and val.lower() == 'y':
3980 return True
3981 elif val is not None and val.lower() == 'n':
3982 return False
3983- return default_val
3984+ return self._get_default(default_value)
3985
3986- def get_int(self, key, default_val):
3987+ def get_int(self, key, default_value):
3988+ """
3989+ Retrieves an int parameter by key and returns its value. If not found returns the default value,
3990+ or if the default value is a callable returns the result of invoking the callable.
3991+ """
3992 try:
3993 return int(self.values.get(key))
3994 except TypeError:
3995- return default_val
3996+ return self._get_default(default_value)
3997 except ValueError:
3998- return default_val
3999+ return self._get_default(default_value)
4000
4001
4002 __conf__ = ConfigurationProvider()
4003@@ -81,7 +99,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
4004 raise AgentConfigError(("Missing configuration in {0}"
4005 "").format(conf_file_path))
4006 try:
4007- content = fileutil.read_file(conf_file_path)
4008+ content = read_file(conf_file_path)
4009 conf.load(content)
4010 except IOError as err:
4011 raise AgentConfigError(("Failed to load conf file:{0}, {1}"
4012@@ -97,6 +115,7 @@ __SWITCH_OPTIONS__ = {
4013 "OS.CheckRdmaDriver": False,
4014 "Logs.Verbose": False,
4015 "Logs.Console": True,
4016+ "Logs.Collect": True,
4017 "Extensions.Enabled": True,
4018 "Provisioning.AllowResetSysUser": False,
4019 "Provisioning.RegenerateSshHostKeyPair": False,
4020@@ -110,7 +129,16 @@ __SWITCH_OPTIONS__ = {
4021 "ResourceDisk.EnableSwapEncryption": False,
4022 "AutoUpdate.Enabled": True,
4023 "EnableOverProvisioning": True,
4024- "CGroups.EnforceLimits": False,
4025+ #
4026+ # "Debug" options are experimental and may be removed in later
4027+ # versions of the Agent.
4028+ #
4029+ "Debug.CgroupLogMetrics": False,
4030+ "Debug.CgroupDisableOnProcessCheckFailure": True,
4031+ "Debug.CgroupDisableOnQuotaCheckFailure": True,
4032+ "Debug.EnableAgentMemoryUsageCheck": False,
4033+ "Debug.EnableFastTrack": True,
4034+ "Debug.EnableGAVersioning": False
4035 }
4036
4037
4038@@ -133,16 +161,37 @@ __STRING_OPTIONS__ = {
4039 "ResourceDisk.MountOptions": None,
4040 "ResourceDisk.Filesystem": "ext3",
4041 "AutoUpdate.GAFamily": "Prod",
4042- "CGroups.Excluded": "customscript,runcommand",
4043+ "Debug.CgroupMonitorExpiryTime": "2022-03-31",
4044+ "Debug.CgroupMonitorExtensionName": "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent",
4045 }
4046
4047
4048 __INTEGER_OPTIONS__ = {
4049+ "Extensions.GoalStatePeriod": 6,
4050+ "Extensions.InitialGoalStatePeriod": 6,
4051+ "OS.EnableFirewallPeriod": 300,
4052+ "OS.RemovePersistentNetRulesPeriod": 30,
4053+ "OS.RootDeviceScsiTimeoutPeriod": 30,
4054+ "OS.MonitorDhcpClientRestartPeriod": 30,
4055 "OS.SshClientAliveInterval": 180,
4056+ "Provisioning.MonitorHostNamePeriod": 30,
4057 "Provisioning.PasswordCryptSaltLength": 10,
4058 "HttpProxy.Port": None,
4059 "ResourceDisk.SwapSizeMB": 0,
4060- "Autoupdate.Frequency": 3600
4061+ "Autoupdate.Frequency": 3600,
4062+ "Logs.CollectPeriod": 3600,
4063+ #
4064+ # "Debug" options are experimental and may be removed in later
4065+ # versions of the Agent.
4066+ #
4067+ "Debug.CgroupCheckPeriod": 300,
4068+ "Debug.AgentCpuQuota": 50,
4069+ "Debug.AgentCpuThrottledTimeThreshold": 120,
4070+ "Debug.AgentMemoryQuota": 30 * 1024 ** 2,
4071+ "Debug.EtpCollectionPeriod": 300,
4072+ "Debug.AutoUpdateHotfixFrequency": 14400,
4073+ "Debug.AutoUpdateNormalFrequency": 86400,
4074+ "Debug.FirewallRulesLogPeriod": 86400
4075 }
4076
4077
4078@@ -160,10 +209,40 @@ def get_configuration(conf=__conf__):
4079 return options
4080
4081
4082+def get_default_value(option):
4083+ if option in __STRING_OPTIONS__:
4084+ return __STRING_OPTIONS__[option]
4085+ raise ValueError("{0} is not a valid configuration parameter.".format(option))
4086+
4087+
4088+def get_int_default_value(option):
4089+ if option in __INTEGER_OPTIONS__:
4090+ return int(__INTEGER_OPTIONS__[option])
4091+ raise ValueError("{0} is not a valid configuration parameter.".format(option))
4092+
4093+
4094+def get_switch_default_value(option):
4095+ if option in __SWITCH_OPTIONS__:
4096+ return __SWITCH_OPTIONS__[option]
4097+ raise ValueError("{0} is not a valid configuration parameter.".format(option))
4098+
4099+
4100 def enable_firewall(conf=__conf__):
4101 return conf.get_switch("OS.EnableFirewall", False)
4102
4103
4104+def get_enable_firewall_period(conf=__conf__):
4105+ return conf.get_int("OS.EnableFirewallPeriod", 300)
4106+
4107+
4108+def get_remove_persistent_net_rules_period(conf=__conf__):
4109+ return conf.get_int("OS.RemovePersistentNetRulesPeriod", 30)
4110+
4111+
4112+def get_monitor_dhcp_client_restart_period(conf=__conf__):
4113+ return conf.get_int("OS.MonitorDhcpClientRestartPeriod", 30)
4114+
4115+
4116 def enable_rdma(conf=__conf__):
4117 return conf.get_switch("OS.EnableRDMA", False) or \
4118 conf.get_switch("OS.UpdateRdmaDriver", False) or \
4119@@ -186,11 +265,20 @@ def get_logs_console(conf=__conf__):
4120 return conf.get_switch("Logs.Console", True)
4121
4122
4123+def get_collect_logs(conf=__conf__):
4124+ return conf.get_switch("Logs.Collect", True)
4125+
4126+
4127+def get_collect_logs_period(conf=__conf__):
4128+ return conf.get_int("Logs.CollectPeriod", 3600)
4129+
4130+
4131 def get_lib_dir(conf=__conf__):
4132 return conf.get("Lib.Dir", "/var/lib/waagent")
4133
4134
4135 def get_published_hostname(conf=__conf__):
4136+ # Some applications rely on this file; do not remove this setting
4137 return os.path.join(get_lib_dir(conf), 'published_hostname')
4138
4139
4140@@ -206,6 +294,10 @@ def get_ext_log_dir(conf=__conf__):
4141 return conf.get("Extension.LogDir", "/var/log/azure")
4142
4143
4144+def get_agent_log_file():
4145+ return "/var/log/waagent.log"
4146+
4147+
4148 def get_fips_enabled(conf=__conf__):
4149 return conf.get_switch("OS.EnableFIPS", False)
4150
4151@@ -244,18 +336,22 @@ def get_ssh_key_glob(conf=__conf__):
4152
4153 def get_ssh_key_private_path(conf=__conf__):
4154 return os.path.join(get_ssh_dir(conf),
4155- 'ssh_host_{0}_key'.format(get_ssh_host_keypair_type(conf)))
4156+ 'ssh_host_{0}_key'.format(get_ssh_host_keypair_type(conf)))
4157
4158
4159 def get_ssh_key_public_path(conf=__conf__):
4160 return os.path.join(get_ssh_dir(conf),
4161- 'ssh_host_{0}_key.pub'.format(get_ssh_host_keypair_type(conf)))
4162+ 'ssh_host_{0}_key.pub'.format(get_ssh_host_keypair_type(conf)))
4163
4164
4165 def get_root_device_scsi_timeout(conf=__conf__):
4166 return conf.get("OS.RootDeviceScsiTimeout", None)
4167
4168
4169+def get_root_device_scsi_timeout_period(conf=__conf__):
4170+ return conf.get_int("OS.RootDeviceScsiTimeoutPeriod", 30)
4171+
4172+
4173 def get_ssh_host_keypair_type(conf=__conf__):
4174 keypair_type = conf.get("Provisioning.SshHostKeyPairType", "rsa")
4175 if keypair_type == "auto":
4176@@ -275,6 +371,14 @@ def get_extensions_enabled(conf=__conf__):
4177 return conf.get_switch("Extensions.Enabled", True)
4178
4179
4180+def get_goal_state_period(conf=__conf__):
4181+ return conf.get_int("Extensions.GoalStatePeriod", 6)
4182+
4183+
4184+def get_initial_goal_state_period(conf=__conf__):
4185+ return conf.get_int("Extensions.InitialGoalStatePeriod", default_value=lambda: get_goal_state_period(conf=conf))
4186+
4187+
4188 def get_allow_reset_sys_user(conf=__conf__):
4189 return conf.get_switch("Provisioning.AllowResetSysUser", False)
4190
4191@@ -322,6 +426,10 @@ def get_monitor_hostname(conf=__conf__):
4192 return conf.get_switch("Provisioning.MonitorHostName", False)
4193
4194
4195+def get_monitor_hostname_period(conf=__conf__):
4196+ return conf.get_int("Provisioning.MonitorHostNamePeriod", 30)
4197+
4198+
4199 def get_httpproxy_host(conf=__conf__):
4200 return conf.get("HttpProxy.Host", None)
4201
4202@@ -340,10 +448,12 @@ def get_resourcedisk_format(conf=__conf__):
4203
4204 def get_resourcedisk_enable_swap(conf=__conf__):
4205 return conf.get_switch("ResourceDisk.EnableSwap", False)
4206-
4207+
4208+
4209 def get_resourcedisk_enable_swap_encryption(conf=__conf__):
4210 return conf.get_switch("ResourceDisk.EnableSwapEncryption", False)
4211
4212+
4213 def get_resourcedisk_mountpoint(conf=__conf__):
4214 return conf.get("ResourceDisk.MountPoint", "/mnt/resource")
4215
4216@@ -384,10 +494,151 @@ def get_disable_agent_file_path(conf=__conf__):
4217 return os.path.join(get_lib_dir(conf), DISABLE_AGENT_FILE)
4218
4219
4220-def get_cgroups_enforce_limits(conf=__conf__):
4221- return conf.get_switch("CGroups.EnforceLimits", False)
4222+def get_cgroups_enabled(conf=__conf__):
4223+ return conf.get_switch("CGroups.Enabled", True)
4224+
4225+
4226+def get_monitor_network_configuration_changes(conf=__conf__):
4227+ return conf.get_switch("Monitor.NetworkConfigurationChanges", False)
4228+
4229+
4230+def get_cgroup_check_period(conf=__conf__):
4231+ """
4232+ How often to perform checks on cgroups (are the processes in the cgroups as expected,
4233+ has the agent exceeded its quota, etc)
4234+
4235+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4236+ """
4237+ return conf.get_int("Debug.CgroupCheckPeriod", 300)
4238+
4239
4240+def get_cgroup_log_metrics(conf=__conf__):
4241+ """
4242+ If True, resource usage metrics are written to the local log
4243+
4244+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4245+ """
4246+ return conf.get_switch("Debug.CgroupLogMetrics", False)
4247+
4248+
4249+def get_cgroup_disable_on_process_check_failure(conf=__conf__):
4250+ """
4251+ If True, cgroups will be disabled if the process check fails
4252+
4253+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4254+ """
4255+ return conf.get_switch("Debug.CgroupDisableOnProcessCheckFailure", True)
4256+
4257+
4258+def get_cgroup_disable_on_quota_check_failure(conf=__conf__):
4259+ """
4260+ If True, cgroups will be disabled if the CPU quota check fails
4261+
4262+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4263+ """
4264+ return conf.get_switch("Debug.CgroupDisableOnQuotaCheckFailure", True)
4265+
4266+
4267+def get_agent_cpu_quota(conf=__conf__):
4268+ """
4269+ CPU quota for the agent as a percentage of 1 CPU (100% == 1 CPU)
4270
4271-def get_cgroups_excluded(conf=__conf__):
4272- excluded_value = conf.get("CGroups.Excluded", "customscript, runcommand")
4273- return [s for s in [i.strip().lower() for i in excluded_value.split(',')] if len(s) > 0] if excluded_value else []
4274+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4275+ """
4276+ return conf.get_int("Debug.AgentCpuQuota", 50)
4277+
4278+
4279+def get_agent_cpu_throttled_time_threshold(conf=__conf__):
4280+ """
4281+ Throttled time threshold for agent cpu in seconds.
4282+
4283+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4284+ """
4285+ return conf.get_int("Debug.AgentCpuThrottledTimeThreshold", 120)
4286+
4287+
4288+def get_agent_memory_quota(conf=__conf__):
4289+ """
4290+ Memory quota for the agent in bytes.
4291+
4292+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4293+ """
4294+ return conf.get_int("Debug.AgentMemoryQuota", 30 * 1024 ** 2)
4295+
4296+
4297+def get_enable_agent_memory_usage_check(conf=__conf__):
4298+ """
4299+ If True, Agent checks it's Memory usage.
4300+
4301+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4302+ """
4303+ return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False)
4304+
4305+
4306+def get_cgroup_monitor_expiry_time(conf=__conf__):
4307+ """
4308+ cgroups monitoring for pilot extensions disabled after expiry time
4309+
4310+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4311+ """
4312+ return conf.get("Debug.CgroupMonitorExpiryTime", "2022-03-31")
4313+
4314+
4315+def get_cgroup_monitor_extension_name (conf=__conf__):
4316+ """
4317+ cgroups monitoring extension name
4318+
4319+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4320+ """
4321+ return conf.get("Debug.CgroupMonitorExtensionName", "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent")
4322+
4323+
4324+def get_enable_fast_track(conf=__conf__):
4325+ """
4326+ If True, the agent use FastTrack when retrieving goal states
4327+
4328+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4329+ """
4330+ return conf.get_switch("Debug.EnableFastTrack", True)
4331+
4332+
4333+def get_etp_collection_period(conf=__conf__):
4334+ """
4335+ Determines the frequency to perform ETP collection on extensions telemetry events.
4336+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4337+ """
4338+ return conf.get_int("Debug.EtpCollectionPeriod", 300)
4339+
4340+
4341+def get_hotfix_upgrade_frequency(conf=__conf__):
4342+ """
4343+ Determines the frequency to check for Hotfix upgrades (<Patch>.<Build> version changed in new upgrades).
4344+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4345+ """
4346+ return conf.get_int("Debug.AutoUpdateHotfixFrequency", 4 * 60 * 60)
4347+
4348+
4349+def get_normal_upgrade_frequency(conf=__conf__):
4350+ """
4351+ Determines the frequency to check for Normal upgrades (<Major>.<Minor> version changed in new upgrades).
4352+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4353+ """
4354+ return conf.get_int("Debug.AutoUpdateNormalFrequency", 24 * 60 * 60)
4355+
4356+
4357+def get_enable_ga_versioning(conf=__conf__):
4358+ """
4359+ If True, the agent uses GA Versioning for auto-updating the agent vs automatically auto-updating to the highest version.
4360+
4361+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4362+ """
4363+ return conf.get_switch("Debug.EnableGAVersioning", False)
4364+
4365+
4366+def get_firewall_rules_log_period(conf=__conf__):
4367+ """
4368+ Determine the frequency to perform the periodic operation of logging firewall rules.
4369+
4370+ NOTE: This option is experimental and may be removed in later versions of the Agent.
4371+ """
4372+ return conf.get_int("Debug.FirewallRulesLogPeriod", 86400)
4373diff --git a/azurelinuxagent/common/datacontract.py b/azurelinuxagent/common/datacontract.py
4374index c69bebc..b6d1f3c 100644
4375--- a/azurelinuxagent/common/datacontract.py
4376+++ b/azurelinuxagent/common/datacontract.py
4377@@ -20,9 +20,11 @@
4378 from azurelinuxagent.common.exception import ProtocolError
4379 import azurelinuxagent.common.logger as logger
4380
4381+# pylint: disable=W0105
4382 """
4383 Base class for data contracts between guest and host and utilities to manipulate the properties in those contracts
4384-"""
4385+"""
4386+# pylint: enable=W0105
4387
4388
4389 class DataContract(object):
4390@@ -30,7 +32,7 @@ class DataContract(object):
4391
4392
4393 class DataContractList(list):
4394- def __init__(self, item_cls):
4395+ def __init__(self, item_cls): # pylint: disable=W0231
4396 self.item_cls = item_cls
4397
4398
4399diff --git a/azurelinuxagent/common/dhcp.py b/azurelinuxagent/common/dhcp.py
4400index 5974965..3db58d3 100644
4401--- a/azurelinuxagent/common/dhcp.py
4402+++ b/azurelinuxagent/common/dhcp.py
4403@@ -14,23 +14,23 @@
4404 #
4405 # Requires Python 2.6+ and Openssl 1.0+
4406
4407+import array
4408 import os
4409 import socket
4410-import array
4411 import time
4412+
4413 import azurelinuxagent.common.logger as logger
4414+from azurelinuxagent.common.exception import DhcpError
4415+from azurelinuxagent.common.osutil import get_osutil
4416+from azurelinuxagent.common.utils.restutil import KNOWN_WIRESERVER_IP
4417 from azurelinuxagent.common.utils.textutil import hex_dump, hex_dump2, \
4418 hex_dump3, \
4419 compare_bytes, str_to_ord, \
4420 unpack_big_endian, \
4421 int_to_ip4_addr
4422-from azurelinuxagent.common.exception import DhcpError
4423-from azurelinuxagent.common.osutil import get_osutil
4424-
4425
4426 # the kernel routing table representation of 168.63.129.16
4427 KNOWN_WIRESERVER_IP_ENTRY = '10813FA8'
4428-from azurelinuxagent.common.utils.restutil import KNOWN_WIRESERVER_IP
4429
4430
4431 def get_dhcp_handler():
4432@@ -86,7 +86,7 @@ class DhcpHandler(object):
4433 logger.info("Test for route to {0}".format(KNOWN_WIRESERVER_IP))
4434 try:
4435 route_table = self.osutil.read_route_table()
4436- if any([(KNOWN_WIRESERVER_IP_ENTRY in route) for route in route_table]):
4437+ if any((KNOWN_WIRESERVER_IP_ENTRY in route) for route in route_table):
4438 # reset self.gateway and self.routes
4439 # we do not need to alter the routing table
4440 self.endpoint = KNOWN_WIRESERVER_IP
4441@@ -100,7 +100,7 @@ class DhcpHandler(object):
4442 logger.error(
4443 "Could not determine whether route exists to {0}: {1}".format(
4444 KNOWN_WIRESERVER_IP, e))
4445-
4446+
4447 return route_exists
4448
4449 @property
4450@@ -116,7 +116,7 @@ class DhcpHandler(object):
4451 exists = False
4452
4453 logger.info("Checking for dhcp lease cache")
4454- cached_endpoint = self.osutil.get_dhcp_lease_endpoint()
4455+ cached_endpoint = self.osutil.get_dhcp_lease_endpoint() # pylint: disable=E1128
4456 if cached_endpoint is not None:
4457 self.endpoint = cached_endpoint
4458 exists = True
4459@@ -157,11 +157,14 @@ class DhcpHandler(object):
4460 self.endpoint = KNOWN_WIRESERVER_IP
4461 return
4462
4463+ # pylint: disable=W0105
4464 """
4465 Build dhcp request with mac addr
4466 Configure route to allow dhcp traffic
4467 Stop dhcp service if necessary
4468 """
4469+ # pylint: enable=W0105
4470+
4471 logger.info("Send dhcp request")
4472 mac_addr = self.osutil.get_mac_addr()
4473
4474@@ -194,7 +197,7 @@ class DhcpHandler(object):
4475 self.endpoint, self.gateway, self.routes = parse_dhcp_resp(resp)
4476
4477
4478-def validate_dhcp_resp(request, response):
4479+def validate_dhcp_resp(request, response): # pylint: disable=R1710
4480 bytes_recv = len(response)
4481 if bytes_recv < 0xF6:
4482 logger.error("HandleDhcpResponse: Too few bytes received:{0}",
4483@@ -228,7 +231,7 @@ def validate_dhcp_resp(request, response):
4484 "doesn't match the request")
4485
4486
4487-def parse_route(response, option, i, length, bytes_recv):
4488+def parse_route(response, option, i, length, bytes_recv): # pylint: disable=W0613
4489 # http://msdn.microsoft.com/en-us/library/cc227282%28PROT.10%29.aspx
4490 logger.verbose("Routes at offset: {0} with length:{1}", hex(i),
4491 hex(length))
4492@@ -386,7 +389,7 @@ def build_dhcp_request(mac_addr, request_broadcast):
4493 # set broadcast flag to true to request the dhcp server
4494 # to respond to a boradcast address,
4495 # this is useful when user dhclient fails.
4496- request[0x0A] = 0x80;
4497+ request[0x0A] = 0x80
4498
4499 # fill in ClientHardwareAddress
4500 for a in range(0, 6):
4501diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py
4502index 5274b80..1f903a9 100644
4503--- a/azurelinuxagent/common/event.py
4504+++ b/azurelinuxagent/common/event.py
4505@@ -18,30 +18,35 @@
4506 import atexit
4507 import json
4508 import os
4509+import platform
4510 import re
4511 import sys
4512 import threading
4513 import time
4514 import traceback
4515-from collections import namedtuple
4516 from datetime import datetime
4517
4518 import azurelinuxagent.common.conf as conf
4519 import azurelinuxagent.common.logger as logger
4520-from azurelinuxagent.common.exception import EventError
4521-from azurelinuxagent.common.future import ustr, OrderedDict
4522-from azurelinuxagent.common.datacontract import get_properties, DataContractList
4523-from azurelinuxagent.common.telemetryevent import TelemetryEventParam, TelemetryEvent
4524+from azurelinuxagent.common.AgentGlobals import AgentGlobals
4525+from azurelinuxagent.common.exception import EventError, OSUtilError
4526+from azurelinuxagent.common.future import ustr
4527+from azurelinuxagent.common.datacontract import get_properties, set_properties
4528+from azurelinuxagent.common.osutil import get_osutil
4529+from azurelinuxagent.common.telemetryevent import TelemetryEventParam, TelemetryEvent, CommonTelemetryEventSchema, \
4530+ GuestAgentGenericLogsSchema, GuestAgentExtensionEventsSchema, GuestAgentPerfCounterEventsSchema
4531 from azurelinuxagent.common.utils import fileutil, textutil
4532-from azurelinuxagent.common.version import CURRENT_VERSION, CURRENT_AGENT
4533+from azurelinuxagent.common.utils.textutil import parse_doc, findall, find, getattrib, str_to_encoded_ustr
4534+from azurelinuxagent.common.version import CURRENT_VERSION, CURRENT_AGENT, AGENT_NAME, DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, AGENT_EXECUTION_MODE
4535+from azurelinuxagent.common.protocol.imds import get_imds_client
4536+
4537+EVENTS_DIRECTORY = "events"
4538
4539 _EVENT_MSG = "Event: name={0}, op={1}, message={2}, duration={3}"
4540 TELEMETRY_EVENT_PROVIDER_ID = "69B669B9-4AF8-4C50-BDC4-6006FA76E975"
4541+TELEMETRY_EVENT_EVENT_ID = 1
4542 TELEMETRY_METRICS_EVENT_ID = 4
4543
4544-# Store the last retrieved container id as an environment variable to be shared between threads for telemetry purposes
4545-CONTAINER_ID_ENV_VARIABLE = "AZURE_GUEST_AGENT_CONTAINER_ID"
4546-
4547 TELEMETRY_LOG_PROVIDER_ID = "FFF0196F-EE4C-4EAF-9AA5-776F622DEB4F"
4548 TELEMETRY_LOG_EVENT_ID = 7
4549
4550@@ -53,33 +58,39 @@ SEND_LOGS_TO_TELEMETRY = False
4551
4552 MAX_NUMBER_OF_EVENTS = 1000
4553
4554+AGENT_EVENT_FILE_EXTENSION = '.waagent.tld'
4555+EVENT_FILE_REGEX = re.compile(r'(?P<agent_event>\.waagent)?\.tld$')
4556
4557 def send_logs_to_telemetry():
4558 return SEND_LOGS_TO_TELEMETRY
4559
4560
4561-def get_container_id_from_env():
4562- return os.environ.get(CONTAINER_ID_ENV_VARIABLE, "UNINITIALIZED")
4563-
4564-
4565 class WALAEventOperation:
4566 ActivateResourceDisk = "ActivateResourceDisk"
4567 AgentBlacklisted = "AgentBlacklisted"
4568 AgentEnabled = "AgentEnabled"
4569+ AgentMemory = "AgentMemory"
4570+ AgentUpgrade = "AgentUpgrade"
4571 ArtifactsProfileBlob = "ArtifactsProfileBlob"
4572- AutoUpdate = "AutoUpdate"
4573- CustomData = "CustomData"
4574 CGroupsCleanUp = "CGroupsCleanUp"
4575- CGroupsLimitsCrossed = "CGroupsLimitsCrossed"
4576- ExtensionMetricsData = "ExtensionMetricsData"
4577+ CGroupsDisabled = "CGroupsDisabled"
4578+ CGroupsInfo = "CGroupsInfo"
4579+ CollectEventErrors = "CollectEventErrors"
4580+ CollectEventUnicodeErrors = "CollectEventUnicodeErrors"
4581+ ConfigurationChange = "ConfigurationChange"
4582+ CustomData = "CustomData"
4583+ DefaultChannelChange = "DefaultChannelChange"
4584 Deploy = "Deploy"
4585 Disable = "Disable"
4586 Downgrade = "Downgrade"
4587 Download = "Download"
4588 Enable = "Enable"
4589 ExtensionProcessing = "ExtensionProcessing"
4590+ ExtensionTelemetryEventProcessing = "ExtensionTelemetryEventProcessing"
4591+ FetchGoalState = "FetchGoalState"
4592 Firewall = "Firewall"
4593- GetArtifactExtended = "GetArtifactExtended"
4594+ GoalState = "GoalState"
4595+ GoalStateUnsupportedFeatures = "GoalStateUnsupportedFeatures"
4596 HealthCheck = "HealthCheck"
4597 HealthObservation = "HealthObservation"
4598 HeartBeat = "HeartBeat"
4599@@ -87,29 +98,35 @@ class WALAEventOperation:
4600 HostPluginHeartbeat = "HostPluginHeartbeat"
4601 HostPluginHeartbeatExtended = "HostPluginHeartbeatExtended"
4602 HttpErrors = "HttpErrors"
4603+ HttpGet = "HttpGet"
4604 ImdsHeartbeat = "ImdsHeartbeat"
4605 Install = "Install"
4606- InitializeCGroups = "InitializeCGroups"
4607 InitializeHostPlugin = "InitializeHostPlugin"
4608- InvokeCommandUsingSystemd = "InvokeCommandUsingSystemd"
4609 Log = "Log"
4610+ LogCollection = "LogCollection"
4611 OSInfo = "OSInfo"
4612 Partition = "Partition"
4613- ProcessGoalState = "ProcessGoalState"
4614+ PersistFirewallRules = "PersistFirewallRules"
4615+ PluginSettingsVersionMismatch = "PluginSettingsVersionMismatch"
4616+ InvalidExtensionConfig = "InvalidExtensionConfig"
4617 Provision = "Provision"
4618 ProvisionGuestAgent = "ProvisionGuestAgent"
4619 RemoteAccessHandling = "RemoteAccessHandling"
4620+ ReportEventErrors = "ReportEventErrors"
4621+ ReportEventUnicodeErrors = "ReportEventUnicodeErrors"
4622 ReportStatus = "ReportStatus"
4623 ReportStatusExtended = "ReportStatusExtended"
4624 Restart = "Restart"
4625 SequenceNumberMismatch = "SequenceNumberMismatch"
4626 SetCGroupsLimits = "SetCGroupsLimits"
4627 SkipUpdate = "SkipUpdate"
4628+ StatusProcessing = "StatusProcessing"
4629 UnhandledError = "UnhandledError"
4630 UnInstall = "UnInstall"
4631 Unknown = "Unknown"
4632- Upgrade = "Upgrade"
4633 Update = "Update"
4634+ VmSettings = "VmSettings"
4635+ VmSettingsSummary = "VmSettingsSummary"
4636
4637
4638 SHOULD_ENCODE_MESSAGE_LEN = 80
4639@@ -173,11 +190,57 @@ class EventStatus(object):
4640
4641 __event_status__ = EventStatus()
4642 __event_status_operations__ = [
4643- WALAEventOperation.AutoUpdate,
4644 WALAEventOperation.ReportStatus
4645 ]
4646
4647
4648+def parse_json_event(data_str):
4649+ data = json.loads(data_str)
4650+ event = TelemetryEvent()
4651+ set_properties("TelemetryEvent", event, data)
4652+ event.file_type = "json"
4653+ return event
4654+
4655+
4656+def parse_event(data_str):
4657+ try:
4658+ try:
4659+ return parse_json_event(data_str)
4660+ except ValueError:
4661+ return parse_xml_event(data_str)
4662+ except Exception as e:
4663+ raise EventError("Error parsing event: {0}".format(ustr(e)))
4664+
4665+
4666+def parse_xml_param(param_node):
4667+ name = getattrib(param_node, "Name")
4668+ value_str = getattrib(param_node, "Value")
4669+ attr_type = getattrib(param_node, "T")
4670+ value = value_str
4671+ if attr_type == 'mt:uint64':
4672+ value = int(value_str)
4673+ elif attr_type == 'mt:bool':
4674+ value = bool(value_str)
4675+ elif attr_type == 'mt:float64':
4676+ value = float(value_str)
4677+ return TelemetryEventParam(name, value)
4678+
4679+
4680+def parse_xml_event(data_str):
4681+ try:
4682+ xml_doc = parse_doc(data_str)
4683+ event_id = getattrib(find(xml_doc, "Event"), 'id')
4684+ provider_id = getattrib(find(xml_doc, "Provider"), 'id')
4685+ event = TelemetryEvent(event_id, provider_id)
4686+ param_nodes = findall(xml_doc, 'Param')
4687+ for param_node in param_nodes:
4688+ event.parameters.append(parse_xml_param(param_node))
4689+ event.file_type = "xml"
4690+ return event
4691+ except Exception as e:
4692+ raise ValueError(ustr(e))
4693+
4694+
4695 def _encode_message(op, message):
4696 """
4697 Gzip and base64 encode a message based on the operation.
4698@@ -214,20 +277,158 @@ def _encode_message(op, message):
4699
4700
4701 def _log_event(name, op, message, duration, is_success=True):
4702- global _EVENT_MSG
4703+ global _EVENT_MSG # pylint: disable=W0603
4704
4705- message = _encode_message(op, message)
4706 if not is_success:
4707 logger.error(_EVENT_MSG, name, op, message, duration)
4708 else:
4709 logger.info(_EVENT_MSG, name, op, message, duration)
4710
4711
4712+class CollectOrReportEventDebugInfo(object):
4713+ """
4714+ This class is used for capturing and reporting debug info that is captured during event collection and
4715+ reporting to wireserver.
4716+ It captures the count of unicode errors and any unexpected errors and also a subset of errors with stacks to help
4717+ with debugging any potential issues.
4718+ """
4719+ __MAX_ERRORS_TO_REPORT = 5
4720+ OP_REPORT = "Report"
4721+ OP_COLLECT = "Collect"
4722+
4723+ def __init__(self, operation=OP_REPORT):
4724+ self.__unicode_error_count = 0
4725+ self.__unicode_errors = set()
4726+ self.__op_error_count = 0
4727+ self.__op_errors = set()
4728+
4729+ if operation == self.OP_REPORT:
4730+ self.__unicode_error_event = WALAEventOperation.ReportEventUnicodeErrors
4731+ self.__op_errors_event = WALAEventOperation.ReportEventErrors
4732+ elif operation == self.OP_COLLECT:
4733+ self.__unicode_error_event = WALAEventOperation.CollectEventUnicodeErrors
4734+ self.__op_errors_event = WALAEventOperation.CollectEventErrors
4735+
4736+ def report_debug_info(self):
4737+
4738+ def report_dropped_events_error(count, errors, operation_name):
4739+ err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}"
4740+ if count > 0:
4741+ add_event(op=operation_name,
4742+ message=err_msg_format.format(count, CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT, ', '.join(errors)),
4743+ is_success=False)
4744+
4745+ report_dropped_events_error(self.__op_error_count, self.__op_errors, self.__op_errors_event)
4746+ report_dropped_events_error(self.__unicode_error_count, self.__unicode_errors, self.__unicode_error_event)
4747+
4748+ @staticmethod
4749+ def _update_errors_and_get_count(error_count, errors, error):
4750+ error_count += 1
4751+ if len(errors) < CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT:
4752+ errors.add("{0}: {1}".format(ustr(error), traceback.format_exc()))
4753+ return error_count
4754+
4755+ def update_unicode_error(self, unicode_err):
4756+ self.__unicode_error_count = self._update_errors_and_get_count(self.__unicode_error_count, self.__unicode_errors,
4757+ unicode_err)
4758+
4759+ def update_op_error(self, op_err):
4760+ self.__op_error_count = self._update_errors_and_get_count(self.__op_error_count, self.__op_errors, op_err)
4761+
4762+
4763 class EventLogger(object):
4764 def __init__(self):
4765 self.event_dir = None
4766 self.periodic_events = {}
4767
4768+ #
4769+ # All events should have these parameters.
4770+ #
4771+ # The first set comes from the current OS and is initialized here. These values don't change during
4772+ # the agent's lifetime.
4773+ #
4774+ # The next two sets come from the goal state and IMDS and must be explicitly initialized using
4775+ # initialize_vminfo_common_parameters() once a protocol for communication with the host has been
4776+ # created. Their values don't change during the agent's lifetime. Note that we initialize these
4777+ # parameters here using dummy values (*_UNINITIALIZED) since events sent to the host should always
4778+ # match the schema defined for them in the telemetry pipeline.
4779+ #
4780+ # There is another set of common parameters that must be computed at the time the event is created
4781+ # (e.g. the timestamp and the container ID); those are added to events (along with the parameters
4782+ # below) in _add_common_event_parameters()
4783+ #
4784+ # Note that different kinds of events may also include other parameters; those are added by the
4785+ # corresponding add_* method (e.g. add_metric for performance metrics).
4786+ #
4787+ self._common_parameters = []
4788+
4789+ # Parameters from OS
4790+ osutil = get_osutil()
4791+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.OSVersion, EventLogger._get_os_version()))
4792+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ExecutionMode, AGENT_EXECUTION_MODE))
4793+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RAM, int(EventLogger._get_ram(osutil))))
4794+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.Processors, int(EventLogger._get_processors(osutil))))
4795+
4796+ # Parameters from goal state
4797+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.TenantName, "TenantName_UNINITIALIZED"))
4798+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RoleName, "RoleName_UNINITIALIZED"))
4799+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RoleInstanceName, "RoleInstanceName_UNINITIALIZED"))
4800+ #
4801+ # # Parameters from IMDS
4802+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.Location, "Location_UNINITIALIZED"))
4803+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.SubscriptionId, "SubscriptionId_UNINITIALIZED"))
4804+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ResourceGroupName, "ResourceGroupName_UNINITIALIZED"))
4805+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.VMId, "VMId_UNINITIALIZED"))
4806+ self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ImageOrigin, 0))
4807+
4808+ @staticmethod
4809+ def _get_os_version():
4810+ return "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release())
4811+
4812+ @staticmethod
4813+ def _get_ram(osutil):
4814+ try:
4815+ return osutil.get_total_mem()
4816+ except OSUtilError as e:
4817+ logger.warn("Failed to get RAM info; will be missing from telemetry: {0}", ustr(e))
4818+ return 0
4819+
4820+ @staticmethod
4821+ def _get_processors(osutil):
4822+ try:
4823+ return osutil.get_processor_cores()
4824+ except OSUtilError as e:
4825+ logger.warn("Failed to get Processors info; will be missing from telemetry: {0}", ustr(e))
4826+ return 0
4827+
4828+ def initialize_vminfo_common_parameters(self, protocol):
4829+ """
4830+ Initializes the common parameters that come from the goal state and IMDS
4831+ """
4832+ # create an index of the event parameters for faster updates
4833+ parameters = {}
4834+ for p in self._common_parameters:
4835+ parameters[p.name] = p
4836+
4837+ try:
4838+ vminfo = protocol.get_vminfo()
4839+ parameters[CommonTelemetryEventSchema.TenantName].value = vminfo.tenantName
4840+ parameters[CommonTelemetryEventSchema.RoleName].value = vminfo.roleName
4841+ parameters[CommonTelemetryEventSchema.RoleInstanceName].value = vminfo.roleInstanceName
4842+ except Exception as e:
4843+ logger.warn("Failed to get VM info from goal state; will be missing from telemetry: {0}", ustr(e))
4844+
4845+ try:
4846+ imds_client = get_imds_client(protocol.get_endpoint())
4847+ imds_info = imds_client.get_compute()
4848+ parameters[CommonTelemetryEventSchema.Location].value = imds_info.location
4849+ parameters[CommonTelemetryEventSchema.SubscriptionId].value = imds_info.subscriptionId
4850+ parameters[CommonTelemetryEventSchema.ResourceGroupName].value = imds_info.resourceGroupName
4851+ parameters[CommonTelemetryEventSchema.VMId].value = imds_info.vmId
4852+ parameters[CommonTelemetryEventSchema.ImageOrigin].value = int(imds_info.image_origin)
4853+ except Exception as e:
4854+ logger.warn("Failed to get IMDS info; will be missing from telemetry: {0}", ustr(e))
4855+
4856 def save_event(self, data):
4857 if self.event_dir is None:
4858 logger.warn("Cannot save event -- Event reporter is not initialized.")
4859@@ -258,7 +459,7 @@ class EventLogger(object):
4860 try:
4861 with open(filename + ".tmp", 'wb+') as hfile:
4862 hfile.write(data.encode("utf-8"))
4863- os.rename(filename + ".tmp", filename + ".tld")
4864+ os.rename(filename + ".tmp", filename + AGENT_EVENT_FILE_EXTENSION)
4865 except (IOError, OSError) as e:
4866 msg = "Failed to write events to file: {0}".format(e)
4867 raise EventError(msg)
4868@@ -271,38 +472,31 @@ class EventLogger(object):
4869 (self.periodic_events[h] + delta) <= datetime.now()
4870
4871 def add_periodic(self, delta, name, op=WALAEventOperation.Unknown, is_success=True, duration=0,
4872- version=str(CURRENT_VERSION), message="", evt_type="", is_internal=False, log_event=True,
4873- force=False):
4874+ version=str(CURRENT_VERSION), message="", log_event=True, force=False):
4875 h = hash(name + op + ustr(is_success) + message)
4876
4877 if force or self.is_period_elapsed(delta, h):
4878 self.add_event(name, op=op, is_success=is_success, duration=duration,
4879- version=version, message=message, evt_type=evt_type,
4880- is_internal=is_internal, log_event=log_event)
4881+ version=version, message=message, log_event=log_event)
4882 self.periodic_events[h] = datetime.now()
4883
4884 def add_event(self, name, op=WALAEventOperation.Unknown, is_success=True, duration=0, version=str(CURRENT_VERSION),
4885- message="", evt_type="", is_internal=False, log_event=True):
4886+ message="", log_event=True):
4887
4888 if (not is_success) and log_event:
4889 _log_event(name, op, message, duration, is_success=is_success)
4890
4891- self._add_event(duration, evt_type, is_internal, is_success, message, name, op, version, event_id=1)
4892-
4893- def _add_event(self, duration, evt_type, is_internal, is_success, message, name, op, version, event_id):
4894- event = TelemetryEvent(event_id, TELEMETRY_EVENT_PROVIDER_ID)
4895-
4896- event.parameters.append(TelemetryEventParam('Name', str(name)))
4897- event.parameters.append(TelemetryEventParam('Version', str(version)))
4898- event.parameters.append(TelemetryEventParam('IsInternal', bool(is_internal)))
4899- event.parameters.append(TelemetryEventParam('Operation', str(op)))
4900- event.parameters.append(TelemetryEventParam('OperationSuccess', bool(is_success)))
4901- event.parameters.append(TelemetryEventParam('Message', str(message)))
4902- event.parameters.append(TelemetryEventParam('Duration', int(duration)))
4903- event.parameters.append(TelemetryEventParam('ExtensionType', str(evt_type)))
4904+ event = TelemetryEvent(TELEMETRY_EVENT_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID)
4905+ event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Name, str_to_encoded_ustr(name)))
4906+ event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str_to_encoded_ustr(version)))
4907+ event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, str_to_encoded_ustr(op)))
4908+ event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.OperationSuccess, bool(is_success)))
4909+ event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Message, str_to_encoded_ustr(message)))
4910+ event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Duration, int(duration)))
4911+ self.add_common_event_parameters(event, datetime.utcnow())
4912
4913- event.parameters = self.add_default_parameters_to_event(event.parameters)
4914 data = get_properties(event)
4915+
4916 try:
4917 self.save_event(json.dumps(data))
4918 except EventError as e:
4919@@ -310,13 +504,13 @@ class EventLogger(object):
4920
4921 def add_log_event(self, level, message):
4922 event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID)
4923- event.parameters.append(TelemetryEventParam('EventName', WALAEventOperation.Log))
4924- event.parameters.append(TelemetryEventParam('CapabilityUsed', logger.LogLevel.STRINGS[level]))
4925- event.parameters.append(TelemetryEventParam('Context1', self._clean_up_message(message)))
4926- event.parameters.append(TelemetryEventParam('Context2', ''))
4927- event.parameters.append(TelemetryEventParam('Context3', ''))
4928+ event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.EventName, WALAEventOperation.Log))
4929+ event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.CapabilityUsed, logger.LogLevel.STRINGS[level]))
4930+ event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context1, str_to_encoded_ustr(self._clean_up_message(message))))
4931+ event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context2, datetime.utcnow().strftime(logger.Logger.LogTimeFormatInUTC)))
4932+ event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context3, ''))
4933+ self.add_common_event_parameters(event, datetime.utcnow())
4934
4935- event.parameters = self.add_default_parameters_to_event(event.parameters)
4936 data = get_properties(event)
4937 try:
4938 self.save_event(json.dumps(data))
4939@@ -334,17 +528,16 @@ class EventLogger(object):
4940 :param bool log_event: If true, log the collected metric in the agent log
4941 """
4942 if log_event:
4943- from azurelinuxagent.common.version import AGENT_NAME
4944 message = "Metric {0}/{1} [{2}] = {3}".format(category, counter, instance, value)
4945 _log_event(AGENT_NAME, "METRIC", message, 0)
4946
4947 event = TelemetryEvent(TELEMETRY_METRICS_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID)
4948- event.parameters.append(TelemetryEventParam('Category', str(category)))
4949- event.parameters.append(TelemetryEventParam('Counter', str(counter)))
4950- event.parameters.append(TelemetryEventParam('Instance', str(instance)))
4951- event.parameters.append(TelemetryEventParam('Value', float(value)))
4952+ event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Category, str_to_encoded_ustr(category)))
4953+ event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Counter, str_to_encoded_ustr(counter)))
4954+ event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Instance, str_to_encoded_ustr(instance)))
4955+ event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Value, float(value)))
4956+ self.add_common_event_parameters(event, datetime.utcnow())
4957
4958- event.parameters = self.add_default_parameters_to_event(event.parameters)
4959 data = get_properties(event)
4960 try:
4961 self.save_event(json.dumps(data))
4962@@ -392,56 +585,34 @@ class EventLogger(object):
4963 else:
4964 return message
4965
4966- @staticmethod
4967- def add_default_parameters_to_event(event_parameters, set_values_for_agent=True):
4968+ def add_common_event_parameters(self, event, event_timestamp):
4969 """
4970- Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't
4971- have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly,
4972- but not for extension events. Add it if it's missing.
4973-
4974- We write the GAVersion here rather than add it in azurelinuxagent.ga.monitor.MonitorHandler.add_sysinfo
4975- as there could be a possibility of events being sent with newer version of the agent, rather than the agent
4976- version generating the event.
4977- # Old behavior example: V1 writes the event on the disk and finds an update immediately, and updates. Now the
4978- new monitor thread would pick up the events from the disk and send it with the CURRENT_AGENT, which would have
4979- newer version of the agent. This causes confusion.
4980-
4981- ContainerId can change due to live migration and we want to preserve the container Id of the container writing
4982- the event, rather than sending the event.
4983- OpcodeName - This is used as the actual time of event generation.
4984-
4985- :param event_parameters: List of parameters of the event.
4986- :param set_values_for_agent: Need default values populated or not. Extensions need only GAVersion and
4987- ContainerId to be populated and others should be
4988- :return: Event with default parameters populated (either values for agent or extension)
4989+ This method is called for all events and ensures all telemetry fields are added before the event is sent out.
4990+ Note that the event timestamp is saved in the OpcodeName field.
4991 """
4992- DefaultParameter = namedtuple('DefaultParameter', ['name', 'value'])
4993- default_parameters = [DefaultParameter("GAVersion", CURRENT_AGENT),
4994- DefaultParameter('ContainerId', get_container_id_from_env()),
4995- DefaultParameter('OpcodeName', datetime.utcnow().__str__() if set_values_for_agent else ""),
4996- DefaultParameter('EventTid', threading.current_thread().ident if set_values_for_agent else 0),
4997- DefaultParameter('EventPid', os.getpid() if set_values_for_agent else 0),
4998- DefaultParameter("TaskName", threading.current_thread().getName() if set_values_for_agent else ""),
4999- DefaultParameter("KeywordName", '')]
5000-
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches