Merge ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-merge-2.9.1.1 into ubuntu/+source/walinuxagent:ubuntu/mantic-devel

Proposed by Calvin Mwadime Makokha
Status: Superseded
Proposed branch: ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-merge-2.9.1.1
Merge into: ubuntu/+source/walinuxagent:ubuntu/mantic-devel
Diff against target: 74801 lines (+43588/-14724)
404 files modified
.github/PULL_REQUEST_TEMPLATE.md (+1/-2)
.github/codecov.yml (+2/-0)
.github/workflows/ci_pr.yml (+128/-0)
.gitignore (+1/-2)
CODEOWNERS (+3/-1)
README.md (+110/-33)
SECURITY.md (+41/-0)
azurelinuxagent/agent.py (+203/-66)
azurelinuxagent/common/AgentGlobals.py (+39/-0)
azurelinuxagent/common/agent_supported_feature.py (+122/-0)
azurelinuxagent/common/cgroup.py (+208/-77)
azurelinuxagent/common/cgroupapi.py (+214/-429)
azurelinuxagent/common/cgroupconfigurator.py (+1000/-118)
azurelinuxagent/common/cgroupstelemetry.py (+25/-265)
azurelinuxagent/common/conf.py (+272/-21)
azurelinuxagent/common/datacontract.py (+4/-2)
azurelinuxagent/common/dhcp.py (+14/-11)
azurelinuxagent/common/event.py (+286/-112)
azurelinuxagent/common/exception.py (+72/-18)
azurelinuxagent/common/future.py (+91/-15)
azurelinuxagent/common/interfaces.py (+49/-0)
azurelinuxagent/common/logcollector.py (+401/-0)
azurelinuxagent/common/logcollector_manifests.py (+122/-0)
azurelinuxagent/common/logger.py (+35/-7)
azurelinuxagent/common/osutil/alpine.py (+2/-2)
azurelinuxagent/common/osutil/arch.py (+9/-2)
azurelinuxagent/common/osutil/bigip.py (+31/-30)
azurelinuxagent/common/osutil/clearlinux.py (+28/-16)
azurelinuxagent/common/osutil/coreos.py (+9/-3)
azurelinuxagent/common/osutil/debian.py (+13/-13)
azurelinuxagent/common/osutil/default.py (+441/-315)
azurelinuxagent/common/osutil/devuan.py (+52/-0)
azurelinuxagent/common/osutil/factory.py (+60/-33)
azurelinuxagent/common/osutil/fedora.py (+77/-0)
azurelinuxagent/common/osutil/freebsd.py (+43/-33)
azurelinuxagent/common/osutil/gaia.py (+29/-17)
azurelinuxagent/common/osutil/iosxe.py (+19/-7)
azurelinuxagent/common/osutil/mariner.py (+69/-0)
azurelinuxagent/common/osutil/nsbsd.py (+28/-26)
azurelinuxagent/common/osutil/openbsd.py (+16/-19)
azurelinuxagent/common/osutil/openwrt.py (+14/-13)
azurelinuxagent/common/osutil/photonos.py (+65/-0)
azurelinuxagent/common/osutil/redhat.py (+45/-16)
azurelinuxagent/common/osutil/suse.py (+93/-36)
azurelinuxagent/common/osutil/systemd.py (+86/-0)
azurelinuxagent/common/osutil/ubuntu.py (+32/-16)
azurelinuxagent/common/persist_firewall_rules.py (+338/-0)
azurelinuxagent/common/protocol/__init__.py (+0/-5)
azurelinuxagent/common/protocol/extensions_goal_state.py (+244/-0)
azurelinuxagent/common/protocol/extensions_goal_state_factory.py (+36/-0)
azurelinuxagent/common/protocol/extensions_goal_state_from_extensions_config.py (+571/-0)
azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py (+583/-0)
azurelinuxagent/common/protocol/goal_state.py (+705/-0)
azurelinuxagent/common/protocol/hostplugin.py (+371/-38)
azurelinuxagent/common/protocol/imds.py (+30/-14)
azurelinuxagent/common/protocol/metadata_server_migration_util.py (+79/-0)
azurelinuxagent/common/protocol/ovfenv.py (+7/-6)
azurelinuxagent/common/protocol/restapi.py (+165/-91)
azurelinuxagent/common/protocol/util.py (+109/-159)
azurelinuxagent/common/protocol/wire.py (+516/-1095)
azurelinuxagent/common/rdma.py (+199/-56)
azurelinuxagent/common/singletonperthread.py (+30/-0)
azurelinuxagent/common/telemetryevent.py (+72/-3)
azurelinuxagent/common/utils/archive.py (+204/-111)
azurelinuxagent/common/utils/cryptutil.py (+28/-25)
azurelinuxagent/common/utils/extensionprocessutil.py (+31/-7)
azurelinuxagent/common/utils/fileutil.py (+9/-7)
azurelinuxagent/common/utils/flexible_version.py (+29/-9)
azurelinuxagent/common/utils/networkutil.py (+172/-3)
azurelinuxagent/common/utils/restutil.py (+153/-56)
azurelinuxagent/common/utils/shellutil.py (+260/-57)
azurelinuxagent/common/utils/textutil.py (+63/-10)
azurelinuxagent/common/utils/timeutil.py (+39/-0)
azurelinuxagent/common/version.py (+102/-31)
azurelinuxagent/daemon/main.py (+36/-22)
azurelinuxagent/daemon/resourcedisk/default.py (+13/-12)
azurelinuxagent/daemon/resourcedisk/factory.py (+3/-7)
azurelinuxagent/daemon/resourcedisk/freebsd.py (+1/-1)
azurelinuxagent/daemon/resourcedisk/openwrt.py (+2/-2)
azurelinuxagent/daemon/scvmm.py (+3/-3)
azurelinuxagent/ga/collect_logs.py (+353/-0)
azurelinuxagent/ga/collect_telemetry_events.py (+586/-0)
azurelinuxagent/ga/env.py (+177/-123)
azurelinuxagent/ga/exthandlers.py (+1493/-645)
azurelinuxagent/ga/monitor.py (+249/-450)
azurelinuxagent/ga/periodic_operation.py (+81/-0)
azurelinuxagent/ga/remoteaccess.py (+81/-90)
azurelinuxagent/ga/send_telemetry_events.py (+164/-0)
azurelinuxagent/ga/update.py (+1089/-380)
azurelinuxagent/pa/deprovision/arch.py (+1/-1)
azurelinuxagent/pa/deprovision/clearlinux.py (+4/-2)
azurelinuxagent/pa/deprovision/coreos.py (+1/-1)
azurelinuxagent/pa/deprovision/default.py (+55/-17)
azurelinuxagent/pa/deprovision/factory.py (+5/-8)
azurelinuxagent/pa/deprovision/ubuntu.py (+2/-2)
azurelinuxagent/pa/provision/cloudinit.py (+33/-91)
azurelinuxagent/pa/provision/cloudinitdetect.py (+72/-0)
azurelinuxagent/pa/provision/default.py (+28/-42)
azurelinuxagent/pa/provision/factory.py (+3/-3)
azurelinuxagent/pa/rdma/centos.py (+6/-6)
azurelinuxagent/pa/rdma/factory.py (+9/-7)
azurelinuxagent/pa/rdma/suse.py (+12/-3)
azurelinuxagent/pa/rdma/ubuntu.py (+14/-14)
bin/py3/waagent (+53/-0)
bin/waagent (+5/-1)
bin/waagent2.0 (+5/-1)
ci/2.7.pylintrc (+42/-0)
ci/3.6.pylintrc (+40/-0)
ci/nosetests.sh (+25/-0)
config/66-azure-storage.rules (+23/-17)
config/alpine/waagent.conf (+4/-11)
config/arch/waagent.conf (+4/-6)
config/bigip/waagent.conf (+3/-10)
config/clearlinux/waagent.conf (+3/-5)
config/coreos/waagent.conf (+4/-11)
config/debian/waagent.conf (+10/-11)
config/devuan/waagent.conf (+130/-0)
config/freebsd/waagent.conf (+6/-13)
config/gaia/waagent.conf (+4/-6)
config/iosxe/waagent.conf (+4/-6)
config/mariner/waagent.conf (+88/-0)
config/nsbsd/waagent.conf (+4/-6)
config/openbsd/waagent.conf (+4/-6)
config/photonos/waagent.conf (+80/-0)
config/suse/waagent.conf (+12/-8)
config/ubuntu/waagent.conf (+10/-11)
config/waagent.conf (+26/-10)
debian/changelog (+21/-0)
debian/control (+0/-1)
debian/docs (+0/-1)
debian/install (+1/-1)
debian/patches/add_manpage.patch (+471/-0)
debian/patches/disable_udev_overrides.patch (+17/-3)
debian/patches/fix_cgroup_v2_mounting_and_systemd_process.patch (+164/-0)
debian/patches/fix_systemd_networkd_lease_file_path (+83/-0)
debian/patches/series (+4/-2)
debian/patches/sru_v2_9_1_1.patch (+233/-0)
debian/rules (+3/-8)
debian/walinuxagent.manpages (+1/-0)
debian/watch (+2/-2)
dev/null (+0/-29)
init/azure-vmextensions.slice (+7/-0)
init/azure.slice (+4/-0)
init/devuan/default/walinuxagent (+2/-0)
init/devuan/walinuxagent (+344/-0)
init/mariner/waagent.service (+16/-0)
init/photonos/waagent.service (+16/-0)
init/redhat/py2/waagent.service (+19/-0)
init/redhat/waagent.service (+19/-0)
init/sles/waagent.service (+16/-0)
init/ubuntu/walinuxagent.service (+3/-0)
makepkg.py (+66/-51)
setup.py (+143/-60)
test-requirements.txt (+19/-3)
tests/common/dhcp/test_dhcp.py (+27/-14)
tests/common/mock_cgroup_environment.py (+122/-0)
tests/common/mock_command.py (+17/-0)
tests/common/mock_environment.py (+168/-0)
tests/common/osutil/test_alpine.py (+3/-2)
tests/common/osutil/test_arch.py (+3/-2)
tests/common/osutil/test_bigip.py (+20/-21)
tests/common/osutil/test_clearlinux.py (+3/-2)
tests/common/osutil/test_coreos.py (+3/-2)
tests/common/osutil/test_default.py (+429/-316)
tests/common/osutil/test_default_osutil.py (+3/-162)
tests/common/osutil/test_factory.py (+144/-69)
tests/common/osutil/test_freebsd.py (+8/-7)
tests/common/osutil/test_nsbsd.py (+12/-11)
tests/common/osutil/test_openbsd.py (+3/-2)
tests/common/osutil/test_openwrt.py (+3/-2)
tests/common/osutil/test_photonos.py (+37/-0)
tests/common/osutil/test_redhat.py (+3/-2)
tests/common/osutil/test_suse.py (+3/-2)
tests/common/osutil/test_ubuntu.py (+1/-1)
tests/common/test_agent_supported_feature.py (+55/-0)
tests/common/test_cgroupapi.py (+130/-548)
tests/common/test_cgroupconfigurator.py (+973/-261)
tests/common/test_cgroups.py (+62/-82)
tests/common/test_cgroupstelemetry.py (+120/-406)
tests/common/test_conf.py (+28/-53)
tests/common/test_errorstate.py (+2/-1)
tests/common/test_event.py (+583/-314)
tests/common/test_logcollector.py (+477/-0)
tests/common/test_logger.py (+45/-45)
tests/common/test_persist_firewall_rules.py (+416/-0)
tests/common/test_singletonperthread.py (+164/-0)
tests/common/test_telemetryevent.py (+20/-19)
tests/common/test_version.py (+80/-36)
tests/daemon/test_daemon.py (+15/-14)
tests/daemon/test_resourcedisk.py (+5/-5)
tests/data/cgroups/cpu.stat (+3/-0)
tests/data/cgroups/cpu.stat_t0 (+3/-0)
tests/data/cgroups/cpu.stat_t1 (+3/-0)
tests/data/cgroups/cpuacct.stat (+2/-0)
tests/data/cgroups/memory_mount/memory.stat (+36/-0)
tests/data/cgroups/missing_memory_counters/memory.stat (+34/-0)
tests/data/cgroups/proc_pid_cgroup (+13/-0)
tests/data/cgroups/proc_self_cgroup (+13/-0)
tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers (+7/-0)
tests/data/cloud-init/set-hostname (+4/-0)
tests/data/events/custom_script_1.tld (+30/-0)
tests/data/events/custom_script_2.tld (+30/-0)
tests/data/events/custom_script_extra_parameters.tld (+66/-0)
tests/data/events/custom_script_invalid_json.tld (+30/-0)
tests/data/events/custom_script_no_read_access.tld (+30/-0)
tests/data/events/custom_script_nonascii_characters.tld (+30/-0)
tests/data/events/event_with_callstack.waagent.tld (+1/-0)
tests/data/events/extension_events/different_cases/1591918616.json (+22/-0)
tests/data/events/extension_events/empty_message/1592350454.json (+24/-0)
tests/data/events/extension_events/extra_parameters/1592273009.json (+35/-0)
tests/data/events/extension_events/int_type/1519934744.json (+10/-0)
tests/data/events/extension_events/large_messages/1591921510.json (+12/-0)
tests/data/events/extension_events/malformed_files/1592008079.json (+13/-0)
tests/data/events/extension_events/malformed_files/1594857360.tld (+11/-0)
tests/data/events/extension_events/malformed_files/bad_json_files/1591816395.json (+3/-0)
tests/data/events/extension_events/malformed_files/bad_name_file.json (+24/-0)
tests/data/events/extension_events/missing_parameters/1592273793.json (+74/-0)
tests/data/events/extension_events/mix_files/1591835369.json (+3/-0)
tests/data/events/extension_events/mix_files/1591835848.json (+85/-0)
tests/data/events/extension_events/mix_files/1591835859.json (+11/-0)
tests/data/events/extension_events/special_chars/1591918939.json (+10/-0)
tests/data/events/extension_events/well_formed_files/1591905451.json (+82/-0)
tests/data/events/extension_events/well_formed_files/1592355539.json (+72/-0)
tests/data/events/extension_events/well_formed_files/9999999999.json (+82/-0)
tests/data/events/legacy_agent.tld (+66/-0)
tests/data/events/legacy_agent_no_timestamp.tld (+62/-0)
tests/data/ext/event_from_agent.json (+119/-1)
tests/data/ext/event_from_extension.xml (+9/-6)
tests/data/ext/sample-status-invalid-format-emptykey-line7.json (+37/-0)
tests/data/ext/sample-status-invalid-json-format.json (+37/-0)
tests/data/ext/sample-status-invalid-status-no-status-status-key.json (+35/-0)
tests/data/ext/sample-status-very-large-multiple-substatuses.json (+408/-0)
tests/data/ext/sample-status-very-large.json (+39/-0)
tests/data/ext/sample-status.json (+36/-0)
tests/data/ext/sample_ext-1.3.0/python.sh (+11/-0)
tests/data/ext/sample_ext-1.3.0/sample.py (+82/-23)
tests/data/hostgaplugin/ext_conf-empty_depends_on.xml (+56/-0)
tests/data/hostgaplugin/ext_conf-invalid_blob_type.xml (+94/-0)
tests/data/hostgaplugin/ext_conf-no_status_upload_blob.xml (+39/-0)
tests/data/hostgaplugin/ext_conf-requested_version.xml (+148/-0)
tests/data/hostgaplugin/ext_conf.xml (+146/-0)
tests/data/hostgaplugin/in_vm_artifacts_profile.json (+1/-0)
tests/data/hostgaplugin/vm_settings-difference_in_required_features.json (+201/-0)
tests/data/hostgaplugin/vm_settings-empty_depends_on.json (+69/-0)
tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json (+192/-0)
tests/data/hostgaplugin/vm_settings-invalid_blob_type.json (+104/-0)
tests/data/hostgaplugin/vm_settings-missing_cert.json (+68/-0)
tests/data/hostgaplugin/vm_settings-no_manifests.json (+73/-0)
tests/data/hostgaplugin/vm_settings-no_status_upload_blob.json (+66/-0)
tests/data/hostgaplugin/vm_settings-out-of-sync.json (+66/-0)
tests/data/hostgaplugin/vm_settings-parse_error.json (+72/-0)
tests/data/hostgaplugin/vm_settings-requested_version.json (+141/-0)
tests/data/hostgaplugin/vm_settings-unsupported_version.json (+72/-0)
tests/data/hostgaplugin/vm_settings.json (+201/-0)
tests/data/init/azure-vmextensions.slice (+6/-0)
tests/data/init/azure-walinuxagent-logcollector.slice (+9/-0)
tests/data/init/azure.slice (+4/-0)
tests/data/init/walinuxagent.service (+23/-0)
tests/data/init/walinuxagent.service.previous (+20/-0)
tests/data/init/walinuxagent.service_system-slice (+23/-0)
tests/data/test_waagent.conf (+6/-5)
tests/data/wire/certs-2.xml (+85/-0)
tests/data/wire/certs.xml (+80/-76)
tests/data/wire/certs_no_format_specified.xml (+78/-74)
tests/data/wire/ext_conf-no_gs_metadata.xml (+27/-0)
tests/data/wire/ext_conf.xml (+7/-5)
tests/data/wire/ext_conf_additional_locations.xml (+34/-0)
tests/data/wire/ext_conf_aks_extension.xml (+70/-0)
tests/data/wire/ext_conf_autoupgrade.xml (+9/-7)
tests/data/wire/ext_conf_autoupgrade_internalversion.xml (+9/-7)
tests/data/wire/ext_conf_dependencies_with_empty_settings.xml (+33/-0)
tests/data/wire/ext_conf_in_vm_artifacts_profile.xml (+29/-0)
tests/data/wire/ext_conf_in_vm_empty_artifacts_profile.xml (+29/-0)
tests/data/wire/ext_conf_in_vm_metadata.xml (+29/-0)
tests/data/wire/ext_conf_internalversion.xml (+9/-7)
tests/data/wire/ext_conf_invalid_and_valid_handlers.xml (+35/-0)
tests/data/wire/ext_conf_invalid_vm_metadata.xml (+29/-0)
tests/data/wire/ext_conf_missing_family.xml (+15/-14)
tests/data/wire/ext_conf_missing_requested_version.xml (+39/-0)
tests/data/wire/ext_conf_multiple_extensions.xml (+13/-32)
tests/data/wire/ext_conf_no_extensions-block_blob.xml (+13/-0)
tests/data/wire/ext_conf_no_extensions-no_status_blob.xml (+12/-0)
tests/data/wire/ext_conf_no_extensions-page_blob.xml (+25/-0)
tests/data/wire/ext_conf_no_public.xml (+25/-24)
tests/data/wire/ext_conf_no_settings.xml (+24/-23)
tests/data/wire/ext_conf_requested_version.xml (+29/-0)
tests/data/wire/ext_conf_required_features.xml (+41/-0)
tests/data/wire/ext_conf_sequencing.xml (+9/-7)
tests/data/wire/ext_conf_settings_case_mismatch.xml (+57/-0)
tests/data/wire/ext_conf_upgradeguid.xml (+7/-5)
tests/data/wire/ga_manifest.xml (+10/-31)
tests/data/wire/ga_manifest_no_upgrade.xml (+21/-21)
tests/data/wire/goal_state.xml (+7/-7)
tests/data/wire/goal_state_no_certs.xml (+27/-0)
tests/data/wire/goal_state_no_ext.xml (+6/-5)
tests/data/wire/goal_state_noop.xml (+14/-0)
tests/data/wire/goal_state_remote_access.xml (+9/-8)
tests/data/wire/in_vm_artifacts_profile.json (+1/-0)
tests/data/wire/invalid_config/ext_conf_multiple_depends_on_for_single_handler.xml (+45/-0)
tests/data/wire/invalid_config/ext_conf_multiple_runtime_settings_same_plugin.xml (+31/-0)
tests/data/wire/invalid_config/ext_conf_multiple_settings_for_same_handler.xml (+33/-0)
tests/data/wire/invalid_config/ext_conf_plugin_settings_version_mismatch.xml (+31/-0)
tests/data/wire/invalid_config/ext_conf_single_and_multi_config_settings_same_plugin.xml (+31/-0)
tests/data/wire/manifest.xml (+16/-16)
tests/data/wire/manifest_deletion.xml (+1/-1)
tests/data/wire/multi-config/ext_conf_mc_disabled_extensions.xml (+84/-0)
tests/data/wire/multi-config/ext_conf_mc_update_extensions.xml (+75/-0)
tests/data/wire/multi-config/ext_conf_multi_config_no_dependencies.xml (+75/-0)
tests/data/wire/multi-config/ext_conf_with_disabled_multi_config.xml (+129/-0)
tests/data/wire/multi-config/ext_conf_with_multi_config.xml (+131/-0)
tests/data/wire/multi-config/ext_conf_with_multi_config_dependencies.xml (+99/-0)
tests/data/wire/trans_cert (+17/-17)
tests/data/wire/trans_prv (+26/-26)
tests/data/wire/trans_pub (+7/-7)
tests/distro/test_resourceDisk.py (+1/-1)
tests/distro/test_scvmm.py (+6/-5)
tests/ga/extension_emulator.py (+373/-0)
tests/ga/mocks.py (+119/-0)
tests/ga/test_collect_logs.py (+239/-0)
tests/ga/test_collect_telemetry_events.py (+576/-0)
tests/ga/test_env.py (+50/-50)
tests/ga/test_extension.py (+2088/-1355)
tests/ga/test_exthandlers.py (+283/-108)
tests/ga/test_exthandlers_download_extension.py (+116/-58)
tests/ga/test_exthandlers_exthandlerinstance.py (+10/-12)
tests/ga/test_monitor.py (+155/-1141)
tests/ga/test_multi_config_extension.py (+1229/-0)
tests/ga/test_periodic_operation.py (+156/-0)
tests/ga/test_remoteaccess.py (+41/-49)
tests/ga/test_remoteaccess_handler.py (+429/-446)
tests/ga/test_report_status.py (+119/-0)
tests/ga/test_send_telemetry_events.py (+430/-0)
tests/ga/test_update.py (+1913/-733)
tests/pa/test_deprovision.py (+4/-4)
tests/pa/test_provision.py (+34/-27)
tests/protocol/HttpRequestPredicates.py (+101/-0)
tests/protocol/mocks.py (+167/-0)
tests/protocol/mockwiredata.py (+260/-47)
tests/protocol/test_datacontract.py (+5/-5)
tests/protocol/test_extensions_goal_state_from_extensions_config.py (+62/-0)
tests/protocol/test_extensions_goal_state_from_vm_settings.py (+156/-0)
tests/protocol/test_goal_state.py (+545/-0)
tests/protocol/test_healthservice.py (+1/-1)
tests/protocol/test_hostplugin.py (+620/-445)
tests/protocol/test_image_info_matcher.py (+2/-1)
tests/protocol/test_imds.py (+49/-46)
tests/protocol/test_metadata_server_migration_util.py (+134/-0)
tests/protocol/test_protocol_util.py (+208/-85)
tests/protocol/test_wire.py (+864/-934)
tests/test_agent.py (+154/-17)
tests/tools.py (+87/-86)
tests/utils/cgroups_tools.py (+1/-2)
tests/utils/event_logger_tools.py (+65/-0)
tests/utils/miscellaneous_tools.py (+62/-0)
tests/utils/test_archive.py (+123/-179)
tests/utils/test_crypt_util.py (+2/-7)
tests/utils/test_extension_process_util.py (+103/-54)
tests/utils/test_file_util.py (+19/-20)
tests/utils/test_flexible_version.py (+21/-19)
tests/utils/test_network_util.py (+36/-1)
tests/utils/test_rest_util.py (+74/-55)
tests/utils/test_shell_util.py (+337/-70)
tests/utils/test_text_util.py (+32/-14)
tests_e2e/orchestrator/docker/Dockerfile (+85/-0)
tests_e2e/orchestrator/lib/agent_junit.py (+66/-0)
tests_e2e/orchestrator/lib/agent_test_loader.py (+257/-0)
tests_e2e/orchestrator/lib/agent_test_suite.py (+645/-0)
tests_e2e/orchestrator/lib/agent_test_suite_combinator.py (+249/-0)
tests_e2e/orchestrator/runbook.yml (+142/-0)
tests_e2e/orchestrator/sample_runbooks/existing_vm.yml (+143/-0)
tests_e2e/orchestrator/sample_runbooks/local_machine/hello_world.py (+32/-0)
tests_e2e/orchestrator/sample_runbooks/local_machine/local.yml (+32/-0)
tests_e2e/orchestrator/scripts/check-agent-log.py (+49/-0)
tests_e2e/orchestrator/scripts/collect-logs (+34/-0)
tests_e2e/orchestrator/scripts/get-agent-bin-path (+56/-0)
tests_e2e/orchestrator/scripts/get-agent-modules-path (+37/-0)
tests_e2e/orchestrator/scripts/get-agent-python (+59/-0)
tests_e2e/orchestrator/scripts/install-agent (+137/-0)
tests_e2e/orchestrator/scripts/install-tools (+135/-0)
tests_e2e/orchestrator/scripts/uncompress.py (+33/-0)
tests_e2e/orchestrator/scripts/unzip.py (+36/-0)
tests_e2e/pipeline/pipeline-cleanup.yml (+58/-0)
tests_e2e/pipeline/pipeline.yml (+119/-0)
tests_e2e/pipeline/scripts/execute_tests.sh (+120/-0)
tests_e2e/test_suites/agent_bvt.yml (+8/-0)
tests_e2e/test_suites/fail.yml (+5/-0)
tests_e2e/test_suites/images.yml (+94/-0)
tests_e2e/test_suites/pass.yml (+4/-0)
tests_e2e/tests/bvts/extension_operations.py (+94/-0)
tests_e2e/tests/bvts/run_command.py (+94/-0)
tests_e2e/tests/bvts/vm_access.py (+79/-0)
tests_e2e/tests/error_test.py (+32/-0)
tests_e2e/tests/fail_test.py (+33/-0)
tests_e2e/tests/lib/agent_log.py (+446/-0)
tests_e2e/tests/lib/agent_test.py (+66/-0)
tests_e2e/tests/lib/agent_test_context.py (+164/-0)
tests_e2e/tests/lib/identifiers.py (+63/-0)
tests_e2e/tests/lib/logging.py (+155/-0)
tests_e2e/tests/lib/retry.py (+59/-0)
tests_e2e/tests/lib/shell.py (+56/-0)
tests_e2e/tests/lib/ssh_client.py (+85/-0)
tests_e2e/tests/lib/virtual_machine.py (+143/-0)
tests_e2e/tests/lib/vm_extension.py (+239/-0)
tests_e2e/tests/pass_test.py (+33/-0)
Reviewer Review Type Date Requested Status
Lucas Kanashiro Pending
git-ubuntu import Pending
Review via email: mp+452920@code.launchpad.net

This proposal has been superseded by a proposal from 2023-10-31.

Commit message

added sru tests

added manpages

debian/patches: update cgroup logic to include v2

fix_systemd_networkd_lease_file_path.patch: patch osutil/ubuntu.py

debian/control: Remove isc-dhcp-client from Depends

Remove upstart config

debian/rules: stop installing SysV init files and stop running tests

disable_udev_overrides.patch: patch setup.py

debian/patches: Remove deprecated patches

debian/install: add new udev rules

debian/docs: Remove Changelog

debian/watch: Fix package version regex pattern

New upstream version 2.9.1

To post a comment you must log in.
Revision history for this message
Bryce Harrington (bryce) wrote :

Hi Calvin, it's a bit late in the release for merges, did you have an MRE or SRU bug to accompany this?

Also, I'm not spotting changes to debian/changelog, was that intentional to be excluded?

Revision history for this message
Calvin Mwadime Makokha (calvinmwadime) wrote :

Hi Bryce,

> Hi Calvin, it's a bit late in the release for merges, did you have an MRE or
> SRU bug to accompany this?
After discussion with Utkarsh, it was decided we shall revisit this MP later since I was late to raise it.

> Also, I'm not spotting changes to debian/changelog, was that intentional to be
> excluded?
There are changes made to it. I think the diff display might have been truncated, however, you can see the file listed in the list above showing modified files

Revision history for this message
Lucas Kanashiro (lucaskanashiro) wrote :

Hi Calvin,

Since you decided to work on this later, I'd ask you to check if there is a way to remove the review slot for ubuntu-sponsors. Otherwise, it will be kept in the general sponsorship queue and patch pilots will keep revisiting this.

Revision history for this message
Calvin Mwadime Makokha (calvinmwadime) wrote :

Hi Lucas, which is the best way to do so? I have changed its status from `needs review` to `work in progress` will that help?

Revision history for this message
Calvin Mwadime Makokha (calvinmwadime) wrote :

Hi Lucas, there has been a change in urgency for this merge. I would request a review on this merge request.Changing the status to `needs review`.

8a67b96... by Calvin Mwadime Makokha

fix_systemd_networkd_lease_file_path.patch: patch osutil/ubuntu.py

Use systemd_networkd for newer ubuntu servers

7d40672... by Calvin Mwadime Makokha

changelog: update version

0b1a8de... by Calvin Mwadime Makokha

debian/patches: update cgroup logic to include v2

Properly handle systemd using cgroup v2

c7bd079... by Calvin Mwadime Makokha

changelog update

dbbfc63... by Calvin Mwadime Makokha

Add manpages

fbe4dd5... by Calvin Mwadime Makokha

changelog update

1d131ad... by Calvin Mwadime Makokha

Added sru tests

9c007ee... by Calvin Mwadime Makokha

changelog update

Unmerged commits

9c007ee... by Calvin Mwadime Makokha

changelog update

1d131ad... by Calvin Mwadime Makokha

Added sru tests

fbe4dd5... by Calvin Mwadime Makokha

changelog update

dbbfc63... by Calvin Mwadime Makokha

Add manpages

c7bd079... by Calvin Mwadime Makokha

changelog update

0b1a8de... by Calvin Mwadime Makokha

debian/patches: update cgroup logic to include v2

Properly handle systemd using cgroup v2

7d40672... by Calvin Mwadime Makokha

changelog: update version

8a67b96... by Calvin Mwadime Makokha

fix_systemd_networkd_lease_file_path.patch: patch osutil/ubuntu.py

Use systemd_networkd for newer ubuntu servers

95c00e8... by Calvin Mwadime Makokha

debian/control: Remove isc-dhcp-client from Depends

9f935bf... by Calvin Mwadime Makokha

Remove upstart config

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
diff --git a/.flake8 b/.flake8
0deleted file mode 1006440deleted file mode 100644
index 63303c3..0000000
--- a/.flake8
+++ /dev/null
@@ -1,32 +0,0 @@
1#
2# The project did not use flake8 since inception so there are a number
3# of time-consuming flake8-identified improvements that are just a lot
4# of busy work. Each of these should be disabled and code cleaned up.
5#
6# W503: Line break occurred before a binary operator
7# W504: Line break occurred after a binary operator
8# E126: Continuation line over-indented for hanging indent
9# E127: Continuation line over-indented for visual indent
10# E128: Continuation line under-indented for visual indent
11# E201: Whitespace after '('
12# E202: Whitespace before ')'
13# E203: Whitespace before ':'
14# E221: Multiple spaces before operator
15# E225: Missing whitespace around operator
16# E226: Missing whitespace around arithmetic operator
17# E231: Missing whitespace after ',', ';', or ':'
18# E261: At least two spaces before inline comment
19# E265: Block comment should start with '# '
20# E302: Expected 2 blank lines, found 0
21# E501: Line too long (xx > yy characters)
22# E502: The backslash is redundant between brackets
23# F401: Module imported but unused
24# F403: 'from module import *' used; unable to detect undefined names
25# F405: Name may be undefined, or defined from star imports: module
26#
27
28[flake8]
29ignore = W503,W504,E126,E127,E128,E201,E202,E203,E221,E225,E226,E231,E261,E265,E302,E501,E502,F401,F403,F405
30exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,tests
31max-complexity = 30
32max-line-length = 120
33\ No newline at end of file0\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index edfa1e6..fdcc07c 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -14,9 +14,8 @@ This will expedite the process of getting your pull request merged and avoid ext
14### PR information14### PR information
15- [ ] The title of the PR is clear and informative.15- [ ] The title of the PR is clear and informative.
16- [ ] There are a small number of commits, each of which has an informative message. This means that previously merged commits do not appear in the history of the PR. For information on cleaning up the commits in your pull request, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).16- [ ] There are a small number of commits, each of which has an informative message. This means that previously merged commits do not appear in the history of the PR. For information on cleaning up the commits in your pull request, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).
17- [ ] Except for special cases involving multiple contributors, the PR is started from a fork of the main repository, not a branch.
18- [ ] If applicable, the PR references the bug/issue that it fixes in the description.17- [ ] If applicable, the PR references the bug/issue that it fixes in the description.
19- [ ] New Unit tests were added for the changes made and Travis.CI is passing.18- [ ] New Unit tests were added for the changes made
2019
21### Quality of Code and Contribution Guidelines20### Quality of Code and Contribution Guidelines
22- [ ] I have read the [contribution guidelines](https://github.com/Azure/WALinuxAgent/blob/master/.github/CONTRIBUTING.md).21- [ ] I have read the [contribution guidelines](https://github.com/Azure/WALinuxAgent/blob/master/.github/CONTRIBUTING.md).
23\ No newline at end of file22\ No newline at end of file
diff --git a/.github/codecov.yml b/.github/codecov.yml
24new file mode 10064423new file mode 100644
index 0000000..77707aa
--- /dev/null
+++ b/.github/codecov.yml
@@ -0,0 +1,2 @@
1github_checks:
2 annotations: false
diff --git a/.github/workflows/ci_pr.yml b/.github/workflows/ci_pr.yml
0new file mode 1006443new file mode 100644
index 0000000..e559268
--- /dev/null
+++ b/.github/workflows/ci_pr.yml
@@ -0,0 +1,128 @@
1name: CI Unit tests
2
3on:
4 push:
5 branches: [ "*" ]
6 pull_request:
7 branches: [ "*" ]
8 workflow_dispatch:
9
10jobs:
11 test-legacy-python-versions:
12
13 strategy:
14 fail-fast: false
15 matrix:
16 include:
17 - python-version: 2.6
18 - python-version: 3.4
19
20 name: "Python ${{ matrix.python-version }} Unit Tests"
21 runs-on: ubuntu-20.04
22 container:
23 image: ubuntu:16.04
24 volumes:
25 - /home/waagent:/home/waagent
26 defaults:
27 run:
28 shell: bash -l {0}
29
30 env:
31 NOSEOPTS: "--verbose"
32
33 steps:
34 - uses: actions/checkout@v3
35
36 - name: Install Python ${{ matrix.python-version }}
37 run: |
38 apt-get update
39 apt-get install -y curl bzip2 sudo python3
40 curl https://dcrdata.blob.core.windows.net/python/python-${{ matrix.python-version }}.tar.bz2 -o python-${{ matrix.python-version }}.tar.bz2
41 sudo tar xjvf python-${{ matrix.python-version }}.tar.bz2 --directory /
42
43 - name: Test with nosetests
44 run: |
45 if [[ ${{ matrix.python-version }} == 2.6 ]]; then
46 source /home/waagent/virtualenv/python2.6.9/bin/activate
47 else
48 source /home/waagent/virtualenv/python3.4.8/bin/activate
49 fi
50 ./ci/nosetests.sh
51 exit $?
52
53 test-current-python-versions:
54
55 strategy:
56 fail-fast: false
57 matrix:
58 include:
59
60 - python-version: 2.7
61 PYLINTOPTS: "--rcfile=ci/2.7.pylintrc --ignore=tests_e2e,makepkg.py"
62
63 - python-version: 3.5
64 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e,makepkg.py"
65
66 - python-version: 3.6
67 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
68
69 - python-version: 3.7
70 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
71
72 - python-version: 3.8
73 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"
74
75 - python-version: 3.9
76 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc"
77 additional-nose-opts: "--with-coverage --cover-erase --cover-inclusive --cover-branches --cover-package=azurelinuxagent"
78
79 name: "Python ${{ matrix.python-version }} Unit Tests"
80 runs-on: ubuntu-20.04
81
82 env:
83 PYLINTOPTS: ${{ matrix.PYLINTOPTS }}
84 PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests tests_e2e"
85 NOSEOPTS: "--with-timer ${{ matrix.additional-nose-opts }}"
86 PYTHON_VERSION: ${{ matrix.python-version }}
87
88 steps:
89
90 - name: Checkout WALinuxAgent repo
91 uses: actions/checkout@v3
92
93 - name: Setup Python ${{ matrix.python-version }}
94 uses: actions/setup-python@v4
95 with:
96 python-version: ${{ matrix.python-version }}
97
98 - name: Install dependencies
99 id: install-dependencies
100 run: |
101 sudo env "PATH=$PATH" python -m pip install --upgrade pip
102 sudo env "PATH=$PATH" pip install -r requirements.txt
103 sudo env "PATH=$PATH" pip install -r test-requirements.txt
104
105 - name: Run pylint
106 run: |
107 pylint $PYLINTOPTS --jobs=0 $PYLINTFILES
108
109 - name: Test with nosetests
110 if: success() || (failure() && steps.install-dependencies.outcome == 'success')
111 run: |
112 ./ci/nosetests.sh
113 exit $?
114
115 - name: Compile Coverage
116 if: matrix.python-version == 3.9
117 run: |
118 echo looking for coverage files :
119 ls -alh | grep -i coverage
120 sudo env "PATH=$PATH" coverage combine coverage.*.data
121 sudo env "PATH=$PATH" coverage xml
122 sudo env "PATH=$PATH" coverage report
123
124 - name: Upload Coverage
125 if: matrix.python-version == 3.9
126 uses: codecov/codecov-action@v2
127 with:
128 file: ./coverage.xml
0\ No newline at end of file129\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 0a31340..fd64d33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,8 +17,6 @@ develop-eggs/
17dist/17dist/
18downloads/18downloads/
19eggs/19eggs/
20lib/
21lib64/
22parts/20parts/
23sdist/21sdist/
24var/22var/
@@ -92,3 +90,4 @@ ENV/
9290
93# pyenv91# pyenv
94.python-version92.python-version
93.vscode/
diff --git a/.travis.yml b/.travis.yml
95deleted file mode 10064494deleted file mode 100644
index fa672d3..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,43 +0,0 @@
1---
2os: linux
3dist: xenial
4language: python
5env:
6 - NOSEOPTS="--verbose" SETUPOPTS=""
7 # Add SETUPOPTS="check flake8" to enable flake8 checks
8
9matrix:
10 # exclude the default "python" build - we're being specific here...
11 exclude:
12 - python:
13 env:
14 - NOSEOPTS="" SETUPOPTS="check flake8"
15
16 include:
17 - python: 2.6
18 dist: trusty
19 env:
20 - NOSEOPTS="--verbose" SETUPOPTS=""
21 - python: 2.7
22 - python: 3.4
23 - python: 3.6
24 - python: 3.7
25 env:
26 - >-
27 NOSEOPTS="--verbose --with-coverage --cover-inclusive
28 --cover-min-percentage=60 --cover-branches
29 --cover-package=azurelinuxagent --cover-xml"
30 SETUPOPTS=""
31
32install:
33 - pip install -r requirements.txt
34 - pip install -r test-requirements.txt
35
36script:
37 # future: - pylint setup.py makepkg.py azurelinuxagent/
38 - nosetests $NOSEOPTS --attr '!requires_sudo' tests
39 - sudo env "PATH=$PATH" nosetests $NOSEOPTS --verbose --attr 'requires_sudo' tests
40 - if [ ! -z "$SETUPOPTS" ]; then /usr/bin/env python setup.py $SETUPOPTS; fi
41
42after_success:
43 - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; then codecov; fi
44\ No newline at end of file0\ No newline at end of file
diff --git a/CODEOWNERS b/CODEOWNERS
index a8f2d5d..8707e60 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,3 +1,4 @@
11
1# See https://help.github.com/articles/about-codeowners/2# See https://help.github.com/articles/about-codeowners/
2# for more info about CODEOWNERS file3# for more info about CODEOWNERS file
34
@@ -9,6 +10,7 @@
9# when there are requests for changes in the provisioning agent. For any10# when there are requests for changes in the provisioning agent. For any
10# questions, please feel free to reach out to thstring@microsoft.com.11# questions, please feel free to reach out to thstring@microsoft.com.
11/azurelinuxagent/pa/ @trstringer @anhvoms12/azurelinuxagent/pa/ @trstringer @anhvoms
13/tests/pa/ @trstringer @anhvoms
1214
13#15#
14# RDMA16# RDMA
@@ -19,4 +21,4 @@
19#21#
20# Linux Agent team22# Linux Agent team
21#23#
22* @narrieta @vrdmr @pgombar @larohra24* @narrieta @ZhidongPeng @nagworld9 @maddieford
diff --git a/Changelog b/Changelog
23deleted file mode 10064425deleted file mode 100644
index da68890..0000000
--- a/Changelog
+++ /dev/null
@@ -1,38 +0,0 @@
1WALinuxAgent Changelog
2|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
3
4Refer to releases WALinuxAgent release page: https://github.com/Azure/WALinuxAgent/releases for detailed changelog after v2.2.0
5
612 August 2016, v2.1.6
7 . Improved RDMA support
8 . Extension state migration
9 . Alpine Linux support
10 . Fixes for #347, #351, #353
11
1215 July 2016, v2.1.5
13 . Goal state processing extension
14 . Multi-nic improvements
15 . Bug fixes for #145, #141, #133, #116, #187, #169, #104, #127, #163,
16 #190, #185, #174
17
1809 Mar 2016, WALinuxAgent 2.1.4
19 . Add support for FreeBSD
20 . Fix a bug for internal extension version resolving
21
2229 Jan 2016, WALinuxAgent 2.1.3
23 . Fixed endpoint probing for Azure Stack
24 . Multiple fixes for extension handling
25
2607 Dec 2015, WALinuxAgent 2.1.2
27 . Multiple fixes for extension handling and provisioning
28
2907 Aug 2015, WALinuxAgent 2.1.1
30 . Support python3
31 . Fixed bugs for metadata protocol
32 . Fixed a few pylint warnings
33 . Enabled travis-ci
34
35|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
3601 Jul 2015, WALinuxAgent 2.1.0
37 . Divide waagent into different modules
38
diff --git a/README.md b/README.md
index 0069d46..ae6a851 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,15 @@
1
1# Microsoft Azure Linux Agent2# Microsoft Azure Linux Agent
23
3## Develop branch status4## Linux distributions support
45
5[![Travis CI](https://travis-ci.org/Azure/WALinuxAgent.svg?branch=develop)](https://travis-ci.org/Azure/WALinuxAgent/branches)6Our daily automation tests most of the [Linux distributions supported by Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros); the Agent can be
6[![CodeCov](https://codecov.io/gh/Azure/WALinusAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop)7used on other distributions as well, but development, testing and support for those are done by the open source community.
78
8Each badge below represents our basic validation tests for an image, which are executed several times each day. These include provisioning, user account, disk, extension and networking scenarios.9Testing is done using the develop branch, which can be unstable. For a stable build please use the master branch instead.
910
10Note: These badges represent testing to our develop branch which might not be stable. For a stable build please use master branch instead. 11[![CodeCov](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop)
1112
12Image | Status |
13------|--------|
14Canonical UbuntuServer 14.04.5-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-LTS__agent--bvt.svg)
15Canonical UbuntuServer 14.04.5-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-DAILY-LTS__agent--bvt.svg)
16Canonical UbuntuServer 16.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-LTS__agent--bvt.svg)
17Canonical UbuntuServer 16.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-DAILY-LTS__agent--bvt.svg)
18Canonical UbuntuServer 18.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-LTS__agent--bvt.svg)
19Canonical UbuntuServer 18.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-DAILY-LTS__agent--bvt.svg)
20Credativ Debian 8|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8__agent--bvt.svg)
21Credativ Debian 8-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8-DAILY__agent--bvt.svg)
22Credativ Debian 9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9__agent--bvt.svg)
23Credativ Debian 9-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9-DAILY__agent--bvt.svg)
24OpenLogic CentOS 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_6.9__agent--bvt.svg)
25OpenLogic CentOS 7.4|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_7.4__agent--bvt.svg)
26RedHat RHEL 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_6.9__agent--bvt.svg)
27RedHat RHEL 7-RAW|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_7-RAW__agent--bvt.svg)
28SUSE SLES 12-SP3|![badge](https://dcrbadges.blob.core.windows.net/scenarios/SUSE_SLES_12-SP3__agent--bvt.svg)
2913
30## Introduction14## Introduction
3115
@@ -49,7 +33,6 @@ functionality for Linux IaaS deployments:
4933
50* Kernel34* Kernel
51 * Configure virtual NUMA (disable for kernel <2.6.37)35 * Configure virtual NUMA (disable for kernel <2.6.37)
52 * Consume Hyper-V entropy for /dev/random
53 * Configure SCSI timeouts for the root device (which could be remote)36 * Configure SCSI timeouts for the root device (which could be remote)
5437
55* Diagnostics38* Diagnostics
@@ -79,13 +62,15 @@ The agent will use an HTTP proxy if provided via the `http_proxy` (for `http` re
79`https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and62`https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and
80`HttpProxy.Port` configuration variables (see below), if used, will override the environment63`HttpProxy.Port` configuration variables (see below), if used, will override the environment
81settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring64settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring
82authentication.65authentication. Note that when the agent service is managed by systemd, environment variables
66such as `http_proxy` and `https_proxy` should be defined using one the mechanisms provided by
67systemd (e.g. by using Environment or EnvironmentFile in the service file).
8368
84## Requirements69## Requirements
8570
86The following systems have been tested and are known to work with the Azure71The following systems have been tested and are known to work with the Azure
87Linux Agent. Please note that this list may differ from the official list72Linux Agent. Please note that this list may differ from the official list
88of supported systems on the Microsoft Azure Platform as described [here](http://support.microsoft.com/kb/2805216).73of supported systems on the Microsoft Azure Platform as described [here](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros).
8974
90Waagent depends on some system packages in order to function properly:75Waagent depends on some system packages in order to function properly:
9176
@@ -109,6 +94,12 @@ For more advanced installation options, such as installing to custom locations o
109 sudo python setup.py install --register-service94 sudo python setup.py install --register-service
110```95```
11196
97For Python 3, use:
98
99```bash
100 sudo python3 setup.py install --register-service
101```
102
112You can view more installation options by running:103You can view more installation options by running:
113104
114```bash105```bash
@@ -177,6 +168,8 @@ For CoreOS, use:
177168
178`-start`: Run waagent as a background process169`-start`: Run waagent as a background process
179170
171`-collect-logs [-full]`: Runs the log collector utility that collects relevant agent logs for debugging and stores them in the agent folder on disk. Exact location will be shown when run. Use flag `-full` for more exhaustive log collection.
172
180## Configuration173## Configuration
181174
182A configuration file (/etc/waagent.conf) controls the actions of waagent. Blank lines and lines whose first character is a `#` are ignored (end-of-line comments are *not* supported).175A configuration file (/etc/waagent.conf) controls the actions of waagent. Blank lines and lines whose first character is a `#` are ignored (end-of-line comments are *not* supported).
@@ -185,6 +178,7 @@ A sample configuration file is shown below:
185178
186```yml179```yml
187Extensions.Enabled=y180Extensions.Enabled=y
181Extensions.GoalStatePeriod=6
188Provisioning.Agent=auto182Provisioning.Agent=auto
189Provisioning.DeleteRootPassword=n183Provisioning.DeleteRootPassword=n
190Provisioning.RegenerateSshHostKeyPair=y184Provisioning.RegenerateSshHostKeyPair=y
@@ -202,6 +196,8 @@ ResourceDisk.EnableSwap=n
202ResourceDisk.EnableSwapEncryption=n196ResourceDisk.EnableSwapEncryption=n
203ResourceDisk.SwapSizeMB=0197ResourceDisk.SwapSizeMB=0
204Logs.Verbose=n198Logs.Verbose=n
199Logs.Collect=y
200Logs.CollectPeriod=3600
205OS.AllowHTTP=n201OS.AllowHTTP=n
206OS.RootDeviceScsiTimeout=300202OS.RootDeviceScsiTimeout=300
207OS.EnableFIPS=n203OS.EnableFIPS=n
@@ -210,8 +206,6 @@ OS.SshClientAliveInterval=180
210OS.SshDir=/etc/ssh206OS.SshDir=/etc/ssh
211HttpProxy.Host=None207HttpProxy.Host=None
212HttpProxy.Port=None208HttpProxy.Port=None
213CGroups.EnforceLimits=y
214CGroups.Excluded=customscript,runcommand
215```209```
216210
217The various configuration options are described in detail below. Configuration211The various configuration options are described in detail below. Configuration
@@ -238,6 +232,32 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set
238provisioning time, via whichever API is being used. We will provide more details on232provisioning time, via whichever API is being used. We will provide more details on
239this on our wiki when it is generally available. 233this on our wiki when it is generally available.
240234
235#### __Extensions.GoalStatePeriod__
236
237_Type: Integer_
238_Default: 6_
239
240How often to poll for new goal states (in seconds) and report the status of the VM
241and extensions. Goal states describe the desired state of the extensions on the VM.
242
243_Note_: setting up this parameter to more than a few minutes can make the state of
244the VM be reported as unresponsive/unavailable on the Azure portal. Also, this
245setting affects how fast the agent starts executing extensions.
246
247#### __AutoUpdate.Enabled__
248
249_Type: Boolean_
250_Default: y_
251
252Enables auto-update of the Extension Handler. The Extension Handler is responsible
253for managing extensions and reporting VM status. The core functionality of the agent
254is contained in the Extension Handler, and we encourage users to enable this option
255in order to maintain an up to date version.
256
257On most distros the default value is 'y'.
258
259For more information on the agent version, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#what-does-goal-state-agent-mean-in-waagent---version-output).
260
241#### __Provisioning.Agent__261#### __Provisioning.Agent__
242262
243_Type: String_263_Type: String_
@@ -261,7 +281,22 @@ _Note_: This configuration option has been removed and has no effect. waagent
261now auto-detects cloud-init as a provisioning agent (with an option to override281now auto-detects cloud-init as a provisioning agent (with an option to override
262with `Provisioning.Agent`).282with `Provisioning.Agent`).
263283
264#### __Provisioning.UseCloudInit__ (*removed in 2.2.45*)284#### __Provisioning.MonitorHostName__
285
286_Type: Boolean_
287_Default: n_
288
289Monitor host name changes and publish changes via DHCP requests.
290
291#### __Provisioning.MonitorHostNamePeriod__
292
293_Type: Integer_
294_Default: 30_
295
296How often to monitor host name changes (in seconds). This setting is ignored if
297MonitorHostName is not set.
298
299#### __Provisioning.UseCloudInit__
265300
266_Type: Boolean_ 301_Type: Boolean_
267_Default: n_302_Default: n_
@@ -397,7 +432,7 @@ system swap space.
397_Type: Boolean_ 432_Type: Boolean_
398_Default: n_433_Default: n_
399434
400If set, the swap file (/swapfile) is mounted as an encrypted filesystem.435If set, the swap file (/swapfile) is mounted as an encrypted filesystem (flag supported only on FreeBSD.)
401436
402#### __ResourceDisk.SwapSizeMB__437#### __ResourceDisk.SwapSizeMB__
403438
@@ -414,6 +449,25 @@ _Default: n_
414If set, log verbosity is boosted. Waagent logs to /var/log/waagent.log and449If set, log verbosity is boosted. Waagent logs to /var/log/waagent.log and
415leverages the system logrotate functionality to rotate logs.450leverages the system logrotate functionality to rotate logs.
416451
452
453#### __Logs.Collect__
454
455_Type: Boolean_
456_Default: y_
457
458If set, agent logs will be periodically collected and uploaded to a secure location for improved supportability.
459
460NOTE: This feature relies on the agent's resource usage features (cgroups); this flag will not take effect on any distro not supported.
461
462#### __Logs.CollectPeriod__
463
464_Type: Integer_
465_Default: 3600_
466
467This configures how frequently to collect and upload logs. Default is each hour.
468
469NOTE: This only takes effect if the Logs.Collect option is enabled.
470
417#### __OS.AllowHTTP__471#### __OS.AllowHTTP__
418472
419_Type: Boolean_ 473_Type: Boolean_
@@ -442,6 +496,14 @@ OpenSSL commands. This signals OpenSSL to use any installed FIPS-compliant libra
442Note that the agent itself has no FIPS-specific code. _If no FIPS-compliant certificates are496Note that the agent itself has no FIPS-specific code. _If no FIPS-compliant certificates are
443installed, then enabling this option will cause all OpenSSL commands to fail._497installed, then enabling this option will cause all OpenSSL commands to fail._
444498
499#### __OS.MonitorDhcpClientRestartPeriod__
500
501_Type: Integer_
502_Default: 30_
503
504The agent monitor restarts of the DHCP client and restores network rules when it happens. This
505setting determines how often (in seconds) to monitor for restarts.
506
445#### __OS.RootDeviceScsiTimeout__507#### __OS.RootDeviceScsiTimeout__
446508
447_Type: Integer_ 509_Type: Integer_
@@ -450,6 +512,14 @@ _Default: 300_
450This configures the SCSI timeout in seconds on the root device. If not set, the512This configures the SCSI timeout in seconds on the root device. If not set, the
451system defaults are used.513system defaults are used.
452514
515#### __OS.RootDeviceScsiTimeoutPeriod__
516
517_Type: Integer_
518_Default: 30_
519
520How often to set the SCSI timeout on the root device (in seconds). This setting is
521ignored if RootDeviceScsiTimeout is not set.
522
453#### __OS.OpensslPath__523#### __OS.OpensslPath__
454524
455_Type: String_ 525_Type: String_
@@ -458,6 +528,13 @@ _Default: None_
458This can be used to specify an alternate path for the openssl binary to use for528This can be used to specify an alternate path for the openssl binary to use for
459cryptographic operations.529cryptographic operations.
460530
531#### __OS.RemovePersistentNetRulesPeriod__
532_Type: Integer_
533_Default: 30_
534
535How often to remove the udev rules for persistent network interface names (75-persistent-net-generator.rules
536and /etc/udev/rules.d/70-persistent-net.rules) (in seconds)
537
461#### __OS.SshClientAliveInterval__538#### __OS.SshClientAliveInterval__
462539
463_Type: Integer_ 540_Type: Integer_
diff --git a/SECURITY.md b/SECURITY.md
464new file mode 100644541new file mode 100644
index 0000000..e138ec5
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
1<!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
2
3## Security
4
5Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6
7If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8
9## Reporting Security Issues
10
11**Please do not report security vulnerabilities through public GitHub issues.**
12
13Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14
15If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16
17You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18
19Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20
21 * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 * Full paths of source file(s) related to the manifestation of the issue
23 * The location of the affected source code (tag/branch/commit or direct URL)
24 * Any special configuration required to reproduce the issue
25 * Step-by-step instructions to reproduce the issue
26 * Proof-of-concept or exploit code (if possible)
27 * Impact of the issue, including how an attacker might exploit the issue
28
29This information will help us triage your report more quickly.
30
31If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32
33## Preferred Languages
34
35We prefer all communications to be in English.
36
37## Policy
38
39Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40
41<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py
index 6e65084..8c30348 100644
--- a/azurelinuxagent/agent.py
+++ b/azurelinuxagent/agent.py
@@ -24,21 +24,47 @@ Module agent
24from __future__ import print_function24from __future__ import print_function
2525
26import os26import os
27import sys
28import re27import re
29import subprocess28import subprocess
29import sys
30import threading30import threading
31import traceback31from azurelinuxagent.common import cgroupconfigurator, logcollector
32from azurelinuxagent.common.cgroupapi import SystemdCgroupsApi
3233
33import azurelinuxagent.common.logger as logger
34import azurelinuxagent.common.event as event
35import azurelinuxagent.common.conf as conf34import azurelinuxagent.common.conf as conf
36from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, \35import azurelinuxagent.common.event as event
37 DISTRO_NAME, DISTRO_VERSION, \36import azurelinuxagent.common.logger as logger
38 PY_VERSION_MAJOR, PY_VERSION_MINOR, \37from azurelinuxagent.common.future import ustr
39 PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION38from azurelinuxagent.common.logcollector import LogCollector, OUTPUT_RESULTS_FILE_PATH
40from azurelinuxagent.common.osutil import get_osutil39from azurelinuxagent.common.osutil import get_osutil
41from azurelinuxagent.common.utils import fileutil40from azurelinuxagent.common.utils import fileutil, textutil
41from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
42from azurelinuxagent.common.utils.networkutil import AddFirewallRules
43from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, AGENT_VERSION, \
44 DISTRO_NAME, DISTRO_VERSION, \
45 PY_VERSION_MAJOR, PY_VERSION_MINOR, \
46 PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION, \
47 get_daemon_version, set_daemon_version
48from azurelinuxagent.ga.collect_logs import CollectLogsHandler, get_log_collector_monitor_handler
49from azurelinuxagent.pa.provision.default import ProvisionHandler
50
51
52class AgentCommands(object):
53 """
54 This is the list of all commands that the Linux Guest Agent supports
55 """
56 DeprovisionUser = "deprovision+user"
57 Deprovision = "deprovision"
58 Daemon = "daemon"
59 Start = "start"
60 RegisterService = "register-service"
61 RunExthandlers = "run-exthandlers"
62 Version = "version"
63 ShowConfig = "show-configuration"
64 Help = "help"
65 CollectLogs = "collect-logs"
66 SetupFirewall = "setup-firewall"
67 Provision = "provision"
4268
4369
44class Agent(object):70class Agent(object):
@@ -49,24 +75,24 @@ class Agent(object):
49 self.conf_file_path = conf_file_path75 self.conf_file_path = conf_file_path
50 self.osutil = get_osutil()76 self.osutil = get_osutil()
5177
52 #Init stdout log78 # Init stdout log
53 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO79 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO
54 logger.add_logger_appender(logger.AppenderType.STDOUT, level)80 logger.add_logger_appender(logger.AppenderType.STDOUT, level)
5581
56 #Init config82 # Init config
57 conf_file_path = self.conf_file_path \83 conf_file_path = self.conf_file_path \
58 if self.conf_file_path is not None \84 if self.conf_file_path is not None \
59 else self.osutil.get_agent_conf_file_path()85 else self.osutil.get_agent_conf_file_path()
60 conf.load_conf_from_file(conf_file_path)86 conf.load_conf_from_file(conf_file_path)
6187
62 #Init log88 # Init log
63 verbose = verbose or conf.get_logs_verbose()89 verbose = verbose or conf.get_logs_verbose()
64 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO90 level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO
65 logger.add_logger_appender(logger.AppenderType.FILE, level,91 logger.add_logger_appender(logger.AppenderType.FILE, level, path=conf.get_agent_log_file())
66 path="/var/log/waagent.log")92
67 if conf.get_logs_console():93 # echo the log to /dev/console if the machine will be provisioned
68 logger.add_logger_appender(logger.AppenderType.CONSOLE, level,94 if conf.get_logs_console() and not ProvisionHandler.is_provisioned():
69 path="/dev/console")95 self.__add_console_appender(level)
7096
71 if event.send_logs_to_telemetry():97 if event.send_logs_to_telemetry():
72 logger.add_logger_appender(logger.AppenderType.TELEMETRY,98 logger.add_logger_appender(logger.AppenderType.TELEMETRY,
@@ -84,22 +110,30 @@ class Agent(object):
84 "Exception occurred while creating extension "110 "Exception occurred while creating extension "
85 "log directory {0}: {1}".format(ext_log_dir, e))111 "log directory {0}: {1}".format(ext_log_dir, e))
86112
87 #Init event reporter113 # Init event reporter
114 # Note that the reporter is not fully initialized here yet. Some telemetry fields are filled with data
115 # originating from the goal state or IMDS, which requires a WireProtocol instance. Once a protocol
116 # has been established, those fields must be explicitly initialized using
117 # initialize_event_logger_vminfo_common_parameters(). Any events created before that initialization
118 # will contain dummy values on those fields.
88 event.init_event_status(conf.get_lib_dir())119 event.init_event_status(conf.get_lib_dir())
89 event_dir = os.path.join(conf.get_lib_dir(), "events")120 event_dir = os.path.join(conf.get_lib_dir(), event.EVENTS_DIRECTORY)
90 event.init_event_logger(event_dir)121 event.init_event_logger(event_dir)
91 event.enable_unhandled_err_dump("WALA")122 event.enable_unhandled_err_dump("WALA")
92123
124 def __add_console_appender(self, level):
125 logger.add_logger_appender(logger.AppenderType.CONSOLE, level, path="/dev/console")
126
93 def daemon(self):127 def daemon(self):
94 """128 """
95 Run agent daemon129 Run agent daemon
96 """130 """
131 set_daemon_version(AGENT_VERSION)
97 logger.set_prefix("Daemon")132 logger.set_prefix("Daemon")
98 threading.current_thread().setName("Daemon")133 threading.current_thread().setName("Daemon")
99 child_args = None \134 child_args = None \
100 if self.conf_file_path is None \135 if self.conf_file_path is None \
101 else "-configuration-path:{0}".format(self.conf_file_path)136 else "-configuration-path:{0}".format(self.conf_file_path)
102
103 from azurelinuxagent.daemon import get_daemon_handler137 from azurelinuxagent.daemon import get_daemon_handler
104 daemon_handler = get_daemon_handler()138 daemon_handler = get_daemon_handler()
105 daemon_handler.run(child_args=child_args)139 daemon_handler.run(child_args=child_args)
@@ -137,6 +171,21 @@ class Agent(object):
137 """171 """
138 logger.set_prefix("ExtHandler")172 logger.set_prefix("ExtHandler")
139 threading.current_thread().setName("ExtHandler")173 threading.current_thread().setName("ExtHandler")
174
175 #
176 # Agents < 2.2.53 used to echo the log to the console. Since the extension handler could have been started by
177 # one of those daemons, output a message indicating that output to the console will stop, otherwise users
178 # may think that the agent died if they noticed that output to the console stops abruptly.
179 #
180 # Feel free to remove this code if telemetry shows there are no more agents <= 2.2.53 in the field.
181 #
182 if conf.get_logs_console() and get_daemon_version() < FlexibleVersion("2.2.53"):
183 self.__add_console_appender(logger.LogLevel.INFO)
184 try:
185 logger.info(u"The agent will now check for updates and then will process extensions. Output to /dev/console will be suspended during those operations.")
186 finally:
187 logger.disable_console_output()
188
140 from azurelinuxagent.ga.update import get_update_handler189 from azurelinuxagent.ga.update import get_update_handler
141 update_handler = get_update_handler()190 update_handler = get_update_handler()
142 update_handler.run(debug)191 update_handler.run(debug)
@@ -146,91 +195,175 @@ class Agent(object):
146 for k in sorted(configuration.keys()):195 for k in sorted(configuration.keys()):
147 print("{0} = {1}".format(k, configuration[k]))196 print("{0} = {1}".format(k, configuration[k]))
148197
198 def collect_logs(self, is_full_mode):
199 logger.set_prefix("LogCollector")
200
201 if is_full_mode:
202 logger.info("Running log collector mode full")
203 else:
204 logger.info("Running log collector mode normal")
205
206 # Check the cgroups unit
207 cpu_cgroup_path, memory_cgroup_path, log_collector_monitor = None, None, None
208 if CollectLogsHandler.should_validate_cgroups():
209 cgroups_api = SystemdCgroupsApi()
210 cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")
149211
150def main(args=[]):212 cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path)
213 memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path)
214
215 if not cpu_slice_matches or not memory_slice_matches:
216 logger.info("The Log Collector process is not in the proper cgroups:")
217 if not cpu_slice_matches:
218 logger.info("\tunexpected cpu slice")
219 if not memory_slice_matches:
220 logger.info("\tunexpected memory slice")
221
222 sys.exit(logcollector.INVALID_CGROUPS_ERRCODE)
223
224 try:
225 log_collector = LogCollector(is_full_mode, cpu_cgroup_path, memory_cgroup_path)
226 log_collector_monitor = get_log_collector_monitor_handler(log_collector.cgroups)
227 log_collector_monitor.run()
228 archive = log_collector.collect_logs_and_get_archive()
229 logger.info("Log collection successfully completed. Archive can be found at {0} "
230 "and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH))
231 except Exception as e:
232 logger.error("Log collection completed unsuccessfully. Error: {0}".format(ustr(e)))
233 logger.info("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH))
234 sys.exit(1)
235 finally:
236 if log_collector_monitor is not None:
237 log_collector_monitor.stop()
238
239 @staticmethod
240 def setup_firewall(firewall_metadata):
241
242 print("Setting up firewall for the WALinux Agent with args: {0}".format(firewall_metadata))
243 try:
244 AddFirewallRules.add_iptables_rules(firewall_metadata['wait'], firewall_metadata['dst_ip'],
245 firewall_metadata['uid'])
246 print("Successfully set the firewall rules")
247 except Exception as error:
248 print("Unable to add firewall rules. Error: {0}".format(ustr(error)))
249 sys.exit(1)
250
251
252def main(args=None):
151 """253 """
152 Parse command line arguments, exit with usage() on error.254 Parse command line arguments, exit with usage() on error.
153 Invoke different methods according to different command255 Invoke different methods according to different command
154 """256 """
257 if args is None:
258 args = []
155 if len(args) <= 0:259 if len(args) <= 0:
156 args = sys.argv[1:]260 args = sys.argv[1:]
157 command, force, verbose, debug, conf_file_path = parse_args(args)261 command, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata = parse_args(args)
158 if command == "version":262 if command == AgentCommands.Version:
159 version()263 version()
160 elif command == "help":264 elif command == AgentCommands.Help:
161 print(usage())265 print(usage())
162 elif command == "start":266 elif command == AgentCommands.Start:
163 start(conf_file_path=conf_file_path)267 start(conf_file_path=conf_file_path)
164 else:268 else:
165 try:269 try:
166 agent = Agent(verbose, conf_file_path=conf_file_path)270 agent = Agent(verbose, conf_file_path=conf_file_path)
167 if command == "deprovision+user":271 if command == AgentCommands.DeprovisionUser:
168 agent.deprovision(force, deluser=True)272 agent.deprovision(force, deluser=True)
169 elif command == "deprovision":273 elif command == AgentCommands.Deprovision:
170 agent.deprovision(force, deluser=False)274 agent.deprovision(force, deluser=False)
171 elif command == "provision":275 elif command == AgentCommands.Provision:
172 agent.provision()276 agent.provision()
173 elif command == "register-service":277 elif command == AgentCommands.RegisterService:
174 agent.register_service()278 agent.register_service()
175 elif command == "daemon":279 elif command == AgentCommands.Daemon:
176 agent.daemon()280 agent.daemon()
177 elif command == "run-exthandlers":281 elif command == AgentCommands.RunExthandlers:
178 agent.run_exthandlers(debug)282 agent.run_exthandlers(debug)
179 elif command == "show-configuration":283 elif command == AgentCommands.ShowConfig:
180 agent.show_configuration()284 agent.show_configuration()
181 except Exception:285 elif command == AgentCommands.CollectLogs:
286 agent.collect_logs(log_collector_full_mode)
287 elif command == AgentCommands.SetupFirewall:
288 agent.setup_firewall(firewall_metadata)
289 except Exception as e:
182 logger.error(u"Failed to run '{0}': {1}",290 logger.error(u"Failed to run '{0}': {1}",
183 command,291 command,
184 traceback.format_exc())292 textutil.format_exception(e))
293
185294
186def parse_args(sys_args):295def parse_args(sys_args):
187 """296 """
188 Parse command line arguments297 Parse command line arguments
189 """298 """
190 cmd = "help"299 cmd = AgentCommands.Help
191 force = False300 force = False
192 verbose = False301 verbose = False
193 debug = False302 debug = False
194 conf_file_path = None303 conf_file_path = None
195 for a in sys_args:304 log_collector_full_mode = False
196 m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", a)305 firewall_metadata = {
306 "dst_ip": None,
307 "uid": None,
308 "wait": ""
309 }
310
311 regex_cmd_format = "^([-/]*){0}"
312
313 for arg in sys_args:
314 if arg == "":
315 # Don't parse an empty parameter
316 continue
317 m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", arg) # pylint: disable=W1401
197 if not m is None:318 if not m is None:
198 conf_file_path = m.group(1)319 conf_file_path = m.group(1)
199 if not os.path.exists(conf_file_path):320 if not os.path.exists(conf_file_path):
200 print("Error: Configuration file {0} does not exist".format(321 print("Error: Configuration file {0} does not exist".format(
201 conf_file_path), file=sys.stderr)322 conf_file_path), file=sys.stderr)
202 usage()323 print(usage())
203 sys.exit(1)324 sys.exit(1)
204 325 elif re.match("^([-/]*)deprovision\\+user", arg):
205 elif re.match("^([-/]*)deprovision\\+user", a):326 cmd = AgentCommands.DeprovisionUser
206 cmd = "deprovision+user"327 elif re.match(regex_cmd_format.format(AgentCommands.Deprovision), arg):
207 elif re.match("^([-/]*)deprovision", a):328 cmd = AgentCommands.Deprovision
208 cmd = "deprovision"329 elif re.match(regex_cmd_format.format(AgentCommands.Daemon), arg):
209 elif re.match("^([-/]*)daemon", a):330 cmd = AgentCommands.Daemon
210 cmd = "daemon"331 elif re.match(regex_cmd_format.format(AgentCommands.Start), arg):
211 elif re.match("^([-/]*)start", a):332 cmd = AgentCommands.Start
212 cmd = "start"333 elif re.match(regex_cmd_format.format(AgentCommands.RegisterService), arg):
213 elif re.match("^([-/]*)register-service", a):334 cmd = AgentCommands.RegisterService
214 cmd = "register-service"335 elif re.match(regex_cmd_format.format(AgentCommands.RunExthandlers), arg):
215 elif re.match("^([-/]*)run-exthandlers", a):336 cmd = AgentCommands.RunExthandlers
216 cmd = "run-exthandlers"337 elif re.match(regex_cmd_format.format(AgentCommands.Version), arg):
217 elif re.match("^([-/]*)version", a):338 cmd = AgentCommands.Version
218 cmd = "version"339 elif re.match(regex_cmd_format.format("verbose"), arg):
219 elif re.match("^([-/]*)verbose", a):
220 verbose = True340 verbose = True
221 elif re.match("^([-/]*)debug", a):341 elif re.match(regex_cmd_format.format("debug"), arg):
222 debug = True342 debug = True
223 elif re.match("^([-/]*)force", a):343 elif re.match(regex_cmd_format.format("force"), arg):
224 force = True344 force = True
225 elif re.match("^([-/]*)show-configuration", a):345 elif re.match(regex_cmd_format.format(AgentCommands.ShowConfig), arg):
226 cmd = "show-configuration"346 cmd = AgentCommands.ShowConfig
227 elif re.match("^([-/]*)(help|usage|\\?)", a):347 elif re.match("^([-/]*)(help|usage|\\?)", arg):
228 cmd = "help"348 cmd = AgentCommands.Help
349 elif re.match(regex_cmd_format.format(AgentCommands.CollectLogs), arg):
350 cmd = AgentCommands.CollectLogs
351 elif re.match(regex_cmd_format.format("full"), arg):
352 log_collector_full_mode = True
353 elif re.match(regex_cmd_format.format(AgentCommands.SetupFirewall), arg):
354 cmd = AgentCommands.SetupFirewall
355 elif re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg):
356 firewall_metadata['dst_ip'] = re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg).group(
357 'dst_ip')
358 elif re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg):
359 firewall_metadata['uid'] = re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg).group('uid')
360 elif re.match(regex_cmd_format.format("(w|wait)$"), arg):
361 firewall_metadata['wait'] = "-w"
229 else:362 else:
230 cmd = "help"363 cmd = AgentCommands.Help
231 break364 break
232365
233 return cmd, force, verbose, debug, conf_file_path366 return cmd, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata
234367
235368
236def version():369def version():
@@ -245,29 +378,33 @@ def version():
245 PY_VERSION_MICRO))378 PY_VERSION_MICRO))
246 print("Goal state agent: {0}".format(GOAL_STATE_AGENT_VERSION))379 print("Goal state agent: {0}".format(GOAL_STATE_AGENT_VERSION))
247380
381
248def usage():382def usage():
249 """383 """
250 Return agent usage message384 Return agent usage message
251 """385 """
252 s = "\n"386 s = "\n"
253 s += ("usage: {0} [-verbose] [-force] [-help] "387 s += ("usage: {0} [-verbose] [-force] [-help] "
254 "-configuration-path:<path to configuration file>"388 "-configuration-path:<path to configuration file>"
255 "-deprovision[+user]|-register-service|-version|-daemon|-start|"389 "-deprovision[+user]|-register-service|-version|-daemon|-start|"
256 "-run-exthandlers|-show-configuration]"390 "-run-exthandlers|-show-configuration|-collect-logs [-full]|-setup-firewall [-dst_ip=<IP> -uid=<UID> [-w/--wait]]"
257 "").format(sys.argv[0])391 "").format(sys.argv[0])
258 s += "\n"392 s += "\n"
259 return s393 return s
260394
395
261def start(conf_file_path=None):396def start(conf_file_path=None):
262 """397 """
263 Start agent daemon in a background process and set stdout/stderr to398 Start agent daemon in a background process and set stdout/stderr to
264 /dev/null399 /dev/null
265 """400 """
266 devnull = open(os.devnull, 'w')
267 args = [sys.argv[0], '-daemon']401 args = [sys.argv[0], '-daemon']
268 if conf_file_path is not None:402 if conf_file_path is not None:
269 args.append('-configuration-path:{0}'.format(conf_file_path))403 args.append('-configuration-path:{0}'.format(conf_file_path))
270 subprocess.Popen(args, stdout=devnull, stderr=devnull)404
405 with open(os.devnull, 'w') as devnull:
406 subprocess.Popen(args, stdout=devnull, stderr=devnull)
407
271408
272if __name__ == '__main__' :409if __name__ == '__main__' :
273 main()410 main()
diff --git a/azurelinuxagent/common/AgentGlobals.py b/azurelinuxagent/common/AgentGlobals.py
274new file mode 100644411new file mode 100644
index 0000000..dbfda92
--- /dev/null
+++ b/azurelinuxagent/common/AgentGlobals.py
@@ -0,0 +1,39 @@
1# Microsoft Azure Linux Agent
2#
3# Copyright 2020 Microsoft Corporation
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Requires Python 2.6+ and Openssl 1.0+
18
19
20class AgentGlobals(object):
21 """
22 This class is used for setting AgentGlobals which can be used all throughout the Agent.
23 """
24
25 GUID_ZERO = "00000000-0000-0000-0000-000000000000"
26
27 #
28 # Some modules (e.g. telemetry) require an up-to-date container ID. We update this variable each time we
29 # fetch the goal state.
30 #
31 _container_id = GUID_ZERO
32
33 @staticmethod
34 def get_container_id():
35 return AgentGlobals._container_id
36
37 @staticmethod
38 def update_container_id(container_id):
39 AgentGlobals._container_id = container_id
diff --git a/azurelinuxagent/common/agent_supported_feature.py b/azurelinuxagent/common/agent_supported_feature.py
0new file mode 10064440new file mode 100644
index 0000000..d7f93e2
--- /dev/null
+++ b/azurelinuxagent/common/agent_supported_feature.py
@@ -0,0 +1,122 @@
1# Copyright 2018 Microsoft Corporation
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15# Requires Python 2.6+ and Openssl 1.0+
16#
17
18
19class SupportedFeatureNames(object):
20 """
21 Enum for defining the Feature Names for all features that we the agent supports
22 """
23 MultiConfig = "MultipleExtensionsPerHandler"
24 ExtensionTelemetryPipeline = "ExtensionTelemetryPipeline"
25 FastTrack = "FastTrack"
26
27
28class AgentSupportedFeature(object):
29 """
30 Interface for defining all features that the Linux Guest Agent supports and reports their if supported back to CRP
31 """
32
33 def __init__(self, name, version="1.0", supported=False):
34 self.__name = name
35 self.__version = version
36 self.__supported = supported
37
38 @property
39 def name(self):
40 return self.__name
41
42 @property
43 def version(self):
44 return self.__version
45
46 @property
47 def is_supported(self):
48 return self.__supported
49
50
51class _MultiConfigFeature(AgentSupportedFeature):
52
53 __NAME = SupportedFeatureNames.MultiConfig
54 __VERSION = "1.0"
55 __SUPPORTED = True
56
57 def __init__(self):
58 super(_MultiConfigFeature, self).__init__(name=_MultiConfigFeature.__NAME,
59 version=_MultiConfigFeature.__VERSION,
60 supported=_MultiConfigFeature.__SUPPORTED)
61
62
63class _ETPFeature(AgentSupportedFeature):
64
65 __NAME = SupportedFeatureNames.ExtensionTelemetryPipeline
66 __VERSION = "1.0"
67 __SUPPORTED = True
68
69 def __init__(self):
70 super(_ETPFeature, self).__init__(name=self.__NAME,
71 version=self.__VERSION,
72 supported=self.__SUPPORTED)
73
74
75# This is the list of features that Agent supports and we advertise to CRP
76__CRP_ADVERTISED_FEATURES = {
77 SupportedFeatureNames.MultiConfig: _MultiConfigFeature()
78}
79
80
81# This is the list of features that Agent supports and we advertise to Extensions
82__EXTENSION_ADVERTISED_FEATURES = {
83 SupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature()
84}
85
86
87def get_supported_feature_by_name(feature_name):
88 if feature_name in __CRP_ADVERTISED_FEATURES:
89 return __CRP_ADVERTISED_FEATURES[feature_name]
90
91 if feature_name in __EXTENSION_ADVERTISED_FEATURES:
92 return __EXTENSION_ADVERTISED_FEATURES[feature_name]
93
94 raise NotImplementedError("Feature with Name: {0} not found".format(feature_name))
95
96
97def get_agent_supported_features_list_for_crp():
98 """
99 List of features that the GuestAgent currently supports (like FastTrack, MultiConfig, etc).
100 We need to send this list as part of Status reporting to inform CRP of all the features the agent supports.
101 :return: Dict containing all CRP supported features with the key as their names and the AgentFeature object as
102 the value if they are supported by the Agent
103 Eg: {
104 MultipleExtensionsPerHandler: _MultiConfigFeature()
105 }
106 """
107
108 return dict((name, feature) for name, feature in __CRP_ADVERTISED_FEATURES.items() if feature.is_supported)
109
110
111def get_agent_supported_features_list_for_extensions():
112 """
113 List of features that the GuestAgent currently supports (like Extension Telemetry Pipeline, etc) needed by Extensions.
114 We need to send this list as environment variables when calling extension commands to inform Extensions of all the
115 features the agent supports.
116 :return: Dict containing all Extension supported features with the key as their names and the AgentFeature object as
117 the value if the feature is supported by the Agent.
118 Eg: {
119 CRPSupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature()
120 }
121 """
122 return dict((name, feature) for name, feature in __EXTENSION_ADVERTISED_FEATURES.items() if feature.is_supported)
diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py
index 2ad70c1..b2bf32f 100644
--- a/azurelinuxagent/common/cgroup.py
+++ b/azurelinuxagent/common/cgroup.py
@@ -13,47 +13,94 @@
13# limitations under the License.13# limitations under the License.
14#14#
15# Requires Python 2.6+ and Openssl 1.0+15# Requires Python 2.6+ and Openssl 1.0+
16
16import errno17import errno
17import os18import os
18import re19import re
20from datetime import timedelta
1921
20from azurelinuxagent.common import logger22from azurelinuxagent.common import logger, conf
21from azurelinuxagent.common.exception import CGroupsException23from azurelinuxagent.common.exception import CGroupsException
22from azurelinuxagent.common.future import ustr24from azurelinuxagent.common.future import ustr
23from azurelinuxagent.common.osutil import get_osutil25from azurelinuxagent.common.osutil import get_osutil
24from azurelinuxagent.common.utils import fileutil26from azurelinuxagent.common.utils import fileutil
2527
26re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')28_REPORT_EVERY_HOUR = timedelta(hours=1)
29_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period())
2730
31AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes
32AGENT_LOG_COLLECTOR = "azure-walinuxagent-logcollector"
2833
29class CGroupContollers(object):
30 CPU = "cpu"
31 MEMORY = "memory"
3234
35class CounterNotFound(Exception):
36 pass
3337
34class CGroup(object):
35 @staticmethod
36 def create(cgroup_path, controller, extension_name):
37 """
38 Factory method to create the correct CGroup.
39 """
40 if controller == CGroupContollers.CPU:
41 return CpuCgroup(extension_name, cgroup_path)
42 if controller == CGroupContollers.MEMORY:
43 return MemoryCgroup(extension_name, cgroup_path)
44 raise CGroupsException('CGroup controller {0} is not supported'.format(controller))
4538
46 def __init__(self, name, cgroup_path, controller_type):39class MetricValue(object):
40
41 """
42 Class for defining all the required metric fields to send telemetry.
43 """
44
45 def __init__(self, category, counter, instance, value, report_period=_DEFAULT_REPORT_PERIOD):
46 self._category = category
47 self._counter = counter
48 self._instance = instance
49 self._value = value
50 self._report_period = report_period
51
52 @property
53 def category(self):
54 return self._category
55
56 @property
57 def counter(self):
58 return self._counter
59
60 @property
61 def instance(self):
62 return self._instance
63
64 @property
65 def value(self):
66 return self._value
67
68 @property
69 def report_period(self):
70 return self._report_period
71
72
73class MetricsCategory(object):
74 MEMORY_CATEGORY = "Memory"
75 CPU_CATEGORY = "CPU"
76
77
78class MetricsCounter(object):
79 PROCESSOR_PERCENT_TIME = "% Processor Time"
80 TOTAL_MEM_USAGE = "Total Memory Usage"
81 MAX_MEM_USAGE = "Max Memory Usage"
82 THROTTLED_TIME = "Throttled Time"
83 SWAP_MEM_USAGE = "Swap Memory Usage"
84 AVAILABLE_MEM = "Available MBytes"
85 USED_MEM = "Used MBytes"
86
87
88re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')
89
90
91class CGroup(object):
92 def __init__(self, name, cgroup_path):
47 """93 """
48 Initialize _data collection for the Memory controller94 Initialize _data collection for the Memory controller
49 :param: name: Name of the CGroup95 :param: name: Name of the CGroup
50 :param: cgroup_path: Path of the controller96 :param: cgroup_path: Path of the controller
51 :param: controller_type:
52 :return:97 :return:
53 """98 """
54 self.name = name99 self.name = name
55 self.path = cgroup_path100 self.path = cgroup_path
56 self.controller = controller_type101
102 def __str__(self):
103 return "{0} [{1}]".format(self.name, self.path)
57104
58 def _get_cgroup_file(self, file_name):105 def _get_cgroup_file(self, file_name):
59 return os.path.join(self.path, file_name)106 return os.path.join(self.path, file_name)
@@ -89,7 +136,7 @@ class CGroup(object):
89 logger.error("File {0} is empty but should not be".format(parameter_filename))136 logger.error("File {0} is empty but should not be".format(parameter_filename))
90 raise CGroupsException("File {0} is empty but should not be".format(parameter_filename))137 raise CGroupsException("File {0} is empty but should not be".format(parameter_filename))
91 except Exception as e:138 except Exception as e:
92 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:139 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
93 raise e140 raise e
94 parameter_filename = self._get_cgroup_file(parameter_name)141 parameter_filename = self._get_cgroup_file(parameter_name)
95 raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e)142 raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e)
@@ -114,42 +161,26 @@ class CGroup(object):
114 ' Internal error: {1}'.format(self.path, ustr(e)))161 ' Internal error: {1}'.format(self.path, ustr(e)))
115 return False162 return False
116163
117 def get_tracked_processes(self):164 def get_tracked_metrics(self, **_):
118 """165 """
119 :return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes.166 Retrieves the current value of the metrics tracked for this cgroup and returns them as an array.
167
168 Note: Agent won't track the metrics if the current cpu ticks less than previous value and returns empty array.
120 """169 """
121 procs = []170 raise NotImplementedError()
122 try:
123 procs = self._get_parameters("cgroup.procs")
124 except (IOError, OSError) as e:
125 if e.errno == errno.ENOENT:
126 # only suppressing file not found exceptions.
127 pass
128 else:
129 logger.periodic_warn(logger.EVERY_HALF_HOUR,
130 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.'
131 ' Internal error: {1}'.format(self.path, ustr(e)))
132 except CGroupsException as e:
133 logger.periodic_warn(logger.EVERY_HALF_HOUR,
134 'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.'
135 ' Internal error: {1}'.format(self.path, ustr(e)))
136 return procs
137171
138172
139class CpuCgroup(CGroup):173class CpuCgroup(CGroup):
140 def __init__(self, name, cgroup_path):174 def __init__(self, name, cgroup_path):
141 super(CpuCgroup, self).__init__(name, cgroup_path, CGroupContollers.CPU)175 super(CpuCgroup, self).__init__(name, cgroup_path)
142176
143 self._osutil = get_osutil()177 self._osutil = get_osutil()
144 self._previous_cgroup_cpu = None178 self._previous_cgroup_cpu = None
145 self._previous_system_cpu = None179 self._previous_system_cpu = None
146 self._current_cgroup_cpu = None180 self._current_cgroup_cpu = None
147 self._current_system_cpu = None181 self._current_system_cpu = None
148182 self._previous_throttled_time = None
149 def __str__(self):183 self._current_throttled_time = None
150 return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(
151 self.name, self.path, self.controller
152 )
153184
154 def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False):185 def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False):
155 """186 """
@@ -159,24 +190,54 @@ class CpuCgroup(CGroup):
159 returns 0; this is useful when the function can be called before the cgroup has been created.190 returns 0; this is useful when the function can be called before the cgroup has been created.
160 """191 """
161 try:192 try:
162 cpu_stat = self._get_file_contents('cpuacct.stat')193 cpuacct_stat = self._get_file_contents('cpuacct.stat')
163 except Exception as e:194 except Exception as e:
164 if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:195 if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101
165 raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e)))196 raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e)))
166 if not allow_no_such_file_or_directory_error:197 if not allow_no_such_file_or_directory_error:
167 raise e198 raise e
168 cpu_stat = None199 cpuacct_stat = None
169200
170 cpu_ticks = 0201 cpu_ticks = 0
171202
172 if cpu_stat is not None:203 if cpuacct_stat is not None:
173 match = re_user_system_times.match(cpu_stat)204 #
205 # Sample file:
206 # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpuacct.stat
207 # user 10190
208 # system 3160
209 #
210 match = re_user_system_times.match(cpuacct_stat)
174 if not match:211 if not match:
175 raise CGroupsException("The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpu_stat))212 raise CGroupsException(
213 "The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat))
176 cpu_ticks = int(match.groups()[0]) + int(match.groups()[1])214 cpu_ticks = int(match.groups()[0]) + int(match.groups()[1])
177215
178 return cpu_ticks216 return cpu_ticks
179217
218 def get_throttled_time(self):
219 try:
220 with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat:
221 #
222 # Sample file:
223 #
224 # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpu.stat
225 # nr_periods 51660
226 # nr_throttled 19461
227 # throttled_time 1529590856339
228 #
229 for line in cpu_stat:
230 match = re.match(r'throttled_time\s+(\d+)', line)
231 if match is not None:
232 return int(match.groups()[0])
233 raise Exception("Cannot find throttled_time")
234 except (IOError, OSError) as e:
235 if e.errno == errno.ENOENT:
236 return 0
237 raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e)))
238 except Exception as e:
239 raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e)))
240
180 def _cpu_usage_initialized(self):241 def _cpu_usage_initialized(self):
181 return self._current_cgroup_cpu is not None and self._current_system_cpu is not None242 return self._current_cgroup_cpu is not None and self._current_system_cpu is not None
182243
@@ -188,13 +249,14 @@ class CpuCgroup(CGroup):
188 raise CGroupsException("initialize_cpu_usage() should be invoked only once")249 raise CGroupsException("initialize_cpu_usage() should be invoked only once")
189 self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True)250 self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True)
190 self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot()251 self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot()
252 self._current_throttled_time = self.get_throttled_time()
191253
192 def get_cpu_usage(self):254 def get_cpu_usage(self):
193 """255 """
194 Computes the CPU used by the cgroup since the last call to this function.256 Computes the CPU used by the cgroup since the last call to this function.
195257
196 The usage is measured as a percentage of utilization of all cores in the system. For example,258 The usage is measured as a percentage of utilization of 1 core in the system. For example,
197 using 1 core at 100% on a 4-core system would be reported as 25%.259 using 1 core all of the time on a 4-core system would be reported as 100%.
198260
199 NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage()261 NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage()
200 """262 """
@@ -209,53 +271,122 @@ class CpuCgroup(CGroup):
209 cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu271 cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu
210 system_delta = max(1, self._current_system_cpu - self._previous_system_cpu)272 system_delta = max(1, self._current_system_cpu - self._previous_system_cpu)
211273
212 return round(100.0 * float(cgroup_delta) / float(system_delta), 3)274 return round(100.0 * self._osutil.get_processor_cores() * float(cgroup_delta) / float(system_delta), 3)
275
276 def get_cpu_throttled_time(self, read_previous_throttled_time=True):
277 """
278 Computes the throttled time (in seconds) since the last call to this function.
279 NOTE: initialize_cpu_usage() must be invoked before calling this function
280 Compute only current throttled time if read_previous_throttled_time set to False
281 """
282 if not read_previous_throttled_time:
283 return float(self.get_throttled_time() / 1E9)
284
285 if not self._cpu_usage_initialized():
286 raise CGroupsException(
287 "initialize_cpu_usage() must be invoked before the first call to get_throttled_time()")
288
289 self._previous_throttled_time = self._current_throttled_time
290 self._current_throttled_time = self.get_throttled_time()
291
292 return float(self._current_throttled_time - self._previous_throttled_time) / 1E9
293
294 def get_tracked_metrics(self, **kwargs):
295 tracked = []
296 cpu_usage = self.get_cpu_usage()
297 if cpu_usage >= float(0):
298 tracked.append(
299 MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))
300
301 if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']:
302 throttled_time = self.get_cpu_throttled_time()
303 if cpu_usage >= float(0) and throttled_time >= float(0):
304 tracked.append(
305 MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))
306
307 return tracked
213308
214309
215class MemoryCgroup(CGroup):310class MemoryCgroup(CGroup):
216 def __init__(self, name, cgroup_path):311 def __init__(self, name, cgroup_path):
312 super(MemoryCgroup, self).__init__(name, cgroup_path)
313
314 self._counter_not_found_error_count = 0
315
316 def _get_memory_stat_counter(self, counter_name):
317 try:
318 with open(os.path.join(self.path, 'memory.stat')) as memory_stat:
319 # cat /sys/fs/cgroup/memory/azure.slice/memory.stat
320 # cache 67178496
321 # rss 42340352
322 # rss_huge 6291456
323 # swap 0
324 for line in memory_stat:
325 re_memory_counter = r'{0}\s+(\d+)'.format(counter_name)
326 match = re.match(re_memory_counter, line)
327 if match is not None:
328 return int(match.groups()[0])
329 except (IOError, OSError) as e:
330 if e.errno == errno.ENOENT:
331 raise
332 raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
333 except Exception as e:
334 raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e)))
335
336 raise CounterNotFound("Cannot find counter: {0}".format(counter_name))
337
338 def get_memory_usage(self):
217 """339 """
218 Initialize _data collection for the Memory controller340 Collect RSS+CACHE from memory.stat cgroup.
219341
220 :return: MemoryCgroup342 :return: Memory usage in bytes
343 :rtype: int
221 """344 """
222 super(MemoryCgroup, self).__init__(name, cgroup_path, CGroupContollers.MEMORY)
223345
224 def __str__(self):346 cache = self._get_memory_stat_counter("cache")
225 return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(347 rss = self._get_memory_stat_counter("rss")
226 self.name, self.path, self.controller348 return cache + rss
227 )
228349
229 def get_memory_usage(self):350 def try_swap_memory_usage(self):
230 """351 """
231 Collect memory.usage_in_bytes from the cgroup.352 Collect SWAP from memory.stat cgroup.
232353
233 :return: Memory usage in bytes354 :return: Memory usage in bytes
234 :rtype: int355 :rtype: int
356 Note: stat file is the only place to get the SWAP since other swap related file memory.memsw.usage_in_bytes is for total Memory+SWAP.
235 """357 """
236 usage = None
237 try:358 try:
238 usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True)359 return self._get_memory_stat_counter("swap")
239 except Exception as e:360 except CounterNotFound as e:
240 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:361 if self._counter_not_found_error_count < 1:
241 raise362 logger.periodic_info(logger.EVERY_HALF_HOUR,
242 raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)363 '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path))
243364 self._counter_not_found_error_count += 1
244 return int(usage)365 return 0
245366
246 def get_max_memory_usage(self):367 def get_max_memory_usage(self):
247 """368 """
248 Collect memory.usage_in_bytes from the cgroup.369 Collect memory.max_usage_in_bytes from the cgroup.
249370
250 :return: Memory usage in bytes371 :return: Memory usage in bytes
251 :rtype: int372 :rtype: int
252 """373 """
253 usage = None374 usage = 0
254 try:375 try:
255 usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)376 usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True))
256 except Exception as e:377 except Exception as e:
257 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:378 if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101
258 raise379 raise
259 raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)380 raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e)
260381
261 return int(usage)382 return usage
383
384 def get_tracked_metrics(self, **_):
385 return [
386 MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name,
387 self.get_memory_usage()),
388 MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name,
389 self.get_max_memory_usage(), _REPORT_EVERY_HOUR),
390 MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name,
391 self.try_swap_memory_usage(), _REPORT_EVERY_HOUR)
392 ]
diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py
index c671a2e..ca0ef3b 100644
--- a/azurelinuxagent/common/cgroupapi.py
+++ b/azurelinuxagent/common/cgroupapi.py
@@ -1,3 +1,4 @@
1# -*- coding: utf-8 -*-
1# Copyright 2018 Microsoft Corporation2# Copyright 2018 Microsoft Corporation
2#3#
3# Licensed under the Apache License, Version 2.0 (the "License");4# Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,102 +15,65 @@
14#15#
15# Requires Python 2.6+ and Openssl 1.0+16# Requires Python 2.6+ and Openssl 1.0+
1617
17import errno
18import os18import os
19import re
19import shutil20import shutil
20import subprocess21import subprocess
22import threading
21import uuid23import uuid
2224
23from azurelinuxagent.common import logger25from azurelinuxagent.common import logger
24from azurelinuxagent.common.cgroup import CGroup26from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup
25from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry27from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
26from azurelinuxagent.common.conf import get_agent_pid_file_path28from azurelinuxagent.common.conf import get_agent_pid_file_path
27from azurelinuxagent.common.event import add_event, WALAEventOperation
28from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \29from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \
29 ExtensionOperationError30 ExtensionOperationError
30from azurelinuxagent.common.future import ustr31from azurelinuxagent.common.future import ustr
32from azurelinuxagent.common.osutil import systemd
31from azurelinuxagent.common.utils import fileutil, shellutil33from azurelinuxagent.common.utils import fileutil, shellutil
32from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output34from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \
33from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION35 TELEMETRY_MESSAGE_MAX_LEN
36from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
37from azurelinuxagent.common.version import get_distro
3438
35CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup'39CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup'
36CGROUP_CONTROLLERS = ["cpu", "memory"]40CGROUP_CONTROLLERS = ["cpu", "memory"]
37VM_AGENT_CGROUP_NAME = "walinuxagent.service"41EXTENSION_SLICE_PREFIX = "azure-vmextensions"
38EXTENSIONS_ROOT_CGROUP_NAME = "walinuxagent.extensions"
39UNIT_FILES_FILE_SYSTEM_PATH = "/etc/systemd/system"
4042
4143
42class CGroupsApi(object):44class SystemdRunError(CGroupsException):
43 """45 """
44 Interface for the cgroups API46 Raised when systemd-run fails
45 """47 """
46 def create_agent_cgroups(self):
47 raise NotImplementedError()
48
49 def create_extension_cgroups_root(self):
50 raise NotImplementedError()
51
52 def create_extension_cgroups(self, extension_name):
53 raise NotImplementedError()
54
55 def remove_extension_cgroups(self, extension_name):
56 raise NotImplementedError()
5748
58 def get_extension_cgroups(self, extension_name):49 def __init__(self, msg=None):
59 raise NotImplementedError()50 super(SystemdRunError, self).__init__(msg)
6051
61 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code):
62 raise NotImplementedError()
6352
64 def cleanup_legacy_cgroups(self):53class CGroupsApi(object):
65 raise NotImplementedError()54 @staticmethod
55 def cgroups_supported():
56 distro_info = get_distro()
57 distro_name = distro_info[0]
58 try:
59 distro_version = FlexibleVersion(distro_info[1])
60 except ValueError:
61 return False
62 return distro_name.lower() == 'ubuntu' and distro_version.major >= 16
6663
67 @staticmethod64 @staticmethod
68 def track_cgroups(extension_cgroups):65 def track_cgroups(extension_cgroups):
69 try:66 try:
70 for cgroup in extension_cgroups:67 for cgroup in extension_cgroups:
71 CGroupsTelemetry.track_cgroup(cgroup)68 CGroupsTelemetry.track_cgroup(cgroup)
72 except Exception as e:69 except Exception as exception:
73 logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. "70 logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. "
74 "Error: {1}".format(cgroup.path, ustr(e)))71 "Error: {1}".format(cgroup.path, ustr(exception)))
7572
76 @staticmethod73 @staticmethod
77 def _get_extension_cgroup_name(extension_name):74 def get_processes_in_cgroup(cgroup_path):
78 # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.75 with open(os.path.join(cgroup_path, "cgroup.procs"), "r") as cgroup_procs:
79 return extension_name.replace('-', '_')76 return [int(pid) for pid in cgroup_procs.read().split()]
80
81 @staticmethod
82 def create():
83 """
84 Factory method to create the correct API for the current platform
85 """
86 return SystemdCgroupsApi() if CGroupsApi._is_systemd() else FileSystemCgroupsApi()
87
88 @staticmethod
89 def _is_systemd():
90 """
91 Determine if systemd is managing system services; the implementation follows the same strategy as, for example,
92 sd_booted() in libsystemd, or /usr/sbin/service
93 """
94 return os.path.exists('/run/systemd/system/')
95
96 @staticmethod
97 def _foreach_controller(operation, message):
98 """
99 Executes the given operation on all controllers that need to be tracked; outputs 'message' if the controller
100 is not mounted or if an error occurs in the operation
101 :return: Returns a list of error messages or an empty list if no errors occurred
102 """
103 mounted_controllers = os.listdir(CGROUPS_FILE_SYSTEM_ROOT)
104
105 for controller in CGROUP_CONTROLLERS:
106 try:
107 if controller not in mounted_controllers:
108 logger.warn('Cgroup controller "{0}" is not mounted. {1}', controller, message)
109 else:
110 operation(controller)
111 except Exception as e:
112 logger.warn('Error in cgroup controller "{0}": {1}. {2}', controller, ustr(e), message)
11377
114 @staticmethod78 @staticmethod
115 def _foreach_legacy_cgroup(operation):79 def _foreach_legacy_cgroup(operation):
@@ -138,429 +102,250 @@ class CGroupsApi(object):
138102
139 if os.path.exists(procs_file):103 if os.path.exists(procs_file):
140 procs_file_contents = fileutil.read_file(procs_file).strip()104 procs_file_contents = fileutil.read_file(procs_file).strip()
141 daemon_pid = fileutil.read_file(get_agent_pid_file_path()).strip()105 daemon_pid = CGroupsApi.get_daemon_pid()
142106
143 if daemon_pid in procs_file_contents:107 if ustr(daemon_pid) in procs_file_contents:
144 operation(controller, daemon_pid)108 operation(controller, daemon_pid)
145 finally:109 finally:
146 for _, cgroup in legacy_cgroups:110 for _, cgroup in legacy_cgroups:
147 logger.info('Removing {0}', cgroup)111 logger.info('Removing {0}', cgroup)
148 shutil.rmtree(cgroup, ignore_errors=True)112 shutil.rmtree(cgroup, ignore_errors=True)
113 return len(legacy_cgroups)
149114
150
151class FileSystemCgroupsApi(CGroupsApi):
152 """
153 Cgroups interface using the cgroups file system directly
154 """
155 @staticmethod115 @staticmethod
156 def _try_mkdir(path):116 def get_daemon_pid():
157 """117 return int(fileutil.read_file(get_agent_pid_file_path()).strip())
158 Try to create a directory, recursively. If it already exists as such, do nothing. Raise the appropriate
159 exception should an error occur.
160
161 :param path: str
162 """
163 if not os.path.isdir(path):
164 try:
165 os.makedirs(path, 0o755)
166 except OSError as e:
167 if e.errno == errno.EEXIST:
168 if not os.path.isdir(path):
169 raise CGroupsException("Create directory for cgroup {0}: normal file already exists with that name".format(path))
170 else:
171 pass # There was a race to create the directory, but it's there now, and that's fine
172 elif e.errno == errno.EACCES:
173 # This is unexpected, as the agent runs as root
174 raise CGroupsException("Create directory for cgroup {0}: permission denied".format(path))
175 else:
176 raise
177
178 @staticmethod
179 def _get_agent_cgroup_path(controller):
180 return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME)
181
182 @staticmethod
183 def _get_extension_cgroups_root_path(controller):
184 return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, EXTENSIONS_ROOT_CGROUP_NAME)
185
186 def _get_extension_cgroup_path(self, controller, extension_name):
187 extensions_root = self._get_extension_cgroups_root_path(controller)
188
189 if not os.path.exists(extensions_root):
190 logger.warn("Root directory {0} does not exist.".format(extensions_root))
191
192 cgroup_name = self._get_extension_cgroup_name(extension_name)
193118
194 return os.path.join(extensions_root, cgroup_name)
195119
196 def _create_extension_cgroup(self, controller, extension_name):120class SystemdCgroupsApi(CGroupsApi):
197 return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), controller, extension_name)121 """
122 Cgroups interface via systemd
123 """
198124
199 @staticmethod125 def __init__(self):
200 def _add_process_to_cgroup(pid, cgroup_path):126 self._cgroup_mountpoints = None
201 tasks_file = os.path.join(cgroup_path, 'cgroup.procs')127 self._agent_unit_name = None
202 fileutil.append_file(tasks_file, "{0}\n".format(pid))128 self._systemd_run_commands = []
203 logger.info("Added PID {0} to cgroup {1}".format(pid, cgroup_path))129 self._systemd_run_commands_lock = threading.RLock()
204130
205 def cleanup_legacy_cgroups(self):131 def get_systemd_run_commands(self):
206 """132 """
207 Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;133 Returns a list of the systemd-run commands currently running (given as PIDs)
208 starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. This
209 method moves the daemon's PID from the legacy cgroups to the newer cgroups.
210 """134 """
211 def move_daemon_pid(controller, daemon_pid):135 with self._systemd_run_commands_lock:
212 new_path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)136 return self._systemd_run_commands[:]
213 logger.info("Writing daemon's PID ({0}) to {1}", daemon_pid, new_path)
214 fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid)
215 msg = "Moved daemon's PID from legacy cgroup to {0}".format(new_path)
216 add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=True, message=msg)
217137
218 CGroupsApi._foreach_legacy_cgroup(move_daemon_pid)138 def get_cgroup_mount_points(self):
219
220 def create_agent_cgroups(self):
221 """139 """
222 Creates a cgroup for the VM Agent in each of the controllers we are tracking; returns the created cgroups.140 Returns a tuple with the mount points for the cpu and memory controllers; the values can be None
141 if the corresponding controller is not mounted
223 """142 """
224 cgroups = []143 # the output of mount is similar to
225144 # $ mount -t cgroup
226 pid = int(os.getpid())145 # cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
227146 # cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
228 def create_cgroup(controller):147 # cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
229 path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)148 # etc
230149 #
231 if not os.path.isdir(path):150 if self._cgroup_mountpoints is None:
232 FileSystemCgroupsApi._try_mkdir(path)151 cpu = None
233 logger.info("Created cgroup {0}".format(path))152 memory = None
234153 for line in shellutil.run_command(['mount', '-t', 'cgroup']).splitlines():
235 self._add_process_to_cgroup(pid, path)154 match = re.search(r'on\s+(?P<path>/\S+(memory|cpuacct))\s', line)
236155 if match is not None:
237 cgroups.append(CGroup.create(path, controller, VM_AGENT_CGROUP_NAME))156 path = match.group('path')
238157 if 'cpuacct' in path:
239 self._foreach_controller(create_cgroup, 'Failed to create a cgroup for the VM Agent; resource usage will not be tracked')158 cpu = path
240159 else:
241 if len(cgroups) == 0:160 memory = path
242 raise CGroupsException("Failed to create any cgroup for the VM Agent")161 self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory}
243162
244 return cgroups163 return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory']
245164
246 def create_extension_cgroups_root(self):165 @staticmethod
166 def get_process_cgroup_relative_paths(process_id):
247 """167 """
248 Creates the directory within the cgroups file system that will contain the cgroups for the extensions.168 Returns a tuple with the path of the cpu and memory cgroups for the given process (relative to the mount point of the corresponding
169 controller).
170 The 'process_id' can be a numeric PID or the string "self" for the current process.
171 The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted).
249 """172 """
250 def create_cgroup(controller):173 # The contents of the file are similar to
251 path = self._get_extension_cgroups_root_path(controller)174 # # cat /proc/1218/cgroup
252175 # 10:memory:/system.slice/walinuxagent.service
253 if not os.path.isdir(path):176 # 3:cpu,cpuacct:/system.slice/walinuxagent.service
254 FileSystemCgroupsApi._try_mkdir(path)177 # etc
255 logger.info("Created {0}".format(path))178 cpu_path = None
179 memory_path = None
180 for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines():
181 match = re.match(r'\d+:(?P<controller>(memory|.*cpuacct.*)):(?P<path>.+)', line)
182 if match is not None:
183 controller = match.group('controller')
184 path = match.group('path').lstrip('/') if match.group('path') != '/' else None
185 if controller == 'memory':
186 memory_path = path
187 else:
188 cpu_path = path
256189
257 self._foreach_controller(create_cgroup, 'Failed to create a root cgroup for extensions')190 return cpu_path, memory_path
258191
259 def create_extension_cgroups(self, extension_name):192 def get_process_cgroup_paths(self, process_id):
260 """193 """
261 Creates a cgroup for the given extension in each of the controllers we are tracking; returns the created cgroups.194 Returns a tuple with the path of the cpu and memory cgroups for the given process. The 'process_id' can be a numeric PID or the string "self" for the current process.
195 The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted).
262 """196 """
263 cgroups = []197 cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id)
264
265 def create_cgroup(controller):
266 cgroup = self._create_extension_cgroup(controller, extension_name)
267198
268 if not os.path.isdir(cgroup.path):199 cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points()
269 FileSystemCgroupsApi._try_mkdir(cgroup.path)
270 logger.info("Created cgroup {0}".format(cgroup.path))
271200
272 cgroups.append(cgroup)201 cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \
202 if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None
273203
274 self._foreach_controller(create_cgroup, 'Failed to create a cgroup for extension {0}'.format(extension_name))204 memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \
205 if memory_mount_point is not None and memory_cgroup_relative_path is not None else None
275206
276 return cgroups207 return cpu_cgroup_path, memory_cgroup_path
277208
278 def remove_extension_cgroups(self, extension_name):209 def get_unit_cgroup_paths(self, unit_name):
279 """210 """
280 Deletes the cgroups for the given extension.211 Returns a tuple with the path of the cpu and memory cgroups for the given unit.
212 The values returned can be None if the controller is not mounted.
213 Ex: ControlGroup=/azure.slice/walinuxagent.service
214 controlgroup_path[1:] = azure.slice/walinuxagent.service
281 """215 """
282 def remove_cgroup(controller):216 controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup")
283 path = self._get_extension_cgroup_path(controller, extension_name)217 cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points()
284
285 if os.path.exists(path):
286 try:
287 os.rmdir(path)
288 logger.info('Deleted cgroup "{0}".'.format(path))
289 except OSError as exception:
290 if exception.errno == 16: # [Errno 16] Device or resource busy
291 logger.warn('CGroup "{0}" still has active tasks; will not remove it.'.format(path))
292
293 self._foreach_controller(remove_cgroup, 'Failed to delete cgroups for extension {0}'.format(extension_name))
294
295 def get_extension_cgroups(self, extension_name):
296 """
297 Returns the cgroups for the given extension.
298 """
299
300 cgroups = []
301218
302 def get_cgroup(controller):219 cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \
303 cgroup = self._create_extension_cgroup(controller, extension_name)220 if cpu_mount_point is not None else None
304 cgroups.append(cgroup)
305221
306 self._foreach_controller(get_cgroup, 'Failed to retrieve cgroups for extension {0}'.format(extension_name))222 memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \
223 if memory_mount_point is not None else None
307224
308 return cgroups225 return cpu_cgroup_path, memory_cgroup_path
309226
310 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,227 @staticmethod
311 error_code=ExtensionErrorCodes.PluginUnknownFailure):228 def get_cgroup2_controllers():
312 """229 """
313 Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup230 Returns a tuple with the mount point for the cgroups v2 controllers, and the currently mounted controllers;
314 :param extension_name: The extension executing the command231 either value can be None if cgroups v2 or its controllers are not mounted
315 :param command: The command to invoke
316 :param timeout: Number of seconds to wait for command completion
317 :param cwd: The working directory for the command
318 :param env: The environment to pass to the command's process
319 :param stdout: File object to redirect stdout to
320 :param stderr: File object to redirect stderr to
321 :param error_code: Extension error code to raise in case of error
322 """232 """
323 try:233 # the output of mount is similar to
324 extension_cgroups = self.create_extension_cgroups(extension_name)234 # $ mount -t cgroup2
325 except Exception as exception:235 # cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate)
326 extension_cgroups = []236 #
327 logger.warn("Failed to create cgroups for extension '{0}'; resource usage will not be tracked. "237 for line in shellutil.run_command(['mount', '-t', 'cgroup2']).splitlines():
328 "Error: {1}".format(extension_name, ustr(exception)))238 match = re.search(r'on\s+(?P<path>/\S+)\s', line)
329239 if match is not None:
330 def pre_exec_function():240 mount_point = match.group('path')
331 os.setsid()241 controllers = None
332242 controllers_file = os.path.join(mount_point, 'cgroup.controllers')
333 try:243 if os.path.exists(controllers_file):
334 pid = os.getpid()244 controllers = fileutil.read_file(controllers_file)
335245 return mount_point, controllers
336 for cgroup in extension_cgroups:246 return None, None
337 try:
338 self._add_process_to_cgroup(pid, cgroup.path)
339 except Exception as exception:
340 logger.warn("Failed to add PID {0} to the cgroups for extension '{1}'. "
341 "Resource usage will not be tracked. Error: {2}".format(pid,
342 extension_name,
343 ustr(exception)))
344 except Exception as e:
345 logger.warn("Failed to add extension {0} to its cgroup. Resource usage will not be tracked. "
346 "Error: {1}".format(extension_name, ustr(e)))
347
348 process = subprocess.Popen(command,
349 shell=shell,
350 cwd=cwd,
351 env=env,
352 stdout=stdout,
353 stderr=stderr,
354 preexec_fn=pre_exec_function)
355
356 self.track_cgroups(extension_cgroups)
357 process_output = handle_process_completion(process=process,
358 command=command,
359 timeout=timeout,
360 stdout=stdout,
361 stderr=stderr,
362 error_code=error_code)
363
364 return extension_cgroups, process_output
365
366
367class SystemdCgroupsApi(CGroupsApi):
368 """
369 Cgroups interface via systemd
370 """
371
372 @staticmethod
373 def create_and_start_unit(unit_filename, unit_contents):
374 try:
375 unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
376 fileutil.write_file(unit_path, unit_contents)
377 shellutil.run_command(["systemctl", "daemon-reload"])
378 shellutil.run_command(["systemctl", "start", unit_filename])
379 except Exception as e:
380 raise CGroupsException("Failed to create and start {0}. Error: {1}".format(unit_filename, ustr(e)))
381247
382 @staticmethod248 @staticmethod
383 def _get_extensions_slice_root_name():249 def _is_systemd_failure(scope_name, stderr):
384 return "system-{0}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME)250 stderr.seek(0)
385251 stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace')
386 def _get_extension_slice_name(self, extension_name):252 unit_not_found = "Unit {0} not found.".format(scope_name)
387 return "system-{0}-{1}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME, self._get_extension_cgroup_name(extension_name))253 return unit_not_found in stderr or scope_name not in stderr
388
389 def create_agent_cgroups(self):
390 try:
391 cgroup_unit = None
392 cgroup_paths = fileutil.read_file("/proc/self/cgroup")
393 for entry in cgroup_paths.splitlines():
394 fields = entry.split(':')
395 if fields[1] == "name=systemd":
396 cgroup_unit = fields[2].lstrip(os.path.sep)
397
398 cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'cpu', cgroup_unit)
399 memory_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'memory', cgroup_unit)
400
401 return [CGroup.create(cpu_cgroup_path, 'cpu', VM_AGENT_CGROUP_NAME),
402 CGroup.create(memory_cgroup_path, 'memory', VM_AGENT_CGROUP_NAME)]
403 except Exception as e:
404 raise CGroupsException("Failed to get paths of agent's cgroups. Error: {0}".format(ustr(e)))
405
406 def create_extension_cgroups_root(self):
407 unit_contents = """
408[Unit]
409Description=Slice for walinuxagent extensions
410DefaultDependencies=no
411Before=slices.target
412Requires=system.slice
413After=system.slice"""
414 unit_filename = self._get_extensions_slice_root_name()
415 self.create_and_start_unit(unit_filename, unit_contents)
416 logger.info("Created slice for walinuxagent extensions {0}".format(unit_filename))
417
418 def create_extension_cgroups(self, extension_name):
419 # TODO: The slice created by this function is not used currently. We need to create the extension scopes within
420 # this slice and use the slice to monitor the cgroups. Also see comment in get_extension_cgroups.
421 # the slice.
422 unit_contents = """
423[Unit]
424Description=Slice for extension {0}
425DefaultDependencies=no
426Before=slices.target
427Requires=system-{1}.slice
428After=system-{1}.slice""".format(extension_name, EXTENSIONS_ROOT_CGROUP_NAME)
429 unit_filename = self._get_extension_slice_name(extension_name)
430 self.create_and_start_unit(unit_filename, unit_contents)
431 logger.info("Created slice for {0}".format(unit_filename))
432
433 return self.get_extension_cgroups(extension_name)
434
435 def remove_extension_cgroups(self, extension_name):
436 # For transient units, cgroups are released automatically when the unit stops, so it is sufficient
437 # to call stop on them. Persistent cgroups are released when the unit is disabled and its configuration
438 # file is deleted.
439 # The assumption is that this method is called after the extension has been uninstalled. For now, since
440 # we're running extensions within transient scopes which clean up after they finish running, no removal
441 # of units is needed. In the future, when the extension is running under its own slice,
442 # the following clean up is needed.
443 unit_filename = self._get_extension_slice_name(extension_name)
444 try:
445 unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
446 shellutil.run_command(["systemctl", "stop", unit_filename])
447 fileutil.rm_files(unit_path)
448 shellutil.run_command(["systemctl", "daemon-reload"])
449 except Exception as e:
450 raise CGroupsException("Failed to remove {0}. Error: {1}".format(unit_filename, ustr(e)))
451
452 def get_extension_cgroups(self, extension_name):
453 # TODO: The slice returned by this function is not used currently. We need to create the extension scopes within
454 # this slice and use the slice to monitor the cgroups. Also see comment in create_extension_cgroups.
455 slice_name = self._get_extension_cgroup_name(extension_name)
456
457 cgroups = []
458
459 def create_cgroup(controller):
460 cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', slice_name)
461 cgroups.append(CGroup.create(cpu_cgroup_path, controller, extension_name))
462
463 self._foreach_controller(create_cgroup, 'Cannot retrieve cgroup for extension {0}; resource usage will not be tracked.'.format(extension_name))
464
465 return cgroups
466254
467 @staticmethod255 @staticmethod
468 def _is_systemd_failure(scope_name, process_output):256 def get_extension_slice_name(extension_name, old_slice=False):
469 unit_not_found = "Unit {0} not found.".format(scope_name)257 # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version.
470 return unit_not_found in process_output or scope_name not in process_output258 # old slice includes <HandlerName>.<ExtensionName>-<HandlerVersion>
259 # new slice without version <HandlerName>.<ExtensionName>
260 if not old_slice:
261 extension_name = extension_name.rsplit("-", 1)[0]
262 # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.
263 return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice"
471264
472 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,265 def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr,
473 error_code=ExtensionErrorCodes.PluginUnknownFailure):266 error_code=ExtensionErrorCodes.PluginUnknownFailure):
474 scope_name = "{0}_{1}".format(self._get_extension_cgroup_name(extension_name), uuid.uuid4())267 scope = "{0}_{1}".format(cmd_name, uuid.uuid4())
475268 extension_slice_name = self.get_extension_slice_name(extension_name)
476 process = subprocess.Popen(269 with self._systemd_run_commands_lock:
477 "systemd-run --unit={0} --scope {1}".format(scope_name, command),270 process = subprocess.Popen( # pylint: disable=W1509
478 shell=shell,271 # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice
479 cwd=cwd,272 # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in extension Cgroup
480 stdout=stdout,273 # since slice unit file configured with accounting enabled.
481 stderr=stderr,274 "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command),
482 env=env,275 shell=shell,
483 preexec_fn=os.setsid)276 cwd=cwd,
277 stdout=stdout,
278 stderr=stderr,
279 env=env,
280 preexec_fn=os.setsid)
281
282 # We start systemd-run with shell == True so process.pid is the shell's pid, not the pid for systemd-run
283 self._systemd_run_commands.append(process.pid)
284
285 scope_name = scope + '.scope'
286
287 logger.info("Started extension in unit '{0}'", scope_name)
288
289 cpu_cgroup = None
290 try:
291 cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name)
484292
485 logger.info("Started extension using scope '{0}'", scope_name)293 cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points()
486 extension_cgroups = []
487294
488 def create_cgroup(controller):295 if cpu_cgroup_mountpoint is None:
489 cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', scope_name + ".scope")296 logger.info("The CPU controller is not mounted; will not track resource usage")
490 extension_cgroups.append(CGroup.create(cgroup_path, controller, extension_name))297 else:
298 cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
299 cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path)
300 CGroupsTelemetry.track_cgroup(cpu_cgroup)
491301
492 self._foreach_controller(create_cgroup, 'Cannot create cgroup for extension {0}; '302 if memory_cgroup_mountpoint is None:
493 'resource usage will not be tracked.'.format(extension_name))303 logger.info("The Memory controller is not mounted; will not track resource usage")
494 self.track_cgroups(extension_cgroups)304 else:
305 memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path)
306 memory_cgroup = MemoryCgroup(extension_name, memory_cgroup_path)
307 CGroupsTelemetry.track_cgroup(memory_cgroup)
308
309 except IOError as e:
310 if e.errno == 2: # 'No such file or directory'
311 logger.info("The extension command already completed; will not track resource usage")
312 logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))
313 except Exception as e:
314 logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))
495315
496 # Wait for process completion or timeout316 # Wait for process completion or timeout
497 try:317 try:
498 process_output = handle_process_completion(process=process,318 return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout,
499 command=command,319 stderr=stderr, error_code=error_code, cpu_cgroup=cpu_cgroup)
500 timeout=timeout,
501 stdout=stdout,
502 stderr=stderr,
503 error_code=error_code)
504 except ExtensionError as e:320 except ExtensionError as e:
505 # The extension didn't terminate successfully. Determine whether it was due to systemd errors or321 # The extension didn't terminate successfully. Determine whether it was due to systemd errors or
506 # extension errors.322 # extension errors.
507 process_output = read_output(stdout, stderr)323 if not self._is_systemd_failure(scope, stderr):
508 systemd_failure = self._is_systemd_failure(scope_name, process_output)
509
510 if not systemd_failure:
511 # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error324 # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error
512 raise325 raise
326
327 # There was an issue with systemd-run. We need to log it and retry the extension without systemd.
328 process_output = read_output(stdout, stderr)
329 # Reset the stdout and stderr
330 stdout.truncate(0)
331 stderr.truncate(0)
332
333 if isinstance(e, ExtensionOperationError):
334 # no-member: Instance of 'ExtensionError' has no 'exit_code' member (no-member) - Disabled: e is actually an ExtensionOperationError
335 err_msg = 'Systemd process exited with code %s and output %s' % (
336 e.exit_code, process_output) # pylint: disable=no-member
513 else:337 else:
514 # There was an issue with systemd-run. We need to log it and retry the extension without systemd.338 err_msg = "Systemd timed-out, output: %s" % process_output
515 err_msg = 'Systemd process exited with code %s and output %s' % (e.exit_code, process_output) \339 raise SystemdRunError(err_msg)
516 if isinstance(e, ExtensionOperationError) else "Systemd timed-out, output: %s" % process_output340 finally:
517 event_msg = 'Failed to run systemd-run for unit {0}.scope. ' \341 with self._systemd_run_commands_lock:
518 'Will retry invoking the extension without systemd. ' \342 self._systemd_run_commands.remove(process.pid)
519 'Systemd-run error: {1}'.format(scope_name, err_msg)
520 add_event(AGENT_NAME,
521 version=CURRENT_VERSION,
522 op=WALAEventOperation.InvokeCommandUsingSystemd,
523 is_success=False,
524 log_event=False,
525 message=event_msg)
526 logger.warn(event_msg)
527
528 # Reset the stdout and stderr
529 stdout.truncate(0)
530 stderr.truncate(0)
531
532 # Try invoking the process again, this time without systemd-run
533 logger.info('Extension invocation using systemd failed, falling back to regular invocation '
534 'without cgroups tracking.')
535 process = subprocess.Popen(command,
536 shell=shell,
537 cwd=cwd,
538 env=env,
539 stdout=stdout,
540 stderr=stderr,
541 preexec_fn=os.setsid)
542
543 process_output = handle_process_completion(process=process,
544 command=command,
545 timeout=timeout,
546 stdout=stdout,
547 stderr=stderr,
548 error_code=error_code)
549
550 return [], process_output
551
552 # The process terminated in time and successfully
553 return extension_cgroups, process_output
554343
555 def cleanup_legacy_cgroups(self):344 def cleanup_legacy_cgroups(self):
556 """345 """
557 Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;346 Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;
558 starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If347 starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If
559 we find that any of the legacy groups include the PID of the daemon then we disable data collection for this instance348 we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this
560 (under systemd, moving PIDs across the cgroup file system can produce unpredictable results)349 instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results)
561 """350 """
562 def report_error(_, daemon_pid):351 return CGroupsApi._foreach_legacy_cgroup(lambda *_: None)
563 raise CGroupsException(
564 "The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid))
565
566 CGroupsApi._foreach_legacy_cgroup(report_error)
diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py
index ea6983f..767786f 100644
--- a/azurelinuxagent/common/cgroupconfigurator.py
+++ b/azurelinuxagent/common/cgroupconfigurator.py
@@ -1,3 +1,4 @@
1# -*- encoding: utf-8 -*-
1# Copyright 2018 Microsoft Corporation2# Copyright 2018 Microsoft Corporation
2#3#
3# Licensed under the Apache License, Version 2.0 (the "License");4# Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,157 +14,870 @@
13# limitations under the License.14# limitations under the License.
14#15#
15# Requires Python 2.6+ and Openssl 1.0+16# Requires Python 2.6+ and Openssl 1.0+
1617import glob
18import json
17import os19import os
20import re
18import subprocess21import subprocess
22import threading
1923
24from azurelinuxagent.common import conf
20from azurelinuxagent.common import logger25from azurelinuxagent.common import logger
21from azurelinuxagent.common.cgroupapi import CGroupsApi26from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup
27from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX
22from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry28from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
23from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes29from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException
24from azurelinuxagent.common.future import ustr30from azurelinuxagent.common.future import ustr
25from azurelinuxagent.common.osutil import get_osutil31from azurelinuxagent.common.osutil import get_osutil, systemd
32from azurelinuxagent.common.version import get_distro
33from azurelinuxagent.common.utils import shellutil, fileutil
26from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion34from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion
27from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION
28from azurelinuxagent.common.event import add_event, WALAEventOperation35from azurelinuxagent.common.event import add_event, WALAEventOperation
2936
37AZURE_SLICE = "azure.slice"
38_AZURE_SLICE_CONTENTS = """
39[Unit]
40Description=Slice for Azure VM Agent and Extensions
41DefaultDependencies=no
42Before=slices.target
43"""
44_VMEXTENSIONS_SLICE = EXTENSION_SLICE_PREFIX + ".slice"
45_AZURE_VMEXTENSIONS_SLICE = AZURE_SLICE + "/" + _VMEXTENSIONS_SLICE
46_VMEXTENSIONS_SLICE_CONTENTS = """
47[Unit]
48Description=Slice for Azure VM Extensions
49DefaultDependencies=no
50Before=slices.target
51[Slice]
52CPUAccounting=yes
53MemoryAccounting=yes
54"""
55_EXTENSION_SLICE_CONTENTS = """
56[Unit]
57Description=Slice for Azure VM extension {extension_name}
58DefaultDependencies=no
59Before=slices.target
60[Slice]
61CPUAccounting=yes
62CPUQuota={cpu_quota}
63MemoryAccounting=yes
64"""
65LOGCOLLECTOR_SLICE = "azure-walinuxagent-logcollector.slice"
66# More info on resource limits properties in systemd here:
67# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups
68_LOGCOLLECTOR_SLICE_CONTENTS_FMT = """
69[Unit]
70Description=Slice for Azure VM Agent Periodic Log Collector
71DefaultDependencies=no
72Before=slices.target
73[Slice]
74CPUAccounting=yes
75CPUQuota={cpu_quota}
76MemoryAccounting=yes
77"""
78_LOGCOLLECTOR_CPU_QUOTA = "5%"
79LOGCOLLECTOR_MEMORY_LIMIT = 30 * 1024 ** 2 # 30Mb
80
81_AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf"
82_AGENT_DROP_IN_FILE_SLICE_CONTENTS = """
83# This drop-in unit file was created by the Azure VM Agent.
84# Do not edit.
85[Service]
86Slice=azure.slice
87"""
88_DROP_IN_FILE_CPU_ACCOUNTING = "11-CPUAccounting.conf"
89_DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS = """
90# This drop-in unit file was created by the Azure VM Agent.
91# Do not edit.
92[Service]
93CPUAccounting=yes
94"""
95_DROP_IN_FILE_CPU_QUOTA = "12-CPUQuota.conf"
96_DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT = """
97# This drop-in unit file was created by the Azure VM Agent.
98# Do not edit.
99[Service]
100CPUQuota={0}
101"""
102_DROP_IN_FILE_MEMORY_ACCOUNTING = "13-MemoryAccounting.conf"
103_DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS = """
104# This drop-in unit file was created by the Azure VM Agent.
105# Do not edit.
106[Service]
107MemoryAccounting=yes
108"""
109
110
111class DisableCgroups(object):
112 ALL = "all"
113 AGENT = "agent"
114 EXTENSIONS = "extensions"
115
116
117def _log_cgroup_info(format_string, *args):
118 message = format_string.format(*args)
119 logger.info("[CGI] " + message)
120 add_event(op=WALAEventOperation.CGroupsInfo, message=message)
121
122
123def _log_cgroup_warning(format_string, *args):
124 message = format_string.format(*args)
125 logger.info("[CGW] " + message) # log as INFO for now, in the future it should be logged as WARNING
126 add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False)
127
30128
31class CGroupConfigurator(object):129class CGroupConfigurator(object):
32 """130 """
33 This class implements the high-level operations on CGroups (e.g. initialization, creation, etc)131 This class implements the high-level operations on CGroups (e.g. initialization, creation, etc)
34132
35 NOTE: with the exception of start_extension_command, none of the methods in this class raise exceptions (cgroup operations should not block extensions)133 NOTE: with the exception of start_extension_command, none of the methods in this class
134 raise exceptions (cgroup operations should not block extensions)
36 """135 """
37 class __impl(object):136
137 class _Impl(object):
38 def __init__(self):138 def __init__(self):
139 self._initialized = False
140 self._cgroups_supported = False
141 self._agent_cgroups_enabled = False
142 self._extensions_cgroups_enabled = False
143 self._cgroups_api = None
144 self._agent_cpu_cgroup_path = None
145 self._agent_memory_cgroup_path = None
146 self._agent_memory_cgroup = None
147 self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop.
148
149 def initialize(self):
150 try:
151 if self._initialized:
152 return
153 # This check is to reset the quotas if agent goes from cgroup supported to unsupported distros later in time.
154 if not CGroupsApi.cgroups_supported():
155 agent_drop_in_path = systemd.get_agent_drop_in_path()
156 try:
157 if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path):
158 files_to_cleanup = []
159 agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
160 agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path,
161 _DROP_IN_FILE_CPU_ACCOUNTING)
162 agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path,
163 _DROP_IN_FILE_MEMORY_ACCOUNTING)
164 agent_drop_in_file_cpu_quota = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
165 files_to_cleanup.extend([agent_drop_in_file_slice, agent_drop_in_file_cpu_accounting,
166 agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota])
167 self.__cleanup_all_files(files_to_cleanup)
168 self.__reload_systemd_config()
169 logger.info("Agent reset the quotas if distro: {0} goes from supported to unsupported list", get_distro())
170 except Exception as err:
171 logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err))
172
173 # check whether cgroup monitoring is supported on the current distro
174 self._cgroups_supported = CGroupsApi.cgroups_supported()
175 if not self._cgroups_supported:
176 logger.info("Cgroup monitoring is not supported on {0}", get_distro())
177 return
178
179 # check that systemd is detected correctly
180 self._cgroups_api = SystemdCgroupsApi()
181 if not systemd.is_systemd():
182 _log_cgroup_warning("systemd was not detected on {0}", get_distro())
183 return
184
185 _log_cgroup_info("systemd version: {0}", systemd.get_version())
186
187 # This is temporarily disabled while we analyze telemetry. Likely it will be removed.
188 # self.__collect_azure_unit_telemetry()
189 # self.__collect_agent_unit_files_telemetry()
190
191 if not self.__check_no_legacy_cgroups():
192 return
193
194 agent_unit_name = systemd.get_agent_unit_name()
195 agent_slice = systemd.get_unit_property(agent_unit_name, "Slice")
196 if agent_slice not in (AZURE_SLICE, "system.slice"):
197 _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice)
198 return
199
200 self.__setup_azure_slice()
201
202 cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers()
203 self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice,
204 cpu_controller_root,
205 memory_controller_root)
206
207 if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None:
208 self.enable()
209
210 if self._agent_cpu_cgroup_path is not None:
211 _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path)
212 self.__set_cpu_quota(conf.get_agent_cpu_quota())
213 CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
214
215 if self._agent_memory_cgroup_path is not None:
216 _log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path)
217 self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path)
218 CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup)
219
220 _log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled)
221
222 except Exception as exception:
223 _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception))
224 finally:
225 self._initialized = True
226
227 @staticmethod
228 def __collect_azure_unit_telemetry():
229 azure_units = []
230
231 try:
232 units = shellutil.run_command(['systemctl', 'list-units', 'azure*', '-all'])
233 for line in units.split('\n'):
234 match = re.match(r'\s?(azure[^\s]*)\s?', line, re.IGNORECASE)
235 if match is not None:
236 azure_units.append((match.group(1), line))
237 except shellutil.CommandError as command_error:
238 _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
239
240 for unit_name, unit_description in azure_units:
241 unit_slice = "Unknown"
242 try:
243 unit_slice = systemd.get_unit_property(unit_name, "Slice")
244 except Exception as exception:
245 _log_cgroup_warning("Failed to query Slice for {0}: {1}", unit_name, ustr(exception))
246
247 _log_cgroup_info("Found an Azure unit under slice {0}: {1}", unit_slice, unit_description)
248
249 if len(azure_units) == 0:
250 try:
251 cgroups = shellutil.run_command('systemd-cgls')
252 for line in cgroups.split('\n'):
253 if re.match(r'[^\x00-\xff]+azure\.slice\s*', line, re.UNICODE):
254 logger.info(ustr("Found a cgroup for azure.slice\n{0}").format(cgroups))
255 # Don't add the output of systemd-cgls to the telemetry, since currently it does not support Unicode
256 add_event(op=WALAEventOperation.CGroupsInfo, message="Found a cgroup for azure.slice")
257 except shellutil.CommandError as command_error:
258 _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
259
260 @staticmethod
261 def __collect_agent_unit_files_telemetry():
262 agent_unit_files = []
263 agent_service_name = get_osutil().get_service_name()
264 try:
265 fragment_path = systemd.get_unit_property(agent_service_name, "FragmentPath")
266 if fragment_path != systemd.get_agent_unit_file():
267 agent_unit_files.append(fragment_path)
268 except Exception as exception:
269 _log_cgroup_warning("Failed to query the agent's FragmentPath: {0}", ustr(exception))
270
271 try:
272 drop_in_paths = systemd.get_unit_property(agent_service_name, "DropInPaths")
273 for path in drop_in_paths.split():
274 agent_unit_files.append(path)
275 except Exception as exception:
276 _log_cgroup_warning("Failed to query the agent's DropInPaths: {0}", ustr(exception))
277
278 for unit_file in agent_unit_files:
279 try:
280 with open(unit_file, "r") as file_object:
281 _log_cgroup_info("Found a custom unit file for the agent: {0}\n{1}", unit_file,
282 file_object.read())
283 except Exception as exception:
284 _log_cgroup_warning("Can't read {0}: {1}", unit_file, ustr(exception))
285
286 def __check_no_legacy_cgroups(self):
287 """
288 Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running
289 under systemd this could produce invalid resource usage data. Cgroups should not be enabled under this condition.
290 """
291 legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups()
292 if legacy_cgroups > 0:
293 _log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.")
294 return False
295 return True
296
297 def __get_cgroup_controllers(self):
298 #
299 # check v1 controllers
300 #
301 cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points()
302
303 if cpu_controller_root is not None:
304 logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root)
305 else:
306 _log_cgroup_warning("The CPU cgroup controller is not mounted")
307
308 if memory_controller_root is not None:
309 logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root)
310 else:
311 _log_cgroup_warning("The memory cgroup controller is not mounted")
312
313 #
314 # check v2 controllers
315 #
316 cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers()
317 if cgroup2_mount_point is not None:
318 _log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point,
319 cgroup2_controllers)
320
321 return cpu_controller_root, memory_controller_root
322
323 @staticmethod
324 def __setup_azure_slice():
39 """325 """
40 Ensures the cgroups file system is mounted and selects the correct API to interact with it326 The agent creates "azure.slice" for use by extensions and the agent. The agent runs under "azure.slice" directly and each
327 extension runs under its own slice ("Microsoft.CPlat.Extension.slice" in the example below). All the slices for
328 extensions are grouped under "vmextensions.slice".
329
330 Example: -.slice
331 ├─user.slice
332 ├─system.slice
333 └─azure.slice
334 ├─walinuxagent.service
335 │ ├─5759 /usr/bin/python3 -u /usr/sbin/waagent -daemon
336 │ └─5764 python3 -u bin/WALinuxAgent-2.2.53-py2.7.egg -run-exthandlers
337 └─azure-vmextensions.slice
338 └─Microsoft.CPlat.Extension.slice
339 └─5894 /usr/bin/python3 /var/lib/waagent/Microsoft.CPlat.Extension-1.0.0.0/enable.py
340
341 This method ensures that the "azure" and "vmextensions" slices are created. Setup should create those slices
342 under /lib/systemd/system; but if they do not exist, __ensure_azure_slices_exist will create them.
343
344 It also creates drop-in files to set the agent's Slice and CPUAccounting if they have not been
345 set up in the agent's unit file.
346
347 Lastly, the method also cleans up unit files left over from previous versions of the agent.
41 """348 """
42 osutil = get_osutil()
43349
44 self._cgroups_supported = osutil.is_cgroups_supported()350 # Older agents used to create this slice, but it was never used. Cleanup the file.
351 CGroupConfigurator._Impl.__cleanup_unit_file("/etc/systemd/system/system-walinuxagent.extensions.slice")
352
353 unit_file_install_path = systemd.get_unit_file_install_path()
354 azure_slice = os.path.join(unit_file_install_path, AZURE_SLICE)
355 vmextensions_slice = os.path.join(unit_file_install_path, _VMEXTENSIONS_SLICE)
356 logcollector_slice = os.path.join(unit_file_install_path, LOGCOLLECTOR_SLICE)
357 agent_unit_file = systemd.get_agent_unit_file()
358 agent_drop_in_path = systemd.get_agent_drop_in_path()
359 agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
360 agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_ACCOUNTING)
361 agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_MEMORY_ACCOUNTING)
362
363 files_to_create = []
364
365 if not os.path.exists(azure_slice):
366 files_to_create.append((azure_slice, _AZURE_SLICE_CONTENTS))
367
368 if not os.path.exists(vmextensions_slice):
369 files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS))
370
371 # Update log collector slice contents
372 slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA)
373 files_to_create.append((logcollector_slice, slice_contents))
374
375 if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None:
376 CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_slice)
377 else:
378 if not os.path.exists(agent_drop_in_file_slice):
379 files_to_create.append((agent_drop_in_file_slice, _AGENT_DROP_IN_FILE_SLICE_CONTENTS))
380
381 if fileutil.findre_in_file(agent_unit_file, r"CPUAccounting=") is not None:
382 CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_cpu_accounting)
383 else:
384 if not os.path.exists(agent_drop_in_file_cpu_accounting):
385 files_to_create.append((agent_drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS))
386
387 if fileutil.findre_in_file(agent_unit_file, r"MemoryAccounting=") is not None:
388 CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_memory_accounting)
389 else:
390 if not os.path.exists(agent_drop_in_file_memory_accounting):
391 files_to_create.append(
392 (agent_drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS))
393
394 if len(files_to_create) > 0:
395 # create the unit files, but if 1 fails remove all and return
396 try:
397 for path, contents in files_to_create:
398 CGroupConfigurator._Impl.__create_unit_file(path, contents)
399 except Exception as exception:
400 _log_cgroup_warning("Failed to create unit files for the azure slice: {0}", ustr(exception))
401 for unit_file in files_to_create:
402 CGroupConfigurator._Impl.__cleanup_unit_file(unit_file)
403 return
404
405 CGroupConfigurator._Impl.__reload_systemd_config()
45406
46 if self._cgroups_supported:407 @staticmethod
47 self._enabled = True408 def __reload_systemd_config():
409 # reload the systemd configuration; the new slices will be used once the agent's service restarts
410 try:
411 logger.info("Executing systemctl daemon-reload...")
412 shellutil.run_command(["systemctl", "daemon-reload"])
413 except Exception as exception:
414 _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception))
415
416 @staticmethod
417 def __create_unit_file(path, contents):
418 parent, _ = os.path.split(path)
419 if not os.path.exists(parent):
420 fileutil.mkdir(parent, mode=0o755)
421 exists = os.path.exists(path)
422 fileutil.write_file(path, contents)
423 _log_cgroup_info("{0} {1}", "Updated" if exists else "Created", path)
424
425 @staticmethod
426 def __cleanup_unit_file(path):
427 if os.path.exists(path):
48 try:428 try:
49 osutil.mount_cgroups()429 os.remove(path)
50 self._cgroups_api = CGroupsApi.create()430 _log_cgroup_info("Removed {0}", path)
51 status = "The cgroup filesystem is ready to use"431 except Exception as exception:
52 except Exception as e:432 _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception))
53 status = ustr(e)433
54 self._enabled = False434 @staticmethod
435 def __cleanup_all_files(files_to_cleanup):
436 for path in files_to_cleanup:
437 if os.path.exists(path):
438 try:
439 os.remove(path)
440 _log_cgroup_info("Removed {0}", path)
441 except Exception as exception:
442 _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception))
443
444 @staticmethod
445 def __create_all_files(files_to_create):
446 # create the unit files, but if 1 fails remove all and return
447 try:
448 for path, contents in files_to_create:
449 CGroupConfigurator._Impl.__create_unit_file(path, contents)
450 except Exception as exception:
451 _log_cgroup_warning("Failed to create unit files : {0}", ustr(exception))
452 for unit_file in files_to_create:
453 CGroupConfigurator._Impl.__cleanup_unit_file(unit_file)
454 return
455
456 def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota=None):
457 unit_file_install_path = systemd.get_unit_file_install_path()
458 old_extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name, old_slice=True))
459 # clean up the old slice from the disk
460 if os.path.exists(old_extension_slice_path):
461 CGroupConfigurator._Impl.__cleanup_unit_file(old_extension_slice_path)
462
463 extension_slice_path = os.path.join(unit_file_install_path,
464 SystemdCgroupsApi.get_extension_slice_name(extension_name))
465 cpu_quota = str(
466 cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
467 slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name,
468 cpu_quota=cpu_quota)
469 if os.path.exists(extension_slice_path):
470 with open(extension_slice_path, "r") as file_:
471 if file_.read() == slice_contents:
472 return True
473 return False
474
475 def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controller_root):
476 agent_unit_name = systemd.get_agent_unit_name()
477
478 expected_relative_path = os.path.join(agent_slice, agent_unit_name)
479 cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths(
480 "self")
481
482 if cpu_cgroup_relative_path is None:
483 _log_cgroup_warning("The agent's process is not within a CPU cgroup")
484 else:
485 if cpu_cgroup_relative_path == expected_relative_path:
486 _log_cgroup_info('CPUAccounting: {0}', systemd.get_unit_property(agent_unit_name, "CPUAccounting"))
487 _log_cgroup_info('CPUQuota: {0}', systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec"))
488 else:
489 _log_cgroup_warning(
490 "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]",
491 cpu_cgroup_relative_path,
492 expected_relative_path)
493 cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring
494
495 if memory_cgroup_relative_path is None:
496 _log_cgroup_warning("The agent's process is not within a memory cgroup")
55 else:497 else:
56 self._enabled = False498 if memory_cgroup_relative_path == expected_relative_path:
57 self._cgroups_api = None499 memory_accounting = systemd.get_unit_property(agent_unit_name, "MemoryAccounting")
58 status = "Cgroups are not supported by the platform"500 _log_cgroup_info('MemoryAccounting: {0}', memory_accounting)
501 else:
502 _log_cgroup_info(
503 "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]",
504 memory_cgroup_relative_path,
505 expected_relative_path)
506 memory_cgroup_relative_path = None # Set the path to None to prevent monitoring
59507
60 logger.info("CGroups Status: {0}".format(status))508 if cpu_controller_root is not None and cpu_cgroup_relative_path is not None:
509 agent_cpu_cgroup_path = os.path.join(cpu_controller_root, cpu_cgroup_relative_path)
510 else:
511 agent_cpu_cgroup_path = None
512
513 if memory_controller_root is not None and memory_cgroup_relative_path is not None:
514 agent_memory_cgroup_path = os.path.join(memory_controller_root, memory_cgroup_relative_path)
515 else:
516 agent_memory_cgroup_path = None
61517
62 add_event(518 return agent_cpu_cgroup_path, agent_memory_cgroup_path
63 AGENT_NAME,519
64 version=CURRENT_VERSION,520 def supported(self):
65 op=WALAEventOperation.InitializeCGroups,521 return self._cgroups_supported
66 is_success=self._enabled,
67 message=status,
68 log_event=False)
69522
70 def enabled(self):523 def enabled(self):
71 return self._enabled524 return self._agent_cgroups_enabled or self._extensions_cgroups_enabled
525
526 def agent_enabled(self):
527 return self._agent_cgroups_enabled
528
529 def extensions_enabled(self):
530 return self._extensions_cgroups_enabled
72531
73 def enable(self):532 def enable(self):
74 if not self._cgroups_supported:533 if not self.supported():
75 raise CGroupsException("cgroups are not supported on the current platform")534 raise CGroupsException(
535 "Attempted to enable cgroups, but they are not supported on the current platform")
536 self._agent_cgroups_enabled = True
537 self._extensions_cgroups_enabled = True
76538
77 self._enabled = True539 def disable(self, reason, disable_cgroups):
540 if disable_cgroups == DisableCgroups.ALL: # disable all
541 # Reset quotas
542 self.__reset_agent_cpu_quota()
543 extension_services = self.get_extension_services_list()
544 for extension in extension_services:
545 logger.info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension]))
546 self.__reset_extension_cpu_quota(extension_name=extension)
547 self.__reset_extension_services_cpu_quota(extension_services[extension])
548 self.__reload_systemd_config()
78549
79 def disable(self):550 CGroupsTelemetry.reset()
80 self._enabled = False551 self._agent_cgroups_enabled = False
81 CGroupsTelemetry.reset()552 self._extensions_cgroups_enabled = False
553 elif disable_cgroups == DisableCgroups.AGENT: # disable agent
554 self._agent_cgroups_enabled = False
555 self.__reset_agent_cpu_quota()
556 CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
82557
83 def _invoke_cgroup_operation(self, operation, error_message, on_error=None):558 message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason)
559 logger.info(message) # log as INFO for now, in the future it should be logged as WARNING
560 add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False)
561
562 @staticmethod
563 def __set_cpu_quota(quota):
84 """564 """
85 Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation.565 Sets the agent's CPU quota to the given percentage (100% == 1 CPU)
566
567 NOTE: This is done using a dropin file in the default dropin directory; any local overrides on the VM will take precedence
568 over this setting.
86 """569 """
87 if not self.enabled():570 quota_percentage = "{0}%".format(quota)
88 return571 _log_cgroup_info("Ensuring the agent's CPUQuota is {0}", quota_percentage)
572 if CGroupConfigurator._Impl.__try_set_cpu_quota(quota_percentage):
573 CGroupsTelemetry.set_track_throttled_time(True)
574
575 @staticmethod
576 def __reset_agent_cpu_quota():
577 """
578 Removes any CPUQuota on the agent
89579
580 NOTE: This resets the quota on the agent's default dropin file; any local overrides on the VM will take precedence
581 over this setting.
582 """
583 logger.info("Resetting agent's CPUQuota")
584 if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity)
585 _log_cgroup_info('CPUQuota: {0}',
586 systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec"))
587
588 @staticmethod
589 def __try_set_cpu_quota(quota):
90 try:590 try:
91 return operation()591 drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
92 except Exception as e:592 contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(quota)
93 logger.warn("{0} Error: {1}".format(error_message, ustr(e)))593 if os.path.exists(drop_in_file):
94 if on_error is not None:594 with open(drop_in_file, "r") as file_:
95 try:595 if file_.read() == contents:
96 on_error(e)596 return True # no need to update the file; return here to avoid doing a daemon-reload
97 except Exception as ex:597 CGroupConfigurator._Impl.__create_unit_file(drop_in_file, contents)
98 logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e)))598 except Exception as exception:
599 _log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception))
600 return False
601 try:
602 logger.info("Executing systemctl daemon-reload...")
603 shellutil.run_command(["systemctl", "daemon-reload"])
604 except Exception as exception:
605 _log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception))
606 return False
607 return True
608
609 def check_cgroups(self, cgroup_metrics):
610 self._check_cgroups_lock.acquire()
611 try:
612 if not self.enabled():
613 return
614
615 errors = []
616
617 process_check_success = False
618 try:
619 self._check_processes_in_agent_cgroup()
620 process_check_success = True
621 except CGroupsException as exception:
622 errors.append(exception)
99623
100 def create_agent_cgroups(self, track_cgroups):624 quota_check_success = False
625 try:
626 if cgroup_metrics:
627 self._check_agent_throttled_time(cgroup_metrics)
628 quota_check_success = True
629 except CGroupsException as exception:
630 errors.append(exception)
631
632 reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors]))
633
634 if not process_check_success and conf.get_cgroup_disable_on_process_check_failure():
635 self.disable(reason, DisableCgroups.ALL)
636
637 if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure():
638 self.disable(reason, DisableCgroups.AGENT)
639 finally:
640 self._check_cgroups_lock.release()
641
642 def _check_processes_in_agent_cgroup(self):
101 """643 """
102 Creates and returns the cgroups needed to track the VM Agent644 Verifies that the agent's cgroup includes only the current process, its parent, commands started using shellutil and instances of systemd-run
645 (those processes correspond, respectively, to the extension handler, the daemon, commands started by the extension handler, and the systemd-run
646 commands used to start extensions on their own cgroup).
647 Other processes started by the agent (e.g. extensions) and processes not started by the agent (e.g. services installed by extensions) are reported
648 as unexpected, since they should belong to their own cgroup.
649
650 Raises a CGroupsException if the check fails
103 """651 """
104 def __impl():652 unexpected = []
105 cgroups = self._cgroups_api.create_agent_cgroups()653 agent_cgroup_proc_names = []
654 try:
655 daemon = os.getppid()
656 extension_handler = os.getpid()
657 agent_commands = set()
658 agent_commands.update(shellutil.get_running_commands())
659 systemd_run_commands = set()
660 systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
661 agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path)
662 # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup;
663 agent_commands.update(shellutil.get_running_commands())
664 systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
106665
107 if track_cgroups:666 for process in agent_cgroup:
108 for cgroup in cgroups:667 agent_cgroup_proc_names.append(self.__format_process(process))
109 CGroupsTelemetry.track_cgroup(cgroup)668 # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't.
669 if process in (daemon, extension_handler) or process in systemd_run_commands:
670 continue
671 # check shell systemd_run process if above process check didn't catch it
672 if self._check_systemd_run_process(process):
673 continue
674 # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent
675 if self._get_parent(process) in systemd_run_commands and self._get_command(
676 process) == 'systemd-run':
677 continue
678 # check if the process is a command started by the agent or a descendant of one of those commands
679 current = process
680 while current != 0 and current not in agent_commands:
681 current = self._get_parent(current)
682 # Verify if Process started by agent based on the marker found in process environment or process is in Zombie state.
683 # If so, consider it as valid process in agent cgroup.
684 if current == 0 and not (self.__is_process_descendant_of_the_agent(process) or self.__is_zombie_process(process)):
685 unexpected.append(self.__format_process(process))
686 if len(unexpected) >= 5: # collect just a small sample
687 break
688 except Exception as exception:
689 _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception)))
110690
111 return cgroups691 if len(unexpected) > 0:
692 self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected)
693 raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected))
112694
113 self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked.")695 @staticmethod
696 def _get_command(pid):
697 try:
698 with open('/proc/{0}/comm'.format(pid), "r") as file_:
699 comm = file_.read()
700 if comm and comm[-1] == '\x00': # if null-terminated, remove the null
701 comm = comm[:-1]
702 return comm.rstrip()
703 except Exception:
704 return "UNKNOWN"
114705
115 def cleanup_legacy_cgroups(self):706 @staticmethod
116 def __impl():707 def __format_process(pid):
117 self._cgroups_api.cleanup_legacy_cgroups()708 """
709 Formats the given PID as a string containing the PID and the corresponding command line truncated to 64 chars
710 """
711 try:
712 cmdline = '/proc/{0}/cmdline'.format(pid)
713 if os.path.exists(cmdline):
714 with open(cmdline, "r") as cmdline_file:
715 return "[PID: {0}] {1:64.64}".format(pid, cmdline_file.read())
716 except Exception:
717 pass
718 return "[PID: {0}] UNKNOWN".format(pid)
118719
119 message = 'Failed to process legacy cgroups. Collection of resource usage data will be disabled.'720 @staticmethod
721 def __is_process_descendant_of_the_agent(pid):
722 """
723 Returns True if the process is descendant of the agent by looking at the env flag(AZURE_GUEST_AGENT_PARENT_PROCESS_NAME)
724 that we set when the process starts otherwise False.
725 """
726 try:
727 env = '/proc/{0}/environ'.format(pid)
728 if os.path.exists(env):
729 with open(env, "r") as env_file:
730 environ = env_file.read()
731 if environ and environ[-1] == '\x00':
732 environ = environ[:-1]
733 return "{0}={1}".format(shellutil.PARENT_PROCESS_NAME, shellutil.AZURE_GUEST_AGENT) in environ
734 except Exception:
735 pass
736 return False
120737
121 def disable_cgroups(exception):738 @staticmethod
122 self.disable()739 def __is_zombie_process(pid):
123 add_event(740 """
124 AGENT_NAME,741 Returns True if process is in Zombie state otherwise False.
125 version=CURRENT_VERSION,
126 op=WALAEventOperation.CGroupsCleanUp,
127 is_success=False,
128 log_event=False,
129 message='{0} {1}'.format(message, ustr(exception)))
130742
131 self._invoke_cgroup_operation(__impl, message, on_error=disable_cgroups)743 Ex: cat /proc/18171/stat
744 18171 (python3) S 18103 18103 18103 0 -1 4194624 57736 64902 0 3
745 """
746 try:
747 stat = '/proc/{0}/stat'.format(pid)
748 if os.path.exists(stat):
749 with open(stat, "r") as stat_file:
750 return stat_file.read().split()[2] == 'Z'
751 except Exception:
752 pass
753 return False
132754
133 def create_extension_cgroups_root(self):755 @staticmethod
756 def _check_systemd_run_process(process):
134 """757 """
135 Creates the container (directory/cgroup) that includes the cgroups for all extensions (/sys/fs/cgroup/*/walinuxagent.extensions)758 Returns True if process is shell systemd-run process started by agent otherwise False.
759
760 Ex: sh,7345 -c systemd-run --unit=enable_7c5cab19-eb79-4661-95d9-9e5091bd5ae0 --scope --slice=azure-vmextensions-Microsoft.OSTCExtensions.VMAccessForLinux_1.5.11.slice /var/lib/waagent/Microsoft.OSTCExtensions.VMAccessForLinux-1.5.11/processes.sh
136 """761 """
137 def __impl():762 try:
138 self._cgroups_api.create_extension_cgroups_root()763 process_name = "UNKNOWN"
764 cmdline = '/proc/{0}/cmdline'.format(process)
765 if os.path.exists(cmdline):
766 with open(cmdline, "r") as cmdline_file:
767 process_name = "{0}".format(cmdline_file.read())
768 match = re.search(r'systemd-run.*--unit=.*--scope.*--slice=azure-vmextensions.*', process_name)
769 if match is not None:
770 return True
771 except Exception:
772 pass
773 return False
774
775 @staticmethod
776 def _report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected):
777 for proc_name in unexpected:
778 if 'UNKNOWN' in proc_name:
779 msg = "Agent includes following processes when UNKNOWN process found: {0}".format("\n".join([ustr(proc) for proc in agent_cgroup_proc_names]))
780 add_event(op=WALAEventOperation.CGroupsInfo, message=msg)
139781
140 self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked.")782 @staticmethod
783 def _check_agent_throttled_time(cgroup_metrics):
784 for metric in cgroup_metrics:
785 if metric.instance == AGENT_NAME_TELEMETRY and metric.counter == MetricsCounter.THROTTLED_TIME:
786 if metric.value > conf.get_agent_cpu_throttled_time_threshold():
787 raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value))
141788
142 def create_extension_cgroups(self, name):789 def check_agent_memory_usage(self):
790 if self.enabled() and self._agent_memory_cgroup:
791 metrics = self._agent_memory_cgroup.get_tracked_metrics()
792 current_usage = 0
793 for metric in metrics:
794 if metric.counter == MetricsCounter.TOTAL_MEM_USAGE:
795 current_usage += metric.value
796 elif metric.counter == MetricsCounter.SWAP_MEM_USAGE:
797 current_usage += metric.value
798
799 if current_usage > conf.get_agent_memory_quota():
800 raise AgentMemoryExceededException("The agent memory limit {0} bytes exceeded. The current reported usage is {1} bytes.".format(conf.get_agent_memory_quota(), current_usage))
801
802 @staticmethod
803 def _get_parent(pid):
143 """804 """
144 Creates and returns the cgroups for the given extension805 Returns the parent of the given process. If the parent cannot be determined returns 0 (which is the PID for the scheduler)
145 """806 """
146 def __impl():807 try:
147 return self._cgroups_api.create_extension_cgroups(name)808 stat = '/proc/{0}/stat'.format(pid)
809 if os.path.exists(stat):
810 with open(stat, "r") as stat_file:
811 return int(stat_file.read().split()[3])
812 except Exception:
813 pass
814 return 0
148815
149 return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked.".format(name))816 def start_tracking_unit_cgroups(self, unit_name):
817 """
818 TODO: Start tracking Memory Cgroups
819 """
820 try:
821 cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name)
822
823 if cpu_cgroup_path is None:
824 logger.info("The CPU controller is not mounted; will not track resource usage")
825 else:
826 CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path))
827
828 if memory_cgroup_path is None:
829 logger.info("The Memory controller is not mounted; will not track resource usage")
830 else:
831 CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path))
832
833 except Exception as exception:
834 logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception))
150835
151 def remove_extension_cgroups(self, name):836 def stop_tracking_unit_cgroups(self, unit_name):
152 """837 """
153 Deletes the cgroup for the given extension838 TODO: remove Memory cgroups from tracked list.
154 """839 """
155 def __impl():840 try:
156 cgroups = self._cgroups_api.remove_extension_cgroups(name)841 cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name)
157 return cgroups842
843 if cpu_cgroup_path is not None:
844 CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path))
845
846 if memory_cgroup_path is not None:
847 CGroupsTelemetry.stop_tracking(MemoryCgroup(unit_name, memory_cgroup_path))
158848
159 self._invoke_cgroup_operation(__impl, "Failed to delete cgroups for extension '{0}'.".format(name))849 except Exception as exception:
850 logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
160851
161 def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,852 def stop_tracking_extension_cgroups(self, extension_name):
853 """
854 TODO: remove extension Memory cgroups from tracked list
855 """
856 try:
857 extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name)
858 cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE,
859 extension_slice_name)
860
861 cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self._cgroups_api.get_cgroup_mount_points()
862 cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
863 memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path)
864
865 if cpu_cgroup_path is not None:
866 CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path))
867
868 if memory_cgroup_path is not None:
869 CGroupsTelemetry.stop_tracking(MemoryCgroup(extension_name, memory_cgroup_path))
870
871 except Exception as exception:
872 logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
873
874 def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr,
162 error_code=ExtensionErrorCodes.PluginUnknownFailure):875 error_code=ExtensionErrorCodes.PluginUnknownFailure):
163 """876 """
164 Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup877 Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup
165 :param extension_name: The extension executing the command878 :param extension_name: The extension executing the command
166 :param command: The command to invoke879 :param command: The command to invoke
880 :param cmd_name: The type of the command(enable, install, etc.)
167 :param timeout: Number of seconds to wait for command completion881 :param timeout: Number of seconds to wait for command completion
168 :param cwd: The working directory for the command882 :param cwd: The working directory for the command
169 :param env: The environment to pass to the command's process883 :param env: The environment to pass to the command's process
@@ -172,39 +886,207 @@ class CGroupConfigurator(object):
172 :param stderr: File object to redirect stderr to886 :param stderr: File object to redirect stderr to
173 :param error_code: Extension error code to raise in case of error887 :param error_code: Extension error code to raise in case of error
174 """888 """
175 if not self.enabled():889 if self.enabled():
176 process = subprocess.Popen(command,890 try:
177 shell=shell,891 return self._cgroups_api.start_extension_command(extension_name, command, cmd_name, timeout,
178 cwd=cwd,892 shell=shell, cwd=cwd, env=env, stdout=stdout,
179 env=env,893 stderr=stderr, error_code=error_code)
180 stdout=stdout,894 except SystemdRunError as exception:
181 stderr=stderr,895 reason = 'Failed to start {0} using systemd-run, will try invoking the extension directly. Error: {1}'.format(
182 preexec_fn=os.setsid)896 extension_name, ustr(exception))
183897 self.disable(reason, DisableCgroups.ALL)
184 process_output = handle_process_completion(process=process,898 # fall-through and re-invoke the extension
185 command=command,899
186 timeout=timeout,900 # subprocess-popen-preexec-fn<W1509> Disabled: code is not multi-threaded
187 stdout=stdout,901 process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509
188 stderr=stderr,902 return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code)
189 error_code=error_code)903
190 else:904 def __reset_extension_cpu_quota(self, extension_name):
191 extension_cgroups, process_output = self._cgroups_api.start_extension_command(extension_name,905 """
192 command,906 Removes any CPUQuota on the extension
193 timeout,907
194 shell=shell,908 NOTE: This resets the quota on the extension's slice; any local overrides on the VM will take precedence
195 cwd=cwd,909 over this setting.
196 env=env,910 """
197 stdout=stdout,911 if self.enabled():
198 stderr=stderr,912 self.setup_extension_slice(extension_name, cpu_quota=None)
199 error_code=error_code)913
200914 def setup_extension_slice(self, extension_name, cpu_quota):
201 return process_output915 """
202916 Each extension runs under its own slice (Ex "Microsoft.CPlat.Extension.slice"). All the slices for
203 # unique instance for the singleton (TODO: find a better pattern for a singleton)917 extensions are grouped under "azure-vmextensions.slice.
918
919 This method ensures that the extension slice is created. Setup should create
920 under /lib/systemd/system if it is not exist.
921 TODO: set memory quotas
922 """
923 if self.enabled():
924 unit_file_install_path = systemd.get_unit_file_install_path()
925 extension_slice_path = os.path.join(unit_file_install_path,
926 SystemdCgroupsApi.get_extension_slice_name(extension_name))
927 try:
928 cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
929 if cpu_quota == "":
930 _log_cgroup_info("CPUQuota not set for {0}", extension_name)
931 else:
932 _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota)
933 slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name,
934 cpu_quota=cpu_quota)
935 CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents)
936 except Exception as exception:
937 _log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name,
938 ustr(exception))
939 CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path)
940
941 def remove_extension_slice(self, extension_name):
942 """
943 This method ensures that the extension slice gets removed from /lib/systemd/system if it exist
944 Lastly stop the unit. This would ensure the cleanup the /sys/fs/cgroup controller paths
945 """
946 if self.enabled():
947 unit_file_install_path = systemd.get_unit_file_install_path()
948 extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name)
949 extension_slice_path = os.path.join(unit_file_install_path, extension_slice_name)
950 if os.path.exists(extension_slice_path):
951 self.stop_tracking_extension_cgroups(extension_name)
952 CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path)
953
954 def set_extension_services_cpu_memory_quota(self, services_list):
955 """
956 Each extension service will have name, systemd path and it's quotas.
957 This method ensures that drop-in files are created under service.d folder if quotas given.
958 ex: /lib/systemd/system/extension.service.d/11-CPUAccounting.conf
959 TODO: set memory quotas
960 """
961 if self.enabled() and services_list is not None:
962 for service in services_list:
963 service_name = service.get('name', None)
964 unit_file_path = systemd.get_unit_file_install_path()
965 if service_name is not None and unit_file_path is not None:
966 files_to_create = []
967 drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
968 drop_in_file_cpu_accounting = os.path.join(drop_in_path,
969 _DROP_IN_FILE_CPU_ACCOUNTING)
970 files_to_create.append((drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS))
971 drop_in_file_memory_accounting = os.path.join(drop_in_path,
972 _DROP_IN_FILE_MEMORY_ACCOUNTING)
973 files_to_create.append(
974 (drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS))
975
976 cpu_quota = service.get('cpuQuotaPercentage', None)
977 if cpu_quota is not None:
978 cpu_quota = str(cpu_quota) + "%"
979 _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", service_name, cpu_quota)
980 drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
981 cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota)
982 files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents))
983
984 self.__create_all_files(files_to_create)
985 self.__reload_systemd_config()
986
987 def __reset_extension_services_cpu_quota(self, services_list):
988 """
989 Removes any CPUQuota on the extension service
990
991 NOTE: This resets the quota on the extension service's default dropin file; any local overrides on the VM will take precedence
992 over this setting.
993 """
994 if self.enabled() and services_list is not None:
995 service_name = None
996 try:
997 for service in services_list:
998 service_name = service.get('name', None)
999 unit_file_path = systemd.get_unit_file_install_path()
1000 if service_name is not None and unit_file_path is not None:
1001 files_to_create = []
1002 drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
1003 cpu_quota = "" # setting an empty value resets to the default (infinity)
1004 drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
1005 cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota)
1006 if os.path.exists(drop_in_file_cpu_quota):
1007 with open(drop_in_file_cpu_quota, "r") as file_:
1008 if file_.read() == cpu_quota_contents:
1009 return
1010 files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents))
1011 self.__create_all_files(files_to_create)
1012 except Exception as exception:
1013 _log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception))
1014
1015 def remove_extension_services_drop_in_files(self, services_list):
1016 """
1017 Remove the dropin files from service .d folder for the given service
1018 """
1019 if services_list is not None:
1020 for service in services_list:
1021 service_name = service.get('name', None)
1022 unit_file_path = systemd.get_unit_file_install_path()
1023 if service_name is not None and unit_file_path is not None:
1024 files_to_cleanup = []
1025 drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
1026 drop_in_file_cpu_accounting = os.path.join(drop_in_path,
1027 _DROP_IN_FILE_CPU_ACCOUNTING)
1028 files_to_cleanup.append(drop_in_file_cpu_accounting)
1029 drop_in_file_memory_accounting = os.path.join(drop_in_path,
1030 _DROP_IN_FILE_MEMORY_ACCOUNTING)
1031 files_to_cleanup.append(drop_in_file_memory_accounting)
1032 cpu_quota = service.get('cpuQuotaPercentage', None)
1033 if cpu_quota is not None:
1034 drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
1035 files_to_cleanup.append(drop_in_file_cpu_quota)
1036
1037 CGroupConfigurator._Impl.__cleanup_all_files(files_to_cleanup)
1038 _log_cgroup_info("Drop in files removed for {0}".format(service_name))
1039
1040 def stop_tracking_extension_services_cgroups(self, services_list):
1041 """
1042 Remove the cgroup entry from the tracked groups to stop tracking.
1043 """
1044 if self.enabled() and services_list is not None:
1045 for service in services_list:
1046 service_name = service.get('name', None)
1047 if service_name is not None:
1048 self.stop_tracking_unit_cgroups(service_name)
1049
1050 def start_tracking_extension_services_cgroups(self, services_list):
1051 """
1052 Add the cgroup entry to start tracking the services cgroups.
1053 """
1054 if self.enabled() and services_list is not None:
1055 for service in services_list:
1056 service_name = service.get('name', None)
1057 if service_name is not None:
1058 self.start_tracking_unit_cgroups(service_name)
1059
1060 @staticmethod
1061 def get_extension_services_list():
1062 """
1063 ResourceLimits for extensions are coming from <extName>/HandlerManifest.json file.
1064 Use this pattern to determine all the installed extension HandlerManifest files and
1065 read the extension services if ResourceLimits are present.
1066 """
1067 extensions_services = {}
1068 for manifest_path in glob.iglob(os.path.join(conf.get_lib_dir(), "*/HandlerManifest.json")):
1069 match = re.search("(?P<extname>[\\w+\\.-]+).HandlerManifest\\.json", manifest_path)
1070 if match is not None:
1071 extensions_name = match.group('extname')
1072 if not extensions_name.startswith('WALinuxAgent'):
1073 try:
1074 data = json.loads(fileutil.read_file(manifest_path))
1075 resource_limits = data[0].get('resourceLimits', None)
1076 services = resource_limits.get('services') if resource_limits else None
1077 extensions_services[extensions_name] = services
1078 except (IOError, OSError) as e:
1079 _log_cgroup_warning(
1080 'Failed to load manifest file ({0}): {1}'.format(manifest_path, e.strerror))
1081 except ValueError:
1082 _log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path))
1083 return extensions_services
1084
1085 # unique instance for the singleton
204 _instance = None1086 _instance = None
2051087
206 @staticmethod1088 @staticmethod
207 def get_instance():1089 def get_instance():
208 if CGroupConfigurator._instance is None:1090 if CGroupConfigurator._instance is None:
209 CGroupConfigurator._instance = CGroupConfigurator.__impl()1091 CGroupConfigurator._instance = CGroupConfigurator._Impl()
210 return CGroupConfigurator._instance1092 return CGroupConfigurator._instance
diff --git a/azurelinuxagent/common/cgroupstelemetry.py b/azurelinuxagent/common/cgroupstelemetry.py
index 4bbcba1..7b6bba0 100644
--- a/azurelinuxagent/common/cgroupstelemetry.py
+++ b/azurelinuxagent/common/cgroupstelemetry.py
@@ -15,101 +15,26 @@
15# Requires Python 2.6+ and Openssl 1.0+15# Requires Python 2.6+ and Openssl 1.0+
16import errno16import errno
17import threading17import threading
18from collections import namedtuple
19from datetime import datetime as dt
2018
21from azurelinuxagent.common import logger19from azurelinuxagent.common import logger
22from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers20from azurelinuxagent.common.cgroup import CpuCgroup
23from azurelinuxagent.common.exception import CGroupsException
24from azurelinuxagent.common.future import ustr21from azurelinuxagent.common.future import ustr
25from azurelinuxagent.common.logger import EVERY_SIX_HOURS
26from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo
27
28MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
29StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric'])
30
31DELIM = " | "
32DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
33DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"
34
35
36class MetricsCategory(object):
37 MEMORY_CATEGORY = "Memory"
38 PROCESS_CATEGORY = "Process"
39
40
41class MetricsCounter(object):
42 PROCESSOR_PERCENT_TIME = "% Processor Time"
43 TOTAL_MEM_USAGE = "Total Memory Usage"
44 MAX_MEM_USAGE = "Max Memory Usage"
45 MEM_USED_BY_PROCESS = "Memory Used by Process"
4622
4723
48class CGroupsTelemetry(object):24class CGroupsTelemetry(object):
49 """25 """
50 """26 """
51 _tracked = []27 _tracked = {}
52 _cgroup_metrics = {}28 _track_throttled_time = False
53 _rlock = threading.RLock()29 _rlock = threading.RLock()
5430
55 @staticmethod31 @staticmethod
56 def get_process_info_summary(process_id):32 def set_track_throttled_time(value):
57 process_cmdline = DEFAULT_PROCESS_COMMANDLINE33 CGroupsTelemetry._track_throttled_time = value
58 process_name = DEFAULT_PROCESS_NAME
59
60 # The ProcessName and ProcessCommandLine can generate Exception if the file /proc/<pid>/{comm,cmdline} cease to
61 # exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the
62 # details from those files.
63 try:
64 process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE
65 except Exception as e:
66 logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
67
68 try:
69 process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME
70 except Exception as e:
71 logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
72
73 return process_id + DELIM + process_name + DELIM + process_cmdline
7434
75 @staticmethod35 @staticmethod
76 def _get_metrics_list(metric):36 def get_track_throttled_time():
77 return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(),37 return CGroupsTelemetry._track_throttled_time
78 metric.first_poll_time(), metric.last_poll_time()]
79
80 @staticmethod
81 def _process_cgroup_metric(cgroup_metrics):
82 memory_usage = cgroup_metrics.get_memory_metrics()
83 max_memory_usage = cgroup_metrics.get_max_memory_metrics()
84 cpu_usage = cgroup_metrics.get_cpu_metrics()
85 memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics()
86
87 processed_extension = {}
88
89 if cpu_usage.count() > 0:
90 processed_extension["cpu"] = {"cur_cpu": CGroupsTelemetry._get_metrics_list(cpu_usage)}
91
92 if memory_usage.count() > 0:
93 if "memory" in processed_extension:
94 processed_extension["memory"]["cur_mem"] = CGroupsTelemetry._get_metrics_list(memory_usage)
95 else:
96 processed_extension["memory"] = {"cur_mem": CGroupsTelemetry._get_metrics_list(memory_usage)}
97
98 if max_memory_usage.count() > 0:
99 if "memory" in processed_extension:
100 processed_extension["memory"]["max_mem"] = CGroupsTelemetry._get_metrics_list(max_memory_usage)
101 else:
102 processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)}
103
104 for pid_process_memory in memory_usage_per_process:
105 if "proc_statm_memory" in processed_extension:
106 processed_extension["proc_statm_memory"][pid_process_memory.pid_name_cmdline] = \
107 CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)
108 else:
109 processed_extension["proc_statm_memory"] = {pid_process_memory.pid_name_cmdline:
110 CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)}
111
112 return processed_extension
11338
114 @staticmethod39 @staticmethod
115 def track_cgroup(cgroup):40 def track_cgroup(cgroup):
@@ -122,221 +47,56 @@ class CGroupsTelemetry(object):
12247
123 with CGroupsTelemetry._rlock:48 with CGroupsTelemetry._rlock:
124 if not CGroupsTelemetry.is_tracked(cgroup.path):49 if not CGroupsTelemetry.is_tracked(cgroup.path):
125 CGroupsTelemetry._tracked.append(cgroup)50 CGroupsTelemetry._tracked[cgroup.path] = cgroup
126 logger.info("Started tracking new cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path))51 logger.info("Started tracking cgroup {0}", cgroup)
12752
128 @staticmethod53 @staticmethod
129 def is_tracked(path):54 def is_tracked(path):
130 """55 """
131 Returns true if the given item is in the list of tracked items56 Returns true if the given item is in the list of tracked items
132 O(n) operation. But limited to few cgroup objects we have.57 O(1) operation.
133 """58 """
134 with CGroupsTelemetry._rlock:59 with CGroupsTelemetry._rlock:
135 for cgroup in CGroupsTelemetry._tracked:60 if path in CGroupsTelemetry._tracked:
136 if path == cgroup.path:61 return True
137 return True
13862
139 return False63 return False
14064
141 @staticmethod65 @staticmethod
142 def stop_tracking(cgroup):66 def stop_tracking(cgroup):
143 """67 """
144 Stop tracking the cgroups for the given name68 Stop tracking the cgroups for the given path
145 """69 """
146 with CGroupsTelemetry._rlock:70 with CGroupsTelemetry._rlock:
147 CGroupsTelemetry._tracked.remove(cgroup)71 if cgroup.path in CGroupsTelemetry._tracked:
148 logger.info("Stopped tracking cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path))72 CGroupsTelemetry._tracked.pop(cgroup.path)
14973 logger.info("Stopped tracking cgroup {0}", cgroup)
150 @staticmethod
151 def report_all_tracked():
152 """
153 The report_all_tracked's purpose is to collect the data from the tracked cgroups and process the metric into a
154 data structure by _process_cgroup_metric. The perf metric is added into the data structure and returned to the
155 caller.
156
157 The report_all_tracked would be removed soon - in favor of sending report_metric directly, when polling the data
158 from tracked groups.
159
160 :return collected_metrics: dictionary of cgroups metrics.
161 """
162 collected_metrics = {}
163
164 for name, cgroup_metrics in CGroupsTelemetry._cgroup_metrics.items():
165 perf_metric = CGroupsTelemetry._process_cgroup_metric(cgroup_metrics)
166
167 if perf_metric:
168 collected_metrics[name] = perf_metric
169
170 cgroup_metrics.clear()
171
172 # Doing cleanup after the metrics have already been collected.
173 for key in [key for key in CGroupsTelemetry._cgroup_metrics if
174 CGroupsTelemetry._cgroup_metrics[key].marked_for_delete]:
175 del CGroupsTelemetry._cgroup_metrics[key]
176
177 return collected_metrics
17874
179 @staticmethod75 @staticmethod
180 def poll_all_tracked():76 def poll_all_tracked():
181 metrics = []77 metrics = []
18278 inactive_cgroups = []
183 with CGroupsTelemetry._rlock:79 with CGroupsTelemetry._rlock:
184 for cgroup in CGroupsTelemetry._tracked[:]:80 for cgroup in CGroupsTelemetry._tracked.values():
185 if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
186 CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics()
187 try:81 try:
188 if cgroup.controller == CGroupContollers.CPU:82 metrics.extend(cgroup.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time))
189 current_cpu_usage = cgroup.get_cpu_usage()
190 CGroupsTelemetry._cgroup_metrics[cgroup.name].add_cpu_usage(current_cpu_usage)
191 metrics.append(MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.
192 PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage))
193 elif cgroup.controller == CGroupContollers.MEMORY:
194 current_memory_usage = cgroup.get_memory_usage()
195 CGroupsTelemetry._cgroup_metrics[cgroup.name].add_memory_usage(current_memory_usage)
196 metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
197 TOTAL_MEM_USAGE, cgroup.name, current_memory_usage))
198
199 max_memory_usage = cgroup.get_max_memory_usage()
200 CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage)
201 metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE,
202 cgroup.name, max_memory_usage))
203
204 pids = cgroup.get_tracked_processes()
205 for pid in pids:
206 try:
207 mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid)
208 metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
209 MEM_USED_BY_PROCESS, CGroupsTelemetry.get_process_info_summary(pid),
210 mem_usage_from_procstatm))
211 CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory(
212 CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm)
213 except Exception as e:
214 if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
215 logger.periodic_warn(logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm "
216 "for pid {0}. Error : {1}", pid, ustr(e))
217 else:
218 raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(
219 cgroup.controller, cgroup.name))
220 except Exception as e:83 except Exception as e:
221 # There can be scenarios when the CGroup has been deleted by the time we are fetching the values84 # There can be scenarios when the CGroup has been deleted by the time we are fetching the values
222 # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log85 # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
223 # every occurrences of such case as it would be very verbose. We do want to log all the other86 # every occurrences of such case as it would be very verbose. We do want to log all the other
224 # exceptions which could occur, which is why we do a periodic log for all the other errors.87 # exceptions which could occur, which is why we do a periodic log for all the other errors.
225 if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:88 if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101
226 logger.periodic_warn(logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup '89 logger.periodic_warn(logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup '
227 '{0}. Error : {1}'.format(cgroup.name, ustr(e)))90 '{0}. Error : {1}'.format(cgroup.name, ustr(e)))
228 if not cgroup.is_active():91 if not cgroup.is_active():
229 CGroupsTelemetry.stop_tracking(cgroup)92 inactive_cgroups.append(cgroup)
230 CGroupsTelemetry._cgroup_metrics[cgroup.name].marked_for_delete = True93 for inactive_cgroup in inactive_cgroups:
94 CGroupsTelemetry.stop_tracking(inactive_cgroup)
23195
232 return metrics96 return metrics
23397
234 @staticmethod98 @staticmethod
235 def prune_all_tracked():
236 with CGroupsTelemetry._rlock:
237 for cgroup in CGroupsTelemetry._tracked[:]:
238 if not cgroup.is_active():
239 CGroupsTelemetry.stop_tracking(cgroup)
240
241 @staticmethod
242 def reset():99 def reset():
243 with CGroupsTelemetry._rlock:100 with CGroupsTelemetry._rlock:
244 CGroupsTelemetry._tracked *= 0 # emptying the list101 CGroupsTelemetry._tracked.clear() # emptying the dictionary
245 CGroupsTelemetry._cgroup_metrics = {}102 CGroupsTelemetry._track_throttled_time = False
246
247
248class CgroupMetrics(object):
249 def __init__(self):
250 self._memory_usage = Metric()
251 self._max_memory_usage = Metric()
252 self._cpu_usage = Metric()
253 self._proc_statm_mem = {}
254
255 self.marked_for_delete = False
256
257 def add_memory_usage(self, usage):
258 if not self.marked_for_delete:
259 self._memory_usage.append(usage)
260
261 def add_max_memory_usage(self, usage):
262 if not self.marked_for_delete:
263 self._max_memory_usage.append(usage)
264
265 def add_cpu_usage(self, usage):
266 if not self.marked_for_delete:
267 self._cpu_usage.append(usage)
268
269 def add_proc_statm_memory(self, pid, usage):
270 if not self.marked_for_delete:
271 if pid not in self._proc_statm_mem:
272 self._proc_statm_mem[pid] = Metric()
273 self._proc_statm_mem[pid].append(usage)
274
275 def get_memory_metrics(self):
276 return self._memory_usage
277
278 def get_max_memory_metrics(self):
279 return self._max_memory_usage
280
281 def get_cpu_metrics(self):
282 return self._cpu_usage
283
284 def get_proc_statm_memory_metrics(self):
285 """
286 :return: StatmMetricValue tuples of pid and metric
287 """
288 return [StatmMetricValue(pid_name_cmdline, metric) for pid_name_cmdline, metric in self._proc_statm_mem.items()]
289
290 def clear(self):
291 self._memory_usage.clear()
292 self._max_memory_usage.clear()
293 self._cpu_usage.clear()
294 self._proc_statm_mem.clear()
295
296
297class Metric(object):
298 def __init__(self):
299 self._data = []
300 self._first_poll_time = None
301 self._last_poll_time = None
302
303 def append(self, data):
304 if not self._first_poll_time:
305 # We only want to do it first time.
306 self._first_poll_time = dt.utcnow()
307
308 self._data.append(data)
309 self._last_poll_time = dt.utcnow()
310
311 def clear(self):
312 self._first_poll_time = None
313 self._last_poll_time = None
314 self._data *= 0
315
316 def average(self):
317 return float(sum(self._data)) / float(len(self._data)) if self._data else None
318
319 def max(self):
320 return max(self._data) if self._data else None
321
322 def min(self):
323 return min(self._data) if self._data else None
324
325 def median(self):
326 data = sorted(self._data)
327 l_len = len(data)
328 if l_len < 1:
329 return None
330 if l_len % 2 == 0:
331 return (data[int((l_len - 1) / 2)] + data[int((l_len + 1) / 2)]) / 2.0
332 else:
333 return data[int((l_len - 1) / 2)]
334
335 def count(self):
336 return len(self._data)
337
338 def first_poll_time(self):
339 return str(self._first_poll_time)
340
341 def last_poll_time(self):
342 return str(self._last_poll_time)
diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py
index bfc61f0..46765ea 100644
--- a/azurelinuxagent/common/conf.py
+++ b/azurelinuxagent/common/conf.py
@@ -19,11 +19,11 @@
1919
20"""20"""
21Module conf loads and parses configuration file21Module conf loads and parses configuration file
22"""22""" # pylint: disable=W0105
23import os23import os
24import os.path24import os.path
2525
26import azurelinuxagent.common.utils.fileutil as fileutil26from azurelinuxagent.common.utils.fileutil import read_file #pylint: disable=R0401
27from azurelinuxagent.common.exception import AgentConfigError27from azurelinuxagent.common.exception import AgentConfigError
2828
29DISABLE_AGENT_FILE = 'disable_agent'29DISABLE_AGENT_FILE = 'disable_agent'
@@ -49,25 +49,43 @@ class ConfigurationProvider(object):
49 value = parts[1].split('#')[0].strip("\" ").strip()49 value = parts[1].split('#')[0].strip("\" ").strip()
50 self.values[key] = value if value != "None" else None50 self.values[key] = value if value != "None" else None
5151
52 def get(self, key, default_val):52 @staticmethod
53 def _get_default(default):
54 if hasattr(default, '__call__'):
55 return default()
56 return default
57
58 def get(self, key, default_value):
59 """
60 Retrieves a string parameter by key and returns its value. If not found returns the default value,
61 or if the default value is a callable returns the result of invoking the callable.
62 """
53 val = self.values.get(key)63 val = self.values.get(key)
54 return val if val is not None else default_val64 return val if val is not None else self._get_default(default_value)
5565
56 def get_switch(self, key, default_val):66 def get_switch(self, key, default_value):
67 """
68 Retrieves a switch parameter by key and returns its value as a boolean. If not found returns the default value,
69 or if the default value is a callable returns the result of invoking the callable.
70 """
57 val = self.values.get(key)71 val = self.values.get(key)
58 if val is not None and val.lower() == 'y':72 if val is not None and val.lower() == 'y':
59 return True73 return True
60 elif val is not None and val.lower() == 'n':74 elif val is not None and val.lower() == 'n':
61 return False75 return False
62 return default_val76 return self._get_default(default_value)
6377
64 def get_int(self, key, default_val):78 def get_int(self, key, default_value):
79 """
80 Retrieves an int parameter by key and returns its value. If not found returns the default value,
81 or if the default value is a callable returns the result of invoking the callable.
82 """
65 try:83 try:
66 return int(self.values.get(key))84 return int(self.values.get(key))
67 except TypeError:85 except TypeError:
68 return default_val86 return self._get_default(default_value)
69 except ValueError:87 except ValueError:
70 return default_val88 return self._get_default(default_value)
7189
7290
73__conf__ = ConfigurationProvider()91__conf__ = ConfigurationProvider()
@@ -81,7 +99,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
81 raise AgentConfigError(("Missing configuration in {0}"99 raise AgentConfigError(("Missing configuration in {0}"
82 "").format(conf_file_path))100 "").format(conf_file_path))
83 try:101 try:
84 content = fileutil.read_file(conf_file_path)102 content = read_file(conf_file_path)
85 conf.load(content)103 conf.load(content)
86 except IOError as err:104 except IOError as err:
87 raise AgentConfigError(("Failed to load conf file:{0}, {1}"105 raise AgentConfigError(("Failed to load conf file:{0}, {1}"
@@ -97,6 +115,7 @@ __SWITCH_OPTIONS__ = {
97 "OS.CheckRdmaDriver": False,115 "OS.CheckRdmaDriver": False,
98 "Logs.Verbose": False,116 "Logs.Verbose": False,
99 "Logs.Console": True,117 "Logs.Console": True,
118 "Logs.Collect": True,
100 "Extensions.Enabled": True,119 "Extensions.Enabled": True,
101 "Provisioning.AllowResetSysUser": False,120 "Provisioning.AllowResetSysUser": False,
102 "Provisioning.RegenerateSshHostKeyPair": False,121 "Provisioning.RegenerateSshHostKeyPair": False,
@@ -110,7 +129,16 @@ __SWITCH_OPTIONS__ = {
110 "ResourceDisk.EnableSwapEncryption": False,129 "ResourceDisk.EnableSwapEncryption": False,
111 "AutoUpdate.Enabled": True,130 "AutoUpdate.Enabled": True,
112 "EnableOverProvisioning": True,131 "EnableOverProvisioning": True,
113 "CGroups.EnforceLimits": False,132 #
133 # "Debug" options are experimental and may be removed in later
134 # versions of the Agent.
135 #
136 "Debug.CgroupLogMetrics": False,
137 "Debug.CgroupDisableOnProcessCheckFailure": True,
138 "Debug.CgroupDisableOnQuotaCheckFailure": True,
139 "Debug.EnableAgentMemoryUsageCheck": False,
140 "Debug.EnableFastTrack": True,
141 "Debug.EnableGAVersioning": False
114}142}
115143
116144
@@ -133,16 +161,37 @@ __STRING_OPTIONS__ = {
133 "ResourceDisk.MountOptions": None,161 "ResourceDisk.MountOptions": None,
134 "ResourceDisk.Filesystem": "ext3",162 "ResourceDisk.Filesystem": "ext3",
135 "AutoUpdate.GAFamily": "Prod",163 "AutoUpdate.GAFamily": "Prod",
136 "CGroups.Excluded": "customscript,runcommand",164 "Debug.CgroupMonitorExpiryTime": "2022-03-31",
165 "Debug.CgroupMonitorExtensionName": "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent",
137}166}
138167
139168
140__INTEGER_OPTIONS__ = {169__INTEGER_OPTIONS__ = {
170 "Extensions.GoalStatePeriod": 6,
171 "Extensions.InitialGoalStatePeriod": 6,
172 "OS.EnableFirewallPeriod": 300,
173 "OS.RemovePersistentNetRulesPeriod": 30,
174 "OS.RootDeviceScsiTimeoutPeriod": 30,
175 "OS.MonitorDhcpClientRestartPeriod": 30,
141 "OS.SshClientAliveInterval": 180,176 "OS.SshClientAliveInterval": 180,
177 "Provisioning.MonitorHostNamePeriod": 30,
142 "Provisioning.PasswordCryptSaltLength": 10,178 "Provisioning.PasswordCryptSaltLength": 10,
143 "HttpProxy.Port": None,179 "HttpProxy.Port": None,
144 "ResourceDisk.SwapSizeMB": 0,180 "ResourceDisk.SwapSizeMB": 0,
145 "Autoupdate.Frequency": 3600181 "Autoupdate.Frequency": 3600,
182 "Logs.CollectPeriod": 3600,
183 #
184 # "Debug" options are experimental and may be removed in later
185 # versions of the Agent.
186 #
187 "Debug.CgroupCheckPeriod": 300,
188 "Debug.AgentCpuQuota": 50,
189 "Debug.AgentCpuThrottledTimeThreshold": 120,
190 "Debug.AgentMemoryQuota": 30 * 1024 ** 2,
191 "Debug.EtpCollectionPeriod": 300,
192 "Debug.AutoUpdateHotfixFrequency": 14400,
193 "Debug.AutoUpdateNormalFrequency": 86400,
194 "Debug.FirewallRulesLogPeriod": 86400
146}195}
147196
148197
@@ -160,10 +209,40 @@ def get_configuration(conf=__conf__):
160 return options209 return options
161210
162211
212def get_default_value(option):
213 if option in __STRING_OPTIONS__:
214 return __STRING_OPTIONS__[option]
215 raise ValueError("{0} is not a valid configuration parameter.".format(option))
216
217
218def get_int_default_value(option):
219 if option in __INTEGER_OPTIONS__:
220 return int(__INTEGER_OPTIONS__[option])
221 raise ValueError("{0} is not a valid configuration parameter.".format(option))
222
223
224def get_switch_default_value(option):
225 if option in __SWITCH_OPTIONS__:
226 return __SWITCH_OPTIONS__[option]
227 raise ValueError("{0} is not a valid configuration parameter.".format(option))
228
229
163def enable_firewall(conf=__conf__):230def enable_firewall(conf=__conf__):
164 return conf.get_switch("OS.EnableFirewall", False)231 return conf.get_switch("OS.EnableFirewall", False)
165232
166233
234def get_enable_firewall_period(conf=__conf__):
235 return conf.get_int("OS.EnableFirewallPeriod", 300)
236
237
238def get_remove_persistent_net_rules_period(conf=__conf__):
239 return conf.get_int("OS.RemovePersistentNetRulesPeriod", 30)
240
241
242def get_monitor_dhcp_client_restart_period(conf=__conf__):
243 return conf.get_int("OS.MonitorDhcpClientRestartPeriod", 30)
244
245
167def enable_rdma(conf=__conf__):246def enable_rdma(conf=__conf__):
168 return conf.get_switch("OS.EnableRDMA", False) or \247 return conf.get_switch("OS.EnableRDMA", False) or \
169 conf.get_switch("OS.UpdateRdmaDriver", False) or \248 conf.get_switch("OS.UpdateRdmaDriver", False) or \
@@ -186,11 +265,20 @@ def get_logs_console(conf=__conf__):
186 return conf.get_switch("Logs.Console", True)265 return conf.get_switch("Logs.Console", True)
187266
188267
268def get_collect_logs(conf=__conf__):
269 return conf.get_switch("Logs.Collect", True)
270
271
272def get_collect_logs_period(conf=__conf__):
273 return conf.get_int("Logs.CollectPeriod", 3600)
274
275
189def get_lib_dir(conf=__conf__):276def get_lib_dir(conf=__conf__):
190 return conf.get("Lib.Dir", "/var/lib/waagent")277 return conf.get("Lib.Dir", "/var/lib/waagent")
191278
192279
193def get_published_hostname(conf=__conf__):280def get_published_hostname(conf=__conf__):
281 # Some applications rely on this file; do not remove this setting
194 return os.path.join(get_lib_dir(conf), 'published_hostname')282 return os.path.join(get_lib_dir(conf), 'published_hostname')
195283
196284
@@ -206,6 +294,10 @@ def get_ext_log_dir(conf=__conf__):
206 return conf.get("Extension.LogDir", "/var/log/azure")294 return conf.get("Extension.LogDir", "/var/log/azure")
207295
208296
297def get_agent_log_file():
298 return "/var/log/waagent.log"
299
300
209def get_fips_enabled(conf=__conf__):301def get_fips_enabled(conf=__conf__):
210 return conf.get_switch("OS.EnableFIPS", False)302 return conf.get_switch("OS.EnableFIPS", False)
211303
@@ -244,18 +336,22 @@ def get_ssh_key_glob(conf=__conf__):
244336
245def get_ssh_key_private_path(conf=__conf__):337def get_ssh_key_private_path(conf=__conf__):
246 return os.path.join(get_ssh_dir(conf),338 return os.path.join(get_ssh_dir(conf),
247 'ssh_host_{0}_key'.format(get_ssh_host_keypair_type(conf)))339 'ssh_host_{0}_key'.format(get_ssh_host_keypair_type(conf)))
248340
249341
250def get_ssh_key_public_path(conf=__conf__):342def get_ssh_key_public_path(conf=__conf__):
251 return os.path.join(get_ssh_dir(conf),343 return os.path.join(get_ssh_dir(conf),
252 'ssh_host_{0}_key.pub'.format(get_ssh_host_keypair_type(conf)))344 'ssh_host_{0}_key.pub'.format(get_ssh_host_keypair_type(conf)))
253345
254346
255def get_root_device_scsi_timeout(conf=__conf__):347def get_root_device_scsi_timeout(conf=__conf__):
256 return conf.get("OS.RootDeviceScsiTimeout", None)348 return conf.get("OS.RootDeviceScsiTimeout", None)
257349
258350
351def get_root_device_scsi_timeout_period(conf=__conf__):
352 return conf.get_int("OS.RootDeviceScsiTimeoutPeriod", 30)
353
354
259def get_ssh_host_keypair_type(conf=__conf__):355def get_ssh_host_keypair_type(conf=__conf__):
260 keypair_type = conf.get("Provisioning.SshHostKeyPairType", "rsa")356 keypair_type = conf.get("Provisioning.SshHostKeyPairType", "rsa")
261 if keypair_type == "auto":357 if keypair_type == "auto":
@@ -275,6 +371,14 @@ def get_extensions_enabled(conf=__conf__):
275 return conf.get_switch("Extensions.Enabled", True)371 return conf.get_switch("Extensions.Enabled", True)
276372
277373
374def get_goal_state_period(conf=__conf__):
375 return conf.get_int("Extensions.GoalStatePeriod", 6)
376
377
378def get_initial_goal_state_period(conf=__conf__):
379 return conf.get_int("Extensions.InitialGoalStatePeriod", default_value=lambda: get_goal_state_period(conf=conf))
380
381
278def get_allow_reset_sys_user(conf=__conf__):382def get_allow_reset_sys_user(conf=__conf__):
279 return conf.get_switch("Provisioning.AllowResetSysUser", False)383 return conf.get_switch("Provisioning.AllowResetSysUser", False)
280384
@@ -322,6 +426,10 @@ def get_monitor_hostname(conf=__conf__):
322 return conf.get_switch("Provisioning.MonitorHostName", False)426 return conf.get_switch("Provisioning.MonitorHostName", False)
323427
324428
429def get_monitor_hostname_period(conf=__conf__):
430 return conf.get_int("Provisioning.MonitorHostNamePeriod", 30)
431
432
325def get_httpproxy_host(conf=__conf__):433def get_httpproxy_host(conf=__conf__):
326 return conf.get("HttpProxy.Host", None)434 return conf.get("HttpProxy.Host", None)
327435
@@ -340,10 +448,12 @@ def get_resourcedisk_format(conf=__conf__):
340448
341def get_resourcedisk_enable_swap(conf=__conf__):449def get_resourcedisk_enable_swap(conf=__conf__):
342 return conf.get_switch("ResourceDisk.EnableSwap", False)450 return conf.get_switch("ResourceDisk.EnableSwap", False)
343 451
452
344def get_resourcedisk_enable_swap_encryption(conf=__conf__):453def get_resourcedisk_enable_swap_encryption(conf=__conf__):
345 return conf.get_switch("ResourceDisk.EnableSwapEncryption", False)454 return conf.get_switch("ResourceDisk.EnableSwapEncryption", False)
346455
456
347def get_resourcedisk_mountpoint(conf=__conf__):457def get_resourcedisk_mountpoint(conf=__conf__):
348 return conf.get("ResourceDisk.MountPoint", "/mnt/resource")458 return conf.get("ResourceDisk.MountPoint", "/mnt/resource")
349459
@@ -384,10 +494,151 @@ def get_disable_agent_file_path(conf=__conf__):
384 return os.path.join(get_lib_dir(conf), DISABLE_AGENT_FILE)494 return os.path.join(get_lib_dir(conf), DISABLE_AGENT_FILE)
385495
386496
387def get_cgroups_enforce_limits(conf=__conf__):497def get_cgroups_enabled(conf=__conf__):
388 return conf.get_switch("CGroups.EnforceLimits", False)498 return conf.get_switch("CGroups.Enabled", True)
499
500
501def get_monitor_network_configuration_changes(conf=__conf__):
502 return conf.get_switch("Monitor.NetworkConfigurationChanges", False)
503
504
505def get_cgroup_check_period(conf=__conf__):
506 """
507 How often to perform checks on cgroups (are the processes in the cgroups as expected,
508 has the agent exceeded its quota, etc)
509
510 NOTE: This option is experimental and may be removed in later versions of the Agent.
511 """
512 return conf.get_int("Debug.CgroupCheckPeriod", 300)
513
389514
515def get_cgroup_log_metrics(conf=__conf__):
516 """
517 If True, resource usage metrics are written to the local log
518
519 NOTE: This option is experimental and may be removed in later versions of the Agent.
520 """
521 return conf.get_switch("Debug.CgroupLogMetrics", False)
522
523
524def get_cgroup_disable_on_process_check_failure(conf=__conf__):
525 """
526 If True, cgroups will be disabled if the process check fails
527
528 NOTE: This option is experimental and may be removed in later versions of the Agent.
529 """
530 return conf.get_switch("Debug.CgroupDisableOnProcessCheckFailure", True)
531
532
533def get_cgroup_disable_on_quota_check_failure(conf=__conf__):
534 """
535 If True, cgroups will be disabled if the CPU quota check fails
536
537 NOTE: This option is experimental and may be removed in later versions of the Agent.
538 """
539 return conf.get_switch("Debug.CgroupDisableOnQuotaCheckFailure", True)
540
541
542def get_agent_cpu_quota(conf=__conf__):
543 """
544 CPU quota for the agent as a percentage of 1 CPU (100% == 1 CPU)
390545
391def get_cgroups_excluded(conf=__conf__):546 NOTE: This option is experimental and may be removed in later versions of the Agent.
392 excluded_value = conf.get("CGroups.Excluded", "customscript, runcommand")547 """
393 return [s for s in [i.strip().lower() for i in excluded_value.split(',')] if len(s) > 0] if excluded_value else []548 return conf.get_int("Debug.AgentCpuQuota", 50)
549
550
551def get_agent_cpu_throttled_time_threshold(conf=__conf__):
552 """
553 Throttled time threshold for agent cpu in seconds.
554
555 NOTE: This option is experimental and may be removed in later versions of the Agent.
556 """
557 return conf.get_int("Debug.AgentCpuThrottledTimeThreshold", 120)
558
559
560def get_agent_memory_quota(conf=__conf__):
561 """
562 Memory quota for the agent in bytes.
563
564 NOTE: This option is experimental and may be removed in later versions of the Agent.
565 """
566 return conf.get_int("Debug.AgentMemoryQuota", 30 * 1024 ** 2)
567
568
569def get_enable_agent_memory_usage_check(conf=__conf__):
570 """
571 If True, Agent checks it's Memory usage.
572
573 NOTE: This option is experimental and may be removed in later versions of the Agent.
574 """
575 return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False)
576
577
578def get_cgroup_monitor_expiry_time(conf=__conf__):
579 """
580 cgroups monitoring for pilot extensions disabled after expiry time
581
582 NOTE: This option is experimental and may be removed in later versions of the Agent.
583 """
584 return conf.get("Debug.CgroupMonitorExpiryTime", "2022-03-31")
585
586
587def get_cgroup_monitor_extension_name (conf=__conf__):
588 """
589 cgroups monitoring extension name
590
591 NOTE: This option is experimental and may be removed in later versions of the Agent.
592 """
593 return conf.get("Debug.CgroupMonitorExtensionName", "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent")
594
595
596def get_enable_fast_track(conf=__conf__):
597 """
598 If True, the agent use FastTrack when retrieving goal states
599
600 NOTE: This option is experimental and may be removed in later versions of the Agent.
601 """
602 return conf.get_switch("Debug.EnableFastTrack", True)
603
604
605def get_etp_collection_period(conf=__conf__):
606 """
607 Determines the frequency to perform ETP collection on extensions telemetry events.
608 NOTE: This option is experimental and may be removed in later versions of the Agent.
609 """
610 return conf.get_int("Debug.EtpCollectionPeriod", 300)
611
612
613def get_hotfix_upgrade_frequency(conf=__conf__):
614 """
615 Determines the frequency to check for Hotfix upgrades (<Patch>.<Build> version changed in new upgrades).
616 NOTE: This option is experimental and may be removed in later versions of the Agent.
617 """
618 return conf.get_int("Debug.AutoUpdateHotfixFrequency", 4 * 60 * 60)
619
620
621def get_normal_upgrade_frequency(conf=__conf__):
622 """
623 Determines the frequency to check for Normal upgrades (<Major>.<Minor> version changed in new upgrades).
624 NOTE: This option is experimental and may be removed in later versions of the Agent.
625 """
626 return conf.get_int("Debug.AutoUpdateNormalFrequency", 24 * 60 * 60)
627
628
629def get_enable_ga_versioning(conf=__conf__):
630 """
631 If True, the agent uses GA Versioning for auto-updating the agent vs automatically auto-updating to the highest version.
632
633 NOTE: This option is experimental and may be removed in later versions of the Agent.
634 """
635 return conf.get_switch("Debug.EnableGAVersioning", False)
636
637
638def get_firewall_rules_log_period(conf=__conf__):
639 """
640 Determine the frequency to perform the periodic operation of logging firewall rules.
641
642 NOTE: This option is experimental and may be removed in later versions of the Agent.
643 """
644 return conf.get_int("Debug.FirewallRulesLogPeriod", 86400)
diff --git a/azurelinuxagent/common/datacontract.py b/azurelinuxagent/common/datacontract.py
index c69bebc..b6d1f3c 100644
--- a/azurelinuxagent/common/datacontract.py
+++ b/azurelinuxagent/common/datacontract.py
@@ -20,9 +20,11 @@
20from azurelinuxagent.common.exception import ProtocolError20from azurelinuxagent.common.exception import ProtocolError
21import azurelinuxagent.common.logger as logger21import azurelinuxagent.common.logger as logger
2222
23# pylint: disable=W0105
23"""24"""
24Base class for data contracts between guest and host and utilities to manipulate the properties in those contracts25Base class for data contracts between guest and host and utilities to manipulate the properties in those contracts
25"""26"""
27# pylint: enable=W0105
2628
2729
28class DataContract(object):30class DataContract(object):
@@ -30,7 +32,7 @@ class DataContract(object):
3032
3133
32class DataContractList(list):34class DataContractList(list):
33 def __init__(self, item_cls):35 def __init__(self, item_cls): # pylint: disable=W0231
34 self.item_cls = item_cls36 self.item_cls = item_cls
3537
3638
diff --git a/azurelinuxagent/common/dhcp.py b/azurelinuxagent/common/dhcp.py
index 5974965..3db58d3 100644
--- a/azurelinuxagent/common/dhcp.py
+++ b/azurelinuxagent/common/dhcp.py
@@ -14,23 +14,23 @@
14#14#
15# Requires Python 2.6+ and Openssl 1.0+15# Requires Python 2.6+ and Openssl 1.0+
1616
17import array
17import os18import os
18import socket19import socket
19import array
20import time20import time
21
21import azurelinuxagent.common.logger as logger22import azurelinuxagent.common.logger as logger
23from azurelinuxagent.common.exception import DhcpError
24from azurelinuxagent.common.osutil import get_osutil
25from azurelinuxagent.common.utils.restutil import KNOWN_WIRESERVER_IP
22from azurelinuxagent.common.utils.textutil import hex_dump, hex_dump2, \26from azurelinuxagent.common.utils.textutil import hex_dump, hex_dump2, \
23 hex_dump3, \27 hex_dump3, \
24 compare_bytes, str_to_ord, \28 compare_bytes, str_to_ord, \
25 unpack_big_endian, \29 unpack_big_endian, \
26 int_to_ip4_addr30 int_to_ip4_addr
27from azurelinuxagent.common.exception import DhcpError
28from azurelinuxagent.common.osutil import get_osutil
29
3031
31# the kernel routing table representation of 168.63.129.1632# the kernel routing table representation of 168.63.129.16
32KNOWN_WIRESERVER_IP_ENTRY = '10813FA8'33KNOWN_WIRESERVER_IP_ENTRY = '10813FA8'
33from azurelinuxagent.common.utils.restutil import KNOWN_WIRESERVER_IP
3434
3535
36def get_dhcp_handler():36def get_dhcp_handler():
@@ -86,7 +86,7 @@ class DhcpHandler(object):
86 logger.info("Test for route to {0}".format(KNOWN_WIRESERVER_IP))86 logger.info("Test for route to {0}".format(KNOWN_WIRESERVER_IP))
87 try:87 try:
88 route_table = self.osutil.read_route_table()88 route_table = self.osutil.read_route_table()
89 if any([(KNOWN_WIRESERVER_IP_ENTRY in route) for route in route_table]):89 if any((KNOWN_WIRESERVER_IP_ENTRY in route) for route in route_table):
90 # reset self.gateway and self.routes90 # reset self.gateway and self.routes
91 # we do not need to alter the routing table91 # we do not need to alter the routing table
92 self.endpoint = KNOWN_WIRESERVER_IP92 self.endpoint = KNOWN_WIRESERVER_IP
@@ -100,7 +100,7 @@ class DhcpHandler(object):
100 logger.error(100 logger.error(
101 "Could not determine whether route exists to {0}: {1}".format(101 "Could not determine whether route exists to {0}: {1}".format(
102 KNOWN_WIRESERVER_IP, e))102 KNOWN_WIRESERVER_IP, e))
103 103
104 return route_exists104 return route_exists
105105
106 @property106 @property
@@ -116,7 +116,7 @@ class DhcpHandler(object):
116 exists = False116 exists = False
117117
118 logger.info("Checking for dhcp lease cache")118 logger.info("Checking for dhcp lease cache")
119 cached_endpoint = self.osutil.get_dhcp_lease_endpoint()119 cached_endpoint = self.osutil.get_dhcp_lease_endpoint() # pylint: disable=E1128
120 if cached_endpoint is not None:120 if cached_endpoint is not None:
121 self.endpoint = cached_endpoint121 self.endpoint = cached_endpoint
122 exists = True122 exists = True
@@ -157,11 +157,14 @@ class DhcpHandler(object):
157 self.endpoint = KNOWN_WIRESERVER_IP157 self.endpoint = KNOWN_WIRESERVER_IP
158 return158 return
159159
160 # pylint: disable=W0105
160 """161 """
161 Build dhcp request with mac addr162 Build dhcp request with mac addr
162 Configure route to allow dhcp traffic163 Configure route to allow dhcp traffic
163 Stop dhcp service if necessary164 Stop dhcp service if necessary
164 """165 """
166 # pylint: enable=W0105
167
165 logger.info("Send dhcp request")168 logger.info("Send dhcp request")
166 mac_addr = self.osutil.get_mac_addr()169 mac_addr = self.osutil.get_mac_addr()
167170
@@ -194,7 +197,7 @@ class DhcpHandler(object):
194 self.endpoint, self.gateway, self.routes = parse_dhcp_resp(resp)197 self.endpoint, self.gateway, self.routes = parse_dhcp_resp(resp)
195198
196199
197def validate_dhcp_resp(request, response):200def validate_dhcp_resp(request, response): # pylint: disable=R1710
198 bytes_recv = len(response)201 bytes_recv = len(response)
199 if bytes_recv < 0xF6:202 if bytes_recv < 0xF6:
200 logger.error("HandleDhcpResponse: Too few bytes received:{0}",203 logger.error("HandleDhcpResponse: Too few bytes received:{0}",
@@ -228,7 +231,7 @@ def validate_dhcp_resp(request, response):
228 "doesn't match the request")231 "doesn't match the request")
229232
230233
231def parse_route(response, option, i, length, bytes_recv):234def parse_route(response, option, i, length, bytes_recv): # pylint: disable=W0613
232 # http://msdn.microsoft.com/en-us/library/cc227282%28PROT.10%29.aspx235 # http://msdn.microsoft.com/en-us/library/cc227282%28PROT.10%29.aspx
233 logger.verbose("Routes at offset: {0} with length:{1}", hex(i),236 logger.verbose("Routes at offset: {0} with length:{1}", hex(i),
234 hex(length))237 hex(length))
@@ -386,7 +389,7 @@ def build_dhcp_request(mac_addr, request_broadcast):
386 # set broadcast flag to true to request the dhcp server389 # set broadcast flag to true to request the dhcp server
387 # to respond to a boradcast address,390 # to respond to a boradcast address,
388 # this is useful when user dhclient fails.391 # this is useful when user dhclient fails.
389 request[0x0A] = 0x80;392 request[0x0A] = 0x80
390393
391 # fill in ClientHardwareAddress394 # fill in ClientHardwareAddress
392 for a in range(0, 6):395 for a in range(0, 6):
diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py
index 5274b80..1f903a9 100644
--- a/azurelinuxagent/common/event.py
+++ b/azurelinuxagent/common/event.py
@@ -18,30 +18,35 @@
18import atexit18import atexit
19import json19import json
20import os20import os
21import platform
21import re22import re
22import sys23import sys
23import threading24import threading
24import time25import time
25import traceback26import traceback
26from collections import namedtuple
27from datetime import datetime27from datetime import datetime
2828
29import azurelinuxagent.common.conf as conf29import azurelinuxagent.common.conf as conf
30import azurelinuxagent.common.logger as logger30import azurelinuxagent.common.logger as logger
31from azurelinuxagent.common.exception import EventError31from azurelinuxagent.common.AgentGlobals import AgentGlobals
32from azurelinuxagent.common.future import ustr, OrderedDict32from azurelinuxagent.common.exception import EventError, OSUtilError
33from azurelinuxagent.common.datacontract import get_properties, DataContractList33from azurelinuxagent.common.future import ustr
34from azurelinuxagent.common.telemetryevent import TelemetryEventParam, TelemetryEvent34from azurelinuxagent.common.datacontract import get_properties, set_properties
35from azurelinuxagent.common.osutil import get_osutil
36from azurelinuxagent.common.telemetryevent import TelemetryEventParam, TelemetryEvent, CommonTelemetryEventSchema, \
37 GuestAgentGenericLogsSchema, GuestAgentExtensionEventsSchema, GuestAgentPerfCounterEventsSchema
35from azurelinuxagent.common.utils import fileutil, textutil38from azurelinuxagent.common.utils import fileutil, textutil
36from azurelinuxagent.common.version import CURRENT_VERSION, CURRENT_AGENT39from azurelinuxagent.common.utils.textutil import parse_doc, findall, find, getattrib, str_to_encoded_ustr
40from azurelinuxagent.common.version import CURRENT_VERSION, CURRENT_AGENT, AGENT_NAME, DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, AGENT_EXECUTION_MODE
41from azurelinuxagent.common.protocol.imds import get_imds_client
42
43EVENTS_DIRECTORY = "events"
3744
38_EVENT_MSG = "Event: name={0}, op={1}, message={2}, duration={3}"45_EVENT_MSG = "Event: name={0}, op={1}, message={2}, duration={3}"
39TELEMETRY_EVENT_PROVIDER_ID = "69B669B9-4AF8-4C50-BDC4-6006FA76E975"46TELEMETRY_EVENT_PROVIDER_ID = "69B669B9-4AF8-4C50-BDC4-6006FA76E975"
47TELEMETRY_EVENT_EVENT_ID = 1
40TELEMETRY_METRICS_EVENT_ID = 448TELEMETRY_METRICS_EVENT_ID = 4
4149
42# Store the last retrieved container id as an environment variable to be shared between threads for telemetry purposes
43CONTAINER_ID_ENV_VARIABLE = "AZURE_GUEST_AGENT_CONTAINER_ID"
44
45TELEMETRY_LOG_PROVIDER_ID = "FFF0196F-EE4C-4EAF-9AA5-776F622DEB4F"50TELEMETRY_LOG_PROVIDER_ID = "FFF0196F-EE4C-4EAF-9AA5-776F622DEB4F"
46TELEMETRY_LOG_EVENT_ID = 751TELEMETRY_LOG_EVENT_ID = 7
4752
@@ -53,33 +58,39 @@ SEND_LOGS_TO_TELEMETRY = False
5358
54MAX_NUMBER_OF_EVENTS = 100059MAX_NUMBER_OF_EVENTS = 1000
5560
61AGENT_EVENT_FILE_EXTENSION = '.waagent.tld'
62EVENT_FILE_REGEX = re.compile(r'(?P<agent_event>\.waagent)?\.tld$')
5663
57def send_logs_to_telemetry():64def send_logs_to_telemetry():
58 return SEND_LOGS_TO_TELEMETRY65 return SEND_LOGS_TO_TELEMETRY
5966
6067
61def get_container_id_from_env():
62 return os.environ.get(CONTAINER_ID_ENV_VARIABLE, "UNINITIALIZED")
63
64
65class WALAEventOperation:68class WALAEventOperation:
66 ActivateResourceDisk = "ActivateResourceDisk"69 ActivateResourceDisk = "ActivateResourceDisk"
67 AgentBlacklisted = "AgentBlacklisted"70 AgentBlacklisted = "AgentBlacklisted"
68 AgentEnabled = "AgentEnabled"71 AgentEnabled = "AgentEnabled"
72 AgentMemory = "AgentMemory"
73 AgentUpgrade = "AgentUpgrade"
69 ArtifactsProfileBlob = "ArtifactsProfileBlob"74 ArtifactsProfileBlob = "ArtifactsProfileBlob"
70 AutoUpdate = "AutoUpdate"
71 CustomData = "CustomData"
72 CGroupsCleanUp = "CGroupsCleanUp"75 CGroupsCleanUp = "CGroupsCleanUp"
73 CGroupsLimitsCrossed = "CGroupsLimitsCrossed"76 CGroupsDisabled = "CGroupsDisabled"
74 ExtensionMetricsData = "ExtensionMetricsData"77 CGroupsInfo = "CGroupsInfo"
78 CollectEventErrors = "CollectEventErrors"
79 CollectEventUnicodeErrors = "CollectEventUnicodeErrors"
80 ConfigurationChange = "ConfigurationChange"
81 CustomData = "CustomData"
82 DefaultChannelChange = "DefaultChannelChange"
75 Deploy = "Deploy"83 Deploy = "Deploy"
76 Disable = "Disable"84 Disable = "Disable"
77 Downgrade = "Downgrade"85 Downgrade = "Downgrade"
78 Download = "Download"86 Download = "Download"
79 Enable = "Enable"87 Enable = "Enable"
80 ExtensionProcessing = "ExtensionProcessing"88 ExtensionProcessing = "ExtensionProcessing"
89 ExtensionTelemetryEventProcessing = "ExtensionTelemetryEventProcessing"
90 FetchGoalState = "FetchGoalState"
81 Firewall = "Firewall"91 Firewall = "Firewall"
82 GetArtifactExtended = "GetArtifactExtended"92 GoalState = "GoalState"
93 GoalStateUnsupportedFeatures = "GoalStateUnsupportedFeatures"
83 HealthCheck = "HealthCheck"94 HealthCheck = "HealthCheck"
84 HealthObservation = "HealthObservation"95 HealthObservation = "HealthObservation"
85 HeartBeat = "HeartBeat"96 HeartBeat = "HeartBeat"
@@ -87,29 +98,35 @@ class WALAEventOperation:
87 HostPluginHeartbeat = "HostPluginHeartbeat"98 HostPluginHeartbeat = "HostPluginHeartbeat"
88 HostPluginHeartbeatExtended = "HostPluginHeartbeatExtended"99 HostPluginHeartbeatExtended = "HostPluginHeartbeatExtended"
89 HttpErrors = "HttpErrors"100 HttpErrors = "HttpErrors"
101 HttpGet = "HttpGet"
90 ImdsHeartbeat = "ImdsHeartbeat"102 ImdsHeartbeat = "ImdsHeartbeat"
91 Install = "Install"103 Install = "Install"
92 InitializeCGroups = "InitializeCGroups"
93 InitializeHostPlugin = "InitializeHostPlugin"104 InitializeHostPlugin = "InitializeHostPlugin"
94 InvokeCommandUsingSystemd = "InvokeCommandUsingSystemd"
95 Log = "Log"105 Log = "Log"
106 LogCollection = "LogCollection"
96 OSInfo = "OSInfo"107 OSInfo = "OSInfo"
97 Partition = "Partition"108 Partition = "Partition"
98 ProcessGoalState = "ProcessGoalState"109 PersistFirewallRules = "PersistFirewallRules"
110 PluginSettingsVersionMismatch = "PluginSettingsVersionMismatch"
111 InvalidExtensionConfig = "InvalidExtensionConfig"
99 Provision = "Provision"112 Provision = "Provision"
100 ProvisionGuestAgent = "ProvisionGuestAgent"113 ProvisionGuestAgent = "ProvisionGuestAgent"
101 RemoteAccessHandling = "RemoteAccessHandling"114 RemoteAccessHandling = "RemoteAccessHandling"
115 ReportEventErrors = "ReportEventErrors"
116 ReportEventUnicodeErrors = "ReportEventUnicodeErrors"
102 ReportStatus = "ReportStatus"117 ReportStatus = "ReportStatus"
103 ReportStatusExtended = "ReportStatusExtended"118 ReportStatusExtended = "ReportStatusExtended"
104 Restart = "Restart"119 Restart = "Restart"
105 SequenceNumberMismatch = "SequenceNumberMismatch"120 SequenceNumberMismatch = "SequenceNumberMismatch"
106 SetCGroupsLimits = "SetCGroupsLimits"121 SetCGroupsLimits = "SetCGroupsLimits"
107 SkipUpdate = "SkipUpdate"122 SkipUpdate = "SkipUpdate"
123 StatusProcessing = "StatusProcessing"
108 UnhandledError = "UnhandledError"124 UnhandledError = "UnhandledError"
109 UnInstall = "UnInstall"125 UnInstall = "UnInstall"
110 Unknown = "Unknown"126 Unknown = "Unknown"
111 Upgrade = "Upgrade"
112 Update = "Update"127 Update = "Update"
128 VmSettings = "VmSettings"
129 VmSettingsSummary = "VmSettingsSummary"
113130
114131
115SHOULD_ENCODE_MESSAGE_LEN = 80132SHOULD_ENCODE_MESSAGE_LEN = 80
@@ -173,11 +190,57 @@ class EventStatus(object):
173190
174__event_status__ = EventStatus()191__event_status__ = EventStatus()
175__event_status_operations__ = [192__event_status_operations__ = [
176 WALAEventOperation.AutoUpdate,
177 WALAEventOperation.ReportStatus193 WALAEventOperation.ReportStatus
178 ]194 ]
179195
180196
197def parse_json_event(data_str):
198 data = json.loads(data_str)
199 event = TelemetryEvent()
200 set_properties("TelemetryEvent", event, data)
201 event.file_type = "json"
202 return event
203
204
205def parse_event(data_str):
206 try:
207 try:
208 return parse_json_event(data_str)
209 except ValueError:
210 return parse_xml_event(data_str)
211 except Exception as e:
212 raise EventError("Error parsing event: {0}".format(ustr(e)))
213
214
215def parse_xml_param(param_node):
216 name = getattrib(param_node, "Name")
217 value_str = getattrib(param_node, "Value")
218 attr_type = getattrib(param_node, "T")
219 value = value_str
220 if attr_type == 'mt:uint64':
221 value = int(value_str)
222 elif attr_type == 'mt:bool':
223 value = bool(value_str)
224 elif attr_type == 'mt:float64':
225 value = float(value_str)
226 return TelemetryEventParam(name, value)
227
228
229def parse_xml_event(data_str):
230 try:
231 xml_doc = parse_doc(data_str)
232 event_id = getattrib(find(xml_doc, "Event"), 'id')
233 provider_id = getattrib(find(xml_doc, "Provider"), 'id')
234 event = TelemetryEvent(event_id, provider_id)
235 param_nodes = findall(xml_doc, 'Param')
236 for param_node in param_nodes:
237 event.parameters.append(parse_xml_param(param_node))
238 event.file_type = "xml"
239 return event
240 except Exception as e:
241 raise ValueError(ustr(e))
242
243
181def _encode_message(op, message):244def _encode_message(op, message):
182 """245 """
183 Gzip and base64 encode a message based on the operation.246 Gzip and base64 encode a message based on the operation.
@@ -214,20 +277,158 @@ def _encode_message(op, message):
214277
215278
216def _log_event(name, op, message, duration, is_success=True):279def _log_event(name, op, message, duration, is_success=True):
217 global _EVENT_MSG280 global _EVENT_MSG # pylint: disable=W0603
218281
219 message = _encode_message(op, message)
220 if not is_success:282 if not is_success:
221 logger.error(_EVENT_MSG, name, op, message, duration)283 logger.error(_EVENT_MSG, name, op, message, duration)
222 else:284 else:
223 logger.info(_EVENT_MSG, name, op, message, duration)285 logger.info(_EVENT_MSG, name, op, message, duration)
224286
225287
288class CollectOrReportEventDebugInfo(object):
289 """
290 This class is used for capturing and reporting debug info that is captured during event collection and
291 reporting to wireserver.
292 It captures the count of unicode errors and any unexpected errors and also a subset of errors with stacks to help
293 with debugging any potential issues.
294 """
295 __MAX_ERRORS_TO_REPORT = 5
296 OP_REPORT = "Report"
297 OP_COLLECT = "Collect"
298
299 def __init__(self, operation=OP_REPORT):
300 self.__unicode_error_count = 0
301 self.__unicode_errors = set()
302 self.__op_error_count = 0
303 self.__op_errors = set()
304
305 if operation == self.OP_REPORT:
306 self.__unicode_error_event = WALAEventOperation.ReportEventUnicodeErrors
307 self.__op_errors_event = WALAEventOperation.ReportEventErrors
308 elif operation == self.OP_COLLECT:
309 self.__unicode_error_event = WALAEventOperation.CollectEventUnicodeErrors
310 self.__op_errors_event = WALAEventOperation.CollectEventErrors
311
312 def report_debug_info(self):
313
314 def report_dropped_events_error(count, errors, operation_name):
315 err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}"
316 if count > 0:
317 add_event(op=operation_name,
318 message=err_msg_format.format(count, CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT, ', '.join(errors)),
319 is_success=False)
320
321 report_dropped_events_error(self.__op_error_count, self.__op_errors, self.__op_errors_event)
322 report_dropped_events_error(self.__unicode_error_count, self.__unicode_errors, self.__unicode_error_event)
323
324 @staticmethod
325 def _update_errors_and_get_count(error_count, errors, error):
326 error_count += 1
327 if len(errors) < CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT:
328 errors.add("{0}: {1}".format(ustr(error), traceback.format_exc()))
329 return error_count
330
331 def update_unicode_error(self, unicode_err):
332 self.__unicode_error_count = self._update_errors_and_get_count(self.__unicode_error_count, self.__unicode_errors,
333 unicode_err)
334
335 def update_op_error(self, op_err):
336 self.__op_error_count = self._update_errors_and_get_count(self.__op_error_count, self.__op_errors, op_err)
337
338
226class EventLogger(object):339class EventLogger(object):
227 def __init__(self):340 def __init__(self):
228 self.event_dir = None341 self.event_dir = None
229 self.periodic_events = {}342 self.periodic_events = {}
230343
344 #
345 # All events should have these parameters.
346 #
347 # The first set comes from the current OS and is initialized here. These values don't change during
348 # the agent's lifetime.
349 #
350 # The next two sets come from the goal state and IMDS and must be explicitly initialized using
351 # initialize_vminfo_common_parameters() once a protocol for communication with the host has been
352 # created. Their values don't change during the agent's lifetime. Note that we initialize these
353 # parameters here using dummy values (*_UNINITIALIZED) since events sent to the host should always
354 # match the schema defined for them in the telemetry pipeline.
355 #
356 # There is another set of common parameters that must be computed at the time the event is created
357 # (e.g. the timestamp and the container ID); those are added to events (along with the parameters
358 # below) in _add_common_event_parameters()
359 #
360 # Note that different kinds of events may also include other parameters; those are added by the
361 # corresponding add_* method (e.g. add_metric for performance metrics).
362 #
363 self._common_parameters = []
364
365 # Parameters from OS
366 osutil = get_osutil()
367 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.OSVersion, EventLogger._get_os_version()))
368 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ExecutionMode, AGENT_EXECUTION_MODE))
369 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RAM, int(EventLogger._get_ram(osutil))))
370 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.Processors, int(EventLogger._get_processors(osutil))))
371
372 # Parameters from goal state
373 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.TenantName, "TenantName_UNINITIALIZED"))
374 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RoleName, "RoleName_UNINITIALIZED"))
375 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RoleInstanceName, "RoleInstanceName_UNINITIALIZED"))
376 #
377 # # Parameters from IMDS
378 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.Location, "Location_UNINITIALIZED"))
379 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.SubscriptionId, "SubscriptionId_UNINITIALIZED"))
380 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ResourceGroupName, "ResourceGroupName_UNINITIALIZED"))
381 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.VMId, "VMId_UNINITIALIZED"))
382 self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ImageOrigin, 0))
383
384 @staticmethod
385 def _get_os_version():
386 return "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release())
387
388 @staticmethod
389 def _get_ram(osutil):
390 try:
391 return osutil.get_total_mem()
392 except OSUtilError as e:
393 logger.warn("Failed to get RAM info; will be missing from telemetry: {0}", ustr(e))
394 return 0
395
396 @staticmethod
397 def _get_processors(osutil):
398 try:
399 return osutil.get_processor_cores()
400 except OSUtilError as e:
401 logger.warn("Failed to get Processors info; will be missing from telemetry: {0}", ustr(e))
402 return 0
403
404 def initialize_vminfo_common_parameters(self, protocol):
405 """
406 Initializes the common parameters that come from the goal state and IMDS
407 """
408 # create an index of the event parameters for faster updates
409 parameters = {}
410 for p in self._common_parameters:
411 parameters[p.name] = p
412
413 try:
414 vminfo = protocol.get_vminfo()
415 parameters[CommonTelemetryEventSchema.TenantName].value = vminfo.tenantName
416 parameters[CommonTelemetryEventSchema.RoleName].value = vminfo.roleName
417 parameters[CommonTelemetryEventSchema.RoleInstanceName].value = vminfo.roleInstanceName
418 except Exception as e:
419 logger.warn("Failed to get VM info from goal state; will be missing from telemetry: {0}", ustr(e))
420
421 try:
422 imds_client = get_imds_client(protocol.get_endpoint())
423 imds_info = imds_client.get_compute()
424 parameters[CommonTelemetryEventSchema.Location].value = imds_info.location
425 parameters[CommonTelemetryEventSchema.SubscriptionId].value = imds_info.subscriptionId
426 parameters[CommonTelemetryEventSchema.ResourceGroupName].value = imds_info.resourceGroupName
427 parameters[CommonTelemetryEventSchema.VMId].value = imds_info.vmId
428 parameters[CommonTelemetryEventSchema.ImageOrigin].value = int(imds_info.image_origin)
429 except Exception as e:
430 logger.warn("Failed to get IMDS info; will be missing from telemetry: {0}", ustr(e))
431
231 def save_event(self, data):432 def save_event(self, data):
232 if self.event_dir is None:433 if self.event_dir is None:
233 logger.warn("Cannot save event -- Event reporter is not initialized.")434 logger.warn("Cannot save event -- Event reporter is not initialized.")
@@ -258,7 +459,7 @@ class EventLogger(object):
258 try:459 try:
259 with open(filename + ".tmp", 'wb+') as hfile:460 with open(filename + ".tmp", 'wb+') as hfile:
260 hfile.write(data.encode("utf-8"))461 hfile.write(data.encode("utf-8"))
261 os.rename(filename + ".tmp", filename + ".tld")462 os.rename(filename + ".tmp", filename + AGENT_EVENT_FILE_EXTENSION)
262 except (IOError, OSError) as e:463 except (IOError, OSError) as e:
263 msg = "Failed to write events to file: {0}".format(e)464 msg = "Failed to write events to file: {0}".format(e)
264 raise EventError(msg)465 raise EventError(msg)
@@ -271,38 +472,31 @@ class EventLogger(object):
271 (self.periodic_events[h] + delta) <= datetime.now()472 (self.periodic_events[h] + delta) <= datetime.now()
272473
273 def add_periodic(self, delta, name, op=WALAEventOperation.Unknown, is_success=True, duration=0,474 def add_periodic(self, delta, name, op=WALAEventOperation.Unknown, is_success=True, duration=0,
274 version=str(CURRENT_VERSION), message="", evt_type="", is_internal=False, log_event=True,475 version=str(CURRENT_VERSION), message="", log_event=True, force=False):
275 force=False):
276 h = hash(name + op + ustr(is_success) + message)476 h = hash(name + op + ustr(is_success) + message)
277477
278 if force or self.is_period_elapsed(delta, h):478 if force or self.is_period_elapsed(delta, h):
279 self.add_event(name, op=op, is_success=is_success, duration=duration,479 self.add_event(name, op=op, is_success=is_success, duration=duration,
280 version=version, message=message, evt_type=evt_type,480 version=version, message=message, log_event=log_event)
281 is_internal=is_internal, log_event=log_event)
282 self.periodic_events[h] = datetime.now()481 self.periodic_events[h] = datetime.now()
283482
284 def add_event(self, name, op=WALAEventOperation.Unknown, is_success=True, duration=0, version=str(CURRENT_VERSION),483 def add_event(self, name, op=WALAEventOperation.Unknown, is_success=True, duration=0, version=str(CURRENT_VERSION),
285 message="", evt_type="", is_internal=False, log_event=True):484 message="", log_event=True):
286485
287 if (not is_success) and log_event:486 if (not is_success) and log_event:
288 _log_event(name, op, message, duration, is_success=is_success)487 _log_event(name, op, message, duration, is_success=is_success)
289488
290 self._add_event(duration, evt_type, is_internal, is_success, message, name, op, version, event_id=1)489 event = TelemetryEvent(TELEMETRY_EVENT_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID)
291490 event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Name, str_to_encoded_ustr(name)))
292 def _add_event(self, duration, evt_type, is_internal, is_success, message, name, op, version, event_id):491 event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str_to_encoded_ustr(version)))
293 event = TelemetryEvent(event_id, TELEMETRY_EVENT_PROVIDER_ID)492 event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, str_to_encoded_ustr(op)))
294493 event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.OperationSuccess, bool(is_success)))
295 event.parameters.append(TelemetryEventParam('Name', str(name)))494 event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Message, str_to_encoded_ustr(message)))
296 event.parameters.append(TelemetryEventParam('Version', str(version)))495 event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Duration, int(duration)))
297 event.parameters.append(TelemetryEventParam('IsInternal', bool(is_internal)))496 self.add_common_event_parameters(event, datetime.utcnow())
298 event.parameters.append(TelemetryEventParam('Operation', str(op)))
299 event.parameters.append(TelemetryEventParam('OperationSuccess', bool(is_success)))
300 event.parameters.append(TelemetryEventParam('Message', str(message)))
301 event.parameters.append(TelemetryEventParam('Duration', int(duration)))
302 event.parameters.append(TelemetryEventParam('ExtensionType', str(evt_type)))
303497
304 event.parameters = self.add_default_parameters_to_event(event.parameters)
305 data = get_properties(event)498 data = get_properties(event)
499
306 try:500 try:
307 self.save_event(json.dumps(data))501 self.save_event(json.dumps(data))
308 except EventError as e:502 except EventError as e:
@@ -310,13 +504,13 @@ class EventLogger(object):
310504
311 def add_log_event(self, level, message):505 def add_log_event(self, level, message):
312 event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID)506 event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID)
313 event.parameters.append(TelemetryEventParam('EventName', WALAEventOperation.Log))507 event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.EventName, WALAEventOperation.Log))
314 event.parameters.append(TelemetryEventParam('CapabilityUsed', logger.LogLevel.STRINGS[level]))508 event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.CapabilityUsed, logger.LogLevel.STRINGS[level]))
315 event.parameters.append(TelemetryEventParam('Context1', self._clean_up_message(message)))509 event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context1, str_to_encoded_ustr(self._clean_up_message(message))))
316 event.parameters.append(TelemetryEventParam('Context2', ''))510 event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context2, datetime.utcnow().strftime(logger.Logger.LogTimeFormatInUTC)))
317 event.parameters.append(TelemetryEventParam('Context3', ''))511 event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context3, ''))
512 self.add_common_event_parameters(event, datetime.utcnow())
318513
319 event.parameters = self.add_default_parameters_to_event(event.parameters)
320 data = get_properties(event)514 data = get_properties(event)
321 try:515 try:
322 self.save_event(json.dumps(data))516 self.save_event(json.dumps(data))
@@ -334,17 +528,16 @@ class EventLogger(object):
334 :param bool log_event: If true, log the collected metric in the agent log528 :param bool log_event: If true, log the collected metric in the agent log
335 """529 """
336 if log_event:530 if log_event:
337 from azurelinuxagent.common.version import AGENT_NAME
338 message = "Metric {0}/{1} [{2}] = {3}".format(category, counter, instance, value)531 message = "Metric {0}/{1} [{2}] = {3}".format(category, counter, instance, value)
339 _log_event(AGENT_NAME, "METRIC", message, 0)532 _log_event(AGENT_NAME, "METRIC", message, 0)
340533
341 event = TelemetryEvent(TELEMETRY_METRICS_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID)534 event = TelemetryEvent(TELEMETRY_METRICS_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID)
342 event.parameters.append(TelemetryEventParam('Category', str(category)))535 event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Category, str_to_encoded_ustr(category)))
343 event.parameters.append(TelemetryEventParam('Counter', str(counter)))536 event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Counter, str_to_encoded_ustr(counter)))
344 event.parameters.append(TelemetryEventParam('Instance', str(instance)))537 event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Instance, str_to_encoded_ustr(instance)))
345 event.parameters.append(TelemetryEventParam('Value', float(value)))538 event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Value, float(value)))
539 self.add_common_event_parameters(event, datetime.utcnow())
346540
347 event.parameters = self.add_default_parameters_to_event(event.parameters)
348 data = get_properties(event)541 data = get_properties(event)
349 try:542 try:
350 self.save_event(json.dumps(data))543 self.save_event(json.dumps(data))
@@ -392,56 +585,34 @@ class EventLogger(object):
392 else:585 else:
393 return message586 return message
394587
395 @staticmethod588 def add_common_event_parameters(self, event, event_timestamp):
396 def add_default_parameters_to_event(event_parameters, set_values_for_agent=True):
397 """589 """
398 Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't590 This method is called for all events and ensures all telemetry fields are added before the event is sent out.
399 have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly,591 Note that the event timestamp is saved in the OpcodeName field.
400 but not for extension events. Add it if it's missing.
401
402 We write the GAVersion here rather than add it in azurelinuxagent.ga.monitor.MonitorHandler.add_sysinfo
403 as there could be a possibility of events being sent with newer version of the agent, rather than the agent
404 version generating the event.
405 # Old behavior example: V1 writes the event on the disk and finds an update immediately, and updates. Now the
406 new monitor thread would pick up the events from the disk and send it with the CURRENT_AGENT, which would have
407 newer version of the agent. This causes confusion.
408
409 ContainerId can change due to live migration and we want to preserve the container Id of the container writing
410 the event, rather than sending the event.
411 OpcodeName - This is used as the actual time of event generation.
412
413 :param event_parameters: List of parameters of the event.
414 :param set_values_for_agent: Need default values populated or not. Extensions need only GAVersion and
415 ContainerId to be populated and others should be
416 :return: Event with default parameters populated (either values for agent or extension)
417 """592 """
418 DefaultParameter = namedtuple('DefaultParameter', ['name', 'value'])
419 default_parameters = [DefaultParameter("GAVersion", CURRENT_AGENT),
420 DefaultParameter('ContainerId', get_container_id_from_env()),
421 DefaultParameter('OpcodeName', datetime.utcnow().__str__() if set_values_for_agent else ""),
422 DefaultParameter('EventTid', threading.current_thread().ident if set_values_for_agent else 0),
423 DefaultParameter('EventPid', os.getpid() if set_values_for_agent else 0),
424 DefaultParameter("TaskName", threading.current_thread().getName() if set_values_for_agent else ""),
425 DefaultParameter("KeywordName", '')]
426
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches