Merge ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-merge-2.9.1.1 into ubuntu/+source/walinuxagent:ubuntu/mantic-devel
- Git
- lp:~calvinmwadime/ubuntu/+source/walinuxagent
- mantic-merge-2.9.1.1
- Merge into ubuntu/mantic-devel
Status: | Superseded | ||||
---|---|---|---|---|---|
Proposed branch: | ~calvinmwadime/ubuntu/+source/walinuxagent:mantic-merge-2.9.1.1 | ||||
Merge into: | ubuntu/+source/walinuxagent:ubuntu/mantic-devel | ||||
Diff against target: |
74801 lines (+43588/-14724) 404 files modified
.github/PULL_REQUEST_TEMPLATE.md (+1/-2) .github/codecov.yml (+2/-0) .github/workflows/ci_pr.yml (+128/-0) .gitignore (+1/-2) CODEOWNERS (+3/-1) README.md (+110/-33) SECURITY.md (+41/-0) azurelinuxagent/agent.py (+203/-66) azurelinuxagent/common/AgentGlobals.py (+39/-0) azurelinuxagent/common/agent_supported_feature.py (+122/-0) azurelinuxagent/common/cgroup.py (+208/-77) azurelinuxagent/common/cgroupapi.py (+214/-429) azurelinuxagent/common/cgroupconfigurator.py (+1000/-118) azurelinuxagent/common/cgroupstelemetry.py (+25/-265) azurelinuxagent/common/conf.py (+272/-21) azurelinuxagent/common/datacontract.py (+4/-2) azurelinuxagent/common/dhcp.py (+14/-11) azurelinuxagent/common/event.py (+286/-112) azurelinuxagent/common/exception.py (+72/-18) azurelinuxagent/common/future.py (+91/-15) azurelinuxagent/common/interfaces.py (+49/-0) azurelinuxagent/common/logcollector.py (+401/-0) azurelinuxagent/common/logcollector_manifests.py (+122/-0) azurelinuxagent/common/logger.py (+35/-7) azurelinuxagent/common/osutil/alpine.py (+2/-2) azurelinuxagent/common/osutil/arch.py (+9/-2) azurelinuxagent/common/osutil/bigip.py (+31/-30) azurelinuxagent/common/osutil/clearlinux.py (+28/-16) azurelinuxagent/common/osutil/coreos.py (+9/-3) azurelinuxagent/common/osutil/debian.py (+13/-13) azurelinuxagent/common/osutil/default.py (+441/-315) azurelinuxagent/common/osutil/devuan.py (+52/-0) azurelinuxagent/common/osutil/factory.py (+60/-33) azurelinuxagent/common/osutil/fedora.py (+77/-0) azurelinuxagent/common/osutil/freebsd.py (+43/-33) azurelinuxagent/common/osutil/gaia.py (+29/-17) azurelinuxagent/common/osutil/iosxe.py (+19/-7) azurelinuxagent/common/osutil/mariner.py (+69/-0) azurelinuxagent/common/osutil/nsbsd.py (+28/-26) azurelinuxagent/common/osutil/openbsd.py (+16/-19) azurelinuxagent/common/osutil/openwrt.py (+14/-13) azurelinuxagent/common/osutil/photonos.py (+65/-0) azurelinuxagent/common/osutil/redhat.py (+45/-16) azurelinuxagent/common/osutil/suse.py (+93/-36) azurelinuxagent/common/osutil/systemd.py (+86/-0) azurelinuxagent/common/osutil/ubuntu.py (+32/-16) azurelinuxagent/common/persist_firewall_rules.py (+338/-0) azurelinuxagent/common/protocol/__init__.py (+0/-5) azurelinuxagent/common/protocol/extensions_goal_state.py (+244/-0) azurelinuxagent/common/protocol/extensions_goal_state_factory.py (+36/-0) azurelinuxagent/common/protocol/extensions_goal_state_from_extensions_config.py (+571/-0) azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py (+583/-0) azurelinuxagent/common/protocol/goal_state.py (+705/-0) azurelinuxagent/common/protocol/hostplugin.py (+371/-38) azurelinuxagent/common/protocol/imds.py (+30/-14) azurelinuxagent/common/protocol/metadata_server_migration_util.py (+79/-0) azurelinuxagent/common/protocol/ovfenv.py (+7/-6) azurelinuxagent/common/protocol/restapi.py (+165/-91) azurelinuxagent/common/protocol/util.py (+109/-159) azurelinuxagent/common/protocol/wire.py (+516/-1095) azurelinuxagent/common/rdma.py (+199/-56) azurelinuxagent/common/singletonperthread.py (+30/-0) azurelinuxagent/common/telemetryevent.py (+72/-3) azurelinuxagent/common/utils/archive.py (+204/-111) azurelinuxagent/common/utils/cryptutil.py (+28/-25) azurelinuxagent/common/utils/extensionprocessutil.py (+31/-7) azurelinuxagent/common/utils/fileutil.py (+9/-7) azurelinuxagent/common/utils/flexible_version.py (+29/-9) azurelinuxagent/common/utils/networkutil.py (+172/-3) azurelinuxagent/common/utils/restutil.py (+153/-56) azurelinuxagent/common/utils/shellutil.py (+260/-57) azurelinuxagent/common/utils/textutil.py (+63/-10) azurelinuxagent/common/utils/timeutil.py (+39/-0) azurelinuxagent/common/version.py (+102/-31) azurelinuxagent/daemon/main.py (+36/-22) azurelinuxagent/daemon/resourcedisk/default.py (+13/-12) azurelinuxagent/daemon/resourcedisk/factory.py (+3/-7) azurelinuxagent/daemon/resourcedisk/freebsd.py (+1/-1) azurelinuxagent/daemon/resourcedisk/openwrt.py (+2/-2) azurelinuxagent/daemon/scvmm.py (+3/-3) azurelinuxagent/ga/collect_logs.py (+353/-0) azurelinuxagent/ga/collect_telemetry_events.py (+586/-0) azurelinuxagent/ga/env.py (+177/-123) azurelinuxagent/ga/exthandlers.py (+1493/-645) azurelinuxagent/ga/monitor.py (+249/-450) azurelinuxagent/ga/periodic_operation.py (+81/-0) azurelinuxagent/ga/remoteaccess.py (+81/-90) azurelinuxagent/ga/send_telemetry_events.py (+164/-0) azurelinuxagent/ga/update.py (+1089/-380) azurelinuxagent/pa/deprovision/arch.py (+1/-1) azurelinuxagent/pa/deprovision/clearlinux.py (+4/-2) azurelinuxagent/pa/deprovision/coreos.py (+1/-1) azurelinuxagent/pa/deprovision/default.py (+55/-17) azurelinuxagent/pa/deprovision/factory.py (+5/-8) azurelinuxagent/pa/deprovision/ubuntu.py (+2/-2) azurelinuxagent/pa/provision/cloudinit.py (+33/-91) azurelinuxagent/pa/provision/cloudinitdetect.py (+72/-0) azurelinuxagent/pa/provision/default.py (+28/-42) azurelinuxagent/pa/provision/factory.py (+3/-3) azurelinuxagent/pa/rdma/centos.py (+6/-6) azurelinuxagent/pa/rdma/factory.py (+9/-7) azurelinuxagent/pa/rdma/suse.py (+12/-3) azurelinuxagent/pa/rdma/ubuntu.py (+14/-14) bin/py3/waagent (+53/-0) bin/waagent (+5/-1) bin/waagent2.0 (+5/-1) ci/2.7.pylintrc (+42/-0) ci/3.6.pylintrc (+40/-0) ci/nosetests.sh (+25/-0) config/66-azure-storage.rules (+23/-17) config/alpine/waagent.conf (+4/-11) config/arch/waagent.conf (+4/-6) config/bigip/waagent.conf (+3/-10) config/clearlinux/waagent.conf (+3/-5) config/coreos/waagent.conf (+4/-11) config/debian/waagent.conf (+10/-11) config/devuan/waagent.conf (+130/-0) config/freebsd/waagent.conf (+6/-13) config/gaia/waagent.conf (+4/-6) config/iosxe/waagent.conf (+4/-6) config/mariner/waagent.conf (+88/-0) config/nsbsd/waagent.conf (+4/-6) config/openbsd/waagent.conf (+4/-6) config/photonos/waagent.conf (+80/-0) config/suse/waagent.conf (+12/-8) config/ubuntu/waagent.conf (+10/-11) config/waagent.conf (+26/-10) debian/changelog (+21/-0) debian/control (+0/-1) debian/docs (+0/-1) debian/install (+1/-1) debian/patches/add_manpage.patch (+471/-0) debian/patches/disable_udev_overrides.patch (+17/-3) debian/patches/fix_cgroup_v2_mounting_and_systemd_process.patch (+164/-0) debian/patches/fix_systemd_networkd_lease_file_path (+83/-0) debian/patches/series (+4/-2) debian/patches/sru_v2_9_1_1.patch (+233/-0) debian/rules (+3/-8) debian/walinuxagent.manpages (+1/-0) debian/watch (+2/-2) dev/null (+0/-29) init/azure-vmextensions.slice (+7/-0) init/azure.slice (+4/-0) init/devuan/default/walinuxagent (+2/-0) init/devuan/walinuxagent (+344/-0) init/mariner/waagent.service (+16/-0) init/photonos/waagent.service (+16/-0) init/redhat/py2/waagent.service (+19/-0) init/redhat/waagent.service (+19/-0) init/sles/waagent.service (+16/-0) init/ubuntu/walinuxagent.service (+3/-0) makepkg.py (+66/-51) setup.py (+143/-60) test-requirements.txt (+19/-3) tests/common/dhcp/test_dhcp.py (+27/-14) tests/common/mock_cgroup_environment.py (+122/-0) tests/common/mock_command.py (+17/-0) tests/common/mock_environment.py (+168/-0) tests/common/osutil/test_alpine.py (+3/-2) tests/common/osutil/test_arch.py (+3/-2) tests/common/osutil/test_bigip.py (+20/-21) tests/common/osutil/test_clearlinux.py (+3/-2) tests/common/osutil/test_coreos.py (+3/-2) tests/common/osutil/test_default.py (+429/-316) tests/common/osutil/test_default_osutil.py (+3/-162) tests/common/osutil/test_factory.py (+144/-69) tests/common/osutil/test_freebsd.py (+8/-7) tests/common/osutil/test_nsbsd.py (+12/-11) tests/common/osutil/test_openbsd.py (+3/-2) tests/common/osutil/test_openwrt.py (+3/-2) tests/common/osutil/test_photonos.py (+37/-0) tests/common/osutil/test_redhat.py (+3/-2) tests/common/osutil/test_suse.py (+3/-2) tests/common/osutil/test_ubuntu.py (+1/-1) tests/common/test_agent_supported_feature.py (+55/-0) tests/common/test_cgroupapi.py (+130/-548) tests/common/test_cgroupconfigurator.py (+973/-261) tests/common/test_cgroups.py (+62/-82) tests/common/test_cgroupstelemetry.py (+120/-406) tests/common/test_conf.py (+28/-53) tests/common/test_errorstate.py (+2/-1) tests/common/test_event.py (+583/-314) tests/common/test_logcollector.py (+477/-0) tests/common/test_logger.py (+45/-45) tests/common/test_persist_firewall_rules.py (+416/-0) tests/common/test_singletonperthread.py (+164/-0) tests/common/test_telemetryevent.py (+20/-19) tests/common/test_version.py (+80/-36) tests/daemon/test_daemon.py (+15/-14) tests/daemon/test_resourcedisk.py (+5/-5) tests/data/cgroups/cpu.stat (+3/-0) tests/data/cgroups/cpu.stat_t0 (+3/-0) tests/data/cgroups/cpu.stat_t1 (+3/-0) tests/data/cgroups/cpuacct.stat (+2/-0) tests/data/cgroups/memory_mount/memory.stat (+36/-0) tests/data/cgroups/missing_memory_counters/memory.stat (+34/-0) tests/data/cgroups/proc_pid_cgroup (+13/-0) tests/data/cgroups/proc_self_cgroup (+13/-0) tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers (+7/-0) tests/data/cloud-init/set-hostname (+4/-0) tests/data/events/custom_script_1.tld (+30/-0) tests/data/events/custom_script_2.tld (+30/-0) tests/data/events/custom_script_extra_parameters.tld (+66/-0) tests/data/events/custom_script_invalid_json.tld (+30/-0) tests/data/events/custom_script_no_read_access.tld (+30/-0) tests/data/events/custom_script_nonascii_characters.tld (+30/-0) tests/data/events/event_with_callstack.waagent.tld (+1/-0) tests/data/events/extension_events/different_cases/1591918616.json (+22/-0) tests/data/events/extension_events/empty_message/1592350454.json (+24/-0) tests/data/events/extension_events/extra_parameters/1592273009.json (+35/-0) tests/data/events/extension_events/int_type/1519934744.json (+10/-0) tests/data/events/extension_events/large_messages/1591921510.json (+12/-0) tests/data/events/extension_events/malformed_files/1592008079.json (+13/-0) tests/data/events/extension_events/malformed_files/1594857360.tld (+11/-0) tests/data/events/extension_events/malformed_files/bad_json_files/1591816395.json (+3/-0) tests/data/events/extension_events/malformed_files/bad_name_file.json (+24/-0) tests/data/events/extension_events/missing_parameters/1592273793.json (+74/-0) tests/data/events/extension_events/mix_files/1591835369.json (+3/-0) tests/data/events/extension_events/mix_files/1591835848.json (+85/-0) tests/data/events/extension_events/mix_files/1591835859.json (+11/-0) tests/data/events/extension_events/special_chars/1591918939.json (+10/-0) tests/data/events/extension_events/well_formed_files/1591905451.json (+82/-0) tests/data/events/extension_events/well_formed_files/1592355539.json (+72/-0) tests/data/events/extension_events/well_formed_files/9999999999.json (+82/-0) tests/data/events/legacy_agent.tld (+66/-0) tests/data/events/legacy_agent_no_timestamp.tld (+62/-0) tests/data/ext/event_from_agent.json (+119/-1) tests/data/ext/event_from_extension.xml (+9/-6) tests/data/ext/sample-status-invalid-format-emptykey-line7.json (+37/-0) tests/data/ext/sample-status-invalid-json-format.json (+37/-0) tests/data/ext/sample-status-invalid-status-no-status-status-key.json (+35/-0) tests/data/ext/sample-status-very-large-multiple-substatuses.json (+408/-0) tests/data/ext/sample-status-very-large.json (+39/-0) tests/data/ext/sample-status.json (+36/-0) tests/data/ext/sample_ext-1.3.0/python.sh (+11/-0) tests/data/ext/sample_ext-1.3.0/sample.py (+82/-23) tests/data/hostgaplugin/ext_conf-empty_depends_on.xml (+56/-0) tests/data/hostgaplugin/ext_conf-invalid_blob_type.xml (+94/-0) tests/data/hostgaplugin/ext_conf-no_status_upload_blob.xml (+39/-0) tests/data/hostgaplugin/ext_conf-requested_version.xml (+148/-0) tests/data/hostgaplugin/ext_conf.xml (+146/-0) tests/data/hostgaplugin/in_vm_artifacts_profile.json (+1/-0) tests/data/hostgaplugin/vm_settings-difference_in_required_features.json (+201/-0) tests/data/hostgaplugin/vm_settings-empty_depends_on.json (+69/-0) tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json (+192/-0) tests/data/hostgaplugin/vm_settings-invalid_blob_type.json (+104/-0) tests/data/hostgaplugin/vm_settings-missing_cert.json (+68/-0) tests/data/hostgaplugin/vm_settings-no_manifests.json (+73/-0) tests/data/hostgaplugin/vm_settings-no_status_upload_blob.json (+66/-0) tests/data/hostgaplugin/vm_settings-out-of-sync.json (+66/-0) tests/data/hostgaplugin/vm_settings-parse_error.json (+72/-0) tests/data/hostgaplugin/vm_settings-requested_version.json (+141/-0) tests/data/hostgaplugin/vm_settings-unsupported_version.json (+72/-0) tests/data/hostgaplugin/vm_settings.json (+201/-0) tests/data/init/azure-vmextensions.slice (+6/-0) tests/data/init/azure-walinuxagent-logcollector.slice (+9/-0) tests/data/init/azure.slice (+4/-0) tests/data/init/walinuxagent.service (+23/-0) tests/data/init/walinuxagent.service.previous (+20/-0) tests/data/init/walinuxagent.service_system-slice (+23/-0) tests/data/test_waagent.conf (+6/-5) tests/data/wire/certs-2.xml (+85/-0) tests/data/wire/certs.xml (+80/-76) tests/data/wire/certs_no_format_specified.xml (+78/-74) tests/data/wire/ext_conf-no_gs_metadata.xml (+27/-0) tests/data/wire/ext_conf.xml (+7/-5) tests/data/wire/ext_conf_additional_locations.xml (+34/-0) tests/data/wire/ext_conf_aks_extension.xml (+70/-0) tests/data/wire/ext_conf_autoupgrade.xml (+9/-7) tests/data/wire/ext_conf_autoupgrade_internalversion.xml (+9/-7) tests/data/wire/ext_conf_dependencies_with_empty_settings.xml (+33/-0) tests/data/wire/ext_conf_in_vm_artifacts_profile.xml (+29/-0) tests/data/wire/ext_conf_in_vm_empty_artifacts_profile.xml (+29/-0) tests/data/wire/ext_conf_in_vm_metadata.xml (+29/-0) tests/data/wire/ext_conf_internalversion.xml (+9/-7) tests/data/wire/ext_conf_invalid_and_valid_handlers.xml (+35/-0) tests/data/wire/ext_conf_invalid_vm_metadata.xml (+29/-0) tests/data/wire/ext_conf_missing_family.xml (+15/-14) tests/data/wire/ext_conf_missing_requested_version.xml (+39/-0) tests/data/wire/ext_conf_multiple_extensions.xml (+13/-32) tests/data/wire/ext_conf_no_extensions-block_blob.xml (+13/-0) tests/data/wire/ext_conf_no_extensions-no_status_blob.xml (+12/-0) tests/data/wire/ext_conf_no_extensions-page_blob.xml (+25/-0) tests/data/wire/ext_conf_no_public.xml (+25/-24) tests/data/wire/ext_conf_no_settings.xml (+24/-23) tests/data/wire/ext_conf_requested_version.xml (+29/-0) tests/data/wire/ext_conf_required_features.xml (+41/-0) tests/data/wire/ext_conf_sequencing.xml (+9/-7) tests/data/wire/ext_conf_settings_case_mismatch.xml (+57/-0) tests/data/wire/ext_conf_upgradeguid.xml (+7/-5) tests/data/wire/ga_manifest.xml (+10/-31) tests/data/wire/ga_manifest_no_upgrade.xml (+21/-21) tests/data/wire/goal_state.xml (+7/-7) tests/data/wire/goal_state_no_certs.xml (+27/-0) tests/data/wire/goal_state_no_ext.xml (+6/-5) tests/data/wire/goal_state_noop.xml (+14/-0) tests/data/wire/goal_state_remote_access.xml (+9/-8) tests/data/wire/in_vm_artifacts_profile.json (+1/-0) tests/data/wire/invalid_config/ext_conf_multiple_depends_on_for_single_handler.xml (+45/-0) tests/data/wire/invalid_config/ext_conf_multiple_runtime_settings_same_plugin.xml (+31/-0) tests/data/wire/invalid_config/ext_conf_multiple_settings_for_same_handler.xml (+33/-0) tests/data/wire/invalid_config/ext_conf_plugin_settings_version_mismatch.xml (+31/-0) tests/data/wire/invalid_config/ext_conf_single_and_multi_config_settings_same_plugin.xml (+31/-0) tests/data/wire/manifest.xml (+16/-16) tests/data/wire/manifest_deletion.xml (+1/-1) tests/data/wire/multi-config/ext_conf_mc_disabled_extensions.xml (+84/-0) tests/data/wire/multi-config/ext_conf_mc_update_extensions.xml (+75/-0) tests/data/wire/multi-config/ext_conf_multi_config_no_dependencies.xml (+75/-0) tests/data/wire/multi-config/ext_conf_with_disabled_multi_config.xml (+129/-0) tests/data/wire/multi-config/ext_conf_with_multi_config.xml (+131/-0) tests/data/wire/multi-config/ext_conf_with_multi_config_dependencies.xml (+99/-0) tests/data/wire/trans_cert (+17/-17) tests/data/wire/trans_prv (+26/-26) tests/data/wire/trans_pub (+7/-7) tests/distro/test_resourceDisk.py (+1/-1) tests/distro/test_scvmm.py (+6/-5) tests/ga/extension_emulator.py (+373/-0) tests/ga/mocks.py (+119/-0) tests/ga/test_collect_logs.py (+239/-0) tests/ga/test_collect_telemetry_events.py (+576/-0) tests/ga/test_env.py (+50/-50) tests/ga/test_extension.py (+2088/-1355) tests/ga/test_exthandlers.py (+283/-108) tests/ga/test_exthandlers_download_extension.py (+116/-58) tests/ga/test_exthandlers_exthandlerinstance.py (+10/-12) tests/ga/test_monitor.py (+155/-1141) tests/ga/test_multi_config_extension.py (+1229/-0) tests/ga/test_periodic_operation.py (+156/-0) tests/ga/test_remoteaccess.py (+41/-49) tests/ga/test_remoteaccess_handler.py (+429/-446) tests/ga/test_report_status.py (+119/-0) tests/ga/test_send_telemetry_events.py (+430/-0) tests/ga/test_update.py (+1913/-733) tests/pa/test_deprovision.py (+4/-4) tests/pa/test_provision.py (+34/-27) tests/protocol/HttpRequestPredicates.py (+101/-0) tests/protocol/mocks.py (+167/-0) tests/protocol/mockwiredata.py (+260/-47) tests/protocol/test_datacontract.py (+5/-5) tests/protocol/test_extensions_goal_state_from_extensions_config.py (+62/-0) tests/protocol/test_extensions_goal_state_from_vm_settings.py (+156/-0) tests/protocol/test_goal_state.py (+545/-0) tests/protocol/test_healthservice.py (+1/-1) tests/protocol/test_hostplugin.py (+620/-445) tests/protocol/test_image_info_matcher.py (+2/-1) tests/protocol/test_imds.py (+49/-46) tests/protocol/test_metadata_server_migration_util.py (+134/-0) tests/protocol/test_protocol_util.py (+208/-85) tests/protocol/test_wire.py (+864/-934) tests/test_agent.py (+154/-17) tests/tools.py (+87/-86) tests/utils/cgroups_tools.py (+1/-2) tests/utils/event_logger_tools.py (+65/-0) tests/utils/miscellaneous_tools.py (+62/-0) tests/utils/test_archive.py (+123/-179) tests/utils/test_crypt_util.py (+2/-7) tests/utils/test_extension_process_util.py (+103/-54) tests/utils/test_file_util.py (+19/-20) tests/utils/test_flexible_version.py (+21/-19) tests/utils/test_network_util.py (+36/-1) tests/utils/test_rest_util.py (+74/-55) tests/utils/test_shell_util.py (+337/-70) tests/utils/test_text_util.py (+32/-14) tests_e2e/orchestrator/docker/Dockerfile (+85/-0) tests_e2e/orchestrator/lib/agent_junit.py (+66/-0) tests_e2e/orchestrator/lib/agent_test_loader.py (+257/-0) tests_e2e/orchestrator/lib/agent_test_suite.py (+645/-0) tests_e2e/orchestrator/lib/agent_test_suite_combinator.py (+249/-0) tests_e2e/orchestrator/runbook.yml (+142/-0) tests_e2e/orchestrator/sample_runbooks/existing_vm.yml (+143/-0) tests_e2e/orchestrator/sample_runbooks/local_machine/hello_world.py (+32/-0) tests_e2e/orchestrator/sample_runbooks/local_machine/local.yml (+32/-0) tests_e2e/orchestrator/scripts/check-agent-log.py (+49/-0) tests_e2e/orchestrator/scripts/collect-logs (+34/-0) tests_e2e/orchestrator/scripts/get-agent-bin-path (+56/-0) tests_e2e/orchestrator/scripts/get-agent-modules-path (+37/-0) tests_e2e/orchestrator/scripts/get-agent-python (+59/-0) tests_e2e/orchestrator/scripts/install-agent (+137/-0) tests_e2e/orchestrator/scripts/install-tools (+135/-0) tests_e2e/orchestrator/scripts/uncompress.py (+33/-0) tests_e2e/orchestrator/scripts/unzip.py (+36/-0) tests_e2e/pipeline/pipeline-cleanup.yml (+58/-0) tests_e2e/pipeline/pipeline.yml (+119/-0) tests_e2e/pipeline/scripts/execute_tests.sh (+120/-0) tests_e2e/test_suites/agent_bvt.yml (+8/-0) tests_e2e/test_suites/fail.yml (+5/-0) tests_e2e/test_suites/images.yml (+94/-0) tests_e2e/test_suites/pass.yml (+4/-0) tests_e2e/tests/bvts/extension_operations.py (+94/-0) tests_e2e/tests/bvts/run_command.py (+94/-0) tests_e2e/tests/bvts/vm_access.py (+79/-0) tests_e2e/tests/error_test.py (+32/-0) tests_e2e/tests/fail_test.py (+33/-0) tests_e2e/tests/lib/agent_log.py (+446/-0) tests_e2e/tests/lib/agent_test.py (+66/-0) tests_e2e/tests/lib/agent_test_context.py (+164/-0) tests_e2e/tests/lib/identifiers.py (+63/-0) tests_e2e/tests/lib/logging.py (+155/-0) tests_e2e/tests/lib/retry.py (+59/-0) tests_e2e/tests/lib/shell.py (+56/-0) tests_e2e/tests/lib/ssh_client.py (+85/-0) tests_e2e/tests/lib/virtual_machine.py (+143/-0) tests_e2e/tests/lib/vm_extension.py (+239/-0) tests_e2e/tests/pass_test.py (+33/-0) |
||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Lucas Kanashiro | Pending | ||
git-ubuntu import | Pending | ||
Review via email: mp+452920@code.launchpad.net |
This proposal has been superseded by a proposal from 2023-10-31.
Commit message
added sru tests
added manpages
debian/patches: update cgroup logic to include v2
fix_systemd_
debian/control: Remove isc-dhcp-client from Depends
Remove upstart config
debian/rules: stop installing SysV init files and stop running tests
disable_
debian/patches: Remove deprecated patches
debian/install: add new udev rules
debian/docs: Remove Changelog
debian/watch: Fix package version regex pattern
New upstream version 2.9.1
Description of the change
Bryce Harrington (bryce) wrote : | # |
Calvin Mwadime Makokha (calvinmwadime) wrote : | # |
Hi Bryce,
> Hi Calvin, it's a bit late in the release for merges, did you have an MRE or
> SRU bug to accompany this?
After discussion with Utkarsh, it was decided we shall revisit this MP later since I was late to raise it.
> Also, I'm not spotting changes to debian/changelog, was that intentional to be
> excluded?
There are changes made to it. I think the diff display might have been truncated, however, you can see the file listed in the list above showing modified files
Lucas Kanashiro (lucaskanashiro) wrote : | # |
Hi Calvin,
Since you decided to work on this later, I'd ask you to check if there is a way to remove the review slot for ubuntu-sponsors. Otherwise, it will be kept in the general sponsorship queue and patch pilots will keep revisiting this.
Calvin Mwadime Makokha (calvinmwadime) wrote : | # |
Hi Lucas, which is the best way to do so? I have changed its status from `needs review` to `work in progress` will that help?
Calvin Mwadime Makokha (calvinmwadime) wrote : | # |
Hi Lucas, there has been a change in urgency for this merge. I would request a review on this merge request.Changing the status to `needs review`.
- 8a67b96... by Calvin Mwadime Makokha
-
fix_systemd_
networkd_ lease_file_ path.patch: patch osutil/ubuntu.py Use systemd_networkd for newer ubuntu servers
- 7d40672... by Calvin Mwadime Makokha
-
changelog: update version
- 0b1a8de... by Calvin Mwadime Makokha
-
debian/patches: update cgroup logic to include v2
Properly handle systemd using cgroup v2
- c7bd079... by Calvin Mwadime Makokha
-
changelog update
- dbbfc63... by Calvin Mwadime Makokha
-
Add manpages
- fbe4dd5... by Calvin Mwadime Makokha
-
changelog update
- 1d131ad... by Calvin Mwadime Makokha
-
Added sru tests
- 9c007ee... by Calvin Mwadime Makokha
-
changelog update
Unmerged commits
- 9c007ee... by Calvin Mwadime Makokha
-
changelog update
- 1d131ad... by Calvin Mwadime Makokha
-
Added sru tests
- fbe4dd5... by Calvin Mwadime Makokha
-
changelog update
- dbbfc63... by Calvin Mwadime Makokha
-
Add manpages
- c7bd079... by Calvin Mwadime Makokha
-
changelog update
- 0b1a8de... by Calvin Mwadime Makokha
-
debian/patches: update cgroup logic to include v2
Properly handle systemd using cgroup v2
- 7d40672... by Calvin Mwadime Makokha
-
changelog: update version
- 8a67b96... by Calvin Mwadime Makokha
-
fix_systemd_
networkd_ lease_file_ path.patch: patch osutil/ubuntu.py Use systemd_networkd for newer ubuntu servers
- 95c00e8... by Calvin Mwadime Makokha
-
debian/control: Remove isc-dhcp-client from Depends
- 9f935bf... by Calvin Mwadime Makokha
-
Remove upstart config
Preview Diff
1 | diff --git a/.flake8 b/.flake8 |
2 | deleted file mode 100644 |
3 | index 63303c3..0000000 |
4 | --- a/.flake8 |
5 | +++ /dev/null |
6 | @@ -1,32 +0,0 @@ |
7 | -# |
8 | -# The project did not use flake8 since inception so there are a number |
9 | -# of time-consuming flake8-identified improvements that are just a lot |
10 | -# of busy work. Each of these should be disabled and code cleaned up. |
11 | -# |
12 | -# W503: Line break occurred before a binary operator |
13 | -# W504: Line break occurred after a binary operator |
14 | -# E126: Continuation line over-indented for hanging indent |
15 | -# E127: Continuation line over-indented for visual indent |
16 | -# E128: Continuation line under-indented for visual indent |
17 | -# E201: Whitespace after '(' |
18 | -# E202: Whitespace before ')' |
19 | -# E203: Whitespace before ':' |
20 | -# E221: Multiple spaces before operator |
21 | -# E225: Missing whitespace around operator |
22 | -# E226: Missing whitespace around arithmetic operator |
23 | -# E231: Missing whitespace after ',', ';', or ':' |
24 | -# E261: At least two spaces before inline comment |
25 | -# E265: Block comment should start with '# ' |
26 | -# E302: Expected 2 blank lines, found 0 |
27 | -# E501: Line too long (xx > yy characters) |
28 | -# E502: The backslash is redundant between brackets |
29 | -# F401: Module imported but unused |
30 | -# F403: 'from module import *' used; unable to detect undefined names |
31 | -# F405: Name may be undefined, or defined from star imports: module |
32 | -# |
33 | - |
34 | -[flake8] |
35 | -ignore = W503,W504,E126,E127,E128,E201,E202,E203,E221,E225,E226,E231,E261,E265,E302,E501,E502,F401,F403,F405 |
36 | -exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,tests |
37 | -max-complexity = 30 |
38 | -max-line-length = 120 |
39 | \ No newline at end of file |
40 | diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md |
41 | index edfa1e6..fdcc07c 100644 |
42 | --- a/.github/PULL_REQUEST_TEMPLATE.md |
43 | +++ b/.github/PULL_REQUEST_TEMPLATE.md |
44 | @@ -14,9 +14,8 @@ This will expedite the process of getting your pull request merged and avoid ext |
45 | ### PR information |
46 | - [ ] The title of the PR is clear and informative. |
47 | - [ ] There are a small number of commits, each of which has an informative message. This means that previously merged commits do not appear in the history of the PR. For information on cleaning up the commits in your pull request, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). |
48 | -- [ ] Except for special cases involving multiple contributors, the PR is started from a fork of the main repository, not a branch. |
49 | - [ ] If applicable, the PR references the bug/issue that it fixes in the description. |
50 | -- [ ] New Unit tests were added for the changes made and Travis.CI is passing. |
51 | +- [ ] New Unit tests were added for the changes made |
52 | |
53 | ### Quality of Code and Contribution Guidelines |
54 | - [ ] I have read the [contribution guidelines](https://github.com/Azure/WALinuxAgent/blob/master/.github/CONTRIBUTING.md). |
55 | \ No newline at end of file |
56 | diff --git a/.github/codecov.yml b/.github/codecov.yml |
57 | new file mode 100644 |
58 | index 0000000..77707aa |
59 | --- /dev/null |
60 | +++ b/.github/codecov.yml |
61 | @@ -0,0 +1,2 @@ |
62 | +github_checks: |
63 | + annotations: false |
64 | diff --git a/.github/workflows/ci_pr.yml b/.github/workflows/ci_pr.yml |
65 | new file mode 100644 |
66 | index 0000000..e559268 |
67 | --- /dev/null |
68 | +++ b/.github/workflows/ci_pr.yml |
69 | @@ -0,0 +1,128 @@ |
70 | +name: CI Unit tests |
71 | + |
72 | +on: |
73 | + push: |
74 | + branches: [ "*" ] |
75 | + pull_request: |
76 | + branches: [ "*" ] |
77 | + workflow_dispatch: |
78 | + |
79 | +jobs: |
80 | + test-legacy-python-versions: |
81 | + |
82 | + strategy: |
83 | + fail-fast: false |
84 | + matrix: |
85 | + include: |
86 | + - python-version: 2.6 |
87 | + - python-version: 3.4 |
88 | + |
89 | + name: "Python ${{ matrix.python-version }} Unit Tests" |
90 | + runs-on: ubuntu-20.04 |
91 | + container: |
92 | + image: ubuntu:16.04 |
93 | + volumes: |
94 | + - /home/waagent:/home/waagent |
95 | + defaults: |
96 | + run: |
97 | + shell: bash -l {0} |
98 | + |
99 | + env: |
100 | + NOSEOPTS: "--verbose" |
101 | + |
102 | + steps: |
103 | + - uses: actions/checkout@v3 |
104 | + |
105 | + - name: Install Python ${{ matrix.python-version }} |
106 | + run: | |
107 | + apt-get update |
108 | + apt-get install -y curl bzip2 sudo python3 |
109 | + curl https://dcrdata.blob.core.windows.net/python/python-${{ matrix.python-version }}.tar.bz2 -o python-${{ matrix.python-version }}.tar.bz2 |
110 | + sudo tar xjvf python-${{ matrix.python-version }}.tar.bz2 --directory / |
111 | + |
112 | + - name: Test with nosetests |
113 | + run: | |
114 | + if [[ ${{ matrix.python-version }} == 2.6 ]]; then |
115 | + source /home/waagent/virtualenv/python2.6.9/bin/activate |
116 | + else |
117 | + source /home/waagent/virtualenv/python3.4.8/bin/activate |
118 | + fi |
119 | + ./ci/nosetests.sh |
120 | + exit $? |
121 | + |
122 | + test-current-python-versions: |
123 | + |
124 | + strategy: |
125 | + fail-fast: false |
126 | + matrix: |
127 | + include: |
128 | + |
129 | + - python-version: 2.7 |
130 | + PYLINTOPTS: "--rcfile=ci/2.7.pylintrc --ignore=tests_e2e,makepkg.py" |
131 | + |
132 | + - python-version: 3.5 |
133 | + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e,makepkg.py" |
134 | + |
135 | + - python-version: 3.6 |
136 | + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e" |
137 | + |
138 | + - python-version: 3.7 |
139 | + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e" |
140 | + |
141 | + - python-version: 3.8 |
142 | + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e" |
143 | + |
144 | + - python-version: 3.9 |
145 | + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc" |
146 | + additional-nose-opts: "--with-coverage --cover-erase --cover-inclusive --cover-branches --cover-package=azurelinuxagent" |
147 | + |
148 | + name: "Python ${{ matrix.python-version }} Unit Tests" |
149 | + runs-on: ubuntu-20.04 |
150 | + |
151 | + env: |
152 | + PYLINTOPTS: ${{ matrix.PYLINTOPTS }} |
153 | + PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests tests_e2e" |
154 | + NOSEOPTS: "--with-timer ${{ matrix.additional-nose-opts }}" |
155 | + PYTHON_VERSION: ${{ matrix.python-version }} |
156 | + |
157 | + steps: |
158 | + |
159 | + - name: Checkout WALinuxAgent repo |
160 | + uses: actions/checkout@v3 |
161 | + |
162 | + - name: Setup Python ${{ matrix.python-version }} |
163 | + uses: actions/setup-python@v4 |
164 | + with: |
165 | + python-version: ${{ matrix.python-version }} |
166 | + |
167 | + - name: Install dependencies |
168 | + id: install-dependencies |
169 | + run: | |
170 | + sudo env "PATH=$PATH" python -m pip install --upgrade pip |
171 | + sudo env "PATH=$PATH" pip install -r requirements.txt |
172 | + sudo env "PATH=$PATH" pip install -r test-requirements.txt |
173 | + |
174 | + - name: Run pylint |
175 | + run: | |
176 | + pylint $PYLINTOPTS --jobs=0 $PYLINTFILES |
177 | + |
178 | + - name: Test with nosetests |
179 | + if: success() || (failure() && steps.install-dependencies.outcome == 'success') |
180 | + run: | |
181 | + ./ci/nosetests.sh |
182 | + exit $? |
183 | + |
184 | + - name: Compile Coverage |
185 | + if: matrix.python-version == 3.9 |
186 | + run: | |
187 | + echo looking for coverage files : |
188 | + ls -alh | grep -i coverage |
189 | + sudo env "PATH=$PATH" coverage combine coverage.*.data |
190 | + sudo env "PATH=$PATH" coverage xml |
191 | + sudo env "PATH=$PATH" coverage report |
192 | + |
193 | + - name: Upload Coverage |
194 | + if: matrix.python-version == 3.9 |
195 | + uses: codecov/codecov-action@v2 |
196 | + with: |
197 | + file: ./coverage.xml |
198 | \ No newline at end of file |
199 | diff --git a/.gitignore b/.gitignore |
200 | index 0a31340..fd64d33 100644 |
201 | --- a/.gitignore |
202 | +++ b/.gitignore |
203 | @@ -17,8 +17,6 @@ develop-eggs/ |
204 | dist/ |
205 | downloads/ |
206 | eggs/ |
207 | -lib/ |
208 | -lib64/ |
209 | parts/ |
210 | sdist/ |
211 | var/ |
212 | @@ -92,3 +90,4 @@ ENV/ |
213 | |
214 | # pyenv |
215 | .python-version |
216 | +.vscode/ |
217 | diff --git a/.travis.yml b/.travis.yml |
218 | deleted file mode 100644 |
219 | index fa672d3..0000000 |
220 | --- a/.travis.yml |
221 | +++ /dev/null |
222 | @@ -1,43 +0,0 @@ |
223 | ---- |
224 | -os: linux |
225 | -dist: xenial |
226 | -language: python |
227 | -env: |
228 | - - NOSEOPTS="--verbose" SETUPOPTS="" |
229 | - # Add SETUPOPTS="check flake8" to enable flake8 checks |
230 | - |
231 | -matrix: |
232 | - # exclude the default "python" build - we're being specific here... |
233 | - exclude: |
234 | - - python: |
235 | - env: |
236 | - - NOSEOPTS="" SETUPOPTS="check flake8" |
237 | - |
238 | - include: |
239 | - - python: 2.6 |
240 | - dist: trusty |
241 | - env: |
242 | - - NOSEOPTS="--verbose" SETUPOPTS="" |
243 | - - python: 2.7 |
244 | - - python: 3.4 |
245 | - - python: 3.6 |
246 | - - python: 3.7 |
247 | - env: |
248 | - - >- |
249 | - NOSEOPTS="--verbose --with-coverage --cover-inclusive |
250 | - --cover-min-percentage=60 --cover-branches |
251 | - --cover-package=azurelinuxagent --cover-xml" |
252 | - SETUPOPTS="" |
253 | - |
254 | -install: |
255 | - - pip install -r requirements.txt |
256 | - - pip install -r test-requirements.txt |
257 | - |
258 | -script: |
259 | - # future: - pylint setup.py makepkg.py azurelinuxagent/ |
260 | - - nosetests $NOSEOPTS --attr '!requires_sudo' tests |
261 | - - sudo env "PATH=$PATH" nosetests $NOSEOPTS --verbose --attr 'requires_sudo' tests |
262 | - - if [ ! -z "$SETUPOPTS" ]; then /usr/bin/env python setup.py $SETUPOPTS; fi |
263 | - |
264 | -after_success: |
265 | - - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; then codecov; fi |
266 | \ No newline at end of file |
267 | diff --git a/CODEOWNERS b/CODEOWNERS |
268 | index a8f2d5d..8707e60 100644 |
269 | --- a/CODEOWNERS |
270 | +++ b/CODEOWNERS |
271 | @@ -1,3 +1,4 @@ |
272 | +1 |
273 | # See https://help.github.com/articles/about-codeowners/ |
274 | # for more info about CODEOWNERS file |
275 | |
276 | @@ -9,6 +10,7 @@ |
277 | # when there are requests for changes in the provisioning agent. For any |
278 | # questions, please feel free to reach out to thstring@microsoft.com. |
279 | /azurelinuxagent/pa/ @trstringer @anhvoms |
280 | +/tests/pa/ @trstringer @anhvoms |
281 | |
282 | # |
283 | # RDMA |
284 | @@ -19,4 +21,4 @@ |
285 | # |
286 | # Linux Agent team |
287 | # |
288 | -* @narrieta @vrdmr @pgombar @larohra |
289 | +* @narrieta @ZhidongPeng @nagworld9 @maddieford |
290 | diff --git a/Changelog b/Changelog |
291 | deleted file mode 100644 |
292 | index da68890..0000000 |
293 | --- a/Changelog |
294 | +++ /dev/null |
295 | @@ -1,38 +0,0 @@ |
296 | -WALinuxAgent Changelog |
297 | -||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| |
298 | - |
299 | -Refer to releases WALinuxAgent release page: https://github.com/Azure/WALinuxAgent/releases for detailed changelog after v2.2.0 |
300 | - |
301 | -12 August 2016, v2.1.6 |
302 | - . Improved RDMA support |
303 | - . Extension state migration |
304 | - . Alpine Linux support |
305 | - . Fixes for #347, #351, #353 |
306 | - |
307 | -15 July 2016, v2.1.5 |
308 | - . Goal state processing extension |
309 | - . Multi-nic improvements |
310 | - . Bug fixes for #145, #141, #133, #116, #187, #169, #104, #127, #163, |
311 | - #190, #185, #174 |
312 | - |
313 | -09 Mar 2016, WALinuxAgent 2.1.4 |
314 | - . Add support for FreeBSD |
315 | - . Fix a bug for internal extension version resolving |
316 | - |
317 | -29 Jan 2016, WALinuxAgent 2.1.3 |
318 | - . Fixed endpoint probing for Azure Stack |
319 | - . Multiple fixes for extension handling |
320 | - |
321 | -07 Dec 2015, WALinuxAgent 2.1.2 |
322 | - . Multiple fixes for extension handling and provisioning |
323 | - |
324 | -07 Aug 2015, WALinuxAgent 2.1.1 |
325 | - . Support python3 |
326 | - . Fixed bugs for metadata protocol |
327 | - . Fixed a few pylint warnings |
328 | - . Enabled travis-ci |
329 | - |
330 | -||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| |
331 | -01 Jul 2015, WALinuxAgent 2.1.0 |
332 | - . Divide waagent into different modules |
333 | - |
334 | diff --git a/README.md b/README.md |
335 | index 0069d46..ae6a851 100644 |
336 | --- a/README.md |
337 | +++ b/README.md |
338 | @@ -1,31 +1,15 @@ |
339 | + |
340 | # Microsoft Azure Linux Agent |
341 | |
342 | -## Develop branch status |
343 | - |
344 | -[![Travis CI](https://travis-ci.org/Azure/WALinuxAgent.svg?branch=develop)](https://travis-ci.org/Azure/WALinuxAgent/branches) |
345 | -[![CodeCov](https://codecov.io/gh/Azure/WALinusAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop) |
346 | - |
347 | -Each badge below represents our basic validation tests for an image, which are executed several times each day. These include provisioning, user account, disk, extension and networking scenarios. |
348 | - |
349 | -Note: These badges represent testing to our develop branch which might not be stable. For a stable build please use master branch instead. |
350 | - |
351 | -Image | Status | |
352 | -------|--------| |
353 | -Canonical UbuntuServer 14.04.5-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-LTS__agent--bvt.svg) |
354 | -Canonical UbuntuServer 14.04.5-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_14.04.5-DAILY-LTS__agent--bvt.svg) |
355 | -Canonical UbuntuServer 16.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-LTS__agent--bvt.svg) |
356 | -Canonical UbuntuServer 16.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_16.04-DAILY-LTS__agent--bvt.svg) |
357 | -Canonical UbuntuServer 18.04-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-LTS__agent--bvt.svg) |
358 | -Canonical UbuntuServer 18.04-DAILY-LTS|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Canonical_UbuntuServer_18.04-DAILY-LTS__agent--bvt.svg) |
359 | -Credativ Debian 8|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8__agent--bvt.svg) |
360 | -Credativ Debian 8-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_8-DAILY__agent--bvt.svg) |
361 | -Credativ Debian 9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9__agent--bvt.svg) |
362 | -Credativ Debian 9-DAILY|![badge](https://dcrbadges.blob.core.windows.net/scenarios/Credativ_Debian_9-DAILY__agent--bvt.svg) |
363 | -OpenLogic CentOS 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_6.9__agent--bvt.svg) |
364 | -OpenLogic CentOS 7.4|![badge](https://dcrbadges.blob.core.windows.net/scenarios/OpenLogic_CentOS_7.4__agent--bvt.svg) |
365 | -RedHat RHEL 6.9|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_6.9__agent--bvt.svg) |
366 | -RedHat RHEL 7-RAW|![badge](https://dcrbadges.blob.core.windows.net/scenarios/RedHat_RHEL_7-RAW__agent--bvt.svg) |
367 | -SUSE SLES 12-SP3|![badge](https://dcrbadges.blob.core.windows.net/scenarios/SUSE_SLES_12-SP3__agent--bvt.svg) |
368 | +## Linux distributions support |
369 | + |
370 | +Our daily automation tests most of the [Linux distributions supported by Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros); the Agent can be |
371 | +used on other distributions as well, but development, testing and support for those are done by the open source community. |
372 | + |
373 | +Testing is done using the develop branch, which can be unstable. For a stable build please use the master branch instead. |
374 | + |
375 | +[![CodeCov](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop/graph/badge.svg)](https://codecov.io/gh/Azure/WALinuxAgent/branch/develop) |
376 | + |
377 | |
378 | ## Introduction |
379 | |
380 | @@ -49,7 +33,6 @@ functionality for Linux IaaS deployments: |
381 | |
382 | * Kernel |
383 | * Configure virtual NUMA (disable for kernel <2.6.37) |
384 | - * Consume Hyper-V entropy for /dev/random |
385 | * Configure SCSI timeouts for the root device (which could be remote) |
386 | |
387 | * Diagnostics |
388 | @@ -79,13 +62,15 @@ The agent will use an HTTP proxy if provided via the `http_proxy` (for `http` re |
389 | `https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and |
390 | `HttpProxy.Port` configuration variables (see below), if used, will override the environment |
391 | settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring |
392 | -authentication. |
393 | +authentication. Note that when the agent service is managed by systemd, environment variables |
394 | +such as `http_proxy` and `https_proxy` should be defined using one the mechanisms provided by |
395 | +systemd (e.g. by using Environment or EnvironmentFile in the service file). |
396 | |
397 | ## Requirements |
398 | |
399 | The following systems have been tested and are known to work with the Azure |
400 | Linux Agent. Please note that this list may differ from the official list |
401 | -of supported systems on the Microsoft Azure Platform as described [here](http://support.microsoft.com/kb/2805216). |
402 | +of supported systems on the Microsoft Azure Platform as described [here](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/endorsed-distros). |
403 | |
404 | Waagent depends on some system packages in order to function properly: |
405 | |
406 | @@ -109,6 +94,12 @@ For more advanced installation options, such as installing to custom locations o |
407 | sudo python setup.py install --register-service |
408 | ``` |
409 | |
410 | +For Python 3, use: |
411 | + |
412 | +```bash |
413 | + sudo python3 setup.py install --register-service |
414 | +``` |
415 | + |
416 | You can view more installation options by running: |
417 | |
418 | ```bash |
419 | @@ -177,6 +168,8 @@ For CoreOS, use: |
420 | |
421 | `-start`: Run waagent as a background process |
422 | |
423 | +`-collect-logs [-full]`: Runs the log collector utility that collects relevant agent logs for debugging and stores them in the agent folder on disk. Exact location will be shown when run. Use flag `-full` for more exhaustive log collection. |
424 | + |
425 | ## Configuration |
426 | |
427 | A configuration file (/etc/waagent.conf) controls the actions of waagent. Blank lines and lines whose first character is a `#` are ignored (end-of-line comments are *not* supported). |
428 | @@ -185,6 +178,7 @@ A sample configuration file is shown below: |
429 | |
430 | ```yml |
431 | Extensions.Enabled=y |
432 | +Extensions.GoalStatePeriod=6 |
433 | Provisioning.Agent=auto |
434 | Provisioning.DeleteRootPassword=n |
435 | Provisioning.RegenerateSshHostKeyPair=y |
436 | @@ -202,6 +196,8 @@ ResourceDisk.EnableSwap=n |
437 | ResourceDisk.EnableSwapEncryption=n |
438 | ResourceDisk.SwapSizeMB=0 |
439 | Logs.Verbose=n |
440 | +Logs.Collect=y |
441 | +Logs.CollectPeriod=3600 |
442 | OS.AllowHTTP=n |
443 | OS.RootDeviceScsiTimeout=300 |
444 | OS.EnableFIPS=n |
445 | @@ -210,8 +206,6 @@ OS.SshClientAliveInterval=180 |
446 | OS.SshDir=/etc/ssh |
447 | HttpProxy.Host=None |
448 | HttpProxy.Port=None |
449 | -CGroups.EnforceLimits=y |
450 | -CGroups.Excluded=customscript,runcommand |
451 | ``` |
452 | |
453 | The various configuration options are described in detail below. Configuration |
454 | @@ -238,6 +232,32 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set |
455 | provisioning time, via whichever API is being used. We will provide more details on |
456 | this on our wiki when it is generally available. |
457 | |
458 | +#### __Extensions.GoalStatePeriod__ |
459 | + |
460 | +_Type: Integer_ |
461 | +_Default: 6_ |
462 | + |
463 | +How often to poll for new goal states (in seconds) and report the status of the VM |
464 | +and extensions. Goal states describe the desired state of the extensions on the VM. |
465 | + |
466 | +_Note_: setting up this parameter to more than a few minutes can make the state of |
467 | +the VM be reported as unresponsive/unavailable on the Azure portal. Also, this |
468 | +setting affects how fast the agent starts executing extensions. |
469 | + |
470 | +#### __AutoUpdate.Enabled__ |
471 | + |
472 | +_Type: Boolean_ |
473 | +_Default: y_ |
474 | + |
475 | +Enables auto-update of the Extension Handler. The Extension Handler is responsible |
476 | +for managing extensions and reporting VM status. The core functionality of the agent |
477 | +is contained in the Extension Handler, and we encourage users to enable this option |
478 | +in order to maintain an up to date version. |
479 | + |
480 | +On most distros the default value is 'y'. |
481 | + |
482 | +For more information on the agent version, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#what-does-goal-state-agent-mean-in-waagent---version-output). |
483 | + |
484 | #### __Provisioning.Agent__ |
485 | |
486 | _Type: String_ |
487 | @@ -261,7 +281,22 @@ _Note_: This configuration option has been removed and has no effect. waagent |
488 | now auto-detects cloud-init as a provisioning agent (with an option to override |
489 | with `Provisioning.Agent`). |
490 | |
491 | -#### __Provisioning.UseCloudInit__ (*removed in 2.2.45*) |
492 | +#### __Provisioning.MonitorHostName__ |
493 | + |
494 | +_Type: Boolean_ |
495 | +_Default: n_ |
496 | + |
497 | +Monitor host name changes and publish changes via DHCP requests. |
498 | + |
499 | +#### __Provisioning.MonitorHostNamePeriod__ |
500 | + |
501 | +_Type: Integer_ |
502 | +_Default: 30_ |
503 | + |
504 | +How often to monitor host name changes (in seconds). This setting is ignored if |
505 | +MonitorHostName is not set. |
506 | + |
507 | +#### __Provisioning.UseCloudInit__ |
508 | |
509 | _Type: Boolean_ |
510 | _Default: n_ |
511 | @@ -397,7 +432,7 @@ system swap space. |
512 | _Type: Boolean_ |
513 | _Default: n_ |
514 | |
515 | -If set, the swap file (/swapfile) is mounted as an encrypted filesystem. |
516 | +If set, the swap file (/swapfile) is mounted as an encrypted filesystem (flag supported only on FreeBSD.) |
517 | |
518 | #### __ResourceDisk.SwapSizeMB__ |
519 | |
520 | @@ -414,6 +449,25 @@ _Default: n_ |
521 | If set, log verbosity is boosted. Waagent logs to /var/log/waagent.log and |
522 | leverages the system logrotate functionality to rotate logs. |
523 | |
524 | + |
525 | +#### __Logs.Collect__ |
526 | + |
527 | +_Type: Boolean_ |
528 | +_Default: y_ |
529 | + |
530 | +If set, agent logs will be periodically collected and uploaded to a secure location for improved supportability. |
531 | + |
532 | +NOTE: This feature relies on the agent's resource usage features (cgroups); this flag will not take effect on any distro not supported. |
533 | + |
534 | +#### __Logs.CollectPeriod__ |
535 | + |
536 | +_Type: Integer_ |
537 | +_Default: 3600_ |
538 | + |
539 | +This configures how frequently to collect and upload logs. Default is each hour. |
540 | + |
541 | +NOTE: This only takes effect if the Logs.Collect option is enabled. |
542 | + |
543 | #### __OS.AllowHTTP__ |
544 | |
545 | _Type: Boolean_ |
546 | @@ -442,6 +496,14 @@ OpenSSL commands. This signals OpenSSL to use any installed FIPS-compliant libra |
547 | Note that the agent itself has no FIPS-specific code. _If no FIPS-compliant certificates are |
548 | installed, then enabling this option will cause all OpenSSL commands to fail._ |
549 | |
550 | +#### __OS.MonitorDhcpClientRestartPeriod__ |
551 | + |
552 | +_Type: Integer_ |
553 | +_Default: 30_ |
554 | + |
555 | +The agent monitor restarts of the DHCP client and restores network rules when it happens. This |
556 | +setting determines how often (in seconds) to monitor for restarts. |
557 | + |
558 | #### __OS.RootDeviceScsiTimeout__ |
559 | |
560 | _Type: Integer_ |
561 | @@ -450,6 +512,14 @@ _Default: 300_ |
562 | This configures the SCSI timeout in seconds on the root device. If not set, the |
563 | system defaults are used. |
564 | |
565 | +#### __OS.RootDeviceScsiTimeoutPeriod__ |
566 | + |
567 | +_Type: Integer_ |
568 | +_Default: 30_ |
569 | + |
570 | +How often to set the SCSI timeout on the root device (in seconds). This setting is |
571 | +ignored if RootDeviceScsiTimeout is not set. |
572 | + |
573 | #### __OS.OpensslPath__ |
574 | |
575 | _Type: String_ |
576 | @@ -458,6 +528,13 @@ _Default: None_ |
577 | This can be used to specify an alternate path for the openssl binary to use for |
578 | cryptographic operations. |
579 | |
580 | +#### __OS.RemovePersistentNetRulesPeriod__ |
581 | +_Type: Integer_ |
582 | +_Default: 30_ |
583 | + |
584 | +How often to remove the udev rules for persistent network interface names (75-persistent-net-generator.rules |
585 | +and /etc/udev/rules.d/70-persistent-net.rules) (in seconds) |
586 | + |
587 | #### __OS.SshClientAliveInterval__ |
588 | |
589 | _Type: Integer_ |
590 | diff --git a/SECURITY.md b/SECURITY.md |
591 | new file mode 100644 |
592 | index 0000000..e138ec5 |
593 | --- /dev/null |
594 | +++ b/SECURITY.md |
595 | @@ -0,0 +1,41 @@ |
596 | +<!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK --> |
597 | + |
598 | +## Security |
599 | + |
600 | +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). |
601 | + |
602 | +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. |
603 | + |
604 | +## Reporting Security Issues |
605 | + |
606 | +**Please do not report security vulnerabilities through public GitHub issues.** |
607 | + |
608 | +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). |
609 | + |
610 | +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). |
611 | + |
612 | +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). |
613 | + |
614 | +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: |
615 | + |
616 | + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) |
617 | + * Full paths of source file(s) related to the manifestation of the issue |
618 | + * The location of the affected source code (tag/branch/commit or direct URL) |
619 | + * Any special configuration required to reproduce the issue |
620 | + * Step-by-step instructions to reproduce the issue |
621 | + * Proof-of-concept or exploit code (if possible) |
622 | + * Impact of the issue, including how an attacker might exploit the issue |
623 | + |
624 | +This information will help us triage your report more quickly. |
625 | + |
626 | +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. |
627 | + |
628 | +## Preferred Languages |
629 | + |
630 | +We prefer all communications to be in English. |
631 | + |
632 | +## Policy |
633 | + |
634 | +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). |
635 | + |
636 | +<!-- END MICROSOFT SECURITY.MD BLOCK --> |
637 | diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py |
638 | index 6e65084..8c30348 100644 |
639 | --- a/azurelinuxagent/agent.py |
640 | +++ b/azurelinuxagent/agent.py |
641 | @@ -24,21 +24,47 @@ Module agent |
642 | from __future__ import print_function |
643 | |
644 | import os |
645 | -import sys |
646 | import re |
647 | import subprocess |
648 | +import sys |
649 | import threading |
650 | -import traceback |
651 | +from azurelinuxagent.common import cgroupconfigurator, logcollector |
652 | +from azurelinuxagent.common.cgroupapi import SystemdCgroupsApi |
653 | |
654 | -import azurelinuxagent.common.logger as logger |
655 | -import azurelinuxagent.common.event as event |
656 | import azurelinuxagent.common.conf as conf |
657 | -from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, \ |
658 | - DISTRO_NAME, DISTRO_VERSION, \ |
659 | - PY_VERSION_MAJOR, PY_VERSION_MINOR, \ |
660 | - PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION |
661 | +import azurelinuxagent.common.event as event |
662 | +import azurelinuxagent.common.logger as logger |
663 | +from azurelinuxagent.common.future import ustr |
664 | +from azurelinuxagent.common.logcollector import LogCollector, OUTPUT_RESULTS_FILE_PATH |
665 | from azurelinuxagent.common.osutil import get_osutil |
666 | -from azurelinuxagent.common.utils import fileutil |
667 | +from azurelinuxagent.common.utils import fileutil, textutil |
668 | +from azurelinuxagent.common.utils.flexible_version import FlexibleVersion |
669 | +from azurelinuxagent.common.utils.networkutil import AddFirewallRules |
670 | +from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, AGENT_VERSION, \ |
671 | + DISTRO_NAME, DISTRO_VERSION, \ |
672 | + PY_VERSION_MAJOR, PY_VERSION_MINOR, \ |
673 | + PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION, \ |
674 | + get_daemon_version, set_daemon_version |
675 | +from azurelinuxagent.ga.collect_logs import CollectLogsHandler, get_log_collector_monitor_handler |
676 | +from azurelinuxagent.pa.provision.default import ProvisionHandler |
677 | + |
678 | + |
679 | +class AgentCommands(object): |
680 | + """ |
681 | + This is the list of all commands that the Linux Guest Agent supports |
682 | + """ |
683 | + DeprovisionUser = "deprovision+user" |
684 | + Deprovision = "deprovision" |
685 | + Daemon = "daemon" |
686 | + Start = "start" |
687 | + RegisterService = "register-service" |
688 | + RunExthandlers = "run-exthandlers" |
689 | + Version = "version" |
690 | + ShowConfig = "show-configuration" |
691 | + Help = "help" |
692 | + CollectLogs = "collect-logs" |
693 | + SetupFirewall = "setup-firewall" |
694 | + Provision = "provision" |
695 | |
696 | |
697 | class Agent(object): |
698 | @@ -49,24 +75,24 @@ class Agent(object): |
699 | self.conf_file_path = conf_file_path |
700 | self.osutil = get_osutil() |
701 | |
702 | - #Init stdout log |
703 | + # Init stdout log |
704 | level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO |
705 | logger.add_logger_appender(logger.AppenderType.STDOUT, level) |
706 | |
707 | - #Init config |
708 | + # Init config |
709 | conf_file_path = self.conf_file_path \ |
710 | if self.conf_file_path is not None \ |
711 | else self.osutil.get_agent_conf_file_path() |
712 | conf.load_conf_from_file(conf_file_path) |
713 | |
714 | - #Init log |
715 | + # Init log |
716 | verbose = verbose or conf.get_logs_verbose() |
717 | level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO |
718 | - logger.add_logger_appender(logger.AppenderType.FILE, level, |
719 | - path="/var/log/waagent.log") |
720 | - if conf.get_logs_console(): |
721 | - logger.add_logger_appender(logger.AppenderType.CONSOLE, level, |
722 | - path="/dev/console") |
723 | + logger.add_logger_appender(logger.AppenderType.FILE, level, path=conf.get_agent_log_file()) |
724 | + |
725 | + # echo the log to /dev/console if the machine will be provisioned |
726 | + if conf.get_logs_console() and not ProvisionHandler.is_provisioned(): |
727 | + self.__add_console_appender(level) |
728 | |
729 | if event.send_logs_to_telemetry(): |
730 | logger.add_logger_appender(logger.AppenderType.TELEMETRY, |
731 | @@ -84,22 +110,30 @@ class Agent(object): |
732 | "Exception occurred while creating extension " |
733 | "log directory {0}: {1}".format(ext_log_dir, e)) |
734 | |
735 | - #Init event reporter |
736 | + # Init event reporter |
737 | + # Note that the reporter is not fully initialized here yet. Some telemetry fields are filled with data |
738 | + # originating from the goal state or IMDS, which requires a WireProtocol instance. Once a protocol |
739 | + # has been established, those fields must be explicitly initialized using |
740 | + # initialize_event_logger_vminfo_common_parameters(). Any events created before that initialization |
741 | + # will contain dummy values on those fields. |
742 | event.init_event_status(conf.get_lib_dir()) |
743 | - event_dir = os.path.join(conf.get_lib_dir(), "events") |
744 | + event_dir = os.path.join(conf.get_lib_dir(), event.EVENTS_DIRECTORY) |
745 | event.init_event_logger(event_dir) |
746 | event.enable_unhandled_err_dump("WALA") |
747 | |
748 | + def __add_console_appender(self, level): |
749 | + logger.add_logger_appender(logger.AppenderType.CONSOLE, level, path="/dev/console") |
750 | + |
751 | def daemon(self): |
752 | """ |
753 | Run agent daemon |
754 | """ |
755 | + set_daemon_version(AGENT_VERSION) |
756 | logger.set_prefix("Daemon") |
757 | threading.current_thread().setName("Daemon") |
758 | child_args = None \ |
759 | if self.conf_file_path is None \ |
760 | else "-configuration-path:{0}".format(self.conf_file_path) |
761 | - |
762 | from azurelinuxagent.daemon import get_daemon_handler |
763 | daemon_handler = get_daemon_handler() |
764 | daemon_handler.run(child_args=child_args) |
765 | @@ -137,6 +171,21 @@ class Agent(object): |
766 | """ |
767 | logger.set_prefix("ExtHandler") |
768 | threading.current_thread().setName("ExtHandler") |
769 | + |
770 | + # |
771 | + # Agents < 2.2.53 used to echo the log to the console. Since the extension handler could have been started by |
772 | + # one of those daemons, output a message indicating that output to the console will stop, otherwise users |
773 | + # may think that the agent died if they noticed that output to the console stops abruptly. |
774 | + # |
775 | + # Feel free to remove this code if telemetry shows there are no more agents <= 2.2.53 in the field. |
776 | + # |
777 | + if conf.get_logs_console() and get_daemon_version() < FlexibleVersion("2.2.53"): |
778 | + self.__add_console_appender(logger.LogLevel.INFO) |
779 | + try: |
780 | + logger.info(u"The agent will now check for updates and then will process extensions. Output to /dev/console will be suspended during those operations.") |
781 | + finally: |
782 | + logger.disable_console_output() |
783 | + |
784 | from azurelinuxagent.ga.update import get_update_handler |
785 | update_handler = get_update_handler() |
786 | update_handler.run(debug) |
787 | @@ -146,91 +195,175 @@ class Agent(object): |
788 | for k in sorted(configuration.keys()): |
789 | print("{0} = {1}".format(k, configuration[k])) |
790 | |
791 | + def collect_logs(self, is_full_mode): |
792 | + logger.set_prefix("LogCollector") |
793 | + |
794 | + if is_full_mode: |
795 | + logger.info("Running log collector mode full") |
796 | + else: |
797 | + logger.info("Running log collector mode normal") |
798 | + |
799 | + # Check the cgroups unit |
800 | + cpu_cgroup_path, memory_cgroup_path, log_collector_monitor = None, None, None |
801 | + if CollectLogsHandler.should_validate_cgroups(): |
802 | + cgroups_api = SystemdCgroupsApi() |
803 | + cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self") |
804 | |
805 | -def main(args=[]): |
806 | + cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path) |
807 | + memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path) |
808 | + |
809 | + if not cpu_slice_matches or not memory_slice_matches: |
810 | + logger.info("The Log Collector process is not in the proper cgroups:") |
811 | + if not cpu_slice_matches: |
812 | + logger.info("\tunexpected cpu slice") |
813 | + if not memory_slice_matches: |
814 | + logger.info("\tunexpected memory slice") |
815 | + |
816 | + sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) |
817 | + |
818 | + try: |
819 | + log_collector = LogCollector(is_full_mode, cpu_cgroup_path, memory_cgroup_path) |
820 | + log_collector_monitor = get_log_collector_monitor_handler(log_collector.cgroups) |
821 | + log_collector_monitor.run() |
822 | + archive = log_collector.collect_logs_and_get_archive() |
823 | + logger.info("Log collection successfully completed. Archive can be found at {0} " |
824 | + "and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH)) |
825 | + except Exception as e: |
826 | + logger.error("Log collection completed unsuccessfully. Error: {0}".format(ustr(e))) |
827 | + logger.info("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH)) |
828 | + sys.exit(1) |
829 | + finally: |
830 | + if log_collector_monitor is not None: |
831 | + log_collector_monitor.stop() |
832 | + |
833 | + @staticmethod |
834 | + def setup_firewall(firewall_metadata): |
835 | + |
836 | + print("Setting up firewall for the WALinux Agent with args: {0}".format(firewall_metadata)) |
837 | + try: |
838 | + AddFirewallRules.add_iptables_rules(firewall_metadata['wait'], firewall_metadata['dst_ip'], |
839 | + firewall_metadata['uid']) |
840 | + print("Successfully set the firewall rules") |
841 | + except Exception as error: |
842 | + print("Unable to add firewall rules. Error: {0}".format(ustr(error))) |
843 | + sys.exit(1) |
844 | + |
845 | + |
846 | +def main(args=None): |
847 | """ |
848 | Parse command line arguments, exit with usage() on error. |
849 | Invoke different methods according to different command |
850 | """ |
851 | + if args is None: |
852 | + args = [] |
853 | if len(args) <= 0: |
854 | args = sys.argv[1:] |
855 | - command, force, verbose, debug, conf_file_path = parse_args(args) |
856 | - if command == "version": |
857 | + command, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata = parse_args(args) |
858 | + if command == AgentCommands.Version: |
859 | version() |
860 | - elif command == "help": |
861 | + elif command == AgentCommands.Help: |
862 | print(usage()) |
863 | - elif command == "start": |
864 | + elif command == AgentCommands.Start: |
865 | start(conf_file_path=conf_file_path) |
866 | else: |
867 | try: |
868 | agent = Agent(verbose, conf_file_path=conf_file_path) |
869 | - if command == "deprovision+user": |
870 | + if command == AgentCommands.DeprovisionUser: |
871 | agent.deprovision(force, deluser=True) |
872 | - elif command == "deprovision": |
873 | + elif command == AgentCommands.Deprovision: |
874 | agent.deprovision(force, deluser=False) |
875 | - elif command == "provision": |
876 | + elif command == AgentCommands.Provision: |
877 | agent.provision() |
878 | - elif command == "register-service": |
879 | + elif command == AgentCommands.RegisterService: |
880 | agent.register_service() |
881 | - elif command == "daemon": |
882 | + elif command == AgentCommands.Daemon: |
883 | agent.daemon() |
884 | - elif command == "run-exthandlers": |
885 | + elif command == AgentCommands.RunExthandlers: |
886 | agent.run_exthandlers(debug) |
887 | - elif command == "show-configuration": |
888 | + elif command == AgentCommands.ShowConfig: |
889 | agent.show_configuration() |
890 | - except Exception: |
891 | + elif command == AgentCommands.CollectLogs: |
892 | + agent.collect_logs(log_collector_full_mode) |
893 | + elif command == AgentCommands.SetupFirewall: |
894 | + agent.setup_firewall(firewall_metadata) |
895 | + except Exception as e: |
896 | logger.error(u"Failed to run '{0}': {1}", |
897 | command, |
898 | - traceback.format_exc()) |
899 | + textutil.format_exception(e)) |
900 | + |
901 | |
902 | def parse_args(sys_args): |
903 | """ |
904 | Parse command line arguments |
905 | """ |
906 | - cmd = "help" |
907 | + cmd = AgentCommands.Help |
908 | force = False |
909 | verbose = False |
910 | debug = False |
911 | conf_file_path = None |
912 | - for a in sys_args: |
913 | - m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", a) |
914 | + log_collector_full_mode = False |
915 | + firewall_metadata = { |
916 | + "dst_ip": None, |
917 | + "uid": None, |
918 | + "wait": "" |
919 | + } |
920 | + |
921 | + regex_cmd_format = "^([-/]*){0}" |
922 | + |
923 | + for arg in sys_args: |
924 | + if arg == "": |
925 | + # Don't parse an empty parameter |
926 | + continue |
927 | + m = re.match("^(?:[-/]*)configuration-path:([\w/\.\-_]+)", arg) # pylint: disable=W1401 |
928 | if not m is None: |
929 | conf_file_path = m.group(1) |
930 | if not os.path.exists(conf_file_path): |
931 | print("Error: Configuration file {0} does not exist".format( |
932 | conf_file_path), file=sys.stderr) |
933 | - usage() |
934 | + print(usage()) |
935 | sys.exit(1) |
936 | - |
937 | - elif re.match("^([-/]*)deprovision\\+user", a): |
938 | - cmd = "deprovision+user" |
939 | - elif re.match("^([-/]*)deprovision", a): |
940 | - cmd = "deprovision" |
941 | - elif re.match("^([-/]*)daemon", a): |
942 | - cmd = "daemon" |
943 | - elif re.match("^([-/]*)start", a): |
944 | - cmd = "start" |
945 | - elif re.match("^([-/]*)register-service", a): |
946 | - cmd = "register-service" |
947 | - elif re.match("^([-/]*)run-exthandlers", a): |
948 | - cmd = "run-exthandlers" |
949 | - elif re.match("^([-/]*)version", a): |
950 | - cmd = "version" |
951 | - elif re.match("^([-/]*)verbose", a): |
952 | + elif re.match("^([-/]*)deprovision\\+user", arg): |
953 | + cmd = AgentCommands.DeprovisionUser |
954 | + elif re.match(regex_cmd_format.format(AgentCommands.Deprovision), arg): |
955 | + cmd = AgentCommands.Deprovision |
956 | + elif re.match(regex_cmd_format.format(AgentCommands.Daemon), arg): |
957 | + cmd = AgentCommands.Daemon |
958 | + elif re.match(regex_cmd_format.format(AgentCommands.Start), arg): |
959 | + cmd = AgentCommands.Start |
960 | + elif re.match(regex_cmd_format.format(AgentCommands.RegisterService), arg): |
961 | + cmd = AgentCommands.RegisterService |
962 | + elif re.match(regex_cmd_format.format(AgentCommands.RunExthandlers), arg): |
963 | + cmd = AgentCommands.RunExthandlers |
964 | + elif re.match(regex_cmd_format.format(AgentCommands.Version), arg): |
965 | + cmd = AgentCommands.Version |
966 | + elif re.match(regex_cmd_format.format("verbose"), arg): |
967 | verbose = True |
968 | - elif re.match("^([-/]*)debug", a): |
969 | + elif re.match(regex_cmd_format.format("debug"), arg): |
970 | debug = True |
971 | - elif re.match("^([-/]*)force", a): |
972 | + elif re.match(regex_cmd_format.format("force"), arg): |
973 | force = True |
974 | - elif re.match("^([-/]*)show-configuration", a): |
975 | - cmd = "show-configuration" |
976 | - elif re.match("^([-/]*)(help|usage|\\?)", a): |
977 | - cmd = "help" |
978 | + elif re.match(regex_cmd_format.format(AgentCommands.ShowConfig), arg): |
979 | + cmd = AgentCommands.ShowConfig |
980 | + elif re.match("^([-/]*)(help|usage|\\?)", arg): |
981 | + cmd = AgentCommands.Help |
982 | + elif re.match(regex_cmd_format.format(AgentCommands.CollectLogs), arg): |
983 | + cmd = AgentCommands.CollectLogs |
984 | + elif re.match(regex_cmd_format.format("full"), arg): |
985 | + log_collector_full_mode = True |
986 | + elif re.match(regex_cmd_format.format(AgentCommands.SetupFirewall), arg): |
987 | + cmd = AgentCommands.SetupFirewall |
988 | + elif re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg): |
989 | + firewall_metadata['dst_ip'] = re.match(regex_cmd_format.format("dst_ip=(?P<dst_ip>[\\d.]{7,})"), arg).group( |
990 | + 'dst_ip') |
991 | + elif re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg): |
992 | + firewall_metadata['uid'] = re.match(regex_cmd_format.format("uid=(?P<uid>[\\d]+)"), arg).group('uid') |
993 | + elif re.match(regex_cmd_format.format("(w|wait)$"), arg): |
994 | + firewall_metadata['wait'] = "-w" |
995 | else: |
996 | - cmd = "help" |
997 | + cmd = AgentCommands.Help |
998 | break |
999 | |
1000 | - return cmd, force, verbose, debug, conf_file_path |
1001 | + return cmd, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_metadata |
1002 | |
1003 | |
1004 | def version(): |
1005 | @@ -245,29 +378,33 @@ def version(): |
1006 | PY_VERSION_MICRO)) |
1007 | print("Goal state agent: {0}".format(GOAL_STATE_AGENT_VERSION)) |
1008 | |
1009 | + |
1010 | def usage(): |
1011 | """ |
1012 | Return agent usage message |
1013 | """ |
1014 | s = "\n" |
1015 | s += ("usage: {0} [-verbose] [-force] [-help] " |
1016 | - "-configuration-path:<path to configuration file>" |
1017 | + "-configuration-path:<path to configuration file>" |
1018 | "-deprovision[+user]|-register-service|-version|-daemon|-start|" |
1019 | - "-run-exthandlers|-show-configuration]" |
1020 | + "-run-exthandlers|-show-configuration|-collect-logs [-full]|-setup-firewall [-dst_ip=<IP> -uid=<UID> [-w/--wait]]" |
1021 | "").format(sys.argv[0]) |
1022 | s += "\n" |
1023 | return s |
1024 | |
1025 | + |
1026 | def start(conf_file_path=None): |
1027 | """ |
1028 | Start agent daemon in a background process and set stdout/stderr to |
1029 | /dev/null |
1030 | """ |
1031 | - devnull = open(os.devnull, 'w') |
1032 | args = [sys.argv[0], '-daemon'] |
1033 | if conf_file_path is not None: |
1034 | args.append('-configuration-path:{0}'.format(conf_file_path)) |
1035 | - subprocess.Popen(args, stdout=devnull, stderr=devnull) |
1036 | + |
1037 | + with open(os.devnull, 'w') as devnull: |
1038 | + subprocess.Popen(args, stdout=devnull, stderr=devnull) |
1039 | + |
1040 | |
1041 | if __name__ == '__main__' : |
1042 | main() |
1043 | diff --git a/azurelinuxagent/common/AgentGlobals.py b/azurelinuxagent/common/AgentGlobals.py |
1044 | new file mode 100644 |
1045 | index 0000000..dbfda92 |
1046 | --- /dev/null |
1047 | +++ b/azurelinuxagent/common/AgentGlobals.py |
1048 | @@ -0,0 +1,39 @@ |
1049 | +# Microsoft Azure Linux Agent |
1050 | +# |
1051 | +# Copyright 2020 Microsoft Corporation |
1052 | +# |
1053 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
1054 | +# you may not use this file except in compliance with the License. |
1055 | +# You may obtain a copy of the License at |
1056 | +# |
1057 | +# http://www.apache.org/licenses/LICENSE-2.0 |
1058 | +# |
1059 | +# Unless required by applicable law or agreed to in writing, software |
1060 | +# distributed under the License is distributed on an "AS IS" BASIS, |
1061 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
1062 | +# See the License for the specific language governing permissions and |
1063 | +# limitations under the License. |
1064 | +# |
1065 | +# Requires Python 2.6+ and Openssl 1.0+ |
1066 | + |
1067 | + |
1068 | +class AgentGlobals(object): |
1069 | + """ |
1070 | + This class is used for setting AgentGlobals which can be used all throughout the Agent. |
1071 | + """ |
1072 | + |
1073 | + GUID_ZERO = "00000000-0000-0000-0000-000000000000" |
1074 | + |
1075 | + # |
1076 | + # Some modules (e.g. telemetry) require an up-to-date container ID. We update this variable each time we |
1077 | + # fetch the goal state. |
1078 | + # |
1079 | + _container_id = GUID_ZERO |
1080 | + |
1081 | + @staticmethod |
1082 | + def get_container_id(): |
1083 | + return AgentGlobals._container_id |
1084 | + |
1085 | + @staticmethod |
1086 | + def update_container_id(container_id): |
1087 | + AgentGlobals._container_id = container_id |
1088 | diff --git a/azurelinuxagent/common/agent_supported_feature.py b/azurelinuxagent/common/agent_supported_feature.py |
1089 | new file mode 100644 |
1090 | index 0000000..d7f93e2 |
1091 | --- /dev/null |
1092 | +++ b/azurelinuxagent/common/agent_supported_feature.py |
1093 | @@ -0,0 +1,122 @@ |
1094 | +# Copyright 2018 Microsoft Corporation |
1095 | +# |
1096 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
1097 | +# you may not use this file except in compliance with the License. |
1098 | +# You may obtain a copy of the License at |
1099 | +# |
1100 | +# http://www.apache.org/licenses/LICENSE-2.0 |
1101 | +# |
1102 | +# Unless required by applicable law or agreed to in writing, software |
1103 | +# distributed under the License is distributed on an "AS IS" BASIS, |
1104 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
1105 | +# See the License for the specific language governing permissions and |
1106 | +# limitations under the License. |
1107 | +# |
1108 | +# Requires Python 2.6+ and Openssl 1.0+ |
1109 | +# |
1110 | + |
1111 | + |
1112 | +class SupportedFeatureNames(object): |
1113 | + """ |
1114 | + Enum for defining the Feature Names for all features that we the agent supports |
1115 | + """ |
1116 | + MultiConfig = "MultipleExtensionsPerHandler" |
1117 | + ExtensionTelemetryPipeline = "ExtensionTelemetryPipeline" |
1118 | + FastTrack = "FastTrack" |
1119 | + |
1120 | + |
1121 | +class AgentSupportedFeature(object): |
1122 | + """ |
1123 | + Interface for defining all features that the Linux Guest Agent supports and reports their if supported back to CRP |
1124 | + """ |
1125 | + |
1126 | + def __init__(self, name, version="1.0", supported=False): |
1127 | + self.__name = name |
1128 | + self.__version = version |
1129 | + self.__supported = supported |
1130 | + |
1131 | + @property |
1132 | + def name(self): |
1133 | + return self.__name |
1134 | + |
1135 | + @property |
1136 | + def version(self): |
1137 | + return self.__version |
1138 | + |
1139 | + @property |
1140 | + def is_supported(self): |
1141 | + return self.__supported |
1142 | + |
1143 | + |
1144 | +class _MultiConfigFeature(AgentSupportedFeature): |
1145 | + |
1146 | + __NAME = SupportedFeatureNames.MultiConfig |
1147 | + __VERSION = "1.0" |
1148 | + __SUPPORTED = True |
1149 | + |
1150 | + def __init__(self): |
1151 | + super(_MultiConfigFeature, self).__init__(name=_MultiConfigFeature.__NAME, |
1152 | + version=_MultiConfigFeature.__VERSION, |
1153 | + supported=_MultiConfigFeature.__SUPPORTED) |
1154 | + |
1155 | + |
1156 | +class _ETPFeature(AgentSupportedFeature): |
1157 | + |
1158 | + __NAME = SupportedFeatureNames.ExtensionTelemetryPipeline |
1159 | + __VERSION = "1.0" |
1160 | + __SUPPORTED = True |
1161 | + |
1162 | + def __init__(self): |
1163 | + super(_ETPFeature, self).__init__(name=self.__NAME, |
1164 | + version=self.__VERSION, |
1165 | + supported=self.__SUPPORTED) |
1166 | + |
1167 | + |
1168 | +# This is the list of features that Agent supports and we advertise to CRP |
1169 | +__CRP_ADVERTISED_FEATURES = { |
1170 | + SupportedFeatureNames.MultiConfig: _MultiConfigFeature() |
1171 | +} |
1172 | + |
1173 | + |
1174 | +# This is the list of features that Agent supports and we advertise to Extensions |
1175 | +__EXTENSION_ADVERTISED_FEATURES = { |
1176 | + SupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature() |
1177 | +} |
1178 | + |
1179 | + |
1180 | +def get_supported_feature_by_name(feature_name): |
1181 | + if feature_name in __CRP_ADVERTISED_FEATURES: |
1182 | + return __CRP_ADVERTISED_FEATURES[feature_name] |
1183 | + |
1184 | + if feature_name in __EXTENSION_ADVERTISED_FEATURES: |
1185 | + return __EXTENSION_ADVERTISED_FEATURES[feature_name] |
1186 | + |
1187 | + raise NotImplementedError("Feature with Name: {0} not found".format(feature_name)) |
1188 | + |
1189 | + |
1190 | +def get_agent_supported_features_list_for_crp(): |
1191 | + """ |
1192 | + List of features that the GuestAgent currently supports (like FastTrack, MultiConfig, etc). |
1193 | + We need to send this list as part of Status reporting to inform CRP of all the features the agent supports. |
1194 | + :return: Dict containing all CRP supported features with the key as their names and the AgentFeature object as |
1195 | + the value if they are supported by the Agent |
1196 | + Eg: { |
1197 | + MultipleExtensionsPerHandler: _MultiConfigFeature() |
1198 | + } |
1199 | + """ |
1200 | + |
1201 | + return dict((name, feature) for name, feature in __CRP_ADVERTISED_FEATURES.items() if feature.is_supported) |
1202 | + |
1203 | + |
1204 | +def get_agent_supported_features_list_for_extensions(): |
1205 | + """ |
1206 | + List of features that the GuestAgent currently supports (like Extension Telemetry Pipeline, etc) needed by Extensions. |
1207 | + We need to send this list as environment variables when calling extension commands to inform Extensions of all the |
1208 | + features the agent supports. |
1209 | + :return: Dict containing all Extension supported features with the key as their names and the AgentFeature object as |
1210 | + the value if the feature is supported by the Agent. |
1211 | + Eg: { |
1212 | + CRPSupportedFeatureNames.ExtensionTelemetryPipeline: _ETPFeature() |
1213 | + } |
1214 | + """ |
1215 | + return dict((name, feature) for name, feature in __EXTENSION_ADVERTISED_FEATURES.items() if feature.is_supported) |
1216 | diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py |
1217 | index 2ad70c1..b2bf32f 100644 |
1218 | --- a/azurelinuxagent/common/cgroup.py |
1219 | +++ b/azurelinuxagent/common/cgroup.py |
1220 | @@ -13,47 +13,94 @@ |
1221 | # limitations under the License. |
1222 | # |
1223 | # Requires Python 2.6+ and Openssl 1.0+ |
1224 | + |
1225 | import errno |
1226 | import os |
1227 | import re |
1228 | +from datetime import timedelta |
1229 | |
1230 | -from azurelinuxagent.common import logger |
1231 | +from azurelinuxagent.common import logger, conf |
1232 | from azurelinuxagent.common.exception import CGroupsException |
1233 | from azurelinuxagent.common.future import ustr |
1234 | from azurelinuxagent.common.osutil import get_osutil |
1235 | from azurelinuxagent.common.utils import fileutil |
1236 | |
1237 | -re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n') |
1238 | +_REPORT_EVERY_HOUR = timedelta(hours=1) |
1239 | +_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period()) |
1240 | |
1241 | +AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes |
1242 | +AGENT_LOG_COLLECTOR = "azure-walinuxagent-logcollector" |
1243 | |
1244 | -class CGroupContollers(object): |
1245 | - CPU = "cpu" |
1246 | - MEMORY = "memory" |
1247 | |
1248 | +class CounterNotFound(Exception): |
1249 | + pass |
1250 | |
1251 | -class CGroup(object): |
1252 | - @staticmethod |
1253 | - def create(cgroup_path, controller, extension_name): |
1254 | - """ |
1255 | - Factory method to create the correct CGroup. |
1256 | - """ |
1257 | - if controller == CGroupContollers.CPU: |
1258 | - return CpuCgroup(extension_name, cgroup_path) |
1259 | - if controller == CGroupContollers.MEMORY: |
1260 | - return MemoryCgroup(extension_name, cgroup_path) |
1261 | - raise CGroupsException('CGroup controller {0} is not supported'.format(controller)) |
1262 | |
1263 | - def __init__(self, name, cgroup_path, controller_type): |
1264 | +class MetricValue(object): |
1265 | + |
1266 | + """ |
1267 | + Class for defining all the required metric fields to send telemetry. |
1268 | + """ |
1269 | + |
1270 | + def __init__(self, category, counter, instance, value, report_period=_DEFAULT_REPORT_PERIOD): |
1271 | + self._category = category |
1272 | + self._counter = counter |
1273 | + self._instance = instance |
1274 | + self._value = value |
1275 | + self._report_period = report_period |
1276 | + |
1277 | + @property |
1278 | + def category(self): |
1279 | + return self._category |
1280 | + |
1281 | + @property |
1282 | + def counter(self): |
1283 | + return self._counter |
1284 | + |
1285 | + @property |
1286 | + def instance(self): |
1287 | + return self._instance |
1288 | + |
1289 | + @property |
1290 | + def value(self): |
1291 | + return self._value |
1292 | + |
1293 | + @property |
1294 | + def report_period(self): |
1295 | + return self._report_period |
1296 | + |
1297 | + |
1298 | +class MetricsCategory(object): |
1299 | + MEMORY_CATEGORY = "Memory" |
1300 | + CPU_CATEGORY = "CPU" |
1301 | + |
1302 | + |
1303 | +class MetricsCounter(object): |
1304 | + PROCESSOR_PERCENT_TIME = "% Processor Time" |
1305 | + TOTAL_MEM_USAGE = "Total Memory Usage" |
1306 | + MAX_MEM_USAGE = "Max Memory Usage" |
1307 | + THROTTLED_TIME = "Throttled Time" |
1308 | + SWAP_MEM_USAGE = "Swap Memory Usage" |
1309 | + AVAILABLE_MEM = "Available MBytes" |
1310 | + USED_MEM = "Used MBytes" |
1311 | + |
1312 | + |
1313 | +re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n') |
1314 | + |
1315 | + |
1316 | +class CGroup(object): |
1317 | + def __init__(self, name, cgroup_path): |
1318 | """ |
1319 | Initialize _data collection for the Memory controller |
1320 | :param: name: Name of the CGroup |
1321 | :param: cgroup_path: Path of the controller |
1322 | - :param: controller_type: |
1323 | :return: |
1324 | """ |
1325 | self.name = name |
1326 | self.path = cgroup_path |
1327 | - self.controller = controller_type |
1328 | + |
1329 | + def __str__(self): |
1330 | + return "{0} [{1}]".format(self.name, self.path) |
1331 | |
1332 | def _get_cgroup_file(self, file_name): |
1333 | return os.path.join(self.path, file_name) |
1334 | @@ -89,7 +136,7 @@ class CGroup(object): |
1335 | logger.error("File {0} is empty but should not be".format(parameter_filename)) |
1336 | raise CGroupsException("File {0} is empty but should not be".format(parameter_filename)) |
1337 | except Exception as e: |
1338 | - if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: |
1339 | + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 |
1340 | raise e |
1341 | parameter_filename = self._get_cgroup_file(parameter_name) |
1342 | raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e) |
1343 | @@ -114,42 +161,26 @@ class CGroup(object): |
1344 | ' Internal error: {1}'.format(self.path, ustr(e))) |
1345 | return False |
1346 | |
1347 | - def get_tracked_processes(self): |
1348 | + def get_tracked_metrics(self, **_): |
1349 | """ |
1350 | - :return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes. |
1351 | + Retrieves the current value of the metrics tracked for this cgroup and returns them as an array. |
1352 | + |
1353 | + Note: Agent won't track the metrics if the current cpu ticks less than previous value and returns empty array. |
1354 | """ |
1355 | - procs = [] |
1356 | - try: |
1357 | - procs = self._get_parameters("cgroup.procs") |
1358 | - except (IOError, OSError) as e: |
1359 | - if e.errno == errno.ENOENT: |
1360 | - # only suppressing file not found exceptions. |
1361 | - pass |
1362 | - else: |
1363 | - logger.periodic_warn(logger.EVERY_HALF_HOUR, |
1364 | - 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.' |
1365 | - ' Internal error: {1}'.format(self.path, ustr(e))) |
1366 | - except CGroupsException as e: |
1367 | - logger.periodic_warn(logger.EVERY_HALF_HOUR, |
1368 | - 'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.' |
1369 | - ' Internal error: {1}'.format(self.path, ustr(e))) |
1370 | - return procs |
1371 | + raise NotImplementedError() |
1372 | |
1373 | |
1374 | class CpuCgroup(CGroup): |
1375 | def __init__(self, name, cgroup_path): |
1376 | - super(CpuCgroup, self).__init__(name, cgroup_path, CGroupContollers.CPU) |
1377 | + super(CpuCgroup, self).__init__(name, cgroup_path) |
1378 | |
1379 | self._osutil = get_osutil() |
1380 | self._previous_cgroup_cpu = None |
1381 | self._previous_system_cpu = None |
1382 | self._current_cgroup_cpu = None |
1383 | self._current_system_cpu = None |
1384 | - |
1385 | - def __str__(self): |
1386 | - return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format( |
1387 | - self.name, self.path, self.controller |
1388 | - ) |
1389 | + self._previous_throttled_time = None |
1390 | + self._current_throttled_time = None |
1391 | |
1392 | def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False): |
1393 | """ |
1394 | @@ -159,24 +190,54 @@ class CpuCgroup(CGroup): |
1395 | returns 0; this is useful when the function can be called before the cgroup has been created. |
1396 | """ |
1397 | try: |
1398 | - cpu_stat = self._get_file_contents('cpuacct.stat') |
1399 | + cpuacct_stat = self._get_file_contents('cpuacct.stat') |
1400 | except Exception as e: |
1401 | - if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: |
1402 | + if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101 |
1403 | raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e))) |
1404 | if not allow_no_such_file_or_directory_error: |
1405 | raise e |
1406 | - cpu_stat = None |
1407 | + cpuacct_stat = None |
1408 | |
1409 | cpu_ticks = 0 |
1410 | |
1411 | - if cpu_stat is not None: |
1412 | - match = re_user_system_times.match(cpu_stat) |
1413 | + if cpuacct_stat is not None: |
1414 | + # |
1415 | + # Sample file: |
1416 | + # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpuacct.stat |
1417 | + # user 10190 |
1418 | + # system 3160 |
1419 | + # |
1420 | + match = re_user_system_times.match(cpuacct_stat) |
1421 | if not match: |
1422 | - raise CGroupsException("The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpu_stat)) |
1423 | + raise CGroupsException( |
1424 | + "The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat)) |
1425 | cpu_ticks = int(match.groups()[0]) + int(match.groups()[1]) |
1426 | |
1427 | return cpu_ticks |
1428 | |
1429 | + def get_throttled_time(self): |
1430 | + try: |
1431 | + with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat: |
1432 | + # |
1433 | + # Sample file: |
1434 | + # |
1435 | + # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpu.stat |
1436 | + # nr_periods 51660 |
1437 | + # nr_throttled 19461 |
1438 | + # throttled_time 1529590856339 |
1439 | + # |
1440 | + for line in cpu_stat: |
1441 | + match = re.match(r'throttled_time\s+(\d+)', line) |
1442 | + if match is not None: |
1443 | + return int(match.groups()[0]) |
1444 | + raise Exception("Cannot find throttled_time") |
1445 | + except (IOError, OSError) as e: |
1446 | + if e.errno == errno.ENOENT: |
1447 | + return 0 |
1448 | + raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e))) |
1449 | + except Exception as e: |
1450 | + raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e))) |
1451 | + |
1452 | def _cpu_usage_initialized(self): |
1453 | return self._current_cgroup_cpu is not None and self._current_system_cpu is not None |
1454 | |
1455 | @@ -188,13 +249,14 @@ class CpuCgroup(CGroup): |
1456 | raise CGroupsException("initialize_cpu_usage() should be invoked only once") |
1457 | self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True) |
1458 | self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot() |
1459 | + self._current_throttled_time = self.get_throttled_time() |
1460 | |
1461 | def get_cpu_usage(self): |
1462 | """ |
1463 | Computes the CPU used by the cgroup since the last call to this function. |
1464 | |
1465 | - The usage is measured as a percentage of utilization of all cores in the system. For example, |
1466 | - using 1 core at 100% on a 4-core system would be reported as 25%. |
1467 | + The usage is measured as a percentage of utilization of 1 core in the system. For example, |
1468 | + using 1 core all of the time on a 4-core system would be reported as 100%. |
1469 | |
1470 | NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage() |
1471 | """ |
1472 | @@ -209,53 +271,122 @@ class CpuCgroup(CGroup): |
1473 | cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu |
1474 | system_delta = max(1, self._current_system_cpu - self._previous_system_cpu) |
1475 | |
1476 | - return round(100.0 * float(cgroup_delta) / float(system_delta), 3) |
1477 | + return round(100.0 * self._osutil.get_processor_cores() * float(cgroup_delta) / float(system_delta), 3) |
1478 | + |
1479 | + def get_cpu_throttled_time(self, read_previous_throttled_time=True): |
1480 | + """ |
1481 | + Computes the throttled time (in seconds) since the last call to this function. |
1482 | + NOTE: initialize_cpu_usage() must be invoked before calling this function |
1483 | + Compute only current throttled time if read_previous_throttled_time set to False |
1484 | + """ |
1485 | + if not read_previous_throttled_time: |
1486 | + return float(self.get_throttled_time() / 1E9) |
1487 | + |
1488 | + if not self._cpu_usage_initialized(): |
1489 | + raise CGroupsException( |
1490 | + "initialize_cpu_usage() must be invoked before the first call to get_throttled_time()") |
1491 | + |
1492 | + self._previous_throttled_time = self._current_throttled_time |
1493 | + self._current_throttled_time = self.get_throttled_time() |
1494 | + |
1495 | + return float(self._current_throttled_time - self._previous_throttled_time) / 1E9 |
1496 | + |
1497 | + def get_tracked_metrics(self, **kwargs): |
1498 | + tracked = [] |
1499 | + cpu_usage = self.get_cpu_usage() |
1500 | + if cpu_usage >= float(0): |
1501 | + tracked.append( |
1502 | + MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage)) |
1503 | + |
1504 | + if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']: |
1505 | + throttled_time = self.get_cpu_throttled_time() |
1506 | + if cpu_usage >= float(0) and throttled_time >= float(0): |
1507 | + tracked.append( |
1508 | + MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time)) |
1509 | + |
1510 | + return tracked |
1511 | |
1512 | |
1513 | class MemoryCgroup(CGroup): |
1514 | def __init__(self, name, cgroup_path): |
1515 | + super(MemoryCgroup, self).__init__(name, cgroup_path) |
1516 | + |
1517 | + self._counter_not_found_error_count = 0 |
1518 | + |
1519 | + def _get_memory_stat_counter(self, counter_name): |
1520 | + try: |
1521 | + with open(os.path.join(self.path, 'memory.stat')) as memory_stat: |
1522 | + # cat /sys/fs/cgroup/memory/azure.slice/memory.stat |
1523 | + # cache 67178496 |
1524 | + # rss 42340352 |
1525 | + # rss_huge 6291456 |
1526 | + # swap 0 |
1527 | + for line in memory_stat: |
1528 | + re_memory_counter = r'{0}\s+(\d+)'.format(counter_name) |
1529 | + match = re.match(re_memory_counter, line) |
1530 | + if match is not None: |
1531 | + return int(match.groups()[0]) |
1532 | + except (IOError, OSError) as e: |
1533 | + if e.errno == errno.ENOENT: |
1534 | + raise |
1535 | + raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) |
1536 | + except Exception as e: |
1537 | + raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) |
1538 | + |
1539 | + raise CounterNotFound("Cannot find counter: {0}".format(counter_name)) |
1540 | + |
1541 | + def get_memory_usage(self): |
1542 | """ |
1543 | - Initialize _data collection for the Memory controller |
1544 | + Collect RSS+CACHE from memory.stat cgroup. |
1545 | |
1546 | - :return: MemoryCgroup |
1547 | + :return: Memory usage in bytes |
1548 | + :rtype: int |
1549 | """ |
1550 | - super(MemoryCgroup, self).__init__(name, cgroup_path, CGroupContollers.MEMORY) |
1551 | |
1552 | - def __str__(self): |
1553 | - return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format( |
1554 | - self.name, self.path, self.controller |
1555 | - ) |
1556 | + cache = self._get_memory_stat_counter("cache") |
1557 | + rss = self._get_memory_stat_counter("rss") |
1558 | + return cache + rss |
1559 | |
1560 | - def get_memory_usage(self): |
1561 | + def try_swap_memory_usage(self): |
1562 | """ |
1563 | - Collect memory.usage_in_bytes from the cgroup. |
1564 | + Collect SWAP from memory.stat cgroup. |
1565 | |
1566 | :return: Memory usage in bytes |
1567 | :rtype: int |
1568 | + Note: stat file is the only place to get the SWAP since other swap related file memory.memsw.usage_in_bytes is for total Memory+SWAP. |
1569 | """ |
1570 | - usage = None |
1571 | try: |
1572 | - usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True) |
1573 | - except Exception as e: |
1574 | - if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: |
1575 | - raise |
1576 | - raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e) |
1577 | - |
1578 | - return int(usage) |
1579 | + return self._get_memory_stat_counter("swap") |
1580 | + except CounterNotFound as e: |
1581 | + if self._counter_not_found_error_count < 1: |
1582 | + logger.periodic_info(logger.EVERY_HALF_HOUR, |
1583 | + '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path)) |
1584 | + self._counter_not_found_error_count += 1 |
1585 | + return 0 |
1586 | |
1587 | def get_max_memory_usage(self): |
1588 | """ |
1589 | - Collect memory.usage_in_bytes from the cgroup. |
1590 | + Collect memory.max_usage_in_bytes from the cgroup. |
1591 | |
1592 | :return: Memory usage in bytes |
1593 | :rtype: int |
1594 | """ |
1595 | - usage = None |
1596 | + usage = 0 |
1597 | try: |
1598 | - usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True) |
1599 | + usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)) |
1600 | except Exception as e: |
1601 | - if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: |
1602 | + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 |
1603 | raise |
1604 | - raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e) |
1605 | - |
1606 | - return int(usage) |
1607 | + raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e) |
1608 | + |
1609 | + return usage |
1610 | + |
1611 | + def get_tracked_metrics(self, **_): |
1612 | + return [ |
1613 | + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name, |
1614 | + self.get_memory_usage()), |
1615 | + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name, |
1616 | + self.get_max_memory_usage(), _REPORT_EVERY_HOUR), |
1617 | + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name, |
1618 | + self.try_swap_memory_usage(), _REPORT_EVERY_HOUR) |
1619 | + ] |
1620 | diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py |
1621 | index c671a2e..ca0ef3b 100644 |
1622 | --- a/azurelinuxagent/common/cgroupapi.py |
1623 | +++ b/azurelinuxagent/common/cgroupapi.py |
1624 | @@ -1,3 +1,4 @@ |
1625 | +# -*- coding: utf-8 -*- |
1626 | # Copyright 2018 Microsoft Corporation |
1627 | # |
1628 | # Licensed under the Apache License, Version 2.0 (the "License"); |
1629 | @@ -14,102 +15,65 @@ |
1630 | # |
1631 | # Requires Python 2.6+ and Openssl 1.0+ |
1632 | |
1633 | -import errno |
1634 | import os |
1635 | +import re |
1636 | import shutil |
1637 | import subprocess |
1638 | +import threading |
1639 | import uuid |
1640 | |
1641 | from azurelinuxagent.common import logger |
1642 | -from azurelinuxagent.common.cgroup import CGroup |
1643 | +from azurelinuxagent.common.cgroup import CpuCgroup, MemoryCgroup |
1644 | from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry |
1645 | from azurelinuxagent.common.conf import get_agent_pid_file_path |
1646 | -from azurelinuxagent.common.event import add_event, WALAEventOperation |
1647 | from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \ |
1648 | ExtensionOperationError |
1649 | from azurelinuxagent.common.future import ustr |
1650 | +from azurelinuxagent.common.osutil import systemd |
1651 | from azurelinuxagent.common.utils import fileutil, shellutil |
1652 | -from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output |
1653 | -from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION |
1654 | +from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \ |
1655 | + TELEMETRY_MESSAGE_MAX_LEN |
1656 | +from azurelinuxagent.common.utils.flexible_version import FlexibleVersion |
1657 | +from azurelinuxagent.common.version import get_distro |
1658 | |
1659 | CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup' |
1660 | CGROUP_CONTROLLERS = ["cpu", "memory"] |
1661 | -VM_AGENT_CGROUP_NAME = "walinuxagent.service" |
1662 | -EXTENSIONS_ROOT_CGROUP_NAME = "walinuxagent.extensions" |
1663 | -UNIT_FILES_FILE_SYSTEM_PATH = "/etc/systemd/system" |
1664 | +EXTENSION_SLICE_PREFIX = "azure-vmextensions" |
1665 | |
1666 | |
1667 | -class CGroupsApi(object): |
1668 | +class SystemdRunError(CGroupsException): |
1669 | """ |
1670 | - Interface for the cgroups API |
1671 | + Raised when systemd-run fails |
1672 | """ |
1673 | - def create_agent_cgroups(self): |
1674 | - raise NotImplementedError() |
1675 | - |
1676 | - def create_extension_cgroups_root(self): |
1677 | - raise NotImplementedError() |
1678 | - |
1679 | - def create_extension_cgroups(self, extension_name): |
1680 | - raise NotImplementedError() |
1681 | - |
1682 | - def remove_extension_cgroups(self, extension_name): |
1683 | - raise NotImplementedError() |
1684 | |
1685 | - def get_extension_cgroups(self, extension_name): |
1686 | - raise NotImplementedError() |
1687 | + def __init__(self, msg=None): |
1688 | + super(SystemdRunError, self).__init__(msg) |
1689 | |
1690 | - def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code): |
1691 | - raise NotImplementedError() |
1692 | |
1693 | - def cleanup_legacy_cgroups(self): |
1694 | - raise NotImplementedError() |
1695 | +class CGroupsApi(object): |
1696 | + @staticmethod |
1697 | + def cgroups_supported(): |
1698 | + distro_info = get_distro() |
1699 | + distro_name = distro_info[0] |
1700 | + try: |
1701 | + distro_version = FlexibleVersion(distro_info[1]) |
1702 | + except ValueError: |
1703 | + return False |
1704 | + return distro_name.lower() == 'ubuntu' and distro_version.major >= 16 |
1705 | |
1706 | @staticmethod |
1707 | def track_cgroups(extension_cgroups): |
1708 | try: |
1709 | for cgroup in extension_cgroups: |
1710 | CGroupsTelemetry.track_cgroup(cgroup) |
1711 | - except Exception as e: |
1712 | + except Exception as exception: |
1713 | logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. " |
1714 | - "Error: {1}".format(cgroup.path, ustr(e))) |
1715 | + "Error: {1}".format(cgroup.path, ustr(exception))) |
1716 | |
1717 | @staticmethod |
1718 | - def _get_extension_cgroup_name(extension_name): |
1719 | - # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. |
1720 | - return extension_name.replace('-', '_') |
1721 | - |
1722 | - @staticmethod |
1723 | - def create(): |
1724 | - """ |
1725 | - Factory method to create the correct API for the current platform |
1726 | - """ |
1727 | - return SystemdCgroupsApi() if CGroupsApi._is_systemd() else FileSystemCgroupsApi() |
1728 | - |
1729 | - @staticmethod |
1730 | - def _is_systemd(): |
1731 | - """ |
1732 | - Determine if systemd is managing system services; the implementation follows the same strategy as, for example, |
1733 | - sd_booted() in libsystemd, or /usr/sbin/service |
1734 | - """ |
1735 | - return os.path.exists('/run/systemd/system/') |
1736 | - |
1737 | - @staticmethod |
1738 | - def _foreach_controller(operation, message): |
1739 | - """ |
1740 | - Executes the given operation on all controllers that need to be tracked; outputs 'message' if the controller |
1741 | - is not mounted or if an error occurs in the operation |
1742 | - :return: Returns a list of error messages or an empty list if no errors occurred |
1743 | - """ |
1744 | - mounted_controllers = os.listdir(CGROUPS_FILE_SYSTEM_ROOT) |
1745 | - |
1746 | - for controller in CGROUP_CONTROLLERS: |
1747 | - try: |
1748 | - if controller not in mounted_controllers: |
1749 | - logger.warn('Cgroup controller "{0}" is not mounted. {1}', controller, message) |
1750 | - else: |
1751 | - operation(controller) |
1752 | - except Exception as e: |
1753 | - logger.warn('Error in cgroup controller "{0}": {1}. {2}', controller, ustr(e), message) |
1754 | + def get_processes_in_cgroup(cgroup_path): |
1755 | + with open(os.path.join(cgroup_path, "cgroup.procs"), "r") as cgroup_procs: |
1756 | + return [int(pid) for pid in cgroup_procs.read().split()] |
1757 | |
1758 | @staticmethod |
1759 | def _foreach_legacy_cgroup(operation): |
1760 | @@ -138,429 +102,250 @@ class CGroupsApi(object): |
1761 | |
1762 | if os.path.exists(procs_file): |
1763 | procs_file_contents = fileutil.read_file(procs_file).strip() |
1764 | - daemon_pid = fileutil.read_file(get_agent_pid_file_path()).strip() |
1765 | + daemon_pid = CGroupsApi.get_daemon_pid() |
1766 | |
1767 | - if daemon_pid in procs_file_contents: |
1768 | + if ustr(daemon_pid) in procs_file_contents: |
1769 | operation(controller, daemon_pid) |
1770 | finally: |
1771 | for _, cgroup in legacy_cgroups: |
1772 | logger.info('Removing {0}', cgroup) |
1773 | shutil.rmtree(cgroup, ignore_errors=True) |
1774 | + return len(legacy_cgroups) |
1775 | |
1776 | - |
1777 | -class FileSystemCgroupsApi(CGroupsApi): |
1778 | - """ |
1779 | - Cgroups interface using the cgroups file system directly |
1780 | - """ |
1781 | @staticmethod |
1782 | - def _try_mkdir(path): |
1783 | - """ |
1784 | - Try to create a directory, recursively. If it already exists as such, do nothing. Raise the appropriate |
1785 | - exception should an error occur. |
1786 | - |
1787 | - :param path: str |
1788 | - """ |
1789 | - if not os.path.isdir(path): |
1790 | - try: |
1791 | - os.makedirs(path, 0o755) |
1792 | - except OSError as e: |
1793 | - if e.errno == errno.EEXIST: |
1794 | - if not os.path.isdir(path): |
1795 | - raise CGroupsException("Create directory for cgroup {0}: normal file already exists with that name".format(path)) |
1796 | - else: |
1797 | - pass # There was a race to create the directory, but it's there now, and that's fine |
1798 | - elif e.errno == errno.EACCES: |
1799 | - # This is unexpected, as the agent runs as root |
1800 | - raise CGroupsException("Create directory for cgroup {0}: permission denied".format(path)) |
1801 | - else: |
1802 | - raise |
1803 | - |
1804 | - @staticmethod |
1805 | - def _get_agent_cgroup_path(controller): |
1806 | - return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME) |
1807 | - |
1808 | - @staticmethod |
1809 | - def _get_extension_cgroups_root_path(controller): |
1810 | - return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, EXTENSIONS_ROOT_CGROUP_NAME) |
1811 | - |
1812 | - def _get_extension_cgroup_path(self, controller, extension_name): |
1813 | - extensions_root = self._get_extension_cgroups_root_path(controller) |
1814 | - |
1815 | - if not os.path.exists(extensions_root): |
1816 | - logger.warn("Root directory {0} does not exist.".format(extensions_root)) |
1817 | - |
1818 | - cgroup_name = self._get_extension_cgroup_name(extension_name) |
1819 | + def get_daemon_pid(): |
1820 | + return int(fileutil.read_file(get_agent_pid_file_path()).strip()) |
1821 | |
1822 | - return os.path.join(extensions_root, cgroup_name) |
1823 | |
1824 | - def _create_extension_cgroup(self, controller, extension_name): |
1825 | - return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), controller, extension_name) |
1826 | +class SystemdCgroupsApi(CGroupsApi): |
1827 | + """ |
1828 | + Cgroups interface via systemd |
1829 | + """ |
1830 | |
1831 | - @staticmethod |
1832 | - def _add_process_to_cgroup(pid, cgroup_path): |
1833 | - tasks_file = os.path.join(cgroup_path, 'cgroup.procs') |
1834 | - fileutil.append_file(tasks_file, "{0}\n".format(pid)) |
1835 | - logger.info("Added PID {0} to cgroup {1}".format(pid, cgroup_path)) |
1836 | + def __init__(self): |
1837 | + self._cgroup_mountpoints = None |
1838 | + self._agent_unit_name = None |
1839 | + self._systemd_run_commands = [] |
1840 | + self._systemd_run_commands_lock = threading.RLock() |
1841 | |
1842 | - def cleanup_legacy_cgroups(self): |
1843 | + def get_systemd_run_commands(self): |
1844 | """ |
1845 | - Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; |
1846 | - starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. This |
1847 | - method moves the daemon's PID from the legacy cgroups to the newer cgroups. |
1848 | + Returns a list of the systemd-run commands currently running (given as PIDs) |
1849 | """ |
1850 | - def move_daemon_pid(controller, daemon_pid): |
1851 | - new_path = FileSystemCgroupsApi._get_agent_cgroup_path(controller) |
1852 | - logger.info("Writing daemon's PID ({0}) to {1}", daemon_pid, new_path) |
1853 | - fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid) |
1854 | - msg = "Moved daemon's PID from legacy cgroup to {0}".format(new_path) |
1855 | - add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=True, message=msg) |
1856 | + with self._systemd_run_commands_lock: |
1857 | + return self._systemd_run_commands[:] |
1858 | |
1859 | - CGroupsApi._foreach_legacy_cgroup(move_daemon_pid) |
1860 | - |
1861 | - def create_agent_cgroups(self): |
1862 | + def get_cgroup_mount_points(self): |
1863 | """ |
1864 | - Creates a cgroup for the VM Agent in each of the controllers we are tracking; returns the created cgroups. |
1865 | + Returns a tuple with the mount points for the cpu and memory controllers; the values can be None |
1866 | + if the corresponding controller is not mounted |
1867 | """ |
1868 | - cgroups = [] |
1869 | - |
1870 | - pid = int(os.getpid()) |
1871 | - |
1872 | - def create_cgroup(controller): |
1873 | - path = FileSystemCgroupsApi._get_agent_cgroup_path(controller) |
1874 | - |
1875 | - if not os.path.isdir(path): |
1876 | - FileSystemCgroupsApi._try_mkdir(path) |
1877 | - logger.info("Created cgroup {0}".format(path)) |
1878 | - |
1879 | - self._add_process_to_cgroup(pid, path) |
1880 | - |
1881 | - cgroups.append(CGroup.create(path, controller, VM_AGENT_CGROUP_NAME)) |
1882 | - |
1883 | - self._foreach_controller(create_cgroup, 'Failed to create a cgroup for the VM Agent; resource usage will not be tracked') |
1884 | - |
1885 | - if len(cgroups) == 0: |
1886 | - raise CGroupsException("Failed to create any cgroup for the VM Agent") |
1887 | + # the output of mount is similar to |
1888 | + # $ mount -t cgroup |
1889 | + # cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) |
1890 | + # cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct) |
1891 | + # cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) |
1892 | + # etc |
1893 | + # |
1894 | + if self._cgroup_mountpoints is None: |
1895 | + cpu = None |
1896 | + memory = None |
1897 | + for line in shellutil.run_command(['mount', '-t', 'cgroup']).splitlines(): |
1898 | + match = re.search(r'on\s+(?P<path>/\S+(memory|cpuacct))\s', line) |
1899 | + if match is not None: |
1900 | + path = match.group('path') |
1901 | + if 'cpuacct' in path: |
1902 | + cpu = path |
1903 | + else: |
1904 | + memory = path |
1905 | + self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory} |
1906 | |
1907 | - return cgroups |
1908 | + return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] |
1909 | |
1910 | - def create_extension_cgroups_root(self): |
1911 | + @staticmethod |
1912 | + def get_process_cgroup_relative_paths(process_id): |
1913 | """ |
1914 | - Creates the directory within the cgroups file system that will contain the cgroups for the extensions. |
1915 | + Returns a tuple with the path of the cpu and memory cgroups for the given process (relative to the mount point of the corresponding |
1916 | + controller). |
1917 | + The 'process_id' can be a numeric PID or the string "self" for the current process. |
1918 | + The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted). |
1919 | """ |
1920 | - def create_cgroup(controller): |
1921 | - path = self._get_extension_cgroups_root_path(controller) |
1922 | - |
1923 | - if not os.path.isdir(path): |
1924 | - FileSystemCgroupsApi._try_mkdir(path) |
1925 | - logger.info("Created {0}".format(path)) |
1926 | + # The contents of the file are similar to |
1927 | + # # cat /proc/1218/cgroup |
1928 | + # 10:memory:/system.slice/walinuxagent.service |
1929 | + # 3:cpu,cpuacct:/system.slice/walinuxagent.service |
1930 | + # etc |
1931 | + cpu_path = None |
1932 | + memory_path = None |
1933 | + for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines(): |
1934 | + match = re.match(r'\d+:(?P<controller>(memory|.*cpuacct.*)):(?P<path>.+)', line) |
1935 | + if match is not None: |
1936 | + controller = match.group('controller') |
1937 | + path = match.group('path').lstrip('/') if match.group('path') != '/' else None |
1938 | + if controller == 'memory': |
1939 | + memory_path = path |
1940 | + else: |
1941 | + cpu_path = path |
1942 | |
1943 | - self._foreach_controller(create_cgroup, 'Failed to create a root cgroup for extensions') |
1944 | + return cpu_path, memory_path |
1945 | |
1946 | - def create_extension_cgroups(self, extension_name): |
1947 | + def get_process_cgroup_paths(self, process_id): |
1948 | """ |
1949 | - Creates a cgroup for the given extension in each of the controllers we are tracking; returns the created cgroups. |
1950 | + Returns a tuple with the path of the cpu and memory cgroups for the given process. The 'process_id' can be a numeric PID or the string "self" for the current process. |
1951 | + The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted). |
1952 | """ |
1953 | - cgroups = [] |
1954 | - |
1955 | - def create_cgroup(controller): |
1956 | - cgroup = self._create_extension_cgroup(controller, extension_name) |
1957 | + cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) |
1958 | |
1959 | - if not os.path.isdir(cgroup.path): |
1960 | - FileSystemCgroupsApi._try_mkdir(cgroup.path) |
1961 | - logger.info("Created cgroup {0}".format(cgroup.path)) |
1962 | + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() |
1963 | |
1964 | - cgroups.append(cgroup) |
1965 | + cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ |
1966 | + if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None |
1967 | |
1968 | - self._foreach_controller(create_cgroup, 'Failed to create a cgroup for extension {0}'.format(extension_name)) |
1969 | + memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ |
1970 | + if memory_mount_point is not None and memory_cgroup_relative_path is not None else None |
1971 | |
1972 | - return cgroups |
1973 | + return cpu_cgroup_path, memory_cgroup_path |
1974 | |
1975 | - def remove_extension_cgroups(self, extension_name): |
1976 | + def get_unit_cgroup_paths(self, unit_name): |
1977 | """ |
1978 | - Deletes the cgroups for the given extension. |
1979 | + Returns a tuple with the path of the cpu and memory cgroups for the given unit. |
1980 | + The values returned can be None if the controller is not mounted. |
1981 | + Ex: ControlGroup=/azure.slice/walinuxagent.service |
1982 | + controlgroup_path[1:] = azure.slice/walinuxagent.service |
1983 | """ |
1984 | - def remove_cgroup(controller): |
1985 | - path = self._get_extension_cgroup_path(controller, extension_name) |
1986 | - |
1987 | - if os.path.exists(path): |
1988 | - try: |
1989 | - os.rmdir(path) |
1990 | - logger.info('Deleted cgroup "{0}".'.format(path)) |
1991 | - except OSError as exception: |
1992 | - if exception.errno == 16: # [Errno 16] Device or resource busy |
1993 | - logger.warn('CGroup "{0}" still has active tasks; will not remove it.'.format(path)) |
1994 | - |
1995 | - self._foreach_controller(remove_cgroup, 'Failed to delete cgroups for extension {0}'.format(extension_name)) |
1996 | - |
1997 | - def get_extension_cgroups(self, extension_name): |
1998 | - """ |
1999 | - Returns the cgroups for the given extension. |
2000 | - """ |
2001 | - |
2002 | - cgroups = [] |
2003 | + controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") |
2004 | + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() |
2005 | |
2006 | - def get_cgroup(controller): |
2007 | - cgroup = self._create_extension_cgroup(controller, extension_name) |
2008 | - cgroups.append(cgroup) |
2009 | + cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ |
2010 | + if cpu_mount_point is not None else None |
2011 | |
2012 | - self._foreach_controller(get_cgroup, 'Failed to retrieve cgroups for extension {0}'.format(extension_name)) |
2013 | + memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ |
2014 | + if memory_mount_point is not None else None |
2015 | |
2016 | - return cgroups |
2017 | + return cpu_cgroup_path, memory_cgroup_path |
2018 | |
2019 | - def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, |
2020 | - error_code=ExtensionErrorCodes.PluginUnknownFailure): |
2021 | + @staticmethod |
2022 | + def get_cgroup2_controllers(): |
2023 | """ |
2024 | - Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup |
2025 | - :param extension_name: The extension executing the command |
2026 | - :param command: The command to invoke |
2027 | - :param timeout: Number of seconds to wait for command completion |
2028 | - :param cwd: The working directory for the command |
2029 | - :param env: The environment to pass to the command's process |
2030 | - :param stdout: File object to redirect stdout to |
2031 | - :param stderr: File object to redirect stderr to |
2032 | - :param error_code: Extension error code to raise in case of error |
2033 | + Returns a tuple with the mount point for the cgroups v2 controllers, and the currently mounted controllers; |
2034 | + either value can be None if cgroups v2 or its controllers are not mounted |
2035 | """ |
2036 | - try: |
2037 | - extension_cgroups = self.create_extension_cgroups(extension_name) |
2038 | - except Exception as exception: |
2039 | - extension_cgroups = [] |
2040 | - logger.warn("Failed to create cgroups for extension '{0}'; resource usage will not be tracked. " |
2041 | - "Error: {1}".format(extension_name, ustr(exception))) |
2042 | - |
2043 | - def pre_exec_function(): |
2044 | - os.setsid() |
2045 | - |
2046 | - try: |
2047 | - pid = os.getpid() |
2048 | - |
2049 | - for cgroup in extension_cgroups: |
2050 | - try: |
2051 | - self._add_process_to_cgroup(pid, cgroup.path) |
2052 | - except Exception as exception: |
2053 | - logger.warn("Failed to add PID {0} to the cgroups for extension '{1}'. " |
2054 | - "Resource usage will not be tracked. Error: {2}".format(pid, |
2055 | - extension_name, |
2056 | - ustr(exception))) |
2057 | - except Exception as e: |
2058 | - logger.warn("Failed to add extension {0} to its cgroup. Resource usage will not be tracked. " |
2059 | - "Error: {1}".format(extension_name, ustr(e))) |
2060 | - |
2061 | - process = subprocess.Popen(command, |
2062 | - shell=shell, |
2063 | - cwd=cwd, |
2064 | - env=env, |
2065 | - stdout=stdout, |
2066 | - stderr=stderr, |
2067 | - preexec_fn=pre_exec_function) |
2068 | - |
2069 | - self.track_cgroups(extension_cgroups) |
2070 | - process_output = handle_process_completion(process=process, |
2071 | - command=command, |
2072 | - timeout=timeout, |
2073 | - stdout=stdout, |
2074 | - stderr=stderr, |
2075 | - error_code=error_code) |
2076 | - |
2077 | - return extension_cgroups, process_output |
2078 | - |
2079 | - |
2080 | -class SystemdCgroupsApi(CGroupsApi): |
2081 | - """ |
2082 | - Cgroups interface via systemd |
2083 | - """ |
2084 | - |
2085 | - @staticmethod |
2086 | - def create_and_start_unit(unit_filename, unit_contents): |
2087 | - try: |
2088 | - unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename) |
2089 | - fileutil.write_file(unit_path, unit_contents) |
2090 | - shellutil.run_command(["systemctl", "daemon-reload"]) |
2091 | - shellutil.run_command(["systemctl", "start", unit_filename]) |
2092 | - except Exception as e: |
2093 | - raise CGroupsException("Failed to create and start {0}. Error: {1}".format(unit_filename, ustr(e))) |
2094 | + # the output of mount is similar to |
2095 | + # $ mount -t cgroup2 |
2096 | + # cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate) |
2097 | + # |
2098 | + for line in shellutil.run_command(['mount', '-t', 'cgroup2']).splitlines(): |
2099 | + match = re.search(r'on\s+(?P<path>/\S+)\s', line) |
2100 | + if match is not None: |
2101 | + mount_point = match.group('path') |
2102 | + controllers = None |
2103 | + controllers_file = os.path.join(mount_point, 'cgroup.controllers') |
2104 | + if os.path.exists(controllers_file): |
2105 | + controllers = fileutil.read_file(controllers_file) |
2106 | + return mount_point, controllers |
2107 | + return None, None |
2108 | |
2109 | @staticmethod |
2110 | - def _get_extensions_slice_root_name(): |
2111 | - return "system-{0}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME) |
2112 | - |
2113 | - def _get_extension_slice_name(self, extension_name): |
2114 | - return "system-{0}-{1}.slice".format(EXTENSIONS_ROOT_CGROUP_NAME, self._get_extension_cgroup_name(extension_name)) |
2115 | - |
2116 | - def create_agent_cgroups(self): |
2117 | - try: |
2118 | - cgroup_unit = None |
2119 | - cgroup_paths = fileutil.read_file("/proc/self/cgroup") |
2120 | - for entry in cgroup_paths.splitlines(): |
2121 | - fields = entry.split(':') |
2122 | - if fields[1] == "name=systemd": |
2123 | - cgroup_unit = fields[2].lstrip(os.path.sep) |
2124 | - |
2125 | - cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'cpu', cgroup_unit) |
2126 | - memory_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'memory', cgroup_unit) |
2127 | - |
2128 | - return [CGroup.create(cpu_cgroup_path, 'cpu', VM_AGENT_CGROUP_NAME), |
2129 | - CGroup.create(memory_cgroup_path, 'memory', VM_AGENT_CGROUP_NAME)] |
2130 | - except Exception as e: |
2131 | - raise CGroupsException("Failed to get paths of agent's cgroups. Error: {0}".format(ustr(e))) |
2132 | - |
2133 | - def create_extension_cgroups_root(self): |
2134 | - unit_contents = """ |
2135 | -[Unit] |
2136 | -Description=Slice for walinuxagent extensions |
2137 | -DefaultDependencies=no |
2138 | -Before=slices.target |
2139 | -Requires=system.slice |
2140 | -After=system.slice""" |
2141 | - unit_filename = self._get_extensions_slice_root_name() |
2142 | - self.create_and_start_unit(unit_filename, unit_contents) |
2143 | - logger.info("Created slice for walinuxagent extensions {0}".format(unit_filename)) |
2144 | - |
2145 | - def create_extension_cgroups(self, extension_name): |
2146 | - # TODO: The slice created by this function is not used currently. We need to create the extension scopes within |
2147 | - # this slice and use the slice to monitor the cgroups. Also see comment in get_extension_cgroups. |
2148 | - # the slice. |
2149 | - unit_contents = """ |
2150 | -[Unit] |
2151 | -Description=Slice for extension {0} |
2152 | -DefaultDependencies=no |
2153 | -Before=slices.target |
2154 | -Requires=system-{1}.slice |
2155 | -After=system-{1}.slice""".format(extension_name, EXTENSIONS_ROOT_CGROUP_NAME) |
2156 | - unit_filename = self._get_extension_slice_name(extension_name) |
2157 | - self.create_and_start_unit(unit_filename, unit_contents) |
2158 | - logger.info("Created slice for {0}".format(unit_filename)) |
2159 | - |
2160 | - return self.get_extension_cgroups(extension_name) |
2161 | - |
2162 | - def remove_extension_cgroups(self, extension_name): |
2163 | - # For transient units, cgroups are released automatically when the unit stops, so it is sufficient |
2164 | - # to call stop on them. Persistent cgroups are released when the unit is disabled and its configuration |
2165 | - # file is deleted. |
2166 | - # The assumption is that this method is called after the extension has been uninstalled. For now, since |
2167 | - # we're running extensions within transient scopes which clean up after they finish running, no removal |
2168 | - # of units is needed. In the future, when the extension is running under its own slice, |
2169 | - # the following clean up is needed. |
2170 | - unit_filename = self._get_extension_slice_name(extension_name) |
2171 | - try: |
2172 | - unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename) |
2173 | - shellutil.run_command(["systemctl", "stop", unit_filename]) |
2174 | - fileutil.rm_files(unit_path) |
2175 | - shellutil.run_command(["systemctl", "daemon-reload"]) |
2176 | - except Exception as e: |
2177 | - raise CGroupsException("Failed to remove {0}. Error: {1}".format(unit_filename, ustr(e))) |
2178 | - |
2179 | - def get_extension_cgroups(self, extension_name): |
2180 | - # TODO: The slice returned by this function is not used currently. We need to create the extension scopes within |
2181 | - # this slice and use the slice to monitor the cgroups. Also see comment in create_extension_cgroups. |
2182 | - slice_name = self._get_extension_cgroup_name(extension_name) |
2183 | - |
2184 | - cgroups = [] |
2185 | - |
2186 | - def create_cgroup(controller): |
2187 | - cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', slice_name) |
2188 | - cgroups.append(CGroup.create(cpu_cgroup_path, controller, extension_name)) |
2189 | - |
2190 | - self._foreach_controller(create_cgroup, 'Cannot retrieve cgroup for extension {0}; resource usage will not be tracked.'.format(extension_name)) |
2191 | - |
2192 | - return cgroups |
2193 | + def _is_systemd_failure(scope_name, stderr): |
2194 | + stderr.seek(0) |
2195 | + stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') |
2196 | + unit_not_found = "Unit {0} not found.".format(scope_name) |
2197 | + return unit_not_found in stderr or scope_name not in stderr |
2198 | |
2199 | @staticmethod |
2200 | - def _is_systemd_failure(scope_name, process_output): |
2201 | - unit_not_found = "Unit {0} not found.".format(scope_name) |
2202 | - return unit_not_found in process_output or scope_name not in process_output |
2203 | + def get_extension_slice_name(extension_name, old_slice=False): |
2204 | + # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. |
2205 | + # old slice includes <HandlerName>.<ExtensionName>-<HandlerVersion> |
2206 | + # new slice without version <HandlerName>.<ExtensionName> |
2207 | + if not old_slice: |
2208 | + extension_name = extension_name.rsplit("-", 1)[0] |
2209 | + # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. |
2210 | + return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" |
2211 | |
2212 | - def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, |
2213 | + def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, |
2214 | error_code=ExtensionErrorCodes.PluginUnknownFailure): |
2215 | - scope_name = "{0}_{1}".format(self._get_extension_cgroup_name(extension_name), uuid.uuid4()) |
2216 | - |
2217 | - process = subprocess.Popen( |
2218 | - "systemd-run --unit={0} --scope {1}".format(scope_name, command), |
2219 | - shell=shell, |
2220 | - cwd=cwd, |
2221 | - stdout=stdout, |
2222 | - stderr=stderr, |
2223 | - env=env, |
2224 | - preexec_fn=os.setsid) |
2225 | + scope = "{0}_{1}".format(cmd_name, uuid.uuid4()) |
2226 | + extension_slice_name = self.get_extension_slice_name(extension_name) |
2227 | + with self._systemd_run_commands_lock: |
2228 | + process = subprocess.Popen( # pylint: disable=W1509 |
2229 | + # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice |
2230 | + # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in extension Cgroup |
2231 | + # since slice unit file configured with accounting enabled. |
2232 | + "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command), |
2233 | + shell=shell, |
2234 | + cwd=cwd, |
2235 | + stdout=stdout, |
2236 | + stderr=stderr, |
2237 | + env=env, |
2238 | + preexec_fn=os.setsid) |
2239 | + |
2240 | + # We start systemd-run with shell == True so process.pid is the shell's pid, not the pid for systemd-run |
2241 | + self._systemd_run_commands.append(process.pid) |
2242 | + |
2243 | + scope_name = scope + '.scope' |
2244 | + |
2245 | + logger.info("Started extension in unit '{0}'", scope_name) |
2246 | + |
2247 | + cpu_cgroup = None |
2248 | + try: |
2249 | + cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name) |
2250 | |
2251 | - logger.info("Started extension using scope '{0}'", scope_name) |
2252 | - extension_cgroups = [] |
2253 | + cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points() |
2254 | |
2255 | - def create_cgroup(controller): |
2256 | - cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, 'system.slice', scope_name + ".scope") |
2257 | - extension_cgroups.append(CGroup.create(cgroup_path, controller, extension_name)) |
2258 | + if cpu_cgroup_mountpoint is None: |
2259 | + logger.info("The CPU controller is not mounted; will not track resource usage") |
2260 | + else: |
2261 | + cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) |
2262 | + cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path) |
2263 | + CGroupsTelemetry.track_cgroup(cpu_cgroup) |
2264 | |
2265 | - self._foreach_controller(create_cgroup, 'Cannot create cgroup for extension {0}; ' |
2266 | - 'resource usage will not be tracked.'.format(extension_name)) |
2267 | - self.track_cgroups(extension_cgroups) |
2268 | + if memory_cgroup_mountpoint is None: |
2269 | + logger.info("The Memory controller is not mounted; will not track resource usage") |
2270 | + else: |
2271 | + memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path) |
2272 | + memory_cgroup = MemoryCgroup(extension_name, memory_cgroup_path) |
2273 | + CGroupsTelemetry.track_cgroup(memory_cgroup) |
2274 | + |
2275 | + except IOError as e: |
2276 | + if e.errno == 2: # 'No such file or directory' |
2277 | + logger.info("The extension command already completed; will not track resource usage") |
2278 | + logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) |
2279 | + except Exception as e: |
2280 | + logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) |
2281 | |
2282 | # Wait for process completion or timeout |
2283 | try: |
2284 | - process_output = handle_process_completion(process=process, |
2285 | - command=command, |
2286 | - timeout=timeout, |
2287 | - stdout=stdout, |
2288 | - stderr=stderr, |
2289 | - error_code=error_code) |
2290 | + return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, |
2291 | + stderr=stderr, error_code=error_code, cpu_cgroup=cpu_cgroup) |
2292 | except ExtensionError as e: |
2293 | # The extension didn't terminate successfully. Determine whether it was due to systemd errors or |
2294 | # extension errors. |
2295 | - process_output = read_output(stdout, stderr) |
2296 | - systemd_failure = self._is_systemd_failure(scope_name, process_output) |
2297 | - |
2298 | - if not systemd_failure: |
2299 | + if not self._is_systemd_failure(scope, stderr): |
2300 | # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error |
2301 | raise |
2302 | + |
2303 | + # There was an issue with systemd-run. We need to log it and retry the extension without systemd. |
2304 | + process_output = read_output(stdout, stderr) |
2305 | + # Reset the stdout and stderr |
2306 | + stdout.truncate(0) |
2307 | + stderr.truncate(0) |
2308 | + |
2309 | + if isinstance(e, ExtensionOperationError): |
2310 | + # no-member: Instance of 'ExtensionError' has no 'exit_code' member (no-member) - Disabled: e is actually an ExtensionOperationError |
2311 | + err_msg = 'Systemd process exited with code %s and output %s' % ( |
2312 | + e.exit_code, process_output) # pylint: disable=no-member |
2313 | else: |
2314 | - # There was an issue with systemd-run. We need to log it and retry the extension without systemd. |
2315 | - err_msg = 'Systemd process exited with code %s and output %s' % (e.exit_code, process_output) \ |
2316 | - if isinstance(e, ExtensionOperationError) else "Systemd timed-out, output: %s" % process_output |
2317 | - event_msg = 'Failed to run systemd-run for unit {0}.scope. ' \ |
2318 | - 'Will retry invoking the extension without systemd. ' \ |
2319 | - 'Systemd-run error: {1}'.format(scope_name, err_msg) |
2320 | - add_event(AGENT_NAME, |
2321 | - version=CURRENT_VERSION, |
2322 | - op=WALAEventOperation.InvokeCommandUsingSystemd, |
2323 | - is_success=False, |
2324 | - log_event=False, |
2325 | - message=event_msg) |
2326 | - logger.warn(event_msg) |
2327 | - |
2328 | - # Reset the stdout and stderr |
2329 | - stdout.truncate(0) |
2330 | - stderr.truncate(0) |
2331 | - |
2332 | - # Try invoking the process again, this time without systemd-run |
2333 | - logger.info('Extension invocation using systemd failed, falling back to regular invocation ' |
2334 | - 'without cgroups tracking.') |
2335 | - process = subprocess.Popen(command, |
2336 | - shell=shell, |
2337 | - cwd=cwd, |
2338 | - env=env, |
2339 | - stdout=stdout, |
2340 | - stderr=stderr, |
2341 | - preexec_fn=os.setsid) |
2342 | - |
2343 | - process_output = handle_process_completion(process=process, |
2344 | - command=command, |
2345 | - timeout=timeout, |
2346 | - stdout=stdout, |
2347 | - stderr=stderr, |
2348 | - error_code=error_code) |
2349 | - |
2350 | - return [], process_output |
2351 | - |
2352 | - # The process terminated in time and successfully |
2353 | - return extension_cgroups, process_output |
2354 | + err_msg = "Systemd timed-out, output: %s" % process_output |
2355 | + raise SystemdRunError(err_msg) |
2356 | + finally: |
2357 | + with self._systemd_run_commands_lock: |
2358 | + self._systemd_run_commands.remove(process.pid) |
2359 | |
2360 | def cleanup_legacy_cgroups(self): |
2361 | """ |
2362 | Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; |
2363 | starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If |
2364 | - we find that any of the legacy groups include the PID of the daemon then we disable data collection for this instance |
2365 | - (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) |
2366 | + we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this |
2367 | + instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) |
2368 | """ |
2369 | - def report_error(_, daemon_pid): |
2370 | - raise CGroupsException( |
2371 | - "The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid)) |
2372 | - |
2373 | - CGroupsApi._foreach_legacy_cgroup(report_error) |
2374 | + return CGroupsApi._foreach_legacy_cgroup(lambda *_: None) |
2375 | diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py |
2376 | index ea6983f..767786f 100644 |
2377 | --- a/azurelinuxagent/common/cgroupconfigurator.py |
2378 | +++ b/azurelinuxagent/common/cgroupconfigurator.py |
2379 | @@ -1,3 +1,4 @@ |
2380 | +# -*- encoding: utf-8 -*- |
2381 | # Copyright 2018 Microsoft Corporation |
2382 | # |
2383 | # Licensed under the Apache License, Version 2.0 (the "License"); |
2384 | @@ -13,157 +14,870 @@ |
2385 | # limitations under the License. |
2386 | # |
2387 | # Requires Python 2.6+ and Openssl 1.0+ |
2388 | - |
2389 | +import glob |
2390 | +import json |
2391 | import os |
2392 | +import re |
2393 | import subprocess |
2394 | +import threading |
2395 | |
2396 | +from azurelinuxagent.common import conf |
2397 | from azurelinuxagent.common import logger |
2398 | -from azurelinuxagent.common.cgroupapi import CGroupsApi |
2399 | +from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup |
2400 | +from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX |
2401 | from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry |
2402 | -from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes |
2403 | +from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException |
2404 | from azurelinuxagent.common.future import ustr |
2405 | -from azurelinuxagent.common.osutil import get_osutil |
2406 | +from azurelinuxagent.common.osutil import get_osutil, systemd |
2407 | +from azurelinuxagent.common.version import get_distro |
2408 | +from azurelinuxagent.common.utils import shellutil, fileutil |
2409 | from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion |
2410 | -from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION |
2411 | from azurelinuxagent.common.event import add_event, WALAEventOperation |
2412 | |
2413 | +AZURE_SLICE = "azure.slice" |
2414 | +_AZURE_SLICE_CONTENTS = """ |
2415 | +[Unit] |
2416 | +Description=Slice for Azure VM Agent and Extensions |
2417 | +DefaultDependencies=no |
2418 | +Before=slices.target |
2419 | +""" |
2420 | +_VMEXTENSIONS_SLICE = EXTENSION_SLICE_PREFIX + ".slice" |
2421 | +_AZURE_VMEXTENSIONS_SLICE = AZURE_SLICE + "/" + _VMEXTENSIONS_SLICE |
2422 | +_VMEXTENSIONS_SLICE_CONTENTS = """ |
2423 | +[Unit] |
2424 | +Description=Slice for Azure VM Extensions |
2425 | +DefaultDependencies=no |
2426 | +Before=slices.target |
2427 | +[Slice] |
2428 | +CPUAccounting=yes |
2429 | +MemoryAccounting=yes |
2430 | +""" |
2431 | +_EXTENSION_SLICE_CONTENTS = """ |
2432 | +[Unit] |
2433 | +Description=Slice for Azure VM extension {extension_name} |
2434 | +DefaultDependencies=no |
2435 | +Before=slices.target |
2436 | +[Slice] |
2437 | +CPUAccounting=yes |
2438 | +CPUQuota={cpu_quota} |
2439 | +MemoryAccounting=yes |
2440 | +""" |
2441 | +LOGCOLLECTOR_SLICE = "azure-walinuxagent-logcollector.slice" |
2442 | +# More info on resource limits properties in systemd here: |
2443 | +# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups |
2444 | +_LOGCOLLECTOR_SLICE_CONTENTS_FMT = """ |
2445 | +[Unit] |
2446 | +Description=Slice for Azure VM Agent Periodic Log Collector |
2447 | +DefaultDependencies=no |
2448 | +Before=slices.target |
2449 | +[Slice] |
2450 | +CPUAccounting=yes |
2451 | +CPUQuota={cpu_quota} |
2452 | +MemoryAccounting=yes |
2453 | +""" |
2454 | +_LOGCOLLECTOR_CPU_QUOTA = "5%" |
2455 | +LOGCOLLECTOR_MEMORY_LIMIT = 30 * 1024 ** 2 # 30Mb |
2456 | + |
2457 | +_AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf" |
2458 | +_AGENT_DROP_IN_FILE_SLICE_CONTENTS = """ |
2459 | +# This drop-in unit file was created by the Azure VM Agent. |
2460 | +# Do not edit. |
2461 | +[Service] |
2462 | +Slice=azure.slice |
2463 | +""" |
2464 | +_DROP_IN_FILE_CPU_ACCOUNTING = "11-CPUAccounting.conf" |
2465 | +_DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS = """ |
2466 | +# This drop-in unit file was created by the Azure VM Agent. |
2467 | +# Do not edit. |
2468 | +[Service] |
2469 | +CPUAccounting=yes |
2470 | +""" |
2471 | +_DROP_IN_FILE_CPU_QUOTA = "12-CPUQuota.conf" |
2472 | +_DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT = """ |
2473 | +# This drop-in unit file was created by the Azure VM Agent. |
2474 | +# Do not edit. |
2475 | +[Service] |
2476 | +CPUQuota={0} |
2477 | +""" |
2478 | +_DROP_IN_FILE_MEMORY_ACCOUNTING = "13-MemoryAccounting.conf" |
2479 | +_DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS = """ |
2480 | +# This drop-in unit file was created by the Azure VM Agent. |
2481 | +# Do not edit. |
2482 | +[Service] |
2483 | +MemoryAccounting=yes |
2484 | +""" |
2485 | + |
2486 | + |
2487 | +class DisableCgroups(object): |
2488 | + ALL = "all" |
2489 | + AGENT = "agent" |
2490 | + EXTENSIONS = "extensions" |
2491 | + |
2492 | + |
2493 | +def _log_cgroup_info(format_string, *args): |
2494 | + message = format_string.format(*args) |
2495 | + logger.info("[CGI] " + message) |
2496 | + add_event(op=WALAEventOperation.CGroupsInfo, message=message) |
2497 | + |
2498 | + |
2499 | +def _log_cgroup_warning(format_string, *args): |
2500 | + message = format_string.format(*args) |
2501 | + logger.info("[CGW] " + message) # log as INFO for now, in the future it should be logged as WARNING |
2502 | + add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False) |
2503 | + |
2504 | |
2505 | class CGroupConfigurator(object): |
2506 | """ |
2507 | This class implements the high-level operations on CGroups (e.g. initialization, creation, etc) |
2508 | |
2509 | - NOTE: with the exception of start_extension_command, none of the methods in this class raise exceptions (cgroup operations should not block extensions) |
2510 | + NOTE: with the exception of start_extension_command, none of the methods in this class |
2511 | + raise exceptions (cgroup operations should not block extensions) |
2512 | """ |
2513 | - class __impl(object): |
2514 | + |
2515 | + class _Impl(object): |
2516 | def __init__(self): |
2517 | + self._initialized = False |
2518 | + self._cgroups_supported = False |
2519 | + self._agent_cgroups_enabled = False |
2520 | + self._extensions_cgroups_enabled = False |
2521 | + self._cgroups_api = None |
2522 | + self._agent_cpu_cgroup_path = None |
2523 | + self._agent_memory_cgroup_path = None |
2524 | + self._agent_memory_cgroup = None |
2525 | + self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop. |
2526 | + |
2527 | + def initialize(self): |
2528 | + try: |
2529 | + if self._initialized: |
2530 | + return |
2531 | + # This check is to reset the quotas if agent goes from cgroup supported to unsupported distros later in time. |
2532 | + if not CGroupsApi.cgroups_supported(): |
2533 | + agent_drop_in_path = systemd.get_agent_drop_in_path() |
2534 | + try: |
2535 | + if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path): |
2536 | + files_to_cleanup = [] |
2537 | + agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE) |
2538 | + agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, |
2539 | + _DROP_IN_FILE_CPU_ACCOUNTING) |
2540 | + agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, |
2541 | + _DROP_IN_FILE_MEMORY_ACCOUNTING) |
2542 | + agent_drop_in_file_cpu_quota = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_QUOTA) |
2543 | + files_to_cleanup.extend([agent_drop_in_file_slice, agent_drop_in_file_cpu_accounting, |
2544 | + agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota]) |
2545 | + self.__cleanup_all_files(files_to_cleanup) |
2546 | + self.__reload_systemd_config() |
2547 | + logger.info("Agent reset the quotas if distro: {0} goes from supported to unsupported list", get_distro()) |
2548 | + except Exception as err: |
2549 | + logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) |
2550 | + |
2551 | + # check whether cgroup monitoring is supported on the current distro |
2552 | + self._cgroups_supported = CGroupsApi.cgroups_supported() |
2553 | + if not self._cgroups_supported: |
2554 | + logger.info("Cgroup monitoring is not supported on {0}", get_distro()) |
2555 | + return |
2556 | + |
2557 | + # check that systemd is detected correctly |
2558 | + self._cgroups_api = SystemdCgroupsApi() |
2559 | + if not systemd.is_systemd(): |
2560 | + _log_cgroup_warning("systemd was not detected on {0}", get_distro()) |
2561 | + return |
2562 | + |
2563 | + _log_cgroup_info("systemd version: {0}", systemd.get_version()) |
2564 | + |
2565 | + # This is temporarily disabled while we analyze telemetry. Likely it will be removed. |
2566 | + # self.__collect_azure_unit_telemetry() |
2567 | + # self.__collect_agent_unit_files_telemetry() |
2568 | + |
2569 | + if not self.__check_no_legacy_cgroups(): |
2570 | + return |
2571 | + |
2572 | + agent_unit_name = systemd.get_agent_unit_name() |
2573 | + agent_slice = systemd.get_unit_property(agent_unit_name, "Slice") |
2574 | + if agent_slice not in (AZURE_SLICE, "system.slice"): |
2575 | + _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice) |
2576 | + return |
2577 | + |
2578 | + self.__setup_azure_slice() |
2579 | + |
2580 | + cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers() |
2581 | + self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, |
2582 | + cpu_controller_root, |
2583 | + memory_controller_root) |
2584 | + |
2585 | + if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None: |
2586 | + self.enable() |
2587 | + |
2588 | + if self._agent_cpu_cgroup_path is not None: |
2589 | + _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path) |
2590 | + self.__set_cpu_quota(conf.get_agent_cpu_quota()) |
2591 | + CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) |
2592 | + |
2593 | + if self._agent_memory_cgroup_path is not None: |
2594 | + _log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path) |
2595 | + self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path) |
2596 | + CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup) |
2597 | + |
2598 | + _log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled) |
2599 | + |
2600 | + except Exception as exception: |
2601 | + _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception)) |
2602 | + finally: |
2603 | + self._initialized = True |
2604 | + |
2605 | + @staticmethod |
2606 | + def __collect_azure_unit_telemetry(): |
2607 | + azure_units = [] |
2608 | + |
2609 | + try: |
2610 | + units = shellutil.run_command(['systemctl', 'list-units', 'azure*', '-all']) |
2611 | + for line in units.split('\n'): |
2612 | + match = re.match(r'\s?(azure[^\s]*)\s?', line, re.IGNORECASE) |
2613 | + if match is not None: |
2614 | + azure_units.append((match.group(1), line)) |
2615 | + except shellutil.CommandError as command_error: |
2616 | + _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error)) |
2617 | + |
2618 | + for unit_name, unit_description in azure_units: |
2619 | + unit_slice = "Unknown" |
2620 | + try: |
2621 | + unit_slice = systemd.get_unit_property(unit_name, "Slice") |
2622 | + except Exception as exception: |
2623 | + _log_cgroup_warning("Failed to query Slice for {0}: {1}", unit_name, ustr(exception)) |
2624 | + |
2625 | + _log_cgroup_info("Found an Azure unit under slice {0}: {1}", unit_slice, unit_description) |
2626 | + |
2627 | + if len(azure_units) == 0: |
2628 | + try: |
2629 | + cgroups = shellutil.run_command('systemd-cgls') |
2630 | + for line in cgroups.split('\n'): |
2631 | + if re.match(r'[^\x00-\xff]+azure\.slice\s*', line, re.UNICODE): |
2632 | + logger.info(ustr("Found a cgroup for azure.slice\n{0}").format(cgroups)) |
2633 | + # Don't add the output of systemd-cgls to the telemetry, since currently it does not support Unicode |
2634 | + add_event(op=WALAEventOperation.CGroupsInfo, message="Found a cgroup for azure.slice") |
2635 | + except shellutil.CommandError as command_error: |
2636 | + _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error)) |
2637 | + |
2638 | + @staticmethod |
2639 | + def __collect_agent_unit_files_telemetry(): |
2640 | + agent_unit_files = [] |
2641 | + agent_service_name = get_osutil().get_service_name() |
2642 | + try: |
2643 | + fragment_path = systemd.get_unit_property(agent_service_name, "FragmentPath") |
2644 | + if fragment_path != systemd.get_agent_unit_file(): |
2645 | + agent_unit_files.append(fragment_path) |
2646 | + except Exception as exception: |
2647 | + _log_cgroup_warning("Failed to query the agent's FragmentPath: {0}", ustr(exception)) |
2648 | + |
2649 | + try: |
2650 | + drop_in_paths = systemd.get_unit_property(agent_service_name, "DropInPaths") |
2651 | + for path in drop_in_paths.split(): |
2652 | + agent_unit_files.append(path) |
2653 | + except Exception as exception: |
2654 | + _log_cgroup_warning("Failed to query the agent's DropInPaths: {0}", ustr(exception)) |
2655 | + |
2656 | + for unit_file in agent_unit_files: |
2657 | + try: |
2658 | + with open(unit_file, "r") as file_object: |
2659 | + _log_cgroup_info("Found a custom unit file for the agent: {0}\n{1}", unit_file, |
2660 | + file_object.read()) |
2661 | + except Exception as exception: |
2662 | + _log_cgroup_warning("Can't read {0}: {1}", unit_file, ustr(exception)) |
2663 | + |
2664 | + def __check_no_legacy_cgroups(self): |
2665 | + """ |
2666 | + Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running |
2667 | + under systemd this could produce invalid resource usage data. Cgroups should not be enabled under this condition. |
2668 | + """ |
2669 | + legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups() |
2670 | + if legacy_cgroups > 0: |
2671 | + _log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") |
2672 | + return False |
2673 | + return True |
2674 | + |
2675 | + def __get_cgroup_controllers(self): |
2676 | + # |
2677 | + # check v1 controllers |
2678 | + # |
2679 | + cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() |
2680 | + |
2681 | + if cpu_controller_root is not None: |
2682 | + logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root) |
2683 | + else: |
2684 | + _log_cgroup_warning("The CPU cgroup controller is not mounted") |
2685 | + |
2686 | + if memory_controller_root is not None: |
2687 | + logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root) |
2688 | + else: |
2689 | + _log_cgroup_warning("The memory cgroup controller is not mounted") |
2690 | + |
2691 | + # |
2692 | + # check v2 controllers |
2693 | + # |
2694 | + cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers() |
2695 | + if cgroup2_mount_point is not None: |
2696 | + _log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point, |
2697 | + cgroup2_controllers) |
2698 | + |
2699 | + return cpu_controller_root, memory_controller_root |
2700 | + |
2701 | + @staticmethod |
2702 | + def __setup_azure_slice(): |
2703 | """ |
2704 | - Ensures the cgroups file system is mounted and selects the correct API to interact with it |
2705 | + The agent creates "azure.slice" for use by extensions and the agent. The agent runs under "azure.slice" directly and each |
2706 | + extension runs under its own slice ("Microsoft.CPlat.Extension.slice" in the example below). All the slices for |
2707 | + extensions are grouped under "vmextensions.slice". |
2708 | + |
2709 | + Example: -.slice |
2710 | + ├─user.slice |
2711 | + ├─system.slice |
2712 | + └─azure.slice |
2713 | + ├─walinuxagent.service |
2714 | + │ ├─5759 /usr/bin/python3 -u /usr/sbin/waagent -daemon |
2715 | + │ └─5764 python3 -u bin/WALinuxAgent-2.2.53-py2.7.egg -run-exthandlers |
2716 | + └─azure-vmextensions.slice |
2717 | + └─Microsoft.CPlat.Extension.slice |
2718 | + └─5894 /usr/bin/python3 /var/lib/waagent/Microsoft.CPlat.Extension-1.0.0.0/enable.py |
2719 | + |
2720 | + This method ensures that the "azure" and "vmextensions" slices are created. Setup should create those slices |
2721 | + under /lib/systemd/system; but if they do not exist, __ensure_azure_slices_exist will create them. |
2722 | + |
2723 | + It also creates drop-in files to set the agent's Slice and CPUAccounting if they have not been |
2724 | + set up in the agent's unit file. |
2725 | + |
2726 | + Lastly, the method also cleans up unit files left over from previous versions of the agent. |
2727 | """ |
2728 | - osutil = get_osutil() |
2729 | |
2730 | - self._cgroups_supported = osutil.is_cgroups_supported() |
2731 | + # Older agents used to create this slice, but it was never used. Cleanup the file. |
2732 | + CGroupConfigurator._Impl.__cleanup_unit_file("/etc/systemd/system/system-walinuxagent.extensions.slice") |
2733 | + |
2734 | + unit_file_install_path = systemd.get_unit_file_install_path() |
2735 | + azure_slice = os.path.join(unit_file_install_path, AZURE_SLICE) |
2736 | + vmextensions_slice = os.path.join(unit_file_install_path, _VMEXTENSIONS_SLICE) |
2737 | + logcollector_slice = os.path.join(unit_file_install_path, LOGCOLLECTOR_SLICE) |
2738 | + agent_unit_file = systemd.get_agent_unit_file() |
2739 | + agent_drop_in_path = systemd.get_agent_drop_in_path() |
2740 | + agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE) |
2741 | + agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_ACCOUNTING) |
2742 | + agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, _DROP_IN_FILE_MEMORY_ACCOUNTING) |
2743 | + |
2744 | + files_to_create = [] |
2745 | + |
2746 | + if not os.path.exists(azure_slice): |
2747 | + files_to_create.append((azure_slice, _AZURE_SLICE_CONTENTS)) |
2748 | + |
2749 | + if not os.path.exists(vmextensions_slice): |
2750 | + files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS)) |
2751 | + |
2752 | + # Update log collector slice contents |
2753 | + slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA) |
2754 | + files_to_create.append((logcollector_slice, slice_contents)) |
2755 | + |
2756 | + if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None: |
2757 | + CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_slice) |
2758 | + else: |
2759 | + if not os.path.exists(agent_drop_in_file_slice): |
2760 | + files_to_create.append((agent_drop_in_file_slice, _AGENT_DROP_IN_FILE_SLICE_CONTENTS)) |
2761 | + |
2762 | + if fileutil.findre_in_file(agent_unit_file, r"CPUAccounting=") is not None: |
2763 | + CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_cpu_accounting) |
2764 | + else: |
2765 | + if not os.path.exists(agent_drop_in_file_cpu_accounting): |
2766 | + files_to_create.append((agent_drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS)) |
2767 | + |
2768 | + if fileutil.findre_in_file(agent_unit_file, r"MemoryAccounting=") is not None: |
2769 | + CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_memory_accounting) |
2770 | + else: |
2771 | + if not os.path.exists(agent_drop_in_file_memory_accounting): |
2772 | + files_to_create.append( |
2773 | + (agent_drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS)) |
2774 | + |
2775 | + if len(files_to_create) > 0: |
2776 | + # create the unit files, but if 1 fails remove all and return |
2777 | + try: |
2778 | + for path, contents in files_to_create: |
2779 | + CGroupConfigurator._Impl.__create_unit_file(path, contents) |
2780 | + except Exception as exception: |
2781 | + _log_cgroup_warning("Failed to create unit files for the azure slice: {0}", ustr(exception)) |
2782 | + for unit_file in files_to_create: |
2783 | + CGroupConfigurator._Impl.__cleanup_unit_file(unit_file) |
2784 | + return |
2785 | + |
2786 | + CGroupConfigurator._Impl.__reload_systemd_config() |
2787 | |
2788 | - if self._cgroups_supported: |
2789 | - self._enabled = True |
2790 | + @staticmethod |
2791 | + def __reload_systemd_config(): |
2792 | + # reload the systemd configuration; the new slices will be used once the agent's service restarts |
2793 | + try: |
2794 | + logger.info("Executing systemctl daemon-reload...") |
2795 | + shellutil.run_command(["systemctl", "daemon-reload"]) |
2796 | + except Exception as exception: |
2797 | + _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception)) |
2798 | + |
2799 | + @staticmethod |
2800 | + def __create_unit_file(path, contents): |
2801 | + parent, _ = os.path.split(path) |
2802 | + if not os.path.exists(parent): |
2803 | + fileutil.mkdir(parent, mode=0o755) |
2804 | + exists = os.path.exists(path) |
2805 | + fileutil.write_file(path, contents) |
2806 | + _log_cgroup_info("{0} {1}", "Updated" if exists else "Created", path) |
2807 | + |
2808 | + @staticmethod |
2809 | + def __cleanup_unit_file(path): |
2810 | + if os.path.exists(path): |
2811 | try: |
2812 | - osutil.mount_cgroups() |
2813 | - self._cgroups_api = CGroupsApi.create() |
2814 | - status = "The cgroup filesystem is ready to use" |
2815 | - except Exception as e: |
2816 | - status = ustr(e) |
2817 | - self._enabled = False |
2818 | + os.remove(path) |
2819 | + _log_cgroup_info("Removed {0}", path) |
2820 | + except Exception as exception: |
2821 | + _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception)) |
2822 | + |
2823 | + @staticmethod |
2824 | + def __cleanup_all_files(files_to_cleanup): |
2825 | + for path in files_to_cleanup: |
2826 | + if os.path.exists(path): |
2827 | + try: |
2828 | + os.remove(path) |
2829 | + _log_cgroup_info("Removed {0}", path) |
2830 | + except Exception as exception: |
2831 | + _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception)) |
2832 | + |
2833 | + @staticmethod |
2834 | + def __create_all_files(files_to_create): |
2835 | + # create the unit files, but if 1 fails remove all and return |
2836 | + try: |
2837 | + for path, contents in files_to_create: |
2838 | + CGroupConfigurator._Impl.__create_unit_file(path, contents) |
2839 | + except Exception as exception: |
2840 | + _log_cgroup_warning("Failed to create unit files : {0}", ustr(exception)) |
2841 | + for unit_file in files_to_create: |
2842 | + CGroupConfigurator._Impl.__cleanup_unit_file(unit_file) |
2843 | + return |
2844 | + |
2845 | + def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota=None): |
2846 | + unit_file_install_path = systemd.get_unit_file_install_path() |
2847 | + old_extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name, old_slice=True)) |
2848 | + # clean up the old slice from the disk |
2849 | + if os.path.exists(old_extension_slice_path): |
2850 | + CGroupConfigurator._Impl.__cleanup_unit_file(old_extension_slice_path) |
2851 | + |
2852 | + extension_slice_path = os.path.join(unit_file_install_path, |
2853 | + SystemdCgroupsApi.get_extension_slice_name(extension_name)) |
2854 | + cpu_quota = str( |
2855 | + cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) |
2856 | + slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, |
2857 | + cpu_quota=cpu_quota) |
2858 | + if os.path.exists(extension_slice_path): |
2859 | + with open(extension_slice_path, "r") as file_: |
2860 | + if file_.read() == slice_contents: |
2861 | + return True |
2862 | + return False |
2863 | + |
2864 | + def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controller_root): |
2865 | + agent_unit_name = systemd.get_agent_unit_name() |
2866 | + |
2867 | + expected_relative_path = os.path.join(agent_slice, agent_unit_name) |
2868 | + cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths( |
2869 | + "self") |
2870 | + |
2871 | + if cpu_cgroup_relative_path is None: |
2872 | + _log_cgroup_warning("The agent's process is not within a CPU cgroup") |
2873 | + else: |
2874 | + if cpu_cgroup_relative_path == expected_relative_path: |
2875 | + _log_cgroup_info('CPUAccounting: {0}', systemd.get_unit_property(agent_unit_name, "CPUAccounting")) |
2876 | + _log_cgroup_info('CPUQuota: {0}', systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec")) |
2877 | + else: |
2878 | + _log_cgroup_warning( |
2879 | + "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]", |
2880 | + cpu_cgroup_relative_path, |
2881 | + expected_relative_path) |
2882 | + cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring |
2883 | + |
2884 | + if memory_cgroup_relative_path is None: |
2885 | + _log_cgroup_warning("The agent's process is not within a memory cgroup") |
2886 | else: |
2887 | - self._enabled = False |
2888 | - self._cgroups_api = None |
2889 | - status = "Cgroups are not supported by the platform" |
2890 | + if memory_cgroup_relative_path == expected_relative_path: |
2891 | + memory_accounting = systemd.get_unit_property(agent_unit_name, "MemoryAccounting") |
2892 | + _log_cgroup_info('MemoryAccounting: {0}', memory_accounting) |
2893 | + else: |
2894 | + _log_cgroup_info( |
2895 | + "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]", |
2896 | + memory_cgroup_relative_path, |
2897 | + expected_relative_path) |
2898 | + memory_cgroup_relative_path = None # Set the path to None to prevent monitoring |
2899 | |
2900 | - logger.info("CGroups Status: {0}".format(status)) |
2901 | + if cpu_controller_root is not None and cpu_cgroup_relative_path is not None: |
2902 | + agent_cpu_cgroup_path = os.path.join(cpu_controller_root, cpu_cgroup_relative_path) |
2903 | + else: |
2904 | + agent_cpu_cgroup_path = None |
2905 | + |
2906 | + if memory_controller_root is not None and memory_cgroup_relative_path is not None: |
2907 | + agent_memory_cgroup_path = os.path.join(memory_controller_root, memory_cgroup_relative_path) |
2908 | + else: |
2909 | + agent_memory_cgroup_path = None |
2910 | |
2911 | - add_event( |
2912 | - AGENT_NAME, |
2913 | - version=CURRENT_VERSION, |
2914 | - op=WALAEventOperation.InitializeCGroups, |
2915 | - is_success=self._enabled, |
2916 | - message=status, |
2917 | - log_event=False) |
2918 | + return agent_cpu_cgroup_path, agent_memory_cgroup_path |
2919 | + |
2920 | + def supported(self): |
2921 | + return self._cgroups_supported |
2922 | |
2923 | def enabled(self): |
2924 | - return self._enabled |
2925 | + return self._agent_cgroups_enabled or self._extensions_cgroups_enabled |
2926 | + |
2927 | + def agent_enabled(self): |
2928 | + return self._agent_cgroups_enabled |
2929 | + |
2930 | + def extensions_enabled(self): |
2931 | + return self._extensions_cgroups_enabled |
2932 | |
2933 | def enable(self): |
2934 | - if not self._cgroups_supported: |
2935 | - raise CGroupsException("cgroups are not supported on the current platform") |
2936 | + if not self.supported(): |
2937 | + raise CGroupsException( |
2938 | + "Attempted to enable cgroups, but they are not supported on the current platform") |
2939 | + self._agent_cgroups_enabled = True |
2940 | + self._extensions_cgroups_enabled = True |
2941 | |
2942 | - self._enabled = True |
2943 | + def disable(self, reason, disable_cgroups): |
2944 | + if disable_cgroups == DisableCgroups.ALL: # disable all |
2945 | + # Reset quotas |
2946 | + self.__reset_agent_cpu_quota() |
2947 | + extension_services = self.get_extension_services_list() |
2948 | + for extension in extension_services: |
2949 | + logger.info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension])) |
2950 | + self.__reset_extension_cpu_quota(extension_name=extension) |
2951 | + self.__reset_extension_services_cpu_quota(extension_services[extension]) |
2952 | + self.__reload_systemd_config() |
2953 | |
2954 | - def disable(self): |
2955 | - self._enabled = False |
2956 | - CGroupsTelemetry.reset() |
2957 | + CGroupsTelemetry.reset() |
2958 | + self._agent_cgroups_enabled = False |
2959 | + self._extensions_cgroups_enabled = False |
2960 | + elif disable_cgroups == DisableCgroups.AGENT: # disable agent |
2961 | + self._agent_cgroups_enabled = False |
2962 | + self.__reset_agent_cpu_quota() |
2963 | + CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) |
2964 | |
2965 | - def _invoke_cgroup_operation(self, operation, error_message, on_error=None): |
2966 | + message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason) |
2967 | + logger.info(message) # log as INFO for now, in the future it should be logged as WARNING |
2968 | + add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False) |
2969 | + |
2970 | + @staticmethod |
2971 | + def __set_cpu_quota(quota): |
2972 | """ |
2973 | - Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation. |
2974 | + Sets the agent's CPU quota to the given percentage (100% == 1 CPU) |
2975 | + |
2976 | + NOTE: This is done using a dropin file in the default dropin directory; any local overrides on the VM will take precedence |
2977 | + over this setting. |
2978 | """ |
2979 | - if not self.enabled(): |
2980 | - return |
2981 | + quota_percentage = "{0}%".format(quota) |
2982 | + _log_cgroup_info("Ensuring the agent's CPUQuota is {0}", quota_percentage) |
2983 | + if CGroupConfigurator._Impl.__try_set_cpu_quota(quota_percentage): |
2984 | + CGroupsTelemetry.set_track_throttled_time(True) |
2985 | + |
2986 | + @staticmethod |
2987 | + def __reset_agent_cpu_quota(): |
2988 | + """ |
2989 | + Removes any CPUQuota on the agent |
2990 | |
2991 | + NOTE: This resets the quota on the agent's default dropin file; any local overrides on the VM will take precedence |
2992 | + over this setting. |
2993 | + """ |
2994 | + logger.info("Resetting agent's CPUQuota") |
2995 | + if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity) |
2996 | + _log_cgroup_info('CPUQuota: {0}', |
2997 | + systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec")) |
2998 | + |
2999 | + @staticmethod |
3000 | + def __try_set_cpu_quota(quota): |
3001 | try: |
3002 | - return operation() |
3003 | - except Exception as e: |
3004 | - logger.warn("{0} Error: {1}".format(error_message, ustr(e))) |
3005 | - if on_error is not None: |
3006 | - try: |
3007 | - on_error(e) |
3008 | - except Exception as ex: |
3009 | - logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e))) |
3010 | + drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA) |
3011 | + contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(quota) |
3012 | + if os.path.exists(drop_in_file): |
3013 | + with open(drop_in_file, "r") as file_: |
3014 | + if file_.read() == contents: |
3015 | + return True # no need to update the file; return here to avoid doing a daemon-reload |
3016 | + CGroupConfigurator._Impl.__create_unit_file(drop_in_file, contents) |
3017 | + except Exception as exception: |
3018 | + _log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception)) |
3019 | + return False |
3020 | + try: |
3021 | + logger.info("Executing systemctl daemon-reload...") |
3022 | + shellutil.run_command(["systemctl", "daemon-reload"]) |
3023 | + except Exception as exception: |
3024 | + _log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception)) |
3025 | + return False |
3026 | + return True |
3027 | + |
3028 | + def check_cgroups(self, cgroup_metrics): |
3029 | + self._check_cgroups_lock.acquire() |
3030 | + try: |
3031 | + if not self.enabled(): |
3032 | + return |
3033 | + |
3034 | + errors = [] |
3035 | + |
3036 | + process_check_success = False |
3037 | + try: |
3038 | + self._check_processes_in_agent_cgroup() |
3039 | + process_check_success = True |
3040 | + except CGroupsException as exception: |
3041 | + errors.append(exception) |
3042 | |
3043 | - def create_agent_cgroups(self, track_cgroups): |
3044 | + quota_check_success = False |
3045 | + try: |
3046 | + if cgroup_metrics: |
3047 | + self._check_agent_throttled_time(cgroup_metrics) |
3048 | + quota_check_success = True |
3049 | + except CGroupsException as exception: |
3050 | + errors.append(exception) |
3051 | + |
3052 | + reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors])) |
3053 | + |
3054 | + if not process_check_success and conf.get_cgroup_disable_on_process_check_failure(): |
3055 | + self.disable(reason, DisableCgroups.ALL) |
3056 | + |
3057 | + if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure(): |
3058 | + self.disable(reason, DisableCgroups.AGENT) |
3059 | + finally: |
3060 | + self._check_cgroups_lock.release() |
3061 | + |
3062 | + def _check_processes_in_agent_cgroup(self): |
3063 | """ |
3064 | - Creates and returns the cgroups needed to track the VM Agent |
3065 | + Verifies that the agent's cgroup includes only the current process, its parent, commands started using shellutil and instances of systemd-run |
3066 | + (those processes correspond, respectively, to the extension handler, the daemon, commands started by the extension handler, and the systemd-run |
3067 | + commands used to start extensions on their own cgroup). |
3068 | + Other processes started by the agent (e.g. extensions) and processes not started by the agent (e.g. services installed by extensions) are reported |
3069 | + as unexpected, since they should belong to their own cgroup. |
3070 | + |
3071 | + Raises a CGroupsException if the check fails |
3072 | """ |
3073 | - def __impl(): |
3074 | - cgroups = self._cgroups_api.create_agent_cgroups() |
3075 | + unexpected = [] |
3076 | + agent_cgroup_proc_names = [] |
3077 | + try: |
3078 | + daemon = os.getppid() |
3079 | + extension_handler = os.getpid() |
3080 | + agent_commands = set() |
3081 | + agent_commands.update(shellutil.get_running_commands()) |
3082 | + systemd_run_commands = set() |
3083 | + systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) |
3084 | + agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path) |
3085 | + # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup; |
3086 | + agent_commands.update(shellutil.get_running_commands()) |
3087 | + systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) |
3088 | |
3089 | - if track_cgroups: |
3090 | - for cgroup in cgroups: |
3091 | - CGroupsTelemetry.track_cgroup(cgroup) |
3092 | + for process in agent_cgroup: |
3093 | + agent_cgroup_proc_names.append(self.__format_process(process)) |
3094 | + # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't. |
3095 | + if process in (daemon, extension_handler) or process in systemd_run_commands: |
3096 | + continue |
3097 | + # check shell systemd_run process if above process check didn't catch it |
3098 | + if self._check_systemd_run_process(process): |
3099 | + continue |
3100 | + # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent |
3101 | + if self._get_parent(process) in systemd_run_commands and self._get_command( |
3102 | + process) == 'systemd-run': |
3103 | + continue |
3104 | + # check if the process is a command started by the agent or a descendant of one of those commands |
3105 | + current = process |
3106 | + while current != 0 and current not in agent_commands: |
3107 | + current = self._get_parent(current) |
3108 | + # Verify if Process started by agent based on the marker found in process environment or process is in Zombie state. |
3109 | + # If so, consider it as valid process in agent cgroup. |
3110 | + if current == 0 and not (self.__is_process_descendant_of_the_agent(process) or self.__is_zombie_process(process)): |
3111 | + unexpected.append(self.__format_process(process)) |
3112 | + if len(unexpected) >= 5: # collect just a small sample |
3113 | + break |
3114 | + except Exception as exception: |
3115 | + _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception))) |
3116 | |
3117 | - return cgroups |
3118 | + if len(unexpected) > 0: |
3119 | + self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected) |
3120 | + raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected)) |
3121 | |
3122 | - self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked.") |
3123 | + @staticmethod |
3124 | + def _get_command(pid): |
3125 | + try: |
3126 | + with open('/proc/{0}/comm'.format(pid), "r") as file_: |
3127 | + comm = file_.read() |
3128 | + if comm and comm[-1] == '\x00': # if null-terminated, remove the null |
3129 | + comm = comm[:-1] |
3130 | + return comm.rstrip() |
3131 | + except Exception: |
3132 | + return "UNKNOWN" |
3133 | |
3134 | - def cleanup_legacy_cgroups(self): |
3135 | - def __impl(): |
3136 | - self._cgroups_api.cleanup_legacy_cgroups() |
3137 | + @staticmethod |
3138 | + def __format_process(pid): |
3139 | + """ |
3140 | + Formats the given PID as a string containing the PID and the corresponding command line truncated to 64 chars |
3141 | + """ |
3142 | + try: |
3143 | + cmdline = '/proc/{0}/cmdline'.format(pid) |
3144 | + if os.path.exists(cmdline): |
3145 | + with open(cmdline, "r") as cmdline_file: |
3146 | + return "[PID: {0}] {1:64.64}".format(pid, cmdline_file.read()) |
3147 | + except Exception: |
3148 | + pass |
3149 | + return "[PID: {0}] UNKNOWN".format(pid) |
3150 | |
3151 | - message = 'Failed to process legacy cgroups. Collection of resource usage data will be disabled.' |
3152 | + @staticmethod |
3153 | + def __is_process_descendant_of_the_agent(pid): |
3154 | + """ |
3155 | + Returns True if the process is descendant of the agent by looking at the env flag(AZURE_GUEST_AGENT_PARENT_PROCESS_NAME) |
3156 | + that we set when the process starts otherwise False. |
3157 | + """ |
3158 | + try: |
3159 | + env = '/proc/{0}/environ'.format(pid) |
3160 | + if os.path.exists(env): |
3161 | + with open(env, "r") as env_file: |
3162 | + environ = env_file.read() |
3163 | + if environ and environ[-1] == '\x00': |
3164 | + environ = environ[:-1] |
3165 | + return "{0}={1}".format(shellutil.PARENT_PROCESS_NAME, shellutil.AZURE_GUEST_AGENT) in environ |
3166 | + except Exception: |
3167 | + pass |
3168 | + return False |
3169 | |
3170 | - def disable_cgroups(exception): |
3171 | - self.disable() |
3172 | - add_event( |
3173 | - AGENT_NAME, |
3174 | - version=CURRENT_VERSION, |
3175 | - op=WALAEventOperation.CGroupsCleanUp, |
3176 | - is_success=False, |
3177 | - log_event=False, |
3178 | - message='{0} {1}'.format(message, ustr(exception))) |
3179 | + @staticmethod |
3180 | + def __is_zombie_process(pid): |
3181 | + """ |
3182 | + Returns True if process is in Zombie state otherwise False. |
3183 | |
3184 | - self._invoke_cgroup_operation(__impl, message, on_error=disable_cgroups) |
3185 | + Ex: cat /proc/18171/stat |
3186 | + 18171 (python3) S 18103 18103 18103 0 -1 4194624 57736 64902 0 3 |
3187 | + """ |
3188 | + try: |
3189 | + stat = '/proc/{0}/stat'.format(pid) |
3190 | + if os.path.exists(stat): |
3191 | + with open(stat, "r") as stat_file: |
3192 | + return stat_file.read().split()[2] == 'Z' |
3193 | + except Exception: |
3194 | + pass |
3195 | + return False |
3196 | |
3197 | - def create_extension_cgroups_root(self): |
3198 | + @staticmethod |
3199 | + def _check_systemd_run_process(process): |
3200 | """ |
3201 | - Creates the container (directory/cgroup) that includes the cgroups for all extensions (/sys/fs/cgroup/*/walinuxagent.extensions) |
3202 | + Returns True if process is shell systemd-run process started by agent otherwise False. |
3203 | + |
3204 | + Ex: sh,7345 -c systemd-run --unit=enable_7c5cab19-eb79-4661-95d9-9e5091bd5ae0 --scope --slice=azure-vmextensions-Microsoft.OSTCExtensions.VMAccessForLinux_1.5.11.slice /var/lib/waagent/Microsoft.OSTCExtensions.VMAccessForLinux-1.5.11/processes.sh |
3205 | """ |
3206 | - def __impl(): |
3207 | - self._cgroups_api.create_extension_cgroups_root() |
3208 | + try: |
3209 | + process_name = "UNKNOWN" |
3210 | + cmdline = '/proc/{0}/cmdline'.format(process) |
3211 | + if os.path.exists(cmdline): |
3212 | + with open(cmdline, "r") as cmdline_file: |
3213 | + process_name = "{0}".format(cmdline_file.read()) |
3214 | + match = re.search(r'systemd-run.*--unit=.*--scope.*--slice=azure-vmextensions.*', process_name) |
3215 | + if match is not None: |
3216 | + return True |
3217 | + except Exception: |
3218 | + pass |
3219 | + return False |
3220 | + |
3221 | + @staticmethod |
3222 | + def _report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected): |
3223 | + for proc_name in unexpected: |
3224 | + if 'UNKNOWN' in proc_name: |
3225 | + msg = "Agent includes following processes when UNKNOWN process found: {0}".format("\n".join([ustr(proc) for proc in agent_cgroup_proc_names])) |
3226 | + add_event(op=WALAEventOperation.CGroupsInfo, message=msg) |
3227 | |
3228 | - self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked.") |
3229 | + @staticmethod |
3230 | + def _check_agent_throttled_time(cgroup_metrics): |
3231 | + for metric in cgroup_metrics: |
3232 | + if metric.instance == AGENT_NAME_TELEMETRY and metric.counter == MetricsCounter.THROTTLED_TIME: |
3233 | + if metric.value > conf.get_agent_cpu_throttled_time_threshold(): |
3234 | + raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value)) |
3235 | |
3236 | - def create_extension_cgroups(self, name): |
3237 | + def check_agent_memory_usage(self): |
3238 | + if self.enabled() and self._agent_memory_cgroup: |
3239 | + metrics = self._agent_memory_cgroup.get_tracked_metrics() |
3240 | + current_usage = 0 |
3241 | + for metric in metrics: |
3242 | + if metric.counter == MetricsCounter.TOTAL_MEM_USAGE: |
3243 | + current_usage += metric.value |
3244 | + elif metric.counter == MetricsCounter.SWAP_MEM_USAGE: |
3245 | + current_usage += metric.value |
3246 | + |
3247 | + if current_usage > conf.get_agent_memory_quota(): |
3248 | + raise AgentMemoryExceededException("The agent memory limit {0} bytes exceeded. The current reported usage is {1} bytes.".format(conf.get_agent_memory_quota(), current_usage)) |
3249 | + |
3250 | + @staticmethod |
3251 | + def _get_parent(pid): |
3252 | """ |
3253 | - Creates and returns the cgroups for the given extension |
3254 | + Returns the parent of the given process. If the parent cannot be determined returns 0 (which is the PID for the scheduler) |
3255 | """ |
3256 | - def __impl(): |
3257 | - return self._cgroups_api.create_extension_cgroups(name) |
3258 | + try: |
3259 | + stat = '/proc/{0}/stat'.format(pid) |
3260 | + if os.path.exists(stat): |
3261 | + with open(stat, "r") as stat_file: |
3262 | + return int(stat_file.read().split()[3]) |
3263 | + except Exception: |
3264 | + pass |
3265 | + return 0 |
3266 | |
3267 | - return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked.".format(name)) |
3268 | + def start_tracking_unit_cgroups(self, unit_name): |
3269 | + """ |
3270 | + TODO: Start tracking Memory Cgroups |
3271 | + """ |
3272 | + try: |
3273 | + cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name) |
3274 | + |
3275 | + if cpu_cgroup_path is None: |
3276 | + logger.info("The CPU controller is not mounted; will not track resource usage") |
3277 | + else: |
3278 | + CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path)) |
3279 | + |
3280 | + if memory_cgroup_path is None: |
3281 | + logger.info("The Memory controller is not mounted; will not track resource usage") |
3282 | + else: |
3283 | + CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path)) |
3284 | + |
3285 | + except Exception as exception: |
3286 | + logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception)) |
3287 | |
3288 | - def remove_extension_cgroups(self, name): |
3289 | + def stop_tracking_unit_cgroups(self, unit_name): |
3290 | """ |
3291 | - Deletes the cgroup for the given extension |
3292 | + TODO: remove Memory cgroups from tracked list. |
3293 | """ |
3294 | - def __impl(): |
3295 | - cgroups = self._cgroups_api.remove_extension_cgroups(name) |
3296 | - return cgroups |
3297 | + try: |
3298 | + cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name) |
3299 | + |
3300 | + if cpu_cgroup_path is not None: |
3301 | + CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path)) |
3302 | + |
3303 | + if memory_cgroup_path is not None: |
3304 | + CGroupsTelemetry.stop_tracking(MemoryCgroup(unit_name, memory_cgroup_path)) |
3305 | |
3306 | - self._invoke_cgroup_operation(__impl, "Failed to delete cgroups for extension '{0}'.".format(name)) |
3307 | + except Exception as exception: |
3308 | + logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception)) |
3309 | |
3310 | - def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, |
3311 | + def stop_tracking_extension_cgroups(self, extension_name): |
3312 | + """ |
3313 | + TODO: remove extension Memory cgroups from tracked list |
3314 | + """ |
3315 | + try: |
3316 | + extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name) |
3317 | + cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE, |
3318 | + extension_slice_name) |
3319 | + |
3320 | + cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self._cgroups_api.get_cgroup_mount_points() |
3321 | + cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) |
3322 | + memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path) |
3323 | + |
3324 | + if cpu_cgroup_path is not None: |
3325 | + CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path)) |
3326 | + |
3327 | + if memory_cgroup_path is not None: |
3328 | + CGroupsTelemetry.stop_tracking(MemoryCgroup(extension_name, memory_cgroup_path)) |
3329 | + |
3330 | + except Exception as exception: |
3331 | + logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception)) |
3332 | + |
3333 | + def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, |
3334 | error_code=ExtensionErrorCodes.PluginUnknownFailure): |
3335 | """ |
3336 | Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup |
3337 | :param extension_name: The extension executing the command |
3338 | :param command: The command to invoke |
3339 | + :param cmd_name: The type of the command(enable, install, etc.) |
3340 | :param timeout: Number of seconds to wait for command completion |
3341 | :param cwd: The working directory for the command |
3342 | :param env: The environment to pass to the command's process |
3343 | @@ -172,39 +886,207 @@ class CGroupConfigurator(object): |
3344 | :param stderr: File object to redirect stderr to |
3345 | :param error_code: Extension error code to raise in case of error |
3346 | """ |
3347 | - if not self.enabled(): |
3348 | - process = subprocess.Popen(command, |
3349 | - shell=shell, |
3350 | - cwd=cwd, |
3351 | - env=env, |
3352 | - stdout=stdout, |
3353 | - stderr=stderr, |
3354 | - preexec_fn=os.setsid) |
3355 | - |
3356 | - process_output = handle_process_completion(process=process, |
3357 | - command=command, |
3358 | - timeout=timeout, |
3359 | - stdout=stdout, |
3360 | - stderr=stderr, |
3361 | - error_code=error_code) |
3362 | - else: |
3363 | - extension_cgroups, process_output = self._cgroups_api.start_extension_command(extension_name, |
3364 | - command, |
3365 | - timeout, |
3366 | - shell=shell, |
3367 | - cwd=cwd, |
3368 | - env=env, |
3369 | - stdout=stdout, |
3370 | - stderr=stderr, |
3371 | - error_code=error_code) |
3372 | - |
3373 | - return process_output |
3374 | - |
3375 | - # unique instance for the singleton (TODO: find a better pattern for a singleton) |
3376 | + if self.enabled(): |
3377 | + try: |
3378 | + return self._cgroups_api.start_extension_command(extension_name, command, cmd_name, timeout, |
3379 | + shell=shell, cwd=cwd, env=env, stdout=stdout, |
3380 | + stderr=stderr, error_code=error_code) |
3381 | + except SystemdRunError as exception: |
3382 | + reason = 'Failed to start {0} using systemd-run, will try invoking the extension directly. Error: {1}'.format( |
3383 | + extension_name, ustr(exception)) |
3384 | + self.disable(reason, DisableCgroups.ALL) |
3385 | + # fall-through and re-invoke the extension |
3386 | + |
3387 | + # subprocess-popen-preexec-fn<W1509> Disabled: code is not multi-threaded |
3388 | + process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509 |
3389 | + return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code) |
3390 | + |
3391 | + def __reset_extension_cpu_quota(self, extension_name): |
3392 | + """ |
3393 | + Removes any CPUQuota on the extension |
3394 | + |
3395 | + NOTE: This resets the quota on the extension's slice; any local overrides on the VM will take precedence |
3396 | + over this setting. |
3397 | + """ |
3398 | + if self.enabled(): |
3399 | + self.setup_extension_slice(extension_name, cpu_quota=None) |
3400 | + |
3401 | + def setup_extension_slice(self, extension_name, cpu_quota): |
3402 | + """ |
3403 | + Each extension runs under its own slice (Ex "Microsoft.CPlat.Extension.slice"). All the slices for |
3404 | + extensions are grouped under "azure-vmextensions.slice. |
3405 | + |
3406 | + This method ensures that the extension slice is created. Setup should create |
3407 | + under /lib/systemd/system if it is not exist. |
3408 | + TODO: set memory quotas |
3409 | + """ |
3410 | + if self.enabled(): |
3411 | + unit_file_install_path = systemd.get_unit_file_install_path() |
3412 | + extension_slice_path = os.path.join(unit_file_install_path, |
3413 | + SystemdCgroupsApi.get_extension_slice_name(extension_name)) |
3414 | + try: |
3415 | + cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) |
3416 | + if cpu_quota == "": |
3417 | + _log_cgroup_info("CPUQuota not set for {0}", extension_name) |
3418 | + else: |
3419 | + _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota) |
3420 | + slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, |
3421 | + cpu_quota=cpu_quota) |
3422 | + CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents) |
3423 | + except Exception as exception: |
3424 | + _log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name, |
3425 | + ustr(exception)) |
3426 | + CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path) |
3427 | + |
3428 | + def remove_extension_slice(self, extension_name): |
3429 | + """ |
3430 | + This method ensures that the extension slice gets removed from /lib/systemd/system if it exist |
3431 | + Lastly stop the unit. This would ensure the cleanup the /sys/fs/cgroup controller paths |
3432 | + """ |
3433 | + if self.enabled(): |
3434 | + unit_file_install_path = systemd.get_unit_file_install_path() |
3435 | + extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name) |
3436 | + extension_slice_path = os.path.join(unit_file_install_path, extension_slice_name) |
3437 | + if os.path.exists(extension_slice_path): |
3438 | + self.stop_tracking_extension_cgroups(extension_name) |
3439 | + CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path) |
3440 | + |
3441 | + def set_extension_services_cpu_memory_quota(self, services_list): |
3442 | + """ |
3443 | + Each extension service will have name, systemd path and it's quotas. |
3444 | + This method ensures that drop-in files are created under service.d folder if quotas given. |
3445 | + ex: /lib/systemd/system/extension.service.d/11-CPUAccounting.conf |
3446 | + TODO: set memory quotas |
3447 | + """ |
3448 | + if self.enabled() and services_list is not None: |
3449 | + for service in services_list: |
3450 | + service_name = service.get('name', None) |
3451 | + unit_file_path = systemd.get_unit_file_install_path() |
3452 | + if service_name is not None and unit_file_path is not None: |
3453 | + files_to_create = [] |
3454 | + drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name)) |
3455 | + drop_in_file_cpu_accounting = os.path.join(drop_in_path, |
3456 | + _DROP_IN_FILE_CPU_ACCOUNTING) |
3457 | + files_to_create.append((drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS)) |
3458 | + drop_in_file_memory_accounting = os.path.join(drop_in_path, |
3459 | + _DROP_IN_FILE_MEMORY_ACCOUNTING) |
3460 | + files_to_create.append( |
3461 | + (drop_in_file_memory_accounting, _DROP_IN_FILE_MEMORY_ACCOUNTING_CONTENTS)) |
3462 | + |
3463 | + cpu_quota = service.get('cpuQuotaPercentage', None) |
3464 | + if cpu_quota is not None: |
3465 | + cpu_quota = str(cpu_quota) + "%" |
3466 | + _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", service_name, cpu_quota) |
3467 | + drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) |
3468 | + cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota) |
3469 | + files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) |
3470 | + |
3471 | + self.__create_all_files(files_to_create) |
3472 | + self.__reload_systemd_config() |
3473 | + |
3474 | + def __reset_extension_services_cpu_quota(self, services_list): |
3475 | + """ |
3476 | + Removes any CPUQuota on the extension service |
3477 | + |
3478 | + NOTE: This resets the quota on the extension service's default dropin file; any local overrides on the VM will take precedence |
3479 | + over this setting. |
3480 | + """ |
3481 | + if self.enabled() and services_list is not None: |
3482 | + service_name = None |
3483 | + try: |
3484 | + for service in services_list: |
3485 | + service_name = service.get('name', None) |
3486 | + unit_file_path = systemd.get_unit_file_install_path() |
3487 | + if service_name is not None and unit_file_path is not None: |
3488 | + files_to_create = [] |
3489 | + drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name)) |
3490 | + cpu_quota = "" # setting an empty value resets to the default (infinity) |
3491 | + drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) |
3492 | + cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota) |
3493 | + if os.path.exists(drop_in_file_cpu_quota): |
3494 | + with open(drop_in_file_cpu_quota, "r") as file_: |
3495 | + if file_.read() == cpu_quota_contents: |
3496 | + return |
3497 | + files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) |
3498 | + self.__create_all_files(files_to_create) |
3499 | + except Exception as exception: |
3500 | + _log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception)) |
3501 | + |
3502 | + def remove_extension_services_drop_in_files(self, services_list): |
3503 | + """ |
3504 | + Remove the dropin files from service .d folder for the given service |
3505 | + """ |
3506 | + if services_list is not None: |
3507 | + for service in services_list: |
3508 | + service_name = service.get('name', None) |
3509 | + unit_file_path = systemd.get_unit_file_install_path() |
3510 | + if service_name is not None and unit_file_path is not None: |
3511 | + files_to_cleanup = [] |
3512 | + drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name)) |
3513 | + drop_in_file_cpu_accounting = os.path.join(drop_in_path, |
3514 | + _DROP_IN_FILE_CPU_ACCOUNTING) |
3515 | + files_to_cleanup.append(drop_in_file_cpu_accounting) |
3516 | + drop_in_file_memory_accounting = os.path.join(drop_in_path, |
3517 | + _DROP_IN_FILE_MEMORY_ACCOUNTING) |
3518 | + files_to_cleanup.append(drop_in_file_memory_accounting) |
3519 | + cpu_quota = service.get('cpuQuotaPercentage', None) |
3520 | + if cpu_quota is not None: |
3521 | + drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) |
3522 | + files_to_cleanup.append(drop_in_file_cpu_quota) |
3523 | + |
3524 | + CGroupConfigurator._Impl.__cleanup_all_files(files_to_cleanup) |
3525 | + _log_cgroup_info("Drop in files removed for {0}".format(service_name)) |
3526 | + |
3527 | + def stop_tracking_extension_services_cgroups(self, services_list): |
3528 | + """ |
3529 | + Remove the cgroup entry from the tracked groups to stop tracking. |
3530 | + """ |
3531 | + if self.enabled() and services_list is not None: |
3532 | + for service in services_list: |
3533 | + service_name = service.get('name', None) |
3534 | + if service_name is not None: |
3535 | + self.stop_tracking_unit_cgroups(service_name) |
3536 | + |
3537 | + def start_tracking_extension_services_cgroups(self, services_list): |
3538 | + """ |
3539 | + Add the cgroup entry to start tracking the services cgroups. |
3540 | + """ |
3541 | + if self.enabled() and services_list is not None: |
3542 | + for service in services_list: |
3543 | + service_name = service.get('name', None) |
3544 | + if service_name is not None: |
3545 | + self.start_tracking_unit_cgroups(service_name) |
3546 | + |
3547 | + @staticmethod |
3548 | + def get_extension_services_list(): |
3549 | + """ |
3550 | + ResourceLimits for extensions are coming from <extName>/HandlerManifest.json file. |
3551 | + Use this pattern to determine all the installed extension HandlerManifest files and |
3552 | + read the extension services if ResourceLimits are present. |
3553 | + """ |
3554 | + extensions_services = {} |
3555 | + for manifest_path in glob.iglob(os.path.join(conf.get_lib_dir(), "*/HandlerManifest.json")): |
3556 | + match = re.search("(?P<extname>[\\w+\\.-]+).HandlerManifest\\.json", manifest_path) |
3557 | + if match is not None: |
3558 | + extensions_name = match.group('extname') |
3559 | + if not extensions_name.startswith('WALinuxAgent'): |
3560 | + try: |
3561 | + data = json.loads(fileutil.read_file(manifest_path)) |
3562 | + resource_limits = data[0].get('resourceLimits', None) |
3563 | + services = resource_limits.get('services') if resource_limits else None |
3564 | + extensions_services[extensions_name] = services |
3565 | + except (IOError, OSError) as e: |
3566 | + _log_cgroup_warning( |
3567 | + 'Failed to load manifest file ({0}): {1}'.format(manifest_path, e.strerror)) |
3568 | + except ValueError: |
3569 | + _log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path)) |
3570 | + return extensions_services |
3571 | + |
3572 | + # unique instance for the singleton |
3573 | _instance = None |
3574 | |
3575 | @staticmethod |
3576 | def get_instance(): |
3577 | if CGroupConfigurator._instance is None: |
3578 | - CGroupConfigurator._instance = CGroupConfigurator.__impl() |
3579 | + CGroupConfigurator._instance = CGroupConfigurator._Impl() |
3580 | return CGroupConfigurator._instance |
3581 | diff --git a/azurelinuxagent/common/cgroupstelemetry.py b/azurelinuxagent/common/cgroupstelemetry.py |
3582 | index 4bbcba1..7b6bba0 100644 |
3583 | --- a/azurelinuxagent/common/cgroupstelemetry.py |
3584 | +++ b/azurelinuxagent/common/cgroupstelemetry.py |
3585 | @@ -15,101 +15,26 @@ |
3586 | # Requires Python 2.6+ and Openssl 1.0+ |
3587 | import errno |
3588 | import threading |
3589 | -from collections import namedtuple |
3590 | -from datetime import datetime as dt |
3591 | |
3592 | from azurelinuxagent.common import logger |
3593 | -from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers |
3594 | -from azurelinuxagent.common.exception import CGroupsException |
3595 | +from azurelinuxagent.common.cgroup import CpuCgroup |
3596 | from azurelinuxagent.common.future import ustr |
3597 | -from azurelinuxagent.common.logger import EVERY_SIX_HOURS |
3598 | -from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo |
3599 | - |
3600 | -MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value']) |
3601 | -StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric']) |
3602 | - |
3603 | -DELIM = " | " |
3604 | -DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND" |
3605 | -DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND" |
3606 | - |
3607 | - |
3608 | -class MetricsCategory(object): |
3609 | - MEMORY_CATEGORY = "Memory" |
3610 | - PROCESS_CATEGORY = "Process" |
3611 | - |
3612 | - |
3613 | -class MetricsCounter(object): |
3614 | - PROCESSOR_PERCENT_TIME = "% Processor Time" |
3615 | - TOTAL_MEM_USAGE = "Total Memory Usage" |
3616 | - MAX_MEM_USAGE = "Max Memory Usage" |
3617 | - MEM_USED_BY_PROCESS = "Memory Used by Process" |
3618 | |
3619 | |
3620 | class CGroupsTelemetry(object): |
3621 | """ |
3622 | """ |
3623 | - _tracked = [] |
3624 | - _cgroup_metrics = {} |
3625 | + _tracked = {} |
3626 | + _track_throttled_time = False |
3627 | _rlock = threading.RLock() |
3628 | |
3629 | @staticmethod |
3630 | - def get_process_info_summary(process_id): |
3631 | - process_cmdline = DEFAULT_PROCESS_COMMANDLINE |
3632 | - process_name = DEFAULT_PROCESS_NAME |
3633 | - |
3634 | - # The ProcessName and ProcessCommandLine can generate Exception if the file /proc/<pid>/{comm,cmdline} cease to |
3635 | - # exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the |
3636 | - # details from those files. |
3637 | - try: |
3638 | - process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE |
3639 | - except Exception as e: |
3640 | - logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e)) |
3641 | - |
3642 | - try: |
3643 | - process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME |
3644 | - except Exception as e: |
3645 | - logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e)) |
3646 | - |
3647 | - return process_id + DELIM + process_name + DELIM + process_cmdline |
3648 | + def set_track_throttled_time(value): |
3649 | + CGroupsTelemetry._track_throttled_time = value |
3650 | |
3651 | @staticmethod |
3652 | - def _get_metrics_list(metric): |
3653 | - return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(), |
3654 | - metric.first_poll_time(), metric.last_poll_time()] |
3655 | - |
3656 | - @staticmethod |
3657 | - def _process_cgroup_metric(cgroup_metrics): |
3658 | - memory_usage = cgroup_metrics.get_memory_metrics() |
3659 | - max_memory_usage = cgroup_metrics.get_max_memory_metrics() |
3660 | - cpu_usage = cgroup_metrics.get_cpu_metrics() |
3661 | - memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics() |
3662 | - |
3663 | - processed_extension = {} |
3664 | - |
3665 | - if cpu_usage.count() > 0: |
3666 | - processed_extension["cpu"] = {"cur_cpu": CGroupsTelemetry._get_metrics_list(cpu_usage)} |
3667 | - |
3668 | - if memory_usage.count() > 0: |
3669 | - if "memory" in processed_extension: |
3670 | - processed_extension["memory"]["cur_mem"] = CGroupsTelemetry._get_metrics_list(memory_usage) |
3671 | - else: |
3672 | - processed_extension["memory"] = {"cur_mem": CGroupsTelemetry._get_metrics_list(memory_usage)} |
3673 | - |
3674 | - if max_memory_usage.count() > 0: |
3675 | - if "memory" in processed_extension: |
3676 | - processed_extension["memory"]["max_mem"] = CGroupsTelemetry._get_metrics_list(max_memory_usage) |
3677 | - else: |
3678 | - processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)} |
3679 | - |
3680 | - for pid_process_memory in memory_usage_per_process: |
3681 | - if "proc_statm_memory" in processed_extension: |
3682 | - processed_extension["proc_statm_memory"][pid_process_memory.pid_name_cmdline] = \ |
3683 | - CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric) |
3684 | - else: |
3685 | - processed_extension["proc_statm_memory"] = {pid_process_memory.pid_name_cmdline: |
3686 | - CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)} |
3687 | - |
3688 | - return processed_extension |
3689 | + def get_track_throttled_time(): |
3690 | + return CGroupsTelemetry._track_throttled_time |
3691 | |
3692 | @staticmethod |
3693 | def track_cgroup(cgroup): |
3694 | @@ -122,221 +47,56 @@ class CGroupsTelemetry(object): |
3695 | |
3696 | with CGroupsTelemetry._rlock: |
3697 | if not CGroupsTelemetry.is_tracked(cgroup.path): |
3698 | - CGroupsTelemetry._tracked.append(cgroup) |
3699 | - logger.info("Started tracking new cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path)) |
3700 | + CGroupsTelemetry._tracked[cgroup.path] = cgroup |
3701 | + logger.info("Started tracking cgroup {0}", cgroup) |
3702 | |
3703 | @staticmethod |
3704 | def is_tracked(path): |
3705 | """ |
3706 | Returns true if the given item is in the list of tracked items |
3707 | - O(n) operation. But limited to few cgroup objects we have. |
3708 | + O(1) operation. |
3709 | """ |
3710 | with CGroupsTelemetry._rlock: |
3711 | - for cgroup in CGroupsTelemetry._tracked: |
3712 | - if path == cgroup.path: |
3713 | - return True |
3714 | + if path in CGroupsTelemetry._tracked: |
3715 | + return True |
3716 | |
3717 | return False |
3718 | |
3719 | @staticmethod |
3720 | def stop_tracking(cgroup): |
3721 | """ |
3722 | - Stop tracking the cgroups for the given name |
3723 | + Stop tracking the cgroups for the given path |
3724 | """ |
3725 | with CGroupsTelemetry._rlock: |
3726 | - CGroupsTelemetry._tracked.remove(cgroup) |
3727 | - logger.info("Stopped tracking cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path)) |
3728 | - |
3729 | - @staticmethod |
3730 | - def report_all_tracked(): |
3731 | - """ |
3732 | - The report_all_tracked's purpose is to collect the data from the tracked cgroups and process the metric into a |
3733 | - data structure by _process_cgroup_metric. The perf metric is added into the data structure and returned to the |
3734 | - caller. |
3735 | - |
3736 | - The report_all_tracked would be removed soon - in favor of sending report_metric directly, when polling the data |
3737 | - from tracked groups. |
3738 | - |
3739 | - :return collected_metrics: dictionary of cgroups metrics. |
3740 | - """ |
3741 | - collected_metrics = {} |
3742 | - |
3743 | - for name, cgroup_metrics in CGroupsTelemetry._cgroup_metrics.items(): |
3744 | - perf_metric = CGroupsTelemetry._process_cgroup_metric(cgroup_metrics) |
3745 | - |
3746 | - if perf_metric: |
3747 | - collected_metrics[name] = perf_metric |
3748 | - |
3749 | - cgroup_metrics.clear() |
3750 | - |
3751 | - # Doing cleanup after the metrics have already been collected. |
3752 | - for key in [key for key in CGroupsTelemetry._cgroup_metrics if |
3753 | - CGroupsTelemetry._cgroup_metrics[key].marked_for_delete]: |
3754 | - del CGroupsTelemetry._cgroup_metrics[key] |
3755 | - |
3756 | - return collected_metrics |
3757 | + if cgroup.path in CGroupsTelemetry._tracked: |
3758 | + CGroupsTelemetry._tracked.pop(cgroup.path) |
3759 | + logger.info("Stopped tracking cgroup {0}", cgroup) |
3760 | |
3761 | @staticmethod |
3762 | def poll_all_tracked(): |
3763 | metrics = [] |
3764 | - |
3765 | + inactive_cgroups = [] |
3766 | with CGroupsTelemetry._rlock: |
3767 | - for cgroup in CGroupsTelemetry._tracked[:]: |
3768 | - if cgroup.name not in CGroupsTelemetry._cgroup_metrics: |
3769 | - CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics() |
3770 | + for cgroup in CGroupsTelemetry._tracked.values(): |
3771 | try: |
3772 | - if cgroup.controller == CGroupContollers.CPU: |
3773 | - current_cpu_usage = cgroup.get_cpu_usage() |
3774 | - CGroupsTelemetry._cgroup_metrics[cgroup.name].add_cpu_usage(current_cpu_usage) |
3775 | - metrics.append(MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter. |
3776 | - PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage)) |
3777 | - elif cgroup.controller == CGroupContollers.MEMORY: |
3778 | - current_memory_usage = cgroup.get_memory_usage() |
3779 | - CGroupsTelemetry._cgroup_metrics[cgroup.name].add_memory_usage(current_memory_usage) |
3780 | - metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter. |
3781 | - TOTAL_MEM_USAGE, cgroup.name, current_memory_usage)) |
3782 | - |
3783 | - max_memory_usage = cgroup.get_max_memory_usage() |
3784 | - CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage) |
3785 | - metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, |
3786 | - cgroup.name, max_memory_usage)) |
3787 | - |
3788 | - pids = cgroup.get_tracked_processes() |
3789 | - for pid in pids: |
3790 | - try: |
3791 | - mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid) |
3792 | - metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter. |
3793 | - MEM_USED_BY_PROCESS, CGroupsTelemetry.get_process_info_summary(pid), |
3794 | - mem_usage_from_procstatm)) |
3795 | - CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory( |
3796 | - CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm) |
3797 | - except Exception as e: |
3798 | - if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: |
3799 | - logger.periodic_warn(logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm " |
3800 | - "for pid {0}. Error : {1}", pid, ustr(e)) |
3801 | - else: |
3802 | - raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format( |
3803 | - cgroup.controller, cgroup.name)) |
3804 | + metrics.extend(cgroup.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time)) |
3805 | except Exception as e: |
3806 | # There can be scenarios when the CGroup has been deleted by the time we are fetching the values |
3807 | # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log |
3808 | # every occurrences of such case as it would be very verbose. We do want to log all the other |
3809 | # exceptions which could occur, which is why we do a periodic log for all the other errors. |
3810 | - if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: |
3811 | + if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101 |
3812 | logger.periodic_warn(logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup ' |
3813 | '{0}. Error : {1}'.format(cgroup.name, ustr(e))) |
3814 | if not cgroup.is_active(): |
3815 | - CGroupsTelemetry.stop_tracking(cgroup) |
3816 | - CGroupsTelemetry._cgroup_metrics[cgroup.name].marked_for_delete = True |
3817 | + inactive_cgroups.append(cgroup) |
3818 | + for inactive_cgroup in inactive_cgroups: |
3819 | + CGroupsTelemetry.stop_tracking(inactive_cgroup) |
3820 | |
3821 | return metrics |
3822 | |
3823 | @staticmethod |
3824 | - def prune_all_tracked(): |
3825 | - with CGroupsTelemetry._rlock: |
3826 | - for cgroup in CGroupsTelemetry._tracked[:]: |
3827 | - if not cgroup.is_active(): |
3828 | - CGroupsTelemetry.stop_tracking(cgroup) |
3829 | - |
3830 | - @staticmethod |
3831 | def reset(): |
3832 | with CGroupsTelemetry._rlock: |
3833 | - CGroupsTelemetry._tracked *= 0 # emptying the list |
3834 | - CGroupsTelemetry._cgroup_metrics = {} |
3835 | - |
3836 | - |
3837 | -class CgroupMetrics(object): |
3838 | - def __init__(self): |
3839 | - self._memory_usage = Metric() |
3840 | - self._max_memory_usage = Metric() |
3841 | - self._cpu_usage = Metric() |
3842 | - self._proc_statm_mem = {} |
3843 | - |
3844 | - self.marked_for_delete = False |
3845 | - |
3846 | - def add_memory_usage(self, usage): |
3847 | - if not self.marked_for_delete: |
3848 | - self._memory_usage.append(usage) |
3849 | - |
3850 | - def add_max_memory_usage(self, usage): |
3851 | - if not self.marked_for_delete: |
3852 | - self._max_memory_usage.append(usage) |
3853 | - |
3854 | - def add_cpu_usage(self, usage): |
3855 | - if not self.marked_for_delete: |
3856 | - self._cpu_usage.append(usage) |
3857 | - |
3858 | - def add_proc_statm_memory(self, pid, usage): |
3859 | - if not self.marked_for_delete: |
3860 | - if pid not in self._proc_statm_mem: |
3861 | - self._proc_statm_mem[pid] = Metric() |
3862 | - self._proc_statm_mem[pid].append(usage) |
3863 | - |
3864 | - def get_memory_metrics(self): |
3865 | - return self._memory_usage |
3866 | - |
3867 | - def get_max_memory_metrics(self): |
3868 | - return self._max_memory_usage |
3869 | - |
3870 | - def get_cpu_metrics(self): |
3871 | - return self._cpu_usage |
3872 | - |
3873 | - def get_proc_statm_memory_metrics(self): |
3874 | - """ |
3875 | - :return: StatmMetricValue tuples of pid and metric |
3876 | - """ |
3877 | - return [StatmMetricValue(pid_name_cmdline, metric) for pid_name_cmdline, metric in self._proc_statm_mem.items()] |
3878 | - |
3879 | - def clear(self): |
3880 | - self._memory_usage.clear() |
3881 | - self._max_memory_usage.clear() |
3882 | - self._cpu_usage.clear() |
3883 | - self._proc_statm_mem.clear() |
3884 | - |
3885 | - |
3886 | -class Metric(object): |
3887 | - def __init__(self): |
3888 | - self._data = [] |
3889 | - self._first_poll_time = None |
3890 | - self._last_poll_time = None |
3891 | - |
3892 | - def append(self, data): |
3893 | - if not self._first_poll_time: |
3894 | - # We only want to do it first time. |
3895 | - self._first_poll_time = dt.utcnow() |
3896 | - |
3897 | - self._data.append(data) |
3898 | - self._last_poll_time = dt.utcnow() |
3899 | - |
3900 | - def clear(self): |
3901 | - self._first_poll_time = None |
3902 | - self._last_poll_time = None |
3903 | - self._data *= 0 |
3904 | - |
3905 | - def average(self): |
3906 | - return float(sum(self._data)) / float(len(self._data)) if self._data else None |
3907 | - |
3908 | - def max(self): |
3909 | - return max(self._data) if self._data else None |
3910 | - |
3911 | - def min(self): |
3912 | - return min(self._data) if self._data else None |
3913 | - |
3914 | - def median(self): |
3915 | - data = sorted(self._data) |
3916 | - l_len = len(data) |
3917 | - if l_len < 1: |
3918 | - return None |
3919 | - if l_len % 2 == 0: |
3920 | - return (data[int((l_len - 1) / 2)] + data[int((l_len + 1) / 2)]) / 2.0 |
3921 | - else: |
3922 | - return data[int((l_len - 1) / 2)] |
3923 | - |
3924 | - def count(self): |
3925 | - return len(self._data) |
3926 | - |
3927 | - def first_poll_time(self): |
3928 | - return str(self._first_poll_time) |
3929 | - |
3930 | - def last_poll_time(self): |
3931 | - return str(self._last_poll_time) |
3932 | + CGroupsTelemetry._tracked.clear() # emptying the dictionary |
3933 | + CGroupsTelemetry._track_throttled_time = False |
3934 | diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py |
3935 | index bfc61f0..46765ea 100644 |
3936 | --- a/azurelinuxagent/common/conf.py |
3937 | +++ b/azurelinuxagent/common/conf.py |
3938 | @@ -19,11 +19,11 @@ |
3939 | |
3940 | """ |
3941 | Module conf loads and parses configuration file |
3942 | -""" |
3943 | +""" # pylint: disable=W0105 |
3944 | import os |
3945 | import os.path |
3946 | |
3947 | -import azurelinuxagent.common.utils.fileutil as fileutil |
3948 | +from azurelinuxagent.common.utils.fileutil import read_file #pylint: disable=R0401 |
3949 | from azurelinuxagent.common.exception import AgentConfigError |
3950 | |
3951 | DISABLE_AGENT_FILE = 'disable_agent' |
3952 | @@ -49,25 +49,43 @@ class ConfigurationProvider(object): |
3953 | value = parts[1].split('#')[0].strip("\" ").strip() |
3954 | self.values[key] = value if value != "None" else None |
3955 | |
3956 | - def get(self, key, default_val): |
3957 | + @staticmethod |
3958 | + def _get_default(default): |
3959 | + if hasattr(default, '__call__'): |
3960 | + return default() |
3961 | + return default |
3962 | + |
3963 | + def get(self, key, default_value): |
3964 | + """ |
3965 | + Retrieves a string parameter by key and returns its value. If not found returns the default value, |
3966 | + or if the default value is a callable returns the result of invoking the callable. |
3967 | + """ |
3968 | val = self.values.get(key) |
3969 | - return val if val is not None else default_val |
3970 | + return val if val is not None else self._get_default(default_value) |
3971 | |
3972 | - def get_switch(self, key, default_val): |
3973 | + def get_switch(self, key, default_value): |
3974 | + """ |
3975 | + Retrieves a switch parameter by key and returns its value as a boolean. If not found returns the default value, |
3976 | + or if the default value is a callable returns the result of invoking the callable. |
3977 | + """ |
3978 | val = self.values.get(key) |
3979 | if val is not None and val.lower() == 'y': |
3980 | return True |
3981 | elif val is not None and val.lower() == 'n': |
3982 | return False |
3983 | - return default_val |
3984 | + return self._get_default(default_value) |
3985 | |
3986 | - def get_int(self, key, default_val): |
3987 | + def get_int(self, key, default_value): |
3988 | + """ |
3989 | + Retrieves an int parameter by key and returns its value. If not found returns the default value, |
3990 | + or if the default value is a callable returns the result of invoking the callable. |
3991 | + """ |
3992 | try: |
3993 | return int(self.values.get(key)) |
3994 | except TypeError: |
3995 | - return default_val |
3996 | + return self._get_default(default_value) |
3997 | except ValueError: |
3998 | - return default_val |
3999 | + return self._get_default(default_value) |
4000 | |
4001 | |
4002 | __conf__ = ConfigurationProvider() |
4003 | @@ -81,7 +99,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__): |
4004 | raise AgentConfigError(("Missing configuration in {0}" |
4005 | "").format(conf_file_path)) |
4006 | try: |
4007 | - content = fileutil.read_file(conf_file_path) |
4008 | + content = read_file(conf_file_path) |
4009 | conf.load(content) |
4010 | except IOError as err: |
4011 | raise AgentConfigError(("Failed to load conf file:{0}, {1}" |
4012 | @@ -97,6 +115,7 @@ __SWITCH_OPTIONS__ = { |
4013 | "OS.CheckRdmaDriver": False, |
4014 | "Logs.Verbose": False, |
4015 | "Logs.Console": True, |
4016 | + "Logs.Collect": True, |
4017 | "Extensions.Enabled": True, |
4018 | "Provisioning.AllowResetSysUser": False, |
4019 | "Provisioning.RegenerateSshHostKeyPair": False, |
4020 | @@ -110,7 +129,16 @@ __SWITCH_OPTIONS__ = { |
4021 | "ResourceDisk.EnableSwapEncryption": False, |
4022 | "AutoUpdate.Enabled": True, |
4023 | "EnableOverProvisioning": True, |
4024 | - "CGroups.EnforceLimits": False, |
4025 | + # |
4026 | + # "Debug" options are experimental and may be removed in later |
4027 | + # versions of the Agent. |
4028 | + # |
4029 | + "Debug.CgroupLogMetrics": False, |
4030 | + "Debug.CgroupDisableOnProcessCheckFailure": True, |
4031 | + "Debug.CgroupDisableOnQuotaCheckFailure": True, |
4032 | + "Debug.EnableAgentMemoryUsageCheck": False, |
4033 | + "Debug.EnableFastTrack": True, |
4034 | + "Debug.EnableGAVersioning": False |
4035 | } |
4036 | |
4037 | |
4038 | @@ -133,16 +161,37 @@ __STRING_OPTIONS__ = { |
4039 | "ResourceDisk.MountOptions": None, |
4040 | "ResourceDisk.Filesystem": "ext3", |
4041 | "AutoUpdate.GAFamily": "Prod", |
4042 | - "CGroups.Excluded": "customscript,runcommand", |
4043 | + "Debug.CgroupMonitorExpiryTime": "2022-03-31", |
4044 | + "Debug.CgroupMonitorExtensionName": "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent", |
4045 | } |
4046 | |
4047 | |
4048 | __INTEGER_OPTIONS__ = { |
4049 | + "Extensions.GoalStatePeriod": 6, |
4050 | + "Extensions.InitialGoalStatePeriod": 6, |
4051 | + "OS.EnableFirewallPeriod": 300, |
4052 | + "OS.RemovePersistentNetRulesPeriod": 30, |
4053 | + "OS.RootDeviceScsiTimeoutPeriod": 30, |
4054 | + "OS.MonitorDhcpClientRestartPeriod": 30, |
4055 | "OS.SshClientAliveInterval": 180, |
4056 | + "Provisioning.MonitorHostNamePeriod": 30, |
4057 | "Provisioning.PasswordCryptSaltLength": 10, |
4058 | "HttpProxy.Port": None, |
4059 | "ResourceDisk.SwapSizeMB": 0, |
4060 | - "Autoupdate.Frequency": 3600 |
4061 | + "Autoupdate.Frequency": 3600, |
4062 | + "Logs.CollectPeriod": 3600, |
4063 | + # |
4064 | + # "Debug" options are experimental and may be removed in later |
4065 | + # versions of the Agent. |
4066 | + # |
4067 | + "Debug.CgroupCheckPeriod": 300, |
4068 | + "Debug.AgentCpuQuota": 50, |
4069 | + "Debug.AgentCpuThrottledTimeThreshold": 120, |
4070 | + "Debug.AgentMemoryQuota": 30 * 1024 ** 2, |
4071 | + "Debug.EtpCollectionPeriod": 300, |
4072 | + "Debug.AutoUpdateHotfixFrequency": 14400, |
4073 | + "Debug.AutoUpdateNormalFrequency": 86400, |
4074 | + "Debug.FirewallRulesLogPeriod": 86400 |
4075 | } |
4076 | |
4077 | |
4078 | @@ -160,10 +209,40 @@ def get_configuration(conf=__conf__): |
4079 | return options |
4080 | |
4081 | |
4082 | +def get_default_value(option): |
4083 | + if option in __STRING_OPTIONS__: |
4084 | + return __STRING_OPTIONS__[option] |
4085 | + raise ValueError("{0} is not a valid configuration parameter.".format(option)) |
4086 | + |
4087 | + |
4088 | +def get_int_default_value(option): |
4089 | + if option in __INTEGER_OPTIONS__: |
4090 | + return int(__INTEGER_OPTIONS__[option]) |
4091 | + raise ValueError("{0} is not a valid configuration parameter.".format(option)) |
4092 | + |
4093 | + |
4094 | +def get_switch_default_value(option): |
4095 | + if option in __SWITCH_OPTIONS__: |
4096 | + return __SWITCH_OPTIONS__[option] |
4097 | + raise ValueError("{0} is not a valid configuration parameter.".format(option)) |
4098 | + |
4099 | + |
4100 | def enable_firewall(conf=__conf__): |
4101 | return conf.get_switch("OS.EnableFirewall", False) |
4102 | |
4103 | |
4104 | +def get_enable_firewall_period(conf=__conf__): |
4105 | + return conf.get_int("OS.EnableFirewallPeriod", 300) |
4106 | + |
4107 | + |
4108 | +def get_remove_persistent_net_rules_period(conf=__conf__): |
4109 | + return conf.get_int("OS.RemovePersistentNetRulesPeriod", 30) |
4110 | + |
4111 | + |
4112 | +def get_monitor_dhcp_client_restart_period(conf=__conf__): |
4113 | + return conf.get_int("OS.MonitorDhcpClientRestartPeriod", 30) |
4114 | + |
4115 | + |
4116 | def enable_rdma(conf=__conf__): |
4117 | return conf.get_switch("OS.EnableRDMA", False) or \ |
4118 | conf.get_switch("OS.UpdateRdmaDriver", False) or \ |
4119 | @@ -186,11 +265,20 @@ def get_logs_console(conf=__conf__): |
4120 | return conf.get_switch("Logs.Console", True) |
4121 | |
4122 | |
4123 | +def get_collect_logs(conf=__conf__): |
4124 | + return conf.get_switch("Logs.Collect", True) |
4125 | + |
4126 | + |
4127 | +def get_collect_logs_period(conf=__conf__): |
4128 | + return conf.get_int("Logs.CollectPeriod", 3600) |
4129 | + |
4130 | + |
4131 | def get_lib_dir(conf=__conf__): |
4132 | return conf.get("Lib.Dir", "/var/lib/waagent") |
4133 | |
4134 | |
4135 | def get_published_hostname(conf=__conf__): |
4136 | + # Some applications rely on this file; do not remove this setting |
4137 | return os.path.join(get_lib_dir(conf), 'published_hostname') |
4138 | |
4139 | |
4140 | @@ -206,6 +294,10 @@ def get_ext_log_dir(conf=__conf__): |
4141 | return conf.get("Extension.LogDir", "/var/log/azure") |
4142 | |
4143 | |
4144 | +def get_agent_log_file(): |
4145 | + return "/var/log/waagent.log" |
4146 | + |
4147 | + |
4148 | def get_fips_enabled(conf=__conf__): |
4149 | return conf.get_switch("OS.EnableFIPS", False) |
4150 | |
4151 | @@ -244,18 +336,22 @@ def get_ssh_key_glob(conf=__conf__): |
4152 | |
4153 | def get_ssh_key_private_path(conf=__conf__): |
4154 | return os.path.join(get_ssh_dir(conf), |
4155 | - 'ssh_host_{0}_key'.format(get_ssh_host_keypair_type(conf))) |
4156 | + 'ssh_host_{0}_key'.format(get_ssh_host_keypair_type(conf))) |
4157 | |
4158 | |
4159 | def get_ssh_key_public_path(conf=__conf__): |
4160 | return os.path.join(get_ssh_dir(conf), |
4161 | - 'ssh_host_{0}_key.pub'.format(get_ssh_host_keypair_type(conf))) |
4162 | + 'ssh_host_{0}_key.pub'.format(get_ssh_host_keypair_type(conf))) |
4163 | |
4164 | |
4165 | def get_root_device_scsi_timeout(conf=__conf__): |
4166 | return conf.get("OS.RootDeviceScsiTimeout", None) |
4167 | |
4168 | |
4169 | +def get_root_device_scsi_timeout_period(conf=__conf__): |
4170 | + return conf.get_int("OS.RootDeviceScsiTimeoutPeriod", 30) |
4171 | + |
4172 | + |
4173 | def get_ssh_host_keypair_type(conf=__conf__): |
4174 | keypair_type = conf.get("Provisioning.SshHostKeyPairType", "rsa") |
4175 | if keypair_type == "auto": |
4176 | @@ -275,6 +371,14 @@ def get_extensions_enabled(conf=__conf__): |
4177 | return conf.get_switch("Extensions.Enabled", True) |
4178 | |
4179 | |
4180 | +def get_goal_state_period(conf=__conf__): |
4181 | + return conf.get_int("Extensions.GoalStatePeriod", 6) |
4182 | + |
4183 | + |
4184 | +def get_initial_goal_state_period(conf=__conf__): |
4185 | + return conf.get_int("Extensions.InitialGoalStatePeriod", default_value=lambda: get_goal_state_period(conf=conf)) |
4186 | + |
4187 | + |
4188 | def get_allow_reset_sys_user(conf=__conf__): |
4189 | return conf.get_switch("Provisioning.AllowResetSysUser", False) |
4190 | |
4191 | @@ -322,6 +426,10 @@ def get_monitor_hostname(conf=__conf__): |
4192 | return conf.get_switch("Provisioning.MonitorHostName", False) |
4193 | |
4194 | |
4195 | +def get_monitor_hostname_period(conf=__conf__): |
4196 | + return conf.get_int("Provisioning.MonitorHostNamePeriod", 30) |
4197 | + |
4198 | + |
4199 | def get_httpproxy_host(conf=__conf__): |
4200 | return conf.get("HttpProxy.Host", None) |
4201 | |
4202 | @@ -340,10 +448,12 @@ def get_resourcedisk_format(conf=__conf__): |
4203 | |
4204 | def get_resourcedisk_enable_swap(conf=__conf__): |
4205 | return conf.get_switch("ResourceDisk.EnableSwap", False) |
4206 | - |
4207 | + |
4208 | + |
4209 | def get_resourcedisk_enable_swap_encryption(conf=__conf__): |
4210 | return conf.get_switch("ResourceDisk.EnableSwapEncryption", False) |
4211 | |
4212 | + |
4213 | def get_resourcedisk_mountpoint(conf=__conf__): |
4214 | return conf.get("ResourceDisk.MountPoint", "/mnt/resource") |
4215 | |
4216 | @@ -384,10 +494,151 @@ def get_disable_agent_file_path(conf=__conf__): |
4217 | return os.path.join(get_lib_dir(conf), DISABLE_AGENT_FILE) |
4218 | |
4219 | |
4220 | -def get_cgroups_enforce_limits(conf=__conf__): |
4221 | - return conf.get_switch("CGroups.EnforceLimits", False) |
4222 | +def get_cgroups_enabled(conf=__conf__): |
4223 | + return conf.get_switch("CGroups.Enabled", True) |
4224 | + |
4225 | + |
4226 | +def get_monitor_network_configuration_changes(conf=__conf__): |
4227 | + return conf.get_switch("Monitor.NetworkConfigurationChanges", False) |
4228 | + |
4229 | + |
4230 | +def get_cgroup_check_period(conf=__conf__): |
4231 | + """ |
4232 | + How often to perform checks on cgroups (are the processes in the cgroups as expected, |
4233 | + has the agent exceeded its quota, etc) |
4234 | + |
4235 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4236 | + """ |
4237 | + return conf.get_int("Debug.CgroupCheckPeriod", 300) |
4238 | + |
4239 | |
4240 | +def get_cgroup_log_metrics(conf=__conf__): |
4241 | + """ |
4242 | + If True, resource usage metrics are written to the local log |
4243 | + |
4244 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4245 | + """ |
4246 | + return conf.get_switch("Debug.CgroupLogMetrics", False) |
4247 | + |
4248 | + |
4249 | +def get_cgroup_disable_on_process_check_failure(conf=__conf__): |
4250 | + """ |
4251 | + If True, cgroups will be disabled if the process check fails |
4252 | + |
4253 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4254 | + """ |
4255 | + return conf.get_switch("Debug.CgroupDisableOnProcessCheckFailure", True) |
4256 | + |
4257 | + |
4258 | +def get_cgroup_disable_on_quota_check_failure(conf=__conf__): |
4259 | + """ |
4260 | + If True, cgroups will be disabled if the CPU quota check fails |
4261 | + |
4262 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4263 | + """ |
4264 | + return conf.get_switch("Debug.CgroupDisableOnQuotaCheckFailure", True) |
4265 | + |
4266 | + |
4267 | +def get_agent_cpu_quota(conf=__conf__): |
4268 | + """ |
4269 | + CPU quota for the agent as a percentage of 1 CPU (100% == 1 CPU) |
4270 | |
4271 | -def get_cgroups_excluded(conf=__conf__): |
4272 | - excluded_value = conf.get("CGroups.Excluded", "customscript, runcommand") |
4273 | - return [s for s in [i.strip().lower() for i in excluded_value.split(',')] if len(s) > 0] if excluded_value else [] |
4274 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4275 | + """ |
4276 | + return conf.get_int("Debug.AgentCpuQuota", 50) |
4277 | + |
4278 | + |
4279 | +def get_agent_cpu_throttled_time_threshold(conf=__conf__): |
4280 | + """ |
4281 | + Throttled time threshold for agent cpu in seconds. |
4282 | + |
4283 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4284 | + """ |
4285 | + return conf.get_int("Debug.AgentCpuThrottledTimeThreshold", 120) |
4286 | + |
4287 | + |
4288 | +def get_agent_memory_quota(conf=__conf__): |
4289 | + """ |
4290 | + Memory quota for the agent in bytes. |
4291 | + |
4292 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4293 | + """ |
4294 | + return conf.get_int("Debug.AgentMemoryQuota", 30 * 1024 ** 2) |
4295 | + |
4296 | + |
4297 | +def get_enable_agent_memory_usage_check(conf=__conf__): |
4298 | + """ |
4299 | + If True, Agent checks it's Memory usage. |
4300 | + |
4301 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4302 | + """ |
4303 | + return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False) |
4304 | + |
4305 | + |
4306 | +def get_cgroup_monitor_expiry_time(conf=__conf__): |
4307 | + """ |
4308 | + cgroups monitoring for pilot extensions disabled after expiry time |
4309 | + |
4310 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4311 | + """ |
4312 | + return conf.get("Debug.CgroupMonitorExpiryTime", "2022-03-31") |
4313 | + |
4314 | + |
4315 | +def get_cgroup_monitor_extension_name (conf=__conf__): |
4316 | + """ |
4317 | + cgroups monitoring extension name |
4318 | + |
4319 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4320 | + """ |
4321 | + return conf.get("Debug.CgroupMonitorExtensionName", "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent") |
4322 | + |
4323 | + |
4324 | +def get_enable_fast_track(conf=__conf__): |
4325 | + """ |
4326 | + If True, the agent use FastTrack when retrieving goal states |
4327 | + |
4328 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4329 | + """ |
4330 | + return conf.get_switch("Debug.EnableFastTrack", True) |
4331 | + |
4332 | + |
4333 | +def get_etp_collection_period(conf=__conf__): |
4334 | + """ |
4335 | + Determines the frequency to perform ETP collection on extensions telemetry events. |
4336 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4337 | + """ |
4338 | + return conf.get_int("Debug.EtpCollectionPeriod", 300) |
4339 | + |
4340 | + |
4341 | +def get_hotfix_upgrade_frequency(conf=__conf__): |
4342 | + """ |
4343 | + Determines the frequency to check for Hotfix upgrades (<Patch>.<Build> version changed in new upgrades). |
4344 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4345 | + """ |
4346 | + return conf.get_int("Debug.AutoUpdateHotfixFrequency", 4 * 60 * 60) |
4347 | + |
4348 | + |
4349 | +def get_normal_upgrade_frequency(conf=__conf__): |
4350 | + """ |
4351 | + Determines the frequency to check for Normal upgrades (<Major>.<Minor> version changed in new upgrades). |
4352 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4353 | + """ |
4354 | + return conf.get_int("Debug.AutoUpdateNormalFrequency", 24 * 60 * 60) |
4355 | + |
4356 | + |
4357 | +def get_enable_ga_versioning(conf=__conf__): |
4358 | + """ |
4359 | + If True, the agent uses GA Versioning for auto-updating the agent vs automatically auto-updating to the highest version. |
4360 | + |
4361 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4362 | + """ |
4363 | + return conf.get_switch("Debug.EnableGAVersioning", False) |
4364 | + |
4365 | + |
4366 | +def get_firewall_rules_log_period(conf=__conf__): |
4367 | + """ |
4368 | + Determine the frequency to perform the periodic operation of logging firewall rules. |
4369 | + |
4370 | + NOTE: This option is experimental and may be removed in later versions of the Agent. |
4371 | + """ |
4372 | + return conf.get_int("Debug.FirewallRulesLogPeriod", 86400) |
4373 | diff --git a/azurelinuxagent/common/datacontract.py b/azurelinuxagent/common/datacontract.py |
4374 | index c69bebc..b6d1f3c 100644 |
4375 | --- a/azurelinuxagent/common/datacontract.py |
4376 | +++ b/azurelinuxagent/common/datacontract.py |
4377 | @@ -20,9 +20,11 @@ |
4378 | from azurelinuxagent.common.exception import ProtocolError |
4379 | import azurelinuxagent.common.logger as logger |
4380 | |
4381 | +# pylint: disable=W0105 |
4382 | """ |
4383 | Base class for data contracts between guest and host and utilities to manipulate the properties in those contracts |
4384 | -""" |
4385 | +""" |
4386 | +# pylint: enable=W0105 |
4387 | |
4388 | |
4389 | class DataContract(object): |
4390 | @@ -30,7 +32,7 @@ class DataContract(object): |
4391 | |
4392 | |
4393 | class DataContractList(list): |
4394 | - def __init__(self, item_cls): |
4395 | + def __init__(self, item_cls): # pylint: disable=W0231 |
4396 | self.item_cls = item_cls |
4397 | |
4398 | |
4399 | diff --git a/azurelinuxagent/common/dhcp.py b/azurelinuxagent/common/dhcp.py |
4400 | index 5974965..3db58d3 100644 |
4401 | --- a/azurelinuxagent/common/dhcp.py |
4402 | +++ b/azurelinuxagent/common/dhcp.py |
4403 | @@ -14,23 +14,23 @@ |
4404 | # |
4405 | # Requires Python 2.6+ and Openssl 1.0+ |
4406 | |
4407 | +import array |
4408 | import os |
4409 | import socket |
4410 | -import array |
4411 | import time |
4412 | + |
4413 | import azurelinuxagent.common.logger as logger |
4414 | +from azurelinuxagent.common.exception import DhcpError |
4415 | +from azurelinuxagent.common.osutil import get_osutil |
4416 | +from azurelinuxagent.common.utils.restutil import KNOWN_WIRESERVER_IP |
4417 | from azurelinuxagent.common.utils.textutil import hex_dump, hex_dump2, \ |
4418 | hex_dump3, \ |
4419 | compare_bytes, str_to_ord, \ |
4420 | unpack_big_endian, \ |
4421 | int_to_ip4_addr |
4422 | -from azurelinuxagent.common.exception import DhcpError |
4423 | -from azurelinuxagent.common.osutil import get_osutil |
4424 | - |
4425 | |
4426 | # the kernel routing table representation of 168.63.129.16 |
4427 | KNOWN_WIRESERVER_IP_ENTRY = '10813FA8' |
4428 | -from azurelinuxagent.common.utils.restutil import KNOWN_WIRESERVER_IP |
4429 | |
4430 | |
4431 | def get_dhcp_handler(): |
4432 | @@ -86,7 +86,7 @@ class DhcpHandler(object): |
4433 | logger.info("Test for route to {0}".format(KNOWN_WIRESERVER_IP)) |
4434 | try: |
4435 | route_table = self.osutil.read_route_table() |
4436 | - if any([(KNOWN_WIRESERVER_IP_ENTRY in route) for route in route_table]): |
4437 | + if any((KNOWN_WIRESERVER_IP_ENTRY in route) for route in route_table): |
4438 | # reset self.gateway and self.routes |
4439 | # we do not need to alter the routing table |
4440 | self.endpoint = KNOWN_WIRESERVER_IP |
4441 | @@ -100,7 +100,7 @@ class DhcpHandler(object): |
4442 | logger.error( |
4443 | "Could not determine whether route exists to {0}: {1}".format( |
4444 | KNOWN_WIRESERVER_IP, e)) |
4445 | - |
4446 | + |
4447 | return route_exists |
4448 | |
4449 | @property |
4450 | @@ -116,7 +116,7 @@ class DhcpHandler(object): |
4451 | exists = False |
4452 | |
4453 | logger.info("Checking for dhcp lease cache") |
4454 | - cached_endpoint = self.osutil.get_dhcp_lease_endpoint() |
4455 | + cached_endpoint = self.osutil.get_dhcp_lease_endpoint() # pylint: disable=E1128 |
4456 | if cached_endpoint is not None: |
4457 | self.endpoint = cached_endpoint |
4458 | exists = True |
4459 | @@ -157,11 +157,14 @@ class DhcpHandler(object): |
4460 | self.endpoint = KNOWN_WIRESERVER_IP |
4461 | return |
4462 | |
4463 | + # pylint: disable=W0105 |
4464 | """ |
4465 | Build dhcp request with mac addr |
4466 | Configure route to allow dhcp traffic |
4467 | Stop dhcp service if necessary |
4468 | """ |
4469 | + # pylint: enable=W0105 |
4470 | + |
4471 | logger.info("Send dhcp request") |
4472 | mac_addr = self.osutil.get_mac_addr() |
4473 | |
4474 | @@ -194,7 +197,7 @@ class DhcpHandler(object): |
4475 | self.endpoint, self.gateway, self.routes = parse_dhcp_resp(resp) |
4476 | |
4477 | |
4478 | -def validate_dhcp_resp(request, response): |
4479 | +def validate_dhcp_resp(request, response): # pylint: disable=R1710 |
4480 | bytes_recv = len(response) |
4481 | if bytes_recv < 0xF6: |
4482 | logger.error("HandleDhcpResponse: Too few bytes received:{0}", |
4483 | @@ -228,7 +231,7 @@ def validate_dhcp_resp(request, response): |
4484 | "doesn't match the request") |
4485 | |
4486 | |
4487 | -def parse_route(response, option, i, length, bytes_recv): |
4488 | +def parse_route(response, option, i, length, bytes_recv): # pylint: disable=W0613 |
4489 | # http://msdn.microsoft.com/en-us/library/cc227282%28PROT.10%29.aspx |
4490 | logger.verbose("Routes at offset: {0} with length:{1}", hex(i), |
4491 | hex(length)) |
4492 | @@ -386,7 +389,7 @@ def build_dhcp_request(mac_addr, request_broadcast): |
4493 | # set broadcast flag to true to request the dhcp server |
4494 | # to respond to a boradcast address, |
4495 | # this is useful when user dhclient fails. |
4496 | - request[0x0A] = 0x80; |
4497 | + request[0x0A] = 0x80 |
4498 | |
4499 | # fill in ClientHardwareAddress |
4500 | for a in range(0, 6): |
4501 | diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py |
4502 | index 5274b80..1f903a9 100644 |
4503 | --- a/azurelinuxagent/common/event.py |
4504 | +++ b/azurelinuxagent/common/event.py |
4505 | @@ -18,30 +18,35 @@ |
4506 | import atexit |
4507 | import json |
4508 | import os |
4509 | +import platform |
4510 | import re |
4511 | import sys |
4512 | import threading |
4513 | import time |
4514 | import traceback |
4515 | -from collections import namedtuple |
4516 | from datetime import datetime |
4517 | |
4518 | import azurelinuxagent.common.conf as conf |
4519 | import azurelinuxagent.common.logger as logger |
4520 | -from azurelinuxagent.common.exception import EventError |
4521 | -from azurelinuxagent.common.future import ustr, OrderedDict |
4522 | -from azurelinuxagent.common.datacontract import get_properties, DataContractList |
4523 | -from azurelinuxagent.common.telemetryevent import TelemetryEventParam, TelemetryEvent |
4524 | +from azurelinuxagent.common.AgentGlobals import AgentGlobals |
4525 | +from azurelinuxagent.common.exception import EventError, OSUtilError |
4526 | +from azurelinuxagent.common.future import ustr |
4527 | +from azurelinuxagent.common.datacontract import get_properties, set_properties |
4528 | +from azurelinuxagent.common.osutil import get_osutil |
4529 | +from azurelinuxagent.common.telemetryevent import TelemetryEventParam, TelemetryEvent, CommonTelemetryEventSchema, \ |
4530 | + GuestAgentGenericLogsSchema, GuestAgentExtensionEventsSchema, GuestAgentPerfCounterEventsSchema |
4531 | from azurelinuxagent.common.utils import fileutil, textutil |
4532 | -from azurelinuxagent.common.version import CURRENT_VERSION, CURRENT_AGENT |
4533 | +from azurelinuxagent.common.utils.textutil import parse_doc, findall, find, getattrib, str_to_encoded_ustr |
4534 | +from azurelinuxagent.common.version import CURRENT_VERSION, CURRENT_AGENT, AGENT_NAME, DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, AGENT_EXECUTION_MODE |
4535 | +from azurelinuxagent.common.protocol.imds import get_imds_client |
4536 | + |
4537 | +EVENTS_DIRECTORY = "events" |
4538 | |
4539 | _EVENT_MSG = "Event: name={0}, op={1}, message={2}, duration={3}" |
4540 | TELEMETRY_EVENT_PROVIDER_ID = "69B669B9-4AF8-4C50-BDC4-6006FA76E975" |
4541 | +TELEMETRY_EVENT_EVENT_ID = 1 |
4542 | TELEMETRY_METRICS_EVENT_ID = 4 |
4543 | |
4544 | -# Store the last retrieved container id as an environment variable to be shared between threads for telemetry purposes |
4545 | -CONTAINER_ID_ENV_VARIABLE = "AZURE_GUEST_AGENT_CONTAINER_ID" |
4546 | - |
4547 | TELEMETRY_LOG_PROVIDER_ID = "FFF0196F-EE4C-4EAF-9AA5-776F622DEB4F" |
4548 | TELEMETRY_LOG_EVENT_ID = 7 |
4549 | |
4550 | @@ -53,33 +58,39 @@ SEND_LOGS_TO_TELEMETRY = False |
4551 | |
4552 | MAX_NUMBER_OF_EVENTS = 1000 |
4553 | |
4554 | +AGENT_EVENT_FILE_EXTENSION = '.waagent.tld' |
4555 | +EVENT_FILE_REGEX = re.compile(r'(?P<agent_event>\.waagent)?\.tld$') |
4556 | |
4557 | def send_logs_to_telemetry(): |
4558 | return SEND_LOGS_TO_TELEMETRY |
4559 | |
4560 | |
4561 | -def get_container_id_from_env(): |
4562 | - return os.environ.get(CONTAINER_ID_ENV_VARIABLE, "UNINITIALIZED") |
4563 | - |
4564 | - |
4565 | class WALAEventOperation: |
4566 | ActivateResourceDisk = "ActivateResourceDisk" |
4567 | AgentBlacklisted = "AgentBlacklisted" |
4568 | AgentEnabled = "AgentEnabled" |
4569 | + AgentMemory = "AgentMemory" |
4570 | + AgentUpgrade = "AgentUpgrade" |
4571 | ArtifactsProfileBlob = "ArtifactsProfileBlob" |
4572 | - AutoUpdate = "AutoUpdate" |
4573 | - CustomData = "CustomData" |
4574 | CGroupsCleanUp = "CGroupsCleanUp" |
4575 | - CGroupsLimitsCrossed = "CGroupsLimitsCrossed" |
4576 | - ExtensionMetricsData = "ExtensionMetricsData" |
4577 | + CGroupsDisabled = "CGroupsDisabled" |
4578 | + CGroupsInfo = "CGroupsInfo" |
4579 | + CollectEventErrors = "CollectEventErrors" |
4580 | + CollectEventUnicodeErrors = "CollectEventUnicodeErrors" |
4581 | + ConfigurationChange = "ConfigurationChange" |
4582 | + CustomData = "CustomData" |
4583 | + DefaultChannelChange = "DefaultChannelChange" |
4584 | Deploy = "Deploy" |
4585 | Disable = "Disable" |
4586 | Downgrade = "Downgrade" |
4587 | Download = "Download" |
4588 | Enable = "Enable" |
4589 | ExtensionProcessing = "ExtensionProcessing" |
4590 | + ExtensionTelemetryEventProcessing = "ExtensionTelemetryEventProcessing" |
4591 | + FetchGoalState = "FetchGoalState" |
4592 | Firewall = "Firewall" |
4593 | - GetArtifactExtended = "GetArtifactExtended" |
4594 | + GoalState = "GoalState" |
4595 | + GoalStateUnsupportedFeatures = "GoalStateUnsupportedFeatures" |
4596 | HealthCheck = "HealthCheck" |
4597 | HealthObservation = "HealthObservation" |
4598 | HeartBeat = "HeartBeat" |
4599 | @@ -87,29 +98,35 @@ class WALAEventOperation: |
4600 | HostPluginHeartbeat = "HostPluginHeartbeat" |
4601 | HostPluginHeartbeatExtended = "HostPluginHeartbeatExtended" |
4602 | HttpErrors = "HttpErrors" |
4603 | + HttpGet = "HttpGet" |
4604 | ImdsHeartbeat = "ImdsHeartbeat" |
4605 | Install = "Install" |
4606 | - InitializeCGroups = "InitializeCGroups" |
4607 | InitializeHostPlugin = "InitializeHostPlugin" |
4608 | - InvokeCommandUsingSystemd = "InvokeCommandUsingSystemd" |
4609 | Log = "Log" |
4610 | + LogCollection = "LogCollection" |
4611 | OSInfo = "OSInfo" |
4612 | Partition = "Partition" |
4613 | - ProcessGoalState = "ProcessGoalState" |
4614 | + PersistFirewallRules = "PersistFirewallRules" |
4615 | + PluginSettingsVersionMismatch = "PluginSettingsVersionMismatch" |
4616 | + InvalidExtensionConfig = "InvalidExtensionConfig" |
4617 | Provision = "Provision" |
4618 | ProvisionGuestAgent = "ProvisionGuestAgent" |
4619 | RemoteAccessHandling = "RemoteAccessHandling" |
4620 | + ReportEventErrors = "ReportEventErrors" |
4621 | + ReportEventUnicodeErrors = "ReportEventUnicodeErrors" |
4622 | ReportStatus = "ReportStatus" |
4623 | ReportStatusExtended = "ReportStatusExtended" |
4624 | Restart = "Restart" |
4625 | SequenceNumberMismatch = "SequenceNumberMismatch" |
4626 | SetCGroupsLimits = "SetCGroupsLimits" |
4627 | SkipUpdate = "SkipUpdate" |
4628 | + StatusProcessing = "StatusProcessing" |
4629 | UnhandledError = "UnhandledError" |
4630 | UnInstall = "UnInstall" |
4631 | Unknown = "Unknown" |
4632 | - Upgrade = "Upgrade" |
4633 | Update = "Update" |
4634 | + VmSettings = "VmSettings" |
4635 | + VmSettingsSummary = "VmSettingsSummary" |
4636 | |
4637 | |
4638 | SHOULD_ENCODE_MESSAGE_LEN = 80 |
4639 | @@ -173,11 +190,57 @@ class EventStatus(object): |
4640 | |
4641 | __event_status__ = EventStatus() |
4642 | __event_status_operations__ = [ |
4643 | - WALAEventOperation.AutoUpdate, |
4644 | WALAEventOperation.ReportStatus |
4645 | ] |
4646 | |
4647 | |
4648 | +def parse_json_event(data_str): |
4649 | + data = json.loads(data_str) |
4650 | + event = TelemetryEvent() |
4651 | + set_properties("TelemetryEvent", event, data) |
4652 | + event.file_type = "json" |
4653 | + return event |
4654 | + |
4655 | + |
4656 | +def parse_event(data_str): |
4657 | + try: |
4658 | + try: |
4659 | + return parse_json_event(data_str) |
4660 | + except ValueError: |
4661 | + return parse_xml_event(data_str) |
4662 | + except Exception as e: |
4663 | + raise EventError("Error parsing event: {0}".format(ustr(e))) |
4664 | + |
4665 | + |
4666 | +def parse_xml_param(param_node): |
4667 | + name = getattrib(param_node, "Name") |
4668 | + value_str = getattrib(param_node, "Value") |
4669 | + attr_type = getattrib(param_node, "T") |
4670 | + value = value_str |
4671 | + if attr_type == 'mt:uint64': |
4672 | + value = int(value_str) |
4673 | + elif attr_type == 'mt:bool': |
4674 | + value = bool(value_str) |
4675 | + elif attr_type == 'mt:float64': |
4676 | + value = float(value_str) |
4677 | + return TelemetryEventParam(name, value) |
4678 | + |
4679 | + |
4680 | +def parse_xml_event(data_str): |
4681 | + try: |
4682 | + xml_doc = parse_doc(data_str) |
4683 | + event_id = getattrib(find(xml_doc, "Event"), 'id') |
4684 | + provider_id = getattrib(find(xml_doc, "Provider"), 'id') |
4685 | + event = TelemetryEvent(event_id, provider_id) |
4686 | + param_nodes = findall(xml_doc, 'Param') |
4687 | + for param_node in param_nodes: |
4688 | + event.parameters.append(parse_xml_param(param_node)) |
4689 | + event.file_type = "xml" |
4690 | + return event |
4691 | + except Exception as e: |
4692 | + raise ValueError(ustr(e)) |
4693 | + |
4694 | + |
4695 | def _encode_message(op, message): |
4696 | """ |
4697 | Gzip and base64 encode a message based on the operation. |
4698 | @@ -214,20 +277,158 @@ def _encode_message(op, message): |
4699 | |
4700 | |
4701 | def _log_event(name, op, message, duration, is_success=True): |
4702 | - global _EVENT_MSG |
4703 | + global _EVENT_MSG # pylint: disable=W0603 |
4704 | |
4705 | - message = _encode_message(op, message) |
4706 | if not is_success: |
4707 | logger.error(_EVENT_MSG, name, op, message, duration) |
4708 | else: |
4709 | logger.info(_EVENT_MSG, name, op, message, duration) |
4710 | |
4711 | |
4712 | +class CollectOrReportEventDebugInfo(object): |
4713 | + """ |
4714 | + This class is used for capturing and reporting debug info that is captured during event collection and |
4715 | + reporting to wireserver. |
4716 | + It captures the count of unicode errors and any unexpected errors and also a subset of errors with stacks to help |
4717 | + with debugging any potential issues. |
4718 | + """ |
4719 | + __MAX_ERRORS_TO_REPORT = 5 |
4720 | + OP_REPORT = "Report" |
4721 | + OP_COLLECT = "Collect" |
4722 | + |
4723 | + def __init__(self, operation=OP_REPORT): |
4724 | + self.__unicode_error_count = 0 |
4725 | + self.__unicode_errors = set() |
4726 | + self.__op_error_count = 0 |
4727 | + self.__op_errors = set() |
4728 | + |
4729 | + if operation == self.OP_REPORT: |
4730 | + self.__unicode_error_event = WALAEventOperation.ReportEventUnicodeErrors |
4731 | + self.__op_errors_event = WALAEventOperation.ReportEventErrors |
4732 | + elif operation == self.OP_COLLECT: |
4733 | + self.__unicode_error_event = WALAEventOperation.CollectEventUnicodeErrors |
4734 | + self.__op_errors_event = WALAEventOperation.CollectEventErrors |
4735 | + |
4736 | + def report_debug_info(self): |
4737 | + |
4738 | + def report_dropped_events_error(count, errors, operation_name): |
4739 | + err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}" |
4740 | + if count > 0: |
4741 | + add_event(op=operation_name, |
4742 | + message=err_msg_format.format(count, CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT, ', '.join(errors)), |
4743 | + is_success=False) |
4744 | + |
4745 | + report_dropped_events_error(self.__op_error_count, self.__op_errors, self.__op_errors_event) |
4746 | + report_dropped_events_error(self.__unicode_error_count, self.__unicode_errors, self.__unicode_error_event) |
4747 | + |
4748 | + @staticmethod |
4749 | + def _update_errors_and_get_count(error_count, errors, error): |
4750 | + error_count += 1 |
4751 | + if len(errors) < CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT: |
4752 | + errors.add("{0}: {1}".format(ustr(error), traceback.format_exc())) |
4753 | + return error_count |
4754 | + |
4755 | + def update_unicode_error(self, unicode_err): |
4756 | + self.__unicode_error_count = self._update_errors_and_get_count(self.__unicode_error_count, self.__unicode_errors, |
4757 | + unicode_err) |
4758 | + |
4759 | + def update_op_error(self, op_err): |
4760 | + self.__op_error_count = self._update_errors_and_get_count(self.__op_error_count, self.__op_errors, op_err) |
4761 | + |
4762 | + |
4763 | class EventLogger(object): |
4764 | def __init__(self): |
4765 | self.event_dir = None |
4766 | self.periodic_events = {} |
4767 | |
4768 | + # |
4769 | + # All events should have these parameters. |
4770 | + # |
4771 | + # The first set comes from the current OS and is initialized here. These values don't change during |
4772 | + # the agent's lifetime. |
4773 | + # |
4774 | + # The next two sets come from the goal state and IMDS and must be explicitly initialized using |
4775 | + # initialize_vminfo_common_parameters() once a protocol for communication with the host has been |
4776 | + # created. Their values don't change during the agent's lifetime. Note that we initialize these |
4777 | + # parameters here using dummy values (*_UNINITIALIZED) since events sent to the host should always |
4778 | + # match the schema defined for them in the telemetry pipeline. |
4779 | + # |
4780 | + # There is another set of common parameters that must be computed at the time the event is created |
4781 | + # (e.g. the timestamp and the container ID); those are added to events (along with the parameters |
4782 | + # below) in _add_common_event_parameters() |
4783 | + # |
4784 | + # Note that different kinds of events may also include other parameters; those are added by the |
4785 | + # corresponding add_* method (e.g. add_metric for performance metrics). |
4786 | + # |
4787 | + self._common_parameters = [] |
4788 | + |
4789 | + # Parameters from OS |
4790 | + osutil = get_osutil() |
4791 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.OSVersion, EventLogger._get_os_version())) |
4792 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ExecutionMode, AGENT_EXECUTION_MODE)) |
4793 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RAM, int(EventLogger._get_ram(osutil)))) |
4794 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.Processors, int(EventLogger._get_processors(osutil)))) |
4795 | + |
4796 | + # Parameters from goal state |
4797 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.TenantName, "TenantName_UNINITIALIZED")) |
4798 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RoleName, "RoleName_UNINITIALIZED")) |
4799 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RoleInstanceName, "RoleInstanceName_UNINITIALIZED")) |
4800 | + # |
4801 | + # # Parameters from IMDS |
4802 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.Location, "Location_UNINITIALIZED")) |
4803 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.SubscriptionId, "SubscriptionId_UNINITIALIZED")) |
4804 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ResourceGroupName, "ResourceGroupName_UNINITIALIZED")) |
4805 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.VMId, "VMId_UNINITIALIZED")) |
4806 | + self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ImageOrigin, 0)) |
4807 | + |
4808 | + @staticmethod |
4809 | + def _get_os_version(): |
4810 | + return "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) |
4811 | + |
4812 | + @staticmethod |
4813 | + def _get_ram(osutil): |
4814 | + try: |
4815 | + return osutil.get_total_mem() |
4816 | + except OSUtilError as e: |
4817 | + logger.warn("Failed to get RAM info; will be missing from telemetry: {0}", ustr(e)) |
4818 | + return 0 |
4819 | + |
4820 | + @staticmethod |
4821 | + def _get_processors(osutil): |
4822 | + try: |
4823 | + return osutil.get_processor_cores() |
4824 | + except OSUtilError as e: |
4825 | + logger.warn("Failed to get Processors info; will be missing from telemetry: {0}", ustr(e)) |
4826 | + return 0 |
4827 | + |
4828 | + def initialize_vminfo_common_parameters(self, protocol): |
4829 | + """ |
4830 | + Initializes the common parameters that come from the goal state and IMDS |
4831 | + """ |
4832 | + # create an index of the event parameters for faster updates |
4833 | + parameters = {} |
4834 | + for p in self._common_parameters: |
4835 | + parameters[p.name] = p |
4836 | + |
4837 | + try: |
4838 | + vminfo = protocol.get_vminfo() |
4839 | + parameters[CommonTelemetryEventSchema.TenantName].value = vminfo.tenantName |
4840 | + parameters[CommonTelemetryEventSchema.RoleName].value = vminfo.roleName |
4841 | + parameters[CommonTelemetryEventSchema.RoleInstanceName].value = vminfo.roleInstanceName |
4842 | + except Exception as e: |
4843 | + logger.warn("Failed to get VM info from goal state; will be missing from telemetry: {0}", ustr(e)) |
4844 | + |
4845 | + try: |
4846 | + imds_client = get_imds_client(protocol.get_endpoint()) |
4847 | + imds_info = imds_client.get_compute() |
4848 | + parameters[CommonTelemetryEventSchema.Location].value = imds_info.location |
4849 | + parameters[CommonTelemetryEventSchema.SubscriptionId].value = imds_info.subscriptionId |
4850 | + parameters[CommonTelemetryEventSchema.ResourceGroupName].value = imds_info.resourceGroupName |
4851 | + parameters[CommonTelemetryEventSchema.VMId].value = imds_info.vmId |
4852 | + parameters[CommonTelemetryEventSchema.ImageOrigin].value = int(imds_info.image_origin) |
4853 | + except Exception as e: |
4854 | + logger.warn("Failed to get IMDS info; will be missing from telemetry: {0}", ustr(e)) |
4855 | + |
4856 | def save_event(self, data): |
4857 | if self.event_dir is None: |
4858 | logger.warn("Cannot save event -- Event reporter is not initialized.") |
4859 | @@ -258,7 +459,7 @@ class EventLogger(object): |
4860 | try: |
4861 | with open(filename + ".tmp", 'wb+') as hfile: |
4862 | hfile.write(data.encode("utf-8")) |
4863 | - os.rename(filename + ".tmp", filename + ".tld") |
4864 | + os.rename(filename + ".tmp", filename + AGENT_EVENT_FILE_EXTENSION) |
4865 | except (IOError, OSError) as e: |
4866 | msg = "Failed to write events to file: {0}".format(e) |
4867 | raise EventError(msg) |
4868 | @@ -271,38 +472,31 @@ class EventLogger(object): |
4869 | (self.periodic_events[h] + delta) <= datetime.now() |
4870 | |
4871 | def add_periodic(self, delta, name, op=WALAEventOperation.Unknown, is_success=True, duration=0, |
4872 | - version=str(CURRENT_VERSION), message="", evt_type="", is_internal=False, log_event=True, |
4873 | - force=False): |
4874 | + version=str(CURRENT_VERSION), message="", log_event=True, force=False): |
4875 | h = hash(name + op + ustr(is_success) + message) |
4876 | |
4877 | if force or self.is_period_elapsed(delta, h): |
4878 | self.add_event(name, op=op, is_success=is_success, duration=duration, |
4879 | - version=version, message=message, evt_type=evt_type, |
4880 | - is_internal=is_internal, log_event=log_event) |
4881 | + version=version, message=message, log_event=log_event) |
4882 | self.periodic_events[h] = datetime.now() |
4883 | |
4884 | def add_event(self, name, op=WALAEventOperation.Unknown, is_success=True, duration=0, version=str(CURRENT_VERSION), |
4885 | - message="", evt_type="", is_internal=False, log_event=True): |
4886 | + message="", log_event=True): |
4887 | |
4888 | if (not is_success) and log_event: |
4889 | _log_event(name, op, message, duration, is_success=is_success) |
4890 | |
4891 | - self._add_event(duration, evt_type, is_internal, is_success, message, name, op, version, event_id=1) |
4892 | - |
4893 | - def _add_event(self, duration, evt_type, is_internal, is_success, message, name, op, version, event_id): |
4894 | - event = TelemetryEvent(event_id, TELEMETRY_EVENT_PROVIDER_ID) |
4895 | - |
4896 | - event.parameters.append(TelemetryEventParam('Name', str(name))) |
4897 | - event.parameters.append(TelemetryEventParam('Version', str(version))) |
4898 | - event.parameters.append(TelemetryEventParam('IsInternal', bool(is_internal))) |
4899 | - event.parameters.append(TelemetryEventParam('Operation', str(op))) |
4900 | - event.parameters.append(TelemetryEventParam('OperationSuccess', bool(is_success))) |
4901 | - event.parameters.append(TelemetryEventParam('Message', str(message))) |
4902 | - event.parameters.append(TelemetryEventParam('Duration', int(duration))) |
4903 | - event.parameters.append(TelemetryEventParam('ExtensionType', str(evt_type))) |
4904 | + event = TelemetryEvent(TELEMETRY_EVENT_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID) |
4905 | + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Name, str_to_encoded_ustr(name))) |
4906 | + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str_to_encoded_ustr(version))) |
4907 | + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, str_to_encoded_ustr(op))) |
4908 | + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.OperationSuccess, bool(is_success))) |
4909 | + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Message, str_to_encoded_ustr(message))) |
4910 | + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Duration, int(duration))) |
4911 | + self.add_common_event_parameters(event, datetime.utcnow()) |
4912 | |
4913 | - event.parameters = self.add_default_parameters_to_event(event.parameters) |
4914 | data = get_properties(event) |
4915 | + |
4916 | try: |
4917 | self.save_event(json.dumps(data)) |
4918 | except EventError as e: |
4919 | @@ -310,13 +504,13 @@ class EventLogger(object): |
4920 | |
4921 | def add_log_event(self, level, message): |
4922 | event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID) |
4923 | - event.parameters.append(TelemetryEventParam('EventName', WALAEventOperation.Log)) |
4924 | - event.parameters.append(TelemetryEventParam('CapabilityUsed', logger.LogLevel.STRINGS[level])) |
4925 | - event.parameters.append(TelemetryEventParam('Context1', self._clean_up_message(message))) |
4926 | - event.parameters.append(TelemetryEventParam('Context2', '')) |
4927 | - event.parameters.append(TelemetryEventParam('Context3', '')) |
4928 | + event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.EventName, WALAEventOperation.Log)) |
4929 | + event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.CapabilityUsed, logger.LogLevel.STRINGS[level])) |
4930 | + event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context1, str_to_encoded_ustr(self._clean_up_message(message)))) |
4931 | + event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context2, datetime.utcnow().strftime(logger.Logger.LogTimeFormatInUTC))) |
4932 | + event.parameters.append(TelemetryEventParam(GuestAgentGenericLogsSchema.Context3, '')) |
4933 | + self.add_common_event_parameters(event, datetime.utcnow()) |
4934 | |
4935 | - event.parameters = self.add_default_parameters_to_event(event.parameters) |
4936 | data = get_properties(event) |
4937 | try: |
4938 | self.save_event(json.dumps(data)) |
4939 | @@ -334,17 +528,16 @@ class EventLogger(object): |
4940 | :param bool log_event: If true, log the collected metric in the agent log |
4941 | """ |
4942 | if log_event: |
4943 | - from azurelinuxagent.common.version import AGENT_NAME |
4944 | message = "Metric {0}/{1} [{2}] = {3}".format(category, counter, instance, value) |
4945 | _log_event(AGENT_NAME, "METRIC", message, 0) |
4946 | |
4947 | event = TelemetryEvent(TELEMETRY_METRICS_EVENT_ID, TELEMETRY_EVENT_PROVIDER_ID) |
4948 | - event.parameters.append(TelemetryEventParam('Category', str(category))) |
4949 | - event.parameters.append(TelemetryEventParam('Counter', str(counter))) |
4950 | - event.parameters.append(TelemetryEventParam('Instance', str(instance))) |
4951 | - event.parameters.append(TelemetryEventParam('Value', float(value))) |
4952 | + event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Category, str_to_encoded_ustr(category))) |
4953 | + event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Counter, str_to_encoded_ustr(counter))) |
4954 | + event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Instance, str_to_encoded_ustr(instance))) |
4955 | + event.parameters.append(TelemetryEventParam(GuestAgentPerfCounterEventsSchema.Value, float(value))) |
4956 | + self.add_common_event_parameters(event, datetime.utcnow()) |
4957 | |
4958 | - event.parameters = self.add_default_parameters_to_event(event.parameters) |
4959 | data = get_properties(event) |
4960 | try: |
4961 | self.save_event(json.dumps(data)) |
4962 | @@ -392,56 +585,34 @@ class EventLogger(object): |
4963 | else: |
4964 | return message |
4965 | |
4966 | - @staticmethod |
4967 | - def add_default_parameters_to_event(event_parameters, set_values_for_agent=True): |
4968 | + def add_common_event_parameters(self, event, event_timestamp): |
4969 | """ |
4970 | - Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't |
4971 | - have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly, |
4972 | - but not for extension events. Add it if it's missing. |
4973 | - |
4974 | - We write the GAVersion here rather than add it in azurelinuxagent.ga.monitor.MonitorHandler.add_sysinfo |
4975 | - as there could be a possibility of events being sent with newer version of the agent, rather than the agent |
4976 | - version generating the event. |
4977 | - # Old behavior example: V1 writes the event on the disk and finds an update immediately, and updates. Now the |
4978 | - new monitor thread would pick up the events from the disk and send it with the CURRENT_AGENT, which would have |
4979 | - newer version of the agent. This causes confusion. |
4980 | - |
4981 | - ContainerId can change due to live migration and we want to preserve the container Id of the container writing |
4982 | - the event, rather than sending the event. |
4983 | - OpcodeName - This is used as the actual time of event generation. |
4984 | - |
4985 | - :param event_parameters: List of parameters of the event. |
4986 | - :param set_values_for_agent: Need default values populated or not. Extensions need only GAVersion and |
4987 | - ContainerId to be populated and others should be |
4988 | - :return: Event with default parameters populated (either values for agent or extension) |
4989 | + This method is called for all events and ensures all telemetry fields are added before the event is sent out. |
4990 | + Note that the event timestamp is saved in the OpcodeName field. |
4991 | """ |
4992 | - DefaultParameter = namedtuple('DefaultParameter', ['name', 'value']) |
4993 | - default_parameters = [DefaultParameter("GAVersion", CURRENT_AGENT), |
4994 | - DefaultParameter('ContainerId', get_container_id_from_env()), |
4995 | - DefaultParameter('OpcodeName', datetime.utcnow().__str__() if set_values_for_agent else ""), |
4996 | - DefaultParameter('EventTid', threading.current_thread().ident if set_values_for_agent else 0), |
4997 | - DefaultParameter('EventPid', os.getpid() if set_values_for_agent else 0), |
4998 | - DefaultParameter("TaskName", threading.current_thread().getName() if set_values_for_agent else ""), |
4999 | - DefaultParameter("KeywordName", '')] |
5000 | - |
Hi Calvin, it's a bit late in the release for merges, did you have an MRE or SRU bug to accompany this?
Also, I'm not spotting changes to debian/changelog, was that intentional to be excluded?