Skip to content

Commit

Permalink
bugfix: Minor fixes (#4)
Browse files Browse the repository at this point in the history
* fix: Several minor fixes

* Correct condition when adding sudoer entry

* Add caps only when appropriate collector is used

* Use empty string as default gpu type

* Modify alternative scenario to test changes

* chore: Set golang version as internal variable

* This is to make version bumper work correctly

* chore: Update arg spec for dcgm_exporter

* test: Fix failing tests

---------

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri authored Feb 29, 2024
1 parent 429249e commit cdf068e
Show file tree
Hide file tree
Showing 12 changed files with 33 additions and 35 deletions.
1 change: 1 addition & 0 deletions changelogs/changelog.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
---
ancestor: null
releases:
0.1.0:
Expand Down
2 changes: 1 addition & 1 deletion roles/ceems_exporter/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ceems_exporter_enabled_collectors: []
ceems_exporter_disabled_collectors: []
ceems_exporter_create_unique_jobids: false
ceems_exporter_slurm_job_props_dir: ""
ceems_exporter_gpu_type: nvidia
ceems_exporter_gpu_type: ""
ceems_exporter_gpu_job_map_dir: ""
ceems_exporter_emissions_country_code: FR

Expand Down
9 changes: 3 additions & 6 deletions roles/ceems_exporter/meta/argument_specs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,12 @@ argument_specs:
ceems_exporter_gpu_type:
description:
- GPU type.
- Check the L(example scripts,https://github.com/mahendrapaipuri/ceems_monitoring/tree/main/etc/slurm/epilog.d).
default: nvidia
choices:
- nvidia
- amd
- Currently, nVIDIA and AMD GPUs are supported.
default: ""
ceems_exporter_gpu_job_map_dir:
description:
- Directory where files containing mapping of SLURM job ID to GPU ordinals are created by Epilog scripts.
- Currently, nVIDIA and AMD GPUs are supported.
- Check the L(example scripts,https://github.com/mahendrapaipuri/ceems_monitoring/tree/main/etc/slurm/epilog.d).
default: ""
ceems_exporter_emissions_country_code:
description: ISO 3166-1 alpha-3 Country code to export emission data.
Expand Down
8 changes: 5 additions & 3 deletions roles/ceems_exporter/molecule/alternative/molecule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ provisioner:
ceems_exporter_basic_auth_users:
randomuser: examplepassword
go_arch: amd64
ceems_exporter_slurm_job_props_dir: /run/slurmjobprops
ceems_exporter_gpu_job_map_dir: /run/gpujobmap
ceems_exporter_ipmi_dcmi_cmd: ipmi-dcmi
ceems_exporter_enabled_collectors:
- emissions
ceems_exporter_disabled_collectors:
- slurm
ceems_exporter_ipmi_dcmi_cmd: "sudo ipmi-dcmi"
ceems_exporter_env_vars:
EMAPS_API_TOKEN: foo
9 changes: 0 additions & 9 deletions roles/ceems_exporter/molecule/alternative/prepare.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,3 @@
dest: "{{ ceems_exporter_tls_server_config.cert_file }}"
- src: /tmp/tls.key
dest: "{{ ceems_exporter_tls_server_config.key_file }}"

- name: Create slurm job props and GPU job map directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
mode: "0755"
loop:
- "{{ ceems_exporter_slurm_job_props_dir }}"
- "{{ ceems_exporter_gpu_job_map_dir }}"
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ def test_directories(host, dir):
assert d.exists


@pytest.mark.parametrize("file", [
"/etc/sudoers.d/allow-ipmi-dcmi",
])
def test_files(host, file):
f = host.file(file)
assert f.exists


def test_service(host):
s = host.service("ceems_exporter")
try:
Expand All @@ -35,10 +43,7 @@ def test_systemd_properties(host):
p = s.systemd_properties
assert p.get("ProtectHome") == "yes"
assert p.get("Environment") == "EMAPS_API_TOKEN=foo"
# Seems like this test will fail on centos 7
if host.system_info.distribution != "centos" and host.system_info.release != "7":
assert p.get("AmbientCapabilities") == "cap_setgid cap_setuid"
assert p.get("CapabilityBoundingSet") == "cap_setgid cap_setuid"
assert p.get("AmbientCapabilities") in [None, "", "0", "False", False, "No", "no"]


@pytest.mark.parametrize("socket", [
Expand Down
3 changes: 2 additions & 1 deletion roles/ceems_exporter/tasks/configure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
user: "{{ ceems_exporter_system_user }}"
commands: /usr/sbin/ipmi-dcmi
when:
- '"ipmi_dcmi" in ceems_exporter_disabled_collectors'
- 'not "ipmi_dcmi" in ceems_exporter_disabled_collectors'
- '"sudo" in ceems_exporter_ipmi_dcmi_cmd'
- ceems_exporter_system_user != "root"

- name: Allow ceems_exporter port in SELinux on RedHat OS family
Expand Down
10 changes: 6 additions & 4 deletions roles/ceems_exporter/templates/ceems_exporter.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,19 @@ StartLimitInterval=0
{% endfor %}
ProtectHome={{ ns.protect_home }}

{% if "sudo" not in ceems_exporter_ipmi_dcmi_cmd and
"ipmi_dcmi" not in ceems_exporter_disabled_collectors and
{% if "ipmi_dcmi" not in ceems_exporter_disabled_collectors and
"sudo" not in ceems_exporter_ipmi_dcmi_cmd and
ceems_exporter_system_user != "root" %}
{% set _ = ns.caps.extend(['CAP_SETUID', 'CAP_SETGID']) %}
{% endif %}
{% if ceems_exporter_create_unique_jobids and
"slurm" not in ceems_exporter_disabled_collectors and

{% if "slurm" not in ceems_exporter_disabled_collectors and
ceems_exporter_create_unique_jobids and
not ceems_exporter_slurm_job_props_dir and
ceems_exporter_system_user != "root" %}
{% set _ = ns.caps.extend(['CAP_SYS_PTRACE', 'CAP_DAC_READ_SEARCH']) %}
{% endif %}

{% if ceems_exporter_gpu_type and
not ceems_exporter_gpu_job_map_dir and
ceems_exporter_system_user != "root" %}
Expand Down
2 changes: 1 addition & 1 deletion roles/nvidia_dcgm_exporter/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
dcgm_exporter_version: 3.3.3-3.3.1
golang_version: 1.21.5

dcgm_exporter_binary_local_dir: ""

dcgm_exporter_git_url: https://github.com/NVIDIA/dcgm-exporter.git
Expand Down
3 changes: 0 additions & 3 deletions roles/nvidia_dcgm_exporter/meta/argument_specs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ argument_specs:
dcgm_exporter_version:
description: DCGM exporter package version. Also accepts latest as parameter.
default: 3.3.0-3.2.0
golang_version:
description: Go version used to compile dcgm_exporter.
default: 1.21.5
dcgm_exporter_skip_install:
description: DCGM exporter installation tasks gets skipped when set to true.
type: bool
Expand Down
4 changes: 2 additions & 2 deletions roles/nvidia_dcgm_exporter/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
run_once: true
ansible.builtin.get_url:
url: "{{ _golang_download_url }}"
dest: /tmp/golang-{{ golang_version }}.linux-{{ go_arch }}.tar.gz
dest: /tmp/golang-{{ _golang_version }}.linux-{{ go_arch }}.tar.gz
mode: "0644"
register: _download_golang
until: _download_golang is succeeded
Expand All @@ -42,7 +42,7 @@
become: false
run_once: true
ansible.builtin.unarchive:
src: /tmp/golang-{{ golang_version }}.linux-{{ go_arch }}.tar.gz
src: /tmp/golang-{{ _golang_version }}.linux-{{ go_arch }}.tar.gz
dest: /tmp
creates: /tmp/go
delegate_to: localhost
Expand Down
4 changes: 3 additions & 1 deletion roles/nvidia_dcgm_exporter/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ go_arch_map:
armv6l: armv6

go_arch: "{{ go_arch_map[ansible_architecture] | default(ansible_architecture) }}"

_nvidia_dcgm_exporter_repo: NVIDIA/dcgm-exporter
_github_api_headers: "{{ {'GITHUB_TOKEN': lookup('ansible.builtin.env', 'GITHUB_TOKEN')} if (lookup('ansible.builtin.env', 'GITHUB_TOKEN')) else {} }}"
_golang_download_url: https://go.dev/dl/go{{ golang_version }}.linux-{{ go_arch }}.tar.gz
_golang_version: 1.21.5
_golang_download_url: https://go.dev/dl/go{{ _golang_version }}.linux-{{ go_arch }}.tar.gz

0 comments on commit cdf068e

Please sign in to comment.