From b540ea59a295583d6e9fae9b430efb447a1e0b7c Mon Sep 17 00:00:00 2001 From: saltydk Date: Sat, 10 Feb 2024 02:52:15 +0100 Subject: [PATCH] nvidia: rework role use new toolkit --- .../docker/set_docker_devices_variable.yml | 13 +++++- roles/nvidia/defaults/main.yml | 17 ++------ roles/nvidia/files/71-nvidia-dev-char.rules | 2 + roles/nvidia/tasks/subtasks/docker.yml | 41 ++++++++++++++----- 4 files changed, 48 insertions(+), 25 deletions(-) create mode 100644 roles/nvidia/files/71-nvidia-dev-char.rules diff --git a/resources/tasks/docker/set_docker_devices_variable.yml b/resources/tasks/docker/set_docker_devices_variable.yml index ae1999221d..f848487217 100644 --- a/resources/tasks/docker/set_docker_devices_variable.yml +++ b/resources/tasks/docker/set_docker_devices_variable.yml @@ -20,6 +20,17 @@ ansible.builtin.set_fact: docker_devices_var_name: "{{ role_name + '_docker_devices_default' }}" + - name: Resources | Tasks | Docker | Set Docker Devices Variable | Set 'docker_devices_nvidia' temp variable + ansible.builtin.set_fact: + docker_devices_nvidia: + - /dev/nvidia-uvm + - /dev/nvidia-uvm-tools + - /dev/nvidia-modeset + - /dev/nvidiactl + - /dev/nvidia0 + - name: Resources | Tasks | Docker | Set Docker Devices Variable | Set 'docker_devices' variable # noqa jinja[spacing] var-naming[no-jinja] ansible.builtin.set_fact: - "{{ docker_devices_var_name }}": "{{ ['/dev/dri:/dev/dri'] + lookup('vars', role_name + '_docker_devices_default') }}" + "{{ docker_devices_var_name }}": "{{ (['/dev/dri:/dev/dri'] if gpu.intel else []) + + (docker_devices_nvidia if gpu.nvidia else []) + + lookup('vars', role_name + '_docker_devices_default', default=[]) }}" diff --git a/roles/nvidia/defaults/main.yml b/roles/nvidia/defaults/main.yml index 8e678755c7..ded74073c0 100644 --- a/roles/nvidia/defaults/main.yml +++ b/roles/nvidia/defaults/main.yml @@ -36,23 +36,14 @@ nvidia_patch_backup_file_location: "/opt/nvidia/libnvidia-encode-backup" # Docker ################################ -nvidia_docker_runtime_apt_key_url: https://nvidia.github.io/nvidia-container-runtime/gpgkey +nvidia_docker_runtime_apt_key_url: https://nvidia.github.io/libnvidia-container/gpgkey -nvidia_docker_runtime_apt_repo_list_url: "https://nvidia.github.io/nvidia-container-runtime/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/nvidia-container-runtime.list" +nvidia_docker_runtime_apt_repo_list_url: "https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list" -nvidia_docker_runtime_apt_repo_url_list_old2: - - 'deb [signed-by=/etc/apt/trusted.gpg.d/nvidia.asc] https://nvidia.github.io/libnvidia-container/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /' - - 'deb [signed-by=/etc/apt/trusted.gpg.d/nvidia.asc] https://nvidia.github.io/nvidia-container-runtime/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /' -nvidia_docker_runtime_apt_repo_url_list_old: - - 'deb https://nvidia.github.io/libnvidia-container/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /' - - 'deb https://nvidia.github.io/nvidia-container-runtime/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /' - - 'deb [signed-by=/usr/share/keyrings/nvidia.gpg] https://nvidia.github.io/libnvidia-container/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /' - - 'deb [signed-by=/usr/share/keyrings/nvidia.gpg] https://nvidia.github.io/nvidia-container-runtime/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /' +nvidia_docker_runtime_apt_repo_file_old: nvidia-container-runtime -nvidia_docker_runtime_apt_repo_file: nvidia-container-runtime - -nvidia_docker_runtime_apt_package: nvidia-container-runtime +nvidia_docker_runtime_apt_repo_file: nvidia-container-toolkit nvidia_docker_runtime_docker_daemon_json_jq_command: | jq '."default-runtime" = "nvidia" diff --git a/roles/nvidia/files/71-nvidia-dev-char.rules b/roles/nvidia/files/71-nvidia-dev-char.rules new file mode 100644 index 0000000000..28696b05e2 --- /dev/null +++ b/roles/nvidia/files/71-nvidia-dev-char.rules @@ -0,0 +1,2 @@ +# This will create /dev/char symlinks to all device nodes +ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all" diff --git a/roles/nvidia/tasks/subtasks/docker.yml b/roles/nvidia/tasks/subtasks/docker.yml index da82ccedfc..43bf465f92 100644 --- a/roles/nvidia/tasks/subtasks/docker.yml +++ b/roles/nvidia/tasks/subtasks/docker.yml @@ -7,40 +7,59 @@ # GNU General Public License v3.0 # ######################################################################### --- -- name: Docker | Remove old official repository entry - ansible.builtin.apt_repository: - repo: "{{ item }}" - filename: "{{ nvidia_docker_runtime_apt_repo_file }}" - update_cache: true +- name: Docker | Remove old repository list + ansible.builtin.file: + path: "{{ nvidia_docker_runtime_apt_repo_file_old }}" state: absent - loop: "{{ nvidia_docker_runtime_apt_repo_url_list_old + nvidia_docker_runtime_apt_repo_url_list_old2 }}" -- name: Docker | Add 'nvidia-container-runtime' APT Repo Key +- name: Docker | Add 'nvidia-container-toolkit' APT Repo Key ansible.builtin.get_url: url: "{{ nvidia_docker_runtime_apt_key_url }}" dest: /etc/apt/trusted.gpg.d/nvidia.asc mode: "0644" + force: true register: result retries: "{{ '0' if (not continuous_integration) else '5' }}" delay: 10 until: result is succeeded -- name: Docker | Add 'nvidia-container-runtime' APT list +- name: Docker | Add 'nvidia-container-toolkit' APT list ansible.builtin.apt_repository: repo: "{{ item }}" filename: "{{ nvidia_docker_runtime_apt_repo_file }}" state: present mode: "0644" update_cache: true - loop: "{{ lookup('ansible.builtin.url', nvidia_docker_runtime_apt_repo_list_url, wantlist=True) | replace('deb', 'deb [signed-by=/etc/apt/trusted.gpg.d/nvidia.asc]') }}" + loop: "{{ lookup('ansible.builtin.url', nvidia_docker_runtime_apt_repo_list_url, wantlist=True) | regex_replace('^deb https', 'deb [signed-by=/etc/apt/trusted.gpg.d/nvidia.asc] https') }}" when: not item.startswith('#') -- name: Docker | Install 'nvidia-container-runtime' +- name: Docker | Remove 'nvidia-container-runtime' + ansible.builtin.apt: + name: "nvidia-container-runtime" + state: absent + +- name: Docker | Remove 'nvidia-container-toolkit' ansible.builtin.apt: - name: "{{ nvidia_docker_runtime_apt_package }}" + name: "nvidia-container-toolkit" + state: absent + +- name: Docker | Install 'nvidia-container-toolkit' + ansible.builtin.apt: + name: "nvidia-container-toolkit" update_cache: true state: present +- name: Docker | Create '/dev/char' symlinks + ansible.builtin.shell: nvidia-ctk system create-dev-char-symlinks --create-all + +- name: Docker | Import '71-nvidia-dev-char.rules' + ansible.builtin.copy: + src: 71-nvidia-dev-char.rules + dest: "/lib/udev/rules.d/71-nvidia-dev-char.rules" + owner: "root" + group: "root" + mode: "0644" + - name: Docker | Populate Service Facts ansible.builtin.service_facts: