From 79865c2e13f38b92fc3c5c14a5270d2398950c9f Mon Sep 17 00:00:00 2001 From: YISH Date: Tue, 30 Apr 2024 17:18:05 +0800 Subject: [PATCH] Add support for enabling GPU access Signed-off-by: YISH --- .gitignore | 3 + examples/nvidia-smi/docker-compose.yaml | 11 +++ podman_compose.py | 56 +++++++++++++ pytests/test_container_to_args.py | 103 ++++++++++++++++++++++++ 4 files changed, 173 insertions(+) create mode 100644 examples/nvidia-smi/docker-compose.yaml diff --git a/.gitignore b/.gitignore index 6d96ad9..a0d9bb8 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ venv.bak/ # mypy .mypy_cache/ + + +.vscode diff --git a/examples/nvidia-smi/docker-compose.yaml b/examples/nvidia-smi/docker-compose.yaml new file mode 100644 index 0000000..26c411f --- /dev/null +++ b/examples/nvidia-smi/docker-compose.yaml @@ -0,0 +1,11 @@ +services: + test: + image: nvidia/cuda:12.3.1-base-ubuntu20.04 + command: nvidia-smi + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] diff --git a/podman_compose.py b/podman_compose.py index 1f9b48e..4e893cf 100755 --- a/podman_compose.py +++ b/podman_compose.py @@ -635,6 +635,62 @@ def get_secret_args(compose, cnt, secret, podman_is_building=False): def container_to_res_args(cnt, podman_args): + container_to_cpu_res_args(cnt, podman_args) + container_to_gpu_res_args(cnt, podman_args) + + +def container_to_gpu_res_args(cnt, podman_args): + # https://docs.docker.com/compose/gpu-support/ + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html + + deploy = cnt.get("deploy", None) or {} + res = deploy.get("resources", None) or {} + reservations = res.get("reservations", None) or {} + devices = reservations.get("devices", []) + gpu_on = False + for device in devices: + driver = device.get("driver", None) + if driver is None: + continue + + capabilities = device.get("capabilities", None) + if capabilities is None: + continue + + if driver != "nvidia" or "gpu" not in capabilities: + continue + + count = device.get("count", "all") + device_ids = device.get("device_ids", "all") + if device_ids != "all" and len(device_ids) > 0: + for device_id in device_ids: + podman_args.extend(( + "--device", + f"nvidia.com/gpu={device_id}", + )) + gpu_on = True + continue + + if count != "all": + for device_id in range(count): + podman_args.extend(( + "--device", + f"nvidia.com/gpu={device_id}", + )) + gpu_on = True + continue + + podman_args.extend(( + "--device", + "nvidia.com/gpu=all", + )) + gpu_on = True + + if gpu_on: + podman_args.append("--security-opt=label=disable") + + +def container_to_cpu_res_args(cnt, podman_args): # v2: https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources # cpus, cpu_shares, mem_limit, mem_reservation cpus_limit_v2 = try_float(cnt.get("cpus", None), None) diff --git a/pytests/test_container_to_args.py b/pytests/test_container_to_args.py index 883b48a..f79062d 100644 --- a/pytests/test_container_to_args.py +++ b/pytests/test_container_to_args.py @@ -325,3 +325,106 @@ class TestContainerToArgs(unittest.IsolatedAsyncioTestCase): "busybox", ], ) + + async def test_gpu(self): + c = create_compose_mock() + + cnt = get_minimal_container() + cnt["command"] = ["nvidia-smi"] + cnt["deploy"] = {"resources": {"reservations": {"devices": [{}]}}} + + # count: all + cnt["deploy"]["resources"]["reservations"]["devices"][0] = { + "driver": "nvidia", + "count": "all", + "capabilities": ["gpu"], + } + + args = await container_to_args(c, cnt) + self.assertEqual( + args, + [ + "--name=project_name_service_name1", + "-d", + "--network=bridge", + "--network-alias=service_name", + "--device", + "nvidia.com/gpu=all", + "--security-opt=label=disable", + "busybox", + "nvidia-smi", + ], + ) + + # count: 2 + cnt["deploy"]["resources"]["reservations"]["devices"][0] = { + "driver": "nvidia", + "count": 2, + "capabilities": ["gpu"], + } + + args = await container_to_args(c, cnt) + self.assertEqual( + args, + [ + "--name=project_name_service_name1", + "-d", + "--network=bridge", + "--network-alias=service_name", + "--device", + "nvidia.com/gpu=0", + "--device", + "nvidia.com/gpu=1", + "--security-opt=label=disable", + "busybox", + "nvidia-smi", + ], + ) + + # device_ids: all + cnt["deploy"]["resources"]["reservations"]["devices"][0] = { + "driver": "nvidia", + "device_ids": "all", + "capabilities": ["gpu"], + } + + args = await container_to_args(c, cnt) + self.assertEqual( + args, + [ + "--name=project_name_service_name1", + "-d", + "--network=bridge", + "--network-alias=service_name", + "--device", + "nvidia.com/gpu=all", + "--security-opt=label=disable", + "busybox", + "nvidia-smi", + ], + ) + + # device_ids: 1,3 + cnt["deploy"]["resources"]["reservations"]["devices"][0] = { + "driver": "nvidia", + "device_ids": [1, 3], + "capabilities": ["gpu"], + } + + args = await container_to_args(c, cnt) + self.assertEqual( + args, + [ + "--name=project_name_service_name1", + "-d", + "--network=bridge", + "--network-alias=service_name", + "--device", + "nvidia.com/gpu=1", + "--device", + "nvidia.com/gpu=3", + "--security-opt=label=disable", + "busybox", + "nvidia-smi", + ], + )