Add support for enabling GPU access

Signed-off-by: YISH <mokeyish@hotmail.com>
2025-08-16 16:41:17 +02:00 · 2024-04-30 17:18:05 +08:00
parent 33d7d35a4d
commit 79865c2e13
4 changed files with 173 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -105,3 +105,6 @@ venv.bak/

 # mypy
 .mypy_cache/
+
+
+.vscode
--- a/examples/nvidia-smi/docker-compose.yaml
+++ b/examples/nvidia-smi/docker-compose.yaml
@ -0,0 +1,11 @@
+services:
+  test:
+    image: nvidia/cuda:12.3.1-base-ubuntu20.04
+    command: nvidia-smi
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
--- a/podman_compose.py
+++ b/podman_compose.py
@ -635,6 +635,62 @@ def get_secret_args(compose, cnt, secret, podman_is_building=False):


 def container_to_res_args(cnt, podman_args):
+    container_to_cpu_res_args(cnt, podman_args)
+    container_to_gpu_res_args(cnt, podman_args)
+
+
+def container_to_gpu_res_args(cnt, podman_args):
+    # https://docs.docker.com/compose/gpu-support/
+    # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html
+
+    deploy = cnt.get("deploy", None) or {}
+    res = deploy.get("resources", None) or {}
+    reservations = res.get("reservations", None) or {}
+    devices = reservations.get("devices", [])
+    gpu_on = False
+    for device in devices:
+        driver = device.get("driver", None)
+        if driver is None:
+            continue
+
+        capabilities = device.get("capabilities", None)
+        if capabilities is None:
+            continue
+
+        if driver != "nvidia" or "gpu" not in capabilities:
+            continue
+
+        count = device.get("count", "all")
+        device_ids = device.get("device_ids", "all")
+        if device_ids != "all" and len(device_ids) > 0:
+            for device_id in device_ids:
+                podman_args.extend((
+                    "--device",
+                    f"nvidia.com/gpu={device_id}",
+                ))
+            gpu_on = True
+            continue
+
+        if count != "all":
+            for device_id in range(count):
+                podman_args.extend((
+                    "--device",
+                    f"nvidia.com/gpu={device_id}",
+                ))
+            gpu_on = True
+            continue
+
+        podman_args.extend((
+            "--device",
+            "nvidia.com/gpu=all",
+        ))
+        gpu_on = True
+
+    if gpu_on:
+        podman_args.append("--security-opt=label=disable")
+
+
+def container_to_cpu_res_args(cnt, podman_args):
    # v2: https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources
    # cpus, cpu_shares, mem_limit, mem_reservation
    cpus_limit_v2 = try_float(cnt.get("cpus", None), None)
--- a/pytests/test_container_to_args.py
+++ b/pytests/test_container_to_args.py
@ -325,3 +325,106 @@ class TestContainerToArgs(unittest.IsolatedAsyncioTestCase):
                "busybox",
            ],
        )
+
+    async def test_gpu(self):
+        c = create_compose_mock()
+
+        cnt = get_minimal_container()
+        cnt["command"] = ["nvidia-smi"]
+        cnt["deploy"] = {"resources": {"reservations": {"devices": [{}]}}}
+
+        # count: all
+        cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
+            "driver": "nvidia",
+            "count": "all",
+            "capabilities": ["gpu"],
+        }
+
+        args = await container_to_args(c, cnt)
+        self.assertEqual(
+            args,
+            [
+                "--name=project_name_service_name1",
+                "-d",
+                "--network=bridge",
+                "--network-alias=service_name",
+                "--device",
+                "nvidia.com/gpu=all",
+                "--security-opt=label=disable",
+                "busybox",
+                "nvidia-smi",
+            ],
+        )
+
+        # count: 2
+        cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
+            "driver": "nvidia",
+            "count": 2,
+            "capabilities": ["gpu"],
+        }
+
+        args = await container_to_args(c, cnt)
+        self.assertEqual(
+            args,
+            [
+                "--name=project_name_service_name1",
+                "-d",
+                "--network=bridge",
+                "--network-alias=service_name",
+                "--device",
+                "nvidia.com/gpu=0",
+                "--device",
+                "nvidia.com/gpu=1",
+                "--security-opt=label=disable",
+                "busybox",
+                "nvidia-smi",
+            ],
+        )
+
+        # device_ids: all
+        cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
+            "driver": "nvidia",
+            "device_ids": "all",
+            "capabilities": ["gpu"],
+        }
+
+        args = await container_to_args(c, cnt)
+        self.assertEqual(
+            args,
+            [
+                "--name=project_name_service_name1",
+                "-d",
+                "--network=bridge",
+                "--network-alias=service_name",
+                "--device",
+                "nvidia.com/gpu=all",
+                "--security-opt=label=disable",
+                "busybox",
+                "nvidia-smi",
+            ],
+        )
+
+        # device_ids: 1,3
+        cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
+            "driver": "nvidia",
+            "device_ids": [1, 3],
+            "capabilities": ["gpu"],
+        }
+
+        args = await container_to_args(c, cnt)
+        self.assertEqual(
+            args,
+            [
+                "--name=project_name_service_name1",
+                "-d",
+                "--network=bridge",
+                "--network-alias=service_name",
+                "--device",
+                "nvidia.com/gpu=1",
+                "--device",
+                "nvidia.com/gpu=3",
+                "--security-opt=label=disable",
+                "busybox",
+                "nvidia-smi",
+            ],
+        )