Add support for enabling GPU access

Signed-off-by: YISH <mokeyish@hotmail.com>
This commit is contained in:
YISH 2024-04-30 17:18:05 +08:00
parent 33d7d35a4d
commit 79865c2e13
4 changed files with 173 additions and 0 deletions

3
.gitignore vendored
View File

@ -105,3 +105,6 @@ venv.bak/
# mypy
.mypy_cache/
.vscode

View File

@ -0,0 +1,11 @@
services:
test:
image: nvidia/cuda:12.3.1-base-ubuntu20.04
command: nvidia-smi
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

View File

@ -635,6 +635,62 @@ def get_secret_args(compose, cnt, secret, podman_is_building=False):
def container_to_res_args(cnt, podman_args):
container_to_cpu_res_args(cnt, podman_args)
container_to_gpu_res_args(cnt, podman_args)
def container_to_gpu_res_args(cnt, podman_args):
# https://docs.docker.com/compose/gpu-support/
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html
deploy = cnt.get("deploy", None) or {}
res = deploy.get("resources", None) or {}
reservations = res.get("reservations", None) or {}
devices = reservations.get("devices", [])
gpu_on = False
for device in devices:
driver = device.get("driver", None)
if driver is None:
continue
capabilities = device.get("capabilities", None)
if capabilities is None:
continue
if driver != "nvidia" or "gpu" not in capabilities:
continue
count = device.get("count", "all")
device_ids = device.get("device_ids", "all")
if device_ids != "all" and len(device_ids) > 0:
for device_id in device_ids:
podman_args.extend((
"--device",
f"nvidia.com/gpu={device_id}",
))
gpu_on = True
continue
if count != "all":
for device_id in range(count):
podman_args.extend((
"--device",
f"nvidia.com/gpu={device_id}",
))
gpu_on = True
continue
podman_args.extend((
"--device",
"nvidia.com/gpu=all",
))
gpu_on = True
if gpu_on:
podman_args.append("--security-opt=label=disable")
def container_to_cpu_res_args(cnt, podman_args):
# v2: https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources
# cpus, cpu_shares, mem_limit, mem_reservation
cpus_limit_v2 = try_float(cnt.get("cpus", None), None)

View File

@ -325,3 +325,106 @@ class TestContainerToArgs(unittest.IsolatedAsyncioTestCase):
"busybox",
],
)
async def test_gpu(self):
c = create_compose_mock()
cnt = get_minimal_container()
cnt["command"] = ["nvidia-smi"]
cnt["deploy"] = {"resources": {"reservations": {"devices": [{}]}}}
# count: all
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"count": "all",
"capabilities": ["gpu"],
}
args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=all",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)
# count: 2
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"count": 2,
"capabilities": ["gpu"],
}
args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=0",
"--device",
"nvidia.com/gpu=1",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)
# device_ids: all
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"device_ids": "all",
"capabilities": ["gpu"],
}
args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=all",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)
# device_ids: 1,3
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"device_ids": [1, 3],
"capabilities": ["gpu"],
}
args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=1",
"--device",
"nvidia.com/gpu=3",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)