diff --git a/docs/GPU.md b/docs/GPU.md index bbf36d210..b89a7664f 100644 --- a/docs/GPU.md +++ b/docs/GPU.md @@ -37,9 +37,9 @@ Fri Apr 25 06:00:34 2025 Now, time to get the id of the gpu: ```bash -root@gpu-1:/repos/ocean/ocean-node# nvidia-smi --query-gpu=name,uuid --format=csv -name, uuid -NVIDIA GeForce GTX 1060 3GB, GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81 +root@gpu-1:/repos/ocean/ocean-node# nvidia-smi --query-gpu=name,uuid,driver_version,memory.total --format=csv +name, uuid, driver version, memory total +NVIDIA GeForce GTX 1060 3GB, GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81, 570.195.03, 3072 MiB ``` Now, we can define the gpu for node: @@ -56,7 +56,9 @@ Now, we can define the gpu for node: "DeviceIDs": ["GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81"], "Capabilities": [["gpu"]] } - } + }, + "driverVersion": "570.195.03", + "memoryTotal": "3072 MiB" } ``` @@ -80,7 +82,9 @@ Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS: "DeviceIDs": ["GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81"], "Capabilities": [["gpu"]] } - } + }, + "driverVersion": "570.195.03", + "memoryTotal": "3072 MiB" }, { "id": "disk", "total": 1 } ], @@ -93,7 +97,7 @@ Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS: "feeToken": "0x123", "prices": [ { "id": "cpu", "price": 1 }, - { "id": "nyGPU", "price": 3 } + { "id": "myGPU", "price": 3 } ] } ] @@ -161,6 +165,8 @@ root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/comp "Capabilities": [["gpu"]] } }, + "driverVersion": "570.195.03", + "memoryTotal": "3072 MiB", "max": 1, "min": 0, "inUse": 0 @@ -259,7 +265,9 @@ Then define DOCKER_COMPUTE_ENVIRONMENTS with "seccomp": "unconfined" } } - } + }, + "driverVersion": "26.2.2", + "memoryTotal": "16384 MiB" }, { "id": "disk", @@ -316,8 +324,7 @@ Then define DOCKER_COMPUTE_ENVIRONMENTS with aka ```bash -export DOCKER_COMPUTE_ENVIRONMENTS="[{\"socketPath\":\"/var/run/docker.sock\",\"resources\":[{\"id\":\"myGPU\",\"description\":\"AMD Radeon RX 9070 XT\",\"type\":\"gpu\",\"total\":1,\"init\":{\"advanced\":{ -\"IpcMode\":\"host\",\"CapAdd\":[\"CAP_SYS_PTRACE\"],\"Devices\":[\"/dev/dxg\",\"/dev/dri/card0\"],\"Binds\":[\"/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so\",\"/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1\"],\"SecurityOpt\":{\"seccomp\":\"unconfined\"}}}},{\"id\":\"disk\",\"total\":10}],\"storageExpiry\":604800,\"maxJobDuration\":3600,\"minJobDuration\":60,\"fees\":{\"1\":[{\"feeToken\":\"0x123\",\"prices\":[{\"id\":\"cpu\",\"price\":1},{\"id\":\"nyGPU\",\"price\":3}]}]},\"free\":{\"maxJobDuration\":60,\"minJobDuration\":10,\"maxJobs\":3,\"resources\":[{\"id\":\"cpu\",\"max\":1},{\"id\":\"ram\",\"max\":1},{\"id\":\"disk\",\"max\":1},{\"id\":\"myGPU\",\"max\":1}]}}]" +export DOCKER_COMPUTE_ENVIRONMENTS='[{"socketPath":"/var/run/docker.sock","resources":[{"id":"myGPU","description":"AMD Radeon RX 9070 XT","type":"gpu","total":1,"init":{"advanced":{"IpcMode":"host","ShmSize":8589934592,"CapAdd":["SYS_PTRACE"],"Devices":["/dev/dxg","/dev/dri/card0"],"Binds":["/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so","/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1"],"SecurityOpt":{"seccomp":"unconfined"}}},"driverVersion":"26.2.2","memoryTotal":"16384 MiB"},{"id":"disk","total":1}],"storageExpiry":604800,"maxJobDuration":3600,"minJobDuration":60,"fees":{"1":[{"feeToken":"0x123","prices":[{"id":"cpu","price":1},{"id":"nyGPU","price":3}]}]},"free":{"maxJobDuration":60,"minJobDuration":10,"maxJobs":3,"resources":[{"id":"cpu","max":1},{"id":"ram","max":1},{"id":"disk","max":1},{"id":"myGPU","max":1}]}}]' ``` you should have it in your compute envs: @@ -390,6 +397,8 @@ root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/comp } } }, + "driverVersion": "26.2.2", + "memoryTotal": "16384 MiB", "max": 1, "min": 0, "inUse": 0 @@ -541,7 +550,9 @@ Now, we can define the GPU for the node: "GroupAdd": ["video", "render"], "CapAdd": ["SYS_ADMIN"] } - } + }, + "driverVersion": "32.0.101.8531", + "memoryTotal": "16384 MiB" } ``` @@ -563,7 +574,9 @@ Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS with Intel GPU: "GroupAdd": ["video", "render"], "CapAdd": ["SYS_ADMIN"] } - } + }, + "driverVersion": "32.0.101.8531", + "memoryTotal": "16384 MiB" }, { "id": "disk", "total": 1 } ], @@ -644,6 +657,8 @@ root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/comp "CapAdd": ["SYS_ADMIN"] } }, + "driverVersion": "32.0.101.8531", + "memoryTotal": "16384 MiB" "max": 1, "min": 0, "inUse": 0 diff --git a/scripts/list_gpus.sh b/scripts/list_gpus.sh index dc3fdf1e1..2abe6781d 100755 --- a/scripts/list_gpus.sh +++ b/scripts/list_gpus.sh @@ -5,29 +5,151 @@ get_nvidia_gpus() { if command -v nvidia-smi &> /dev/null; then # Query nvidia-smi for GPU count, names, and UUIDs # We use csv format for easier parsing - nvidia-smi --query-gpu=name,uuid --format=csv,noheader | while IFS=, read -r name uuid; do + nvidia-smi --query-gpu=name,uuid,driver_version,memory.total --format=csv,noheader | while IFS=, read -r name uuid driver_version memory_total; do # Trim leading/trailing whitespace name=$(echo "$name" | xargs) uuid=$(echo "$uuid" | xargs) + driver_version=$(echo "$driver_version" | xargs) + memory_total=$(echo "$memory_total" | xargs) # Create a JSON object for this GPU # Note: We use the UUID as the ID locally, but it will be aggregated later jq -c -n \ --arg name "$name" \ --arg uuid "$uuid" \ + --arg driver_version "$driver_version" \ + --arg memory_total "$memory_total" \ '{ description: $name, + driverVersion: (if $driver_version != "" then $driver_version else null end), + memoryTotal: (if $memory_total != "" then $memory_total else null end), init: { deviceRequests: { Driver: "nvidia", Devices: [$uuid] } } - }' + } | del(.. | select(. == null))' done fi } +get_driver_version() { + local module="$1" + + # Try the version field first (present when built as a loadable module) + local ver + ver=$(modinfo "$module" 2>/dev/null | awk '/^version:/ {print $2; exit}') + if [ -n "$ver" ]; then + echo "$ver" + return + fi + + # If module exists but only has srcversion (built-in to kernel - integrated + # GPU), fall back to the kernel version as the effective driver version + if modinfo "$module" 2>/dev/null | grep -q "^srcversion:"; then + uname -r + return + fi +} + +get_intel_driver_version() { + # oneAPI / Level Zero runtime (Intel Arc) + local ver + + # Try clinfo first (works for both oneAPI and standard OpenCL) + ver=$(clinfo 2>/dev/null | awk '/Driver Version/ {print $NF; exit}') + [ -n "$ver" ] && { echo "$ver"; return; } + + # Try package manager + ver=$(dpkg -l 2>/dev/null | awk '/intel-level-zero-gpu/ {print $3; exit}') + [ -z "$ver" ] && ver=$(rpm -q --qf '%{VERSION}' intel-level-zero-gpu 2>/dev/null) + [ -n "$ver" ] && { echo "$ver"; return; } + + # Fall back to kernel module / kernel version + get_driver_version "xe" || get_driver_version "i915" +} + +get_amd_driver_version() { + local ver + + # ROCm version file (most reliable) + ver=$(cat /opt/rocm/.info/version 2>/dev/null | head -n1) + [ -n "$ver" ] && { echo "ROCm $ver"; return; } + + # Try rocm-smi + ver=$(rocm-smi --version 2>/dev/null | awk '/ROCm/ {print $NF; exit}') + [ -n "$ver" ] && { echo "$ver"; return; } + + # Try package manager + ver=$(dpkg -l 2>/dev/null | awk '/rocm-core/ {print $3; exit}') + [ -z "$ver" ] && ver=$(rpm -q --qf '%{VERSION}' rocm-core 2>/dev/null) + [ -n "$ver" ] && { echo "ROCm $ver"; return; } + + # Fall back to kernel module + get_driver_version "amdgpu" +} + +get_amd_vram() { + local card_path="$1" + local real_device_path + real_device_path=$(readlink -f "$card_path/device") + + # Try sysfs VRAM total (bytes) — most reliable on amdgpu + local vram_bytes="" + for f in \ + "$real_device_path/mem_info_vram_total" \ + "$card_path/device/mem_info_vram_total"; do + if [ -r "$f" ]; then + vram_bytes=$(cat "$f" 2>/dev/null) + break + fi + done + + if [ -n "$vram_bytes" ] && [ "$vram_bytes" -gt 0 ] 2>/dev/null; then + echo $(( vram_bytes / 1024 / 1024 )) MiB + fi +} + +get_intel_vram() { + local card_path="$1" + local real_device_path + real_device_path=$(readlink -f "$card_path/device") + + # Dedicated VRAM via sysfs (Intel Arc, etc.) + for f in \ + "$real_device_path/drm/$(basename $card_path)/gt/gt0/mem_info_vram_total" \ + "$real_device_path/mem_info_vram_total"; do + if [ -r "$f" ]; then + local bytes + bytes=$(cat "$f" 2>/dev/null) + if [ -n "$bytes" ] && [ "$bytes" -gt 0 ] 2>/dev/null; then + echo "$(( bytes / 1024 / 1024 )) MiB" + return + fi + fi + done + + # Integrated Intel: stolen/GTT memory from lspci, normalize to MiB + local slot + slot=$(basename "$real_device_path") + local raw + raw=$(lspci -s "$slot" -v 2>/dev/null | awk '/Memory at|prefetchable/ && /size=/ { + match($0, /size=([0-9]+[KMG])/, a); if (a[1]) { print a[1]; exit } + }') + + if [ -n "$raw" ]; then + local num unit + num=$(echo "$raw" | tr -d 'KMG') + unit=$(echo "$raw" | tr -d '0-9') + case "$unit" in + K) echo "$(( num / 1024 )) MiB" ;; + M) echo "${num} MiB" ;; + G) echo "$(( num * 1024 )) MiB" ;; + esac + fi +} + # Declare the associative array (hashmap) globally declare -A gpu_map @@ -55,8 +177,6 @@ map_pci_to_primary() { done } - - # Function to check for other GPUs (AMD, Intel, etc.) via lspci get_generic_gpus() { # Check if lspci is available @@ -131,6 +251,11 @@ process_pci_line() { case "$vendor_id_hex" in "1002") # AMD (0x1002) + local amd_driver_version + amd_driver_version=$(get_amd_driver_version) + local amd_memory_total + amd_memory_total=$(get_amd_vram "$card_path") + # Devices if [ -e "/dev/dxg" ]; then devices+=("/dev/dxg") @@ -153,6 +278,11 @@ process_pci_line() { ;; "8086") # Intel (0x8086) + local intel_driver_version + intel_driver_version=$(get_intel_driver_version) + local intel_memory_total + intel_memory_total=$(get_intel_vram "$card_path") + # Devices [ -n "$render_name" ] && devices+=("/dev/dri/$render_name") devices+=("$device_id") @@ -190,11 +320,18 @@ process_pci_line() { json_devices="[\"$device_id\"]" fi - + local driver_version="" + local memory_total="" + case "$vendor_id_hex" in + "1002") driver_version="$amd_driver_version"; memory_total="$amd_memory_total" ;; + "8086") driver_version="$intel_driver_version"; memory_total="$intel_memory_total" ;; + esac jq -c -n \ --arg desc "$description" \ --arg driver "$driver" \ --arg device_id "$device_id" \ + --arg driver_version "$driver_version" \ + --arg memory_total "$memory_total" \ --argjson dev "$json_devices" \ --argjson bind "$json_binds" \ --argjson cap "$json_cap" \ @@ -204,6 +341,8 @@ process_pci_line() { --argjson ipc "$ipc_mode" \ '{ description: $desc, + driverVersion: (if $driver_version != "" then $driver_version else null end), + memoryTotal: (if $memory_total != "" then $memory_total else null end), init: { deviceRequests: { Driver: (if $driver != "" then $driver else null end), @@ -226,25 +365,42 @@ get_all_gpus_json() { get_nvidia_gpus get_generic_gpus ) | jq -s ' - group_by(.description) | map({ - id: (.[0].description | ascii_downcase | gsub("[^a-z0-9]"; "-") | gsub("-+"; "-") | sub("^-"; "") | sub("-$"; "")), - description: .[0].description, - type: "gpu", - total: length, - init: { - deviceRequests: { - Driver: .[0].init.deviceRequests.Driver, - (if .[0].init.deviceRequests.Driver == "nvidia" then "DeviceIDs" else "Devices" end): (map(.init.deviceRequests.Devices[]?) | unique), - Capabilities: [["gpu"]] - }, - Binds: (map(.init.Binds[]?) | unique), - CapAdd: (map(.init.CapAdd[]?) | unique), - GroupAdd: (map(.init.GroupAdd[]?) | unique), - SecurityOpt: .[0].init.SecurityOpt, - ShmSize: .[0].init.ShmSize, - IpcMode: .[0].init.IpcMode + group_by(.description) | map( + { + id: (.[0].description | ascii_downcase | gsub("[^a-z0-9]"; "-") | gsub("-+"; "-") | sub("^-"; "") | sub("-$"; "")), + description: .[0].description, + type: "gpu", + total: length, + driverVersion: (.[0].driverVersion // null), + memoryTotal: (.[0].memoryTotal // null), + platform: (if .[0].init.deviceRequests.Driver == "amdgpu" then "amd" else .[0].init.deviceRequests.Driver end), + init: ( + if .[0].init.deviceRequests.Driver == "nvidia" then + { + deviceRequests: { + Driver: .[0].init.deviceRequests.Driver, + DeviceIDs: (map(.init.deviceRequests.Devices[]?) | unique), + Capabilities: [["gpu"]] + } + } + else + { + advanced: { + Driver: .[0].init.deviceRequests.Driver, + Devices: (map(.init.deviceRequests.Devices[]?) | unique), + Capabilities: [["gpu"]], + Binds: (map(.init.Binds[]?) | unique), + CapAdd: (map(.init.CapAdd[]?) | unique), + GroupAdd: (map(.init.GroupAdd[]?) | unique), + SecurityOpt: .[0].init.SecurityOpt, + ShmSize: .[0].init.ShmSize, + IpcMode: .[0].init.IpcMode + } | del(.. | select(. == null)) | del(.. | select(. == [])) + } + end + ) } | del(.. | select(. == null)) | del(.. | select(. == [])) - }) | map(if .init.deviceRequests.Driver == null then del(.init.deviceRequests.Driver) else . end) + ) ' } diff --git a/scripts/ocean-node-quickstart.sh b/scripts/ocean-node-quickstart.sh index 6e350a38e..5a6da078d 100755 --- a/scripts/ocean-node-quickstart.sh +++ b/scripts/ocean-node-quickstart.sh @@ -277,25 +277,127 @@ fi # Function to check for NVIDIA GPUs get_nvidia_gpus() { if command -v nvidia-smi &> /dev/null; then - nvidia-smi --query-gpu=name,uuid --format=csv,noheader | while IFS=, read -r name uuid; do + nvidia-smi --query-gpu=name,uuid,driver_version,memory.total --format=csv,noheader | while IFS=, read -r name uuid driver_version memory_total; do name=$(echo "$name" | xargs) uuid=$(echo "$uuid" | xargs) - jq -c -n \ + driver_version=$(echo "$driver_version" | xargs) + memory_total=$(echo "$memory_total" | xargs) + + jq -c -n \ --arg name "$name" \ --arg uuid "$uuid" \ + --arg driver_version "$driver_version" \ + --arg memory_total "$memory_total" \ '{ description: $name, + driverVersion: (if $driver_version != "" then $driver_version else null end), + memoryTotal: (if $memory_total != "" then $memory_total else null end), init: { deviceRequests: { Driver: "nvidia", Devices: [$uuid] } } - }' + } | del(.. | select(. == null))' done fi } +get_driver_version() { + local module="$1" + local ver + ver=$(modinfo "$module" 2>/dev/null | awk '/^version:/ {print $2; exit}') + if [ -n "$ver" ]; then + echo "$ver" + return + fi + + if modinfo "$module" 2>/dev/null | grep -q "^srcversion:"; then + uname -r + return + fi +} + +get_intel_driver_version() { + # oneAPI / Level Zero runtime (Intel Arc) + local ver + ver=$(clinfo 2>/dev/null | awk '/Driver Version/ {print $NF; exit}') + [ -n "$ver" ] && { echo "$ver"; return; } + ver=$(dpkg -l 2>/dev/null | awk '/intel-level-zero-gpu/ {print $3; exit}') + [ -z "$ver" ] && ver=$(rpm -q --qf '%{VERSION}' intel-level-zero-gpu 2>/dev/null) + [ -n "$ver" ] && { echo "$ver"; return; } + get_driver_version "xe" || get_driver_version "i915" +} + +get_amd_driver_version() { + local ver + ver=$(cat /opt/rocm/.info/version 2>/dev/null | head -n1) + [ -n "$ver" ] && { echo "ROCm $ver"; return; } + ver=$(rocm-smi --version 2>/dev/null | awk '/ROCm/ {print $NF; exit}') + [ -n "$ver" ] && { echo "$ver"; return; } + ver=$(dpkg -l 2>/dev/null | awk '/rocm-core/ {print $3; exit}') + [ -z "$ver" ] && ver=$(rpm -q --qf '%{VERSION}' rocm-core 2>/dev/null) + [ -n "$ver" ] && { echo "ROCm $ver"; return; } + get_driver_version "amdgpu" +} + +get_amd_vram() { + local card_path="$1" + local real_device_path + real_device_path=$(readlink -f "$card_path/device") + + local vram_bytes="" + for f in \ + "$real_device_path/mem_info_vram_total" \ + "$card_path/device/mem_info_vram_total"; do + if [ -r "$f" ]; then + vram_bytes=$(cat "$f" 2>/dev/null) + break + fi + done + + if [ -n "$vram_bytes" ] && [ "$vram_bytes" -gt 0 ] 2>/dev/null; then + echo $(( vram_bytes / 1024 / 1024 )) MiB + fi +} + +get_intel_vram() { + local card_path="$1" + local real_device_path + real_device_path=$(readlink -f "$card_path/device") + + for f in \ + "$real_device_path/drm/$(basename $card_path)/gt/gt0/mem_info_vram_total" \ + "$real_device_path/mem_info_vram_total"; do + if [ -r "$f" ]; then + local bytes + bytes=$(cat "$f" 2>/dev/null) + if [ -n "$bytes" ] && [ "$bytes" -gt 0 ] 2>/dev/null; then + echo "$(( bytes / 1024 / 1024 )) MiB" + return + fi + fi + done + + local slot + slot=$(basename "$real_device_path") + local raw + raw=$(lspci -s "$slot" -v 2>/dev/null | awk '/Memory at|prefetchable/ && /size=/ { + match($0, /size=([0-9]+[KMG])/, a); if (a[1]) { print a[1]; exit } + }') + + if [ -n "$raw" ]; then + local num unit + num=$(echo "$raw" | tr -d 'KMG') + unit=$(echo "$raw" | tr -d '0-9') + case "$unit" in + K) echo "$(( num / 1024 )) MiB" ;; + M) echo "${num} MiB" ;; + G) echo "$(( num * 1024 )) MiB" ;; + esac + fi +} + # Declare the associative array (hashmap) globally declare -A gpu_map @@ -309,40 +411,51 @@ map_pci_to_primary() { done } +# Function to check for other GPUs (AMD, Intel, etc.) via lspci +get_generic_gpus() { + if ! command -v lspci &> /dev/null; then + return + fi + + map_pci_to_primary + lspci -mm -n -d ::0300 | while read -r line; do process_pci_line "$line"; done + lspci -mm -n -d ::0302 | while read -r line; do process_pci_line "$line"; done +} + process_pci_line() { line="$1" - + slot=$(echo "$line" | awk '{print $1}') vendor_id_hex=$(echo "$line" | awk '{print $3}' | tr -d '"') - + if [[ "$vendor_id_hex" == "10de" ]] && command -v nvidia-smi &> /dev/null; then return fi - + full_info=$(lspci -s "$slot" -vmm) vendor_name=$(echo "$full_info" | grep "^Vendor:" | cut -f2-) device_name=$(echo "$full_info" | grep "^Device:" | cut -f2-) - + description="$vendor_name $device_name" - pci_id="0000:$slot" - + pci_id="0000:$slot" + driver="" - if [[ "$vendor_id_hex" == "1002" ]]; then + if [[ "$vendor_id_hex" == "1002" ]]; then # AMD driver="amdgpu" - elif [[ "$vendor_id_hex" = "8086" ]]; then + elif [[ "$vendor_id_hex" = "8086" ]]; then # Intel driver="intel" fi device_id="" card_path="" if [ -n "${gpu_map[$pci_id]}" ]; then - device_id="${gpu_map[$pci_id]}" + device_id="${gpu_map[$pci_id]}" # e.g. /dev/dri/card0 card_name=$(basename "$device_id") card_path="/sys/class/drm/$card_name" else device_id="${pci_id}" fi - + local devices=() local binds=() local cap_add=() @@ -355,38 +468,52 @@ process_pci_line() { local real_device_path=$(readlink -f "$card_path/device") local render_name="" if [ -d "$real_device_path/drm" ]; then - render_name=$(ls "$real_device_path/drm" | grep "^renderD" | head -n 1) + render_name=$(ls "$real_device_path/drm" | grep "^renderD" | head -n 1) fi case "$vendor_id_hex" in - "1002") # AMD + "1002") # AMD (0x1002) + local amd_driver_version + amd_driver_version=$(get_amd_driver_version) + local amd_memory_total + amd_memory_total=$(get_amd_vram "$card_path") + if [ -e "/dev/dxg" ]; then devices+=("/dev/dxg") else devices+=("/dev/kfd") fi [ -n "$render_name" ] && devices+=("/dev/dri/$render_name") - devices+=("$device_id") + devices+=("$device_id") # /dev/dri/cardX + [ -e "/opt/rocm/lib/libhsa-runtime64.so.1" ] && \ - binds+=("/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1") + binds+=("/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1") + cap_add+=("SYS_PTRACE") ipc_mode="\"host\"" shm_size="8589934592" security_opt='{"seccomp": "unconfined"}' ;; - "8086") # Intel - [ -n "$render_name" ] && devices+=("/dev/dri/$render_name") - devices+=("$device_id") - group_add+=("video" "render") - cap_add+=("SYS_ADMIN") + + "8086") # Intel (0x8086) + local intel_driver_version + intel_driver_version=$(get_intel_driver_version) + local intel_memory_total + intel_memory_total=$(get_intel_vram "$card_path") + + [ -n "$render_name" ] && devices+=("/dev/dri/$render_name") + devices+=("$device_id") + + group_add+=("video" "render") + cap_add+=("SYS_ADMIN") ;; esac else - if [[ "$vendor_id_hex" == "1002" ]] || [[ "$vendor_id_hex" == "8086" ]]; then - if [[ "$device_id" == /dev/* ]]; then - devices+=("$device_id") - fi - fi + if [[ "$vendor_id_hex" == "1002" ]] || [[ "$vendor_id_hex" == "8086" ]]; then + if [[ "$device_id" == /dev/* ]]; then + devices+=("$device_id") + fi + fi fi json_devices=$(printf '%s\n' "${devices[@]}" | jq -R . | jq -s . | jq 'map(select(length > 0))') @@ -395,13 +522,21 @@ process_pci_line() { json_group=$(printf '%s\n' "${group_add[@]}" | jq -R . | jq -s . | jq 'map(select(length > 0))') if [ "$(echo "$json_devices" | jq length)" -eq 0 ]; then - json_devices="[\"$device_id\"]" + json_devices="[\"$device_id\"]" fi + local driver_version="" + local memory_total="" + case "$vendor_id_hex" in + "1002") driver_version="$amd_driver_version"; memory_total="$amd_memory_total" ;; + "8086") driver_version="$intel_driver_version"; memory_total="$intel_memory_total" ;; + esac jq -c -n \ --arg desc "$description" \ --arg driver "$driver" \ --arg device_id "$device_id" \ + --arg driver_version "$driver_version" \ + --arg memory_total "$memory_total" \ --argjson dev "$json_devices" \ --argjson bind "$json_binds" \ --argjson cap "$json_cap" \ @@ -411,6 +546,8 @@ process_pci_line() { --argjson ipc "$ipc_mode" \ '{ description: $desc, + driverVersion: (if $driver_version != "" then $driver_version else null end), + memoryTotal: (if $memory_total != "" then $memory_total else null end), init: { deviceRequests: { Driver: (if $driver != "" then $driver else null end), @@ -427,41 +564,48 @@ process_pci_line() { } | del(.. | select(. == null)) | del(.. | select(. == []))' } -# Function to check for other GPUs (AMD, Intel, etc.) via lspci -get_generic_gpus() { - if ! command -v lspci &> /dev/null; then - return - fi - map_pci_to_primary - lspci -mm -n -d ::0300 | while read -r line; do process_pci_line "$line"; done - lspci -mm -n -d ::0302 | while read -r line; do process_pci_line "$line"; done -} - # Function to get all GPUs in JSON array format get_all_gpus_json() { ( get_nvidia_gpus get_generic_gpus ) | jq -s ' - group_by(.description) | map({ - id: (.[0].description | ascii_downcase | gsub("[^a-z0-9]"; "-") | gsub("-+"; "-") | sub("^-"; "") | sub("-$"; "")), - description: .[0].description, - type: "gpu", - total: length, - init: { - deviceRequests: { - Driver: .[0].init.deviceRequests.Driver, - (if .[0].init.deviceRequests.Driver == "nvidia" then "DeviceIDs" else "Devices" end): (map(.init.deviceRequests.Devices[]?) | unique), - Capabilities: [["gpu"]] - }, - Binds: (map(.init.Binds[]?) | unique), - CapAdd: (map(.init.CapAdd[]?) | unique), - GroupAdd: (map(.init.GroupAdd[]?) | unique), - SecurityOpt: .[0].init.SecurityOpt, - ShmSize: .[0].init.ShmSize, - IpcMode: .[0].init.IpcMode + group_by(.description) | map( + { + id: (.[0].description | ascii_downcase | gsub("[^a-z0-9]"; "-") | gsub("-+"; "-") | sub("^-"; "") | sub("-$"; "")), + description: .[0].description, + type: "gpu", + total: length, + driverVersion: (.[0].driverVersion // null), + memoryTotal: (.[0].memoryTotal // null), + platform: (if .[0].init.deviceRequests.Driver == "amdgpu" then "amd" else .[0].init.deviceRequests.Driver end), + init: ( + if .[0].init.deviceRequests.Driver == "nvidia" then + { + deviceRequests: { + Driver: .[0].init.deviceRequests.Driver, + DeviceIDs: (map(.init.deviceRequests.Devices[]?) | unique), + Capabilities: [["gpu"]] + } + } + else + { + advanced: { + Driver: .[0].init.deviceRequests.Driver, + Devices: (map(.init.deviceRequests.Devices[]?) | unique), + Capabilities: [["gpu"]], + Binds: (map(.init.Binds[]?) | unique), + CapAdd: (map(.init.CapAdd[]?) | unique), + GroupAdd: (map(.init.GroupAdd[]?) | unique), + SecurityOpt: .[0].init.SecurityOpt, + ShmSize: .[0].init.ShmSize, + IpcMode: .[0].init.IpcMode + } | del(.. | select(. == null)) | del(.. | select(. == [])) + } + end + ) } | del(.. | select(. == null)) | del(.. | select(. == [])) - }) | map(if .init.deviceRequests.Driver == null then del(.init.deviceRequests.Driver) else . end) + ) ' } diff --git a/src/@types/C2D/C2D.ts b/src/@types/C2D/C2D.ts index 5b52751fd..fbf9ef6cc 100644 --- a/src/@types/C2D/C2D.ts +++ b/src/@types/C2D/C2D.ts @@ -56,6 +56,12 @@ export interface ComputeResource { min: number // min number of resource needed for a job max: number // max number of resource for a job inUse?: number // for display purposes + driverVersion?: string + memoryTotal?: string + /** + * `nvidia` | `amd` | `intel` + */ + platform?: string init?: dockerHwInit } export interface ComputeResourceRequest { diff --git a/src/utils/config/schemas.ts b/src/utils/config/schemas.ts index 15671d9c7..7246e8de9 100644 --- a/src/utils/config/schemas.ts +++ b/src/utils/config/schemas.ts @@ -118,7 +118,10 @@ export const ComputeResourceSchema = z.object({ min: z.number().optional(), max: z.number().optional(), inUse: z.number().optional(), - init: z.any().optional() + init: z.any().optional(), + platform: z.string().optional(), + memoryTotal: z.string().optional(), + driverVersion: z.string().optional() }) export const ComputeResourcesPricingInfoSchema = z.object({