Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,18 @@ cp /data/tests/test_package_integrity.py .
pytest test_package_integrity.py
```

### Contribute to a fork branch

When a user open a PR from a fork, we are allowed to push to the fork branch.

If you want to do so, do the following:

```bash
git remote add <user_name> https://github.com/<user_name>/codecarbon.git
git fetch <user_name> <git_branch>
git checkout -b <git_branch> <user_name>/<git_branch>
```

<!-- TOC --><a name="api-and-dashboard"></a>
## API and Dashboard

Expand Down
4 changes: 2 additions & 2 deletions codecarbon/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from codecarbon.core.rapl import RAPLFile
from codecarbon.core.units import Time
from codecarbon.core.util import detect_cpu_model
from codecarbon.core.util import count_cpus, detect_cpu_model
from codecarbon.external.logger import logger
from codecarbon.input import DataSource

Expand Down Expand Up @@ -1001,7 +1001,7 @@ def _main(self) -> Tuple[str, int]:
)
if is_psutil_available():
# Count thread of the CPU
threads = psutil.cpu_count(logical=True)
threads = count_cpus()
estimated_tdp = threads * DEFAULT_POWER_PER_CORE
logger.warning(
f"We will use the default power consumption of {DEFAULT_POWER_PER_CORE} W per thread for your {threads} CPU, so {estimated_tdp}W."
Expand Down
293 changes: 75 additions & 218 deletions codecarbon/core/gpu.py
Original file line number Diff line number Diff line change
@@ -1,223 +1,85 @@
from dataclasses import dataclass, field
from typing import Any, Dict, List, Union
from typing import List

import pynvml

from codecarbon.core.units import Energy, Power, Time
from codecarbon.core import gpu_amd, gpu_nvidia
from codecarbon.core.gpu_device import GPUDevice
from codecarbon.core.units import Time
from codecarbon.external.logger import logger

AMDSMI_AVAILABLE = gpu_amd.AMDSMI_AVAILABLE
PYNVML_AVAILABLE = gpu_nvidia.PYNVML_AVAILABLE

@dataclass
class GPUDevice:
"""
Represents a GPU device with associated energy and power metrics.

Attributes:
handle (any): An identifier for the GPU device.
gpu_index (int): The index of the GPU device in the system.
energy_delta (Energy): The amount of energy consumed by the GPU device
since the last measurement, expressed in kilowatt-hours (kWh).
Defaults to an initial value of 0 kWh.
power (Power): The current power consumption of the GPU device,
measured in watts (W). Defaults to an initial value of 0 W.
last_energy (Energy): The last recorded energy reading for the GPU
device, expressed in kilowatt-hours (kWh). This is used to
calculate `energy_delta`. Defaults to an initial value of 0 kWh.
"""

handle: any
gpu_index: int
# Energy consumed in kWh
energy_delta: Energy = field(default_factory=lambda: Energy(0))
# Power based on reading
power: Power = field(default_factory=lambda: Power(0))
# Last energy reading in kWh
last_energy: Energy = field(default_factory=lambda: Energy(0))

def start(self) -> None:
self.last_energy = self._get_energy_kwh()

def __post_init__(self) -> None:
self.last_energy = self._get_energy_kwh()
self._init_static_details()

def _get_energy_kwh(self) -> Energy:
total_energy_consumption = self._get_total_energy_consumption()
if total_energy_consumption is None:
return self.last_energy
return Energy.from_millijoules(total_energy_consumption)

def delta(self, duration: Time) -> dict:
"""
Compute the energy/power used since last call.
"""
new_last_energy = energy = self._get_energy_kwh()
self.power = self.power.from_energies_and_delay(
energy, self.last_energy, duration
)
self.energy_delta = energy - self.last_energy
self.last_energy = new_last_energy
return {
"name": self._gpu_name,
"uuid": self._uuid,
"delta_energy_consumption": self.energy_delta,
"power_usage": self.power,
}

def get_static_details(self) -> Dict[str, Any]:
return {
"name": self._gpu_name,
"uuid": self._uuid,
"total_memory": self._total_memory,
"power_limit": self._power_limit,
"gpu_index": self.gpu_index,
}

def _init_static_details(self) -> None:
self._gpu_name = self._get_gpu_name()
self._uuid = self._get_uuid()
self._power_limit = self._get_power_limit()
# Get the memory
memory = self._get_memory_info()
self._total_memory = memory.total

def get_gpu_details(self) -> Dict[str, Any]:
# Memory
memory = self._get_memory_info()

device_details = {
"name": self._gpu_name,
"uuid": self._uuid,
"free_memory": memory.free,
"total_memory": memory.total,
"used_memory": memory.used,
"temperature": self._get_temperature(),
"power_usage": self._get_power_usage(),
"power_limit": self._power_limit,
"total_energy_consumption": self._get_total_energy_consumption(),
"gpu_utilization": self._get_gpu_utilization(),
"compute_mode": self._get_compute_mode(),
"compute_processes": self._get_compute_processes(),
"graphics_processes": self._get_graphics_processes(),
}
return device_details

def _to_utf8(self, str_or_bytes) -> Any:
if hasattr(str_or_bytes, "decode"):
return str_or_bytes.decode("utf-8", errors="replace")

return str_or_bytes

def _get_total_energy_consumption(self) -> int:
"""Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a
"""
try:
return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
except pynvml.NVMLError:
logger.warning(
"Failed to retrieve gpu total energy consumption", exc_info=True
)
return None

def _get_gpu_name(self) -> Any:
"""Returns the name of the GPU device
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481
"""
try:
name = pynvml.nvmlDeviceGetName(self.handle)
return self._to_utf8(name)
except UnicodeDecodeError:
return "Unknown GPU"

def _get_uuid(self) -> Any:
"""Returns the globally unique GPU device UUID
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654
"""
uuid = pynvml.nvmlDeviceGetUUID(self.handle)
return self._to_utf8(uuid)

def _get_memory_info(self):
"""Returns memory info in bytes
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8
"""
try:
return pynvml.nvmlDeviceGetMemoryInfo(self.handle)
except pynvml.NVMLError_NotSupported:
# error thrown for the NVIDIA Blackwell GPU of DGX Spark, due to memory sharing -> return defaults instead
return pynvml.c_nvmlMemory_t(-1, -1, -1)

def _get_temperature(self) -> int:
"""Returns degrees in the Celsius scale
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121
"""
return pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU)

def _get_power_usage(self) -> int:
"""Returns power usage in milliwatts
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87
"""
return pynvml.nvmlDeviceGetPowerUsage(self.handle)

def _get_power_limit(self) -> Union[int, None]:
"""Returns max power usage in milliwatts
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad
"""
try:
return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
except Exception:
return None
AMDGPUDevice = gpu_amd.AMDGPUDevice
NvidiaGPUDevice = gpu_nvidia.NvidiaGPUDevice
is_rocm_system = gpu_amd.is_rocm_system
is_nvidia_system = gpu_nvidia.is_nvidia_system

def _get_gpu_utilization(self):
"""Returns the % of utilization of the kernels during the last sample
https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t
"""
return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu

def _get_compute_mode(self) -> int:
"""Returns the compute mode of the GPU
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gbed1b88f2e3ba39070d31d1db4340233
"""
return pynvml.nvmlDeviceGetComputeMode(self.handle)

def _get_compute_processes(self) -> List:
"""Returns the list of processes ids having a compute context on the
device with the memory used
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68
"""
try:
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)

return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
except pynvml.NVMLError:
return []

def _get_graphics_processes(self) -> List:
"""Returns the list of processes ids having a graphics context on the
device with the memory used
https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e
"""
try:
processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)

return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
except pynvml.NVMLError:
return []
# Backward-compatible module attributes
amdsmi = gpu_amd.amdsmi
pynvml = gpu_nvidia.pynvml


class AllGPUDevices:
device_count: int
devices: List[GPUDevice]

def __init__(self) -> None:
if is_gpu_details_available():
gpu_details_available = is_gpu_details_available()
if gpu_details_available:
logger.debug("GPU available. Starting setup")
self.device_count = pynvml.nvmlDeviceGetCount()
else:
logger.error("There is no GPU available")
self.device_count = 0
self.devices = []
for i in range(self.device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_device = GPUDevice(handle=handle, gpu_index=i)
self.devices.append(gpu_device)

if PYNVML_AVAILABLE:
logger.debug("PyNVML available. Starting setup")
gpu_nvidia.pynvml.nvmlInit()
nvidia_devices_count = gpu_nvidia.pynvml.nvmlDeviceGetCount()
for i in range(nvidia_devices_count):
handle = gpu_nvidia.pynvml.nvmlDeviceGetHandleByIndex(i)
nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i)
self.devices.append(nvidia_gpu_device)

if AMDSMI_AVAILABLE:
logger.debug("AMDSMI available. Starting setup")
try:
gpu_amd.amdsmi.amdsmi_init()
amd_devices_handles = gpu_amd.amdsmi.amdsmi_get_processor_handles()
if len(amd_devices_handles) == 0:
print(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

log instead of print ?

"No AMD GPUs foundon machine with amdsmi_get_processor_handles() !"
)
else:
for i, handle in enumerate(amd_devices_handles):
# Try to get the actual device index from BDF (Bus/Device/Function)
# If this fails, fall back to enumeration index
try:
bdf_info = gpu_amd.amdsmi.amdsmi_get_gpu_device_bdf(handle)
# BDF typically contains domain, bus, device, function
# The device portion often corresponds to the GPU index
# For now, we'll use the enumeration index but log the BDF
logger.debug(
f"Found AMD GPU device with handle {handle}, enum_index {i}, BDF {bdf_info}: {gpu_amd.amdsmi.amdsmi_get_gpu_device_uuid(handle)}"
)
# Use enumerate index for now - this will be the index in the filtered list
gpu_index = i
except Exception:
logger.debug(
f"Found AMD GPU device with handle {handle} and index {i} : {gpu_amd.amdsmi.amdsmi_get_gpu_device_uuid(handle)}"
)
gpu_index = i

amd_gpu_device = AMDGPUDevice(
handle=handle, gpu_index=gpu_index
)
self.devices.append(amd_gpu_device)
except gpu_amd.amdsmi.AmdSmiException as e:
logger.warning(f"Failed to initialize AMDSMI: {e}", exc_info=True)
self.device_count = len(self.devices)

def start(self) -> None:
for device in self.devices:
if hasattr(device, "start"):
device.start()

def get_gpu_static_info(self) -> List:
"""Get all GPUs static information.
Expand All @@ -239,7 +101,7 @@ def get_gpu_static_info(self) -> List:
devices_static_info.append(gpu_device.get_static_details())
return devices_static_info

except pynvml.NVMLError:
except Exception:
logger.warning("Failed to retrieve gpu static info", exc_info=True)
return []

Expand Down Expand Up @@ -267,11 +129,11 @@ def get_gpu_details(self) -> List:
try:
devices_info = []
for i in range(self.device_count):
gpu_device: GPUDevice = self.devices[i]
gpu_device = self.devices[i]
devices_info.append(gpu_device.get_gpu_details())
return devices_info

except pynvml.NVMLError:
except Exception:
logger.warning("Failed to retrieve gpu information", exc_info=True)
return []

Expand All @@ -290,20 +152,15 @@ def get_delta(self, last_duration: Time) -> List:
try:
devices_info = []
for i in range(self.device_count):
gpu_device: GPUDevice = self.devices[i]
gpu_device = self.devices[i]
devices_info.append(gpu_device.delta(last_duration))
return devices_info

except pynvml.NVMLError:
except Exception:
logger.warning("Failed to retrieve gpu information", exc_info=True)
return []


def is_gpu_details_available() -> bool:
"""Returns True if the GPU details are available."""
try:
pynvml.nvmlInit()
return True

except pynvml.NVMLError:
return False
return PYNVML_AVAILABLE or AMDSMI_AVAILABLE
Loading
Loading