diff --git a/conf/glances.conf b/conf/glances.conf index 6a0fb795..bd615416 100644 --- a/conf/glances.conf +++ b/conf/glances.conf @@ -182,6 +182,10 @@ mem_critical=90 temperature_careful=60 temperature_warning=70 temperature_critical=80 +# Ignore specific GPU devices by ID (comma-separated) +# Use 'xpu-smi discovery' to see device IDs +# Example: ignore device 2 (typically the iGPU) +#ignore_devices=2 [mem] disable=False diff --git a/glances/plugins/gpu/__init__.py b/glances/plugins/gpu/__init__.py index eb7ab9c1..170fb281 100644 --- a/glances/plugins/gpu/__init__.py +++ b/glances/plugins/gpu/__init__.py @@ -3,6 +3,7 @@ # # Copyright (C) 2020 Kirby Banman # Copyright (C) 2024 Nicolas Hennion +# Intel GPU support added (poorly) 2025 by # # SPDX-License-Identifier: LGPL-3.0-only # @@ -12,11 +13,13 @@ Currently supported: - NVIDIA GPU (need pynvml lib) - AMD GPU (no lib needed) +- Intel GPU (need xpumcli, requires root/sudo right for utilization) """ from glances.globals import to_fahrenheit from glances.logger import logger from glances.plugins.gpu.cards.amd import AmdGPU +from glances.plugins.gpu.cards.intel import IntelGPU from glances.plugins.gpu.cards.nvidia import NvidiaGPU from glances.plugins.plugin.model import GlancesPluginModel @@ -90,13 +93,24 @@ class GpuPlugin(GlancesPluginModel): logger.debug(f'AMD GPU initialization error: {e}') self.amd = None + # Init the Intel GPU API + try: + self.intel = IntelGPU(config=config) + except Exception as e: + logger.debug(f'Intel GPU initialization error: {e}') + self.intel = None + # We want to display the stat in the curse interface self.display_curse = True def exit(self): """Overwrite the exit method to close the GPU API.""" - self.nvidia.exit() - self.amd.exit() + if self.nvidia: + self.nvidia.exit() + if self.amd: + self.amd.exit() + if self.intel: + self.intel.exit() # Call the father exit method super().exit() @@ -117,6 +131,8 @@ class GpuPlugin(GlancesPluginModel): stats.extend(self.nvidia.get_device_stats()) if self.amd: stats.extend(self.amd.get_device_stats()) + if self.intel: + stats.extend(self.intel.get_device_stats()) # !!! # Uncomment to test on computer without Nvidia GPU diff --git a/glances/plugins/gpu/cards/intel.py b/glances/plugins/gpu/cards/intel.py new file mode 100644 index 00000000..391284cc --- /dev/null +++ b/glances/plugins/gpu/cards/intel.py @@ -0,0 +1,281 @@ +# +# This file is part of Glances. +# +# Intel GPU support added (poorly) 2025 by +# +# SPDX-License-Identifier: LGPL-3.0-only +# + +"""Intel GPU card for Glances.""" + +import glob +import json +import os +import re +import subprocess +import time +from collections import defaultdict + +from glances.logger import logger + + +class IntelGPU: + """Intel GPU card (Arc, Xe) using xpumcli + fdinfo.""" + + def __init__(self, config=None): + """Init Intel GPU detection.""" + self.ready = False + self.device_count = 0 + self.pci_to_id = {} + self.fdinfo_last = {} + self.config = config + + # Parse ignore_devices from config + self.ignore_devices = set() + if config: + try: + ignore_str = config.get_value('gpu', 'ignore_devices', default='') + if ignore_str: + self.ignore_devices = {int(x.strip()) for x in ignore_str.split(',') if x.strip()} + logger.debug(f"Intel GPU ignoring devices: {self.ignore_devices}") + except Exception as e: + logger.debug(f"Error parsing ignore_devices: {e}") + + # Detect which command is available: xpu-smi (newer) or xpumcli (older) + self.xpumcli_cmd = None + for cmd in ['xpu-smi', 'xpumcli']: + try: + result = subprocess.run([cmd, '--version'], capture_output=True, timeout=2) + if result.returncode == 0: + self.xpumcli_cmd = cmd + logger.debug(f"Found Intel GPU tool: {cmd}") + break + except (subprocess.TimeoutExpired, FileNotFoundError): + continue + + if not self.xpumcli_cmd: + logger.debug("Neither xpu-smi nor xpumcli found, Intel GPU support disabled") + return + + # Get Intel GPU device list + try: + result = subprocess.run([self.xpumcli_cmd, 'discovery', '-j'], capture_output=True, text=True, timeout=5) + + if result.returncode == 0: + data = json.loads(result.stdout) + devices = data.get('device_list', []) + self.device_count = len(devices) + + # Build PCI address mapping + for device in devices: + device_id = device.get('device_id') + pci_addr = device.get('pci_bdf_address', '').lower() + if device_id is not None and pci_addr: + self.pci_to_id[pci_addr] = device_id + + if self.device_count > 0: + self.ready = True + logger.debug(f"Intel GPU support initialized: {self.device_count} device(s)") + except Exception as e: + logger.debug(f"Intel GPU initialization failed: {e}") + + def get_device_stats(self): + """Get Intel GPU stats. + + Returns list of dicts with GPU stats. + """ + if not self.ready: + return [] + + stats = [] + + # Get GPU utilization from fdinfo + intel_util = self._get_fdinfo_utilization() + + # Query each Intel GPU + for xpu_device_id in range(self.device_count): + # Skip ignored devices + if xpu_device_id in self.ignore_devices: + logger.debug(f"Skipping ignored Intel GPU device {xpu_device_id}") + continue + + try: + result = subprocess.run( + [self.xpumcli_cmd, 'stats', '-j', '-d', str(xpu_device_id)], + capture_output=True, + text=True, + timeout=5, + ) + + if result.returncode != 0: + continue + + data = json.loads(result.stdout) + device_level = data.get('device_level', []) + + device_stats = { + 'key': 'gpu_id', + 'gpu_id': f'intel{xpu_device_id}', + 'name': self._get_device_name(xpu_device_id), + 'mem': self._extract_metric(device_level, 'XPUM_STATS_MEMORY_UTILIZATION'), + 'proc': intel_util.get(xpu_device_id, 0.0), + 'temperature': self._extract_metric(device_level, 'XPUM_STATS_MEMORY_TEMPERATURE'), + 'fan_speed': None, # Not available + } + + # Set None for invalid values + if device_stats['mem'] <= 0: + device_stats['mem'] = None + if device_stats['temperature'] <= 0: + device_stats['temperature'] = None + + stats.append(device_stats) + + except Exception as e: + logger.debug(f"Error getting Intel GPU {xpu_device_id} stats: {e}") + continue + + return stats + + def _get_device_name(self, device_id): + """Get Intel GPU device name.""" + try: + result = subprocess.run([self.xpumcli_cmd, 'discovery', '-j'], capture_output=True, text=True, timeout=5) + + if result.returncode == 0: + data = json.loads(result.stdout) + for device in data.get('device_list', []): + if device.get('device_id') == device_id: + name = device.get('device_name', 'Intel GPU') + # Clean up name + name = name.replace('Intel(R) ', '').replace('Graphics ', '') + if not name or name == 'Graphics': + # Fallback to PCI device ID + pci_id = device.get('pci_device_id', '') + if pci_id.startswith('0x'): + name = pci_id[2:] + else: + name = 'Intel GPU' + return name + except Exception: + pass + return 'Intel GPU' + + def _extract_metric(self, device_level, metric_type): + """Extract metric from xpumcli device_level array.""" + for metric in device_level: + if metric.get('metrics_type') == metric_type: + return metric.get('value', 0) + return 0 + + def _get_fdinfo_utilization(self): + """Get Intel GPU utilization from /proc/*/fdinfo/*. + + Returns dict of {device_id: utilization_percent} + + Requires root/CAP_SYS_PTRACE to see all processes. + """ + current_time = time.time() + + # Find all processes with GPU access + pci_to_cycles = defaultdict(lambda: defaultdict(int)) + + for proc_dir in glob.glob('/proc/[0-9]*'): + try: + fdinfo_dir = os.path.join(proc_dir, 'fdinfo') + + if not os.path.exists(fdinfo_dir): + continue + + for fdinfo_file in os.listdir(fdinfo_dir): + fdinfo_path = os.path.join(fdinfo_dir, fdinfo_file) + + try: + with open(fdinfo_path) as f: + content = f.read() + + # Check for Intel GPU + pci_match = re.search(r'drm-pdev:\s*([0-9a-f:\.]+)', content) + if not pci_match or 'drm-cycles-' not in content: + continue + + pci_addr = pci_match.group(1).lower() + + # Only process Intel GPUs we know about + if pci_addr not in self.pci_to_id: + continue + + # Parse engine cycles + cycles_pattern = re.compile(r'drm-cycles-(\w+):\s+(\d+)') + total_cycles_pattern = re.compile(r'drm-total-cycles-(\w+):\s+(\d+)') + + for match in cycles_pattern.finditer(content): + engine = match.group(1) + value = int(match.group(2)) + pci_to_cycles[pci_addr][engine + '_cycles'] += value + + for match in total_cycles_pattern.finditer(content): + engine = match.group(1) + value = int(match.group(2)) + key = engine + '_total' + pci_to_cycles[pci_addr][key] = max(pci_to_cycles[pci_addr][key], value) + + except (OSError, PermissionError): + continue + except (ValueError, OSError, PermissionError): + continue + + # Calculate utilization + utilization = {} + + for pci_addr, cycles in pci_to_cycles.items(): + device_id = self.pci_to_id.get(pci_addr) + if device_id is None: + continue + + # Check if we have a previous measurement + if pci_addr not in self.fdinfo_last: + # First measurement - store baseline + self.fdinfo_last[pci_addr] = {'cycles': dict(cycles), 'time': current_time} + utilization[device_id] = 0.0 + continue + + last = self.fdinfo_last[pci_addr] + time_delta = current_time - last['time'] + + if time_delta < 0.1: + utilization[device_id] = 0.0 + continue + + # Calculate max utilization across all engines + max_util = 0.0 + engines = {k.replace('_cycles', '').replace('_total', '') for k in cycles.keys()} + + for engine in engines: + curr_cycles = cycles.get(engine + '_cycles', 0) + curr_total = cycles.get(engine + '_total', 0) + prev_cycles = last['cycles'].get(engine + '_cycles', 0) + prev_total = last['cycles'].get(engine + '_total', 0) + + delta_cycles = curr_cycles - prev_cycles + delta_total = curr_total - prev_total + + if delta_total > 0: + engine_util = (delta_cycles / delta_total) * 100.0 + max_util = max(max_util, engine_util) + + utilization[device_id] = min(100.0, max(0.0, max_util)) + + # Update last measurement + self.fdinfo_last[pci_addr] = {'cycles': dict(cycles), 'time': current_time} + + # Fill in 0% for devices with no activity + for device_id in range(self.device_count): + if device_id not in utilization: + utilization[device_id] = 0.0 + + return utilization + + def exit(self): + """Cleanup (Intel GPU is stateless).""" + pass