Merge pull request #3291 from nicolargo/issue3290-cleanup

Fix: Glances stalling on broken NFS connections
This commit is contained in:
Nicolas Hennion 2025-09-28 14:17:24 +02:00 committed by GitHub
commit 6539979498
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 148 additions and 29 deletions

View File

@ -27,6 +27,7 @@ import weakref
from collections import OrderedDict from collections import OrderedDict
from configparser import ConfigParser, NoOptionError, NoSectionError from configparser import ConfigParser, NoOptionError, NoSectionError
from datetime import datetime from datetime import datetime
from multiprocessing import Process, Queue
from operator import itemgetter, methodcaller from operator import itemgetter, methodcaller
from statistics import mean from statistics import mean
from typing import Any, Optional, Union from typing import Any, Optional, Union
@ -584,3 +585,32 @@ def atoi(text):
def natural_keys(text): def natural_keys(text):
"""Return a text in a natural/human readable format.""" """Return a text in a natural/human readable format."""
return [atoi(c) for c in re.split(r'(\d+)', text)] return [atoi(c) for c in re.split(r'(\d+)', text)]
def exit_after(seconds, default=None):
"""Exit the function if it takes more than 'seconds' seconds to complete.
In this case, return the value of 'default' (default: None)."""
def handler(q, func, args, kwargs):
q.put(func(*args, **kwargs))
def decorator(func):
def wraps(*args, **kwargs):
q = Queue()
p = Process(target=handler, args=(q, func, args, kwargs))
p.start()
p.join(timeout=seconds)
if not p.is_alive():
return q.get()
p.terminate()
p.join(timeout=0.1)
if p.is_alive():
# Kill in case processes doesn't terminate
# Happens with cases like broken NFS connections
p.kill()
return default
return wraps
return decorator

View File

@ -12,7 +12,7 @@ import operator
import psutil import psutil
from glances.globals import PermissionError, nativestr, u from glances.globals import PermissionError, exit_after, nativestr, u
from glances.logger import logger from glances.logger import logger
from glances.plugins.plugin.model import GlancesPluginModel from glances.plugins.plugin.model import GlancesPluginModel
@ -88,6 +88,17 @@ snmp_oid['esxi'] = snmp_oid['windows']
items_history_list = [{'name': 'percent', 'description': 'File system usage in percent', 'y_unit': '%'}] items_history_list = [{'name': 'percent', 'description': 'File system usage in percent', 'y_unit': '%'}]
@exit_after(1, default=None)
def get_disk_usage(fs):
"""Return all partitions."""
try:
return psutil.disk_usage(fs.mountpoint)
except OSError:
# Disk is ejected during the command
logger.debug("Plugin - fs: PsUtil fetch failed")
return None
class FsPlugin(GlancesPluginModel): class FsPlugin(GlancesPluginModel):
"""Glances file system plugin. """Glances file system plugin.
@ -126,53 +137,59 @@ class FsPlugin(GlancesPluginModel):
return self.stats return self.stats
@GlancesPluginModel._exit_after(3)
def get_all_stats_partitions(self):
"""Return all partitions."""
try:
return psutil.disk_partitions(all=True)
except (UnicodeDecodeError, PermissionError):
logger.debug("Plugin - fs: PsUtil fetch failed")
return []
@GlancesPluginModel._exit_after(3)
def get_disk_partitions(self, *, fetch_all: bool = False):
"""Return all partitions."""
try:
# Grab the stats using the psutil disk_partitions
# If fetch_all is False, then returns physical devices only (e.g. hard disks, cd-rom drives, USB keys)
# and ignore all others (e.g. memory partitions such as /dev/shm)
return psutil.disk_partitions(all=fetch_all)
except (UnicodeDecodeError, PermissionError):
logger.debug("Plugin - fs: PsUtil fetch failed")
return []
def update_local(self): def update_local(self):
"""Update the FS stats using the input method.""" """Update the FS stats using the input method."""
# Init new stats # Init new stats
stats = self.get_init_value() stats = self.get_init_value()
# Update stats using the standard system lib # Update stats using the standard system lib
fs_stat = self.get_disk_partitions()
# Grab the stats using the psutil disk_partitions
# If 'all'=False return physical devices only (e.g. hard disks, cd-rom drives, USB keys)
# and ignore all others (e.g. memory partitions such as /dev/shm)
try:
fs_stat = psutil.disk_partitions(all=False)
except (UnicodeDecodeError, PermissionError):
logger.debug("Plugin - fs: PsUtil fetch failed")
return stats
# Optional hack to allow logical mounts points (issue #448) # Optional hack to allow logical mounts points (issue #448)
allowed_fs_types = self.get_conf_value('allow') allowed_fs_types = self.get_conf_value('allow')
if allowed_fs_types: if allowed_fs_types:
# Avoid Psutil call unless mounts need to be allowed # Avoid Psutil call unless mounts need to be allowed
try: all_mounted_fs = self.get_disk_partitions(fetch_all=True)
all_mounted_fs = psutil.disk_partitions(all=True) # Discard duplicates (#2299) and add entries matching allowed fs types
except (UnicodeDecodeError, PermissionError): tracked_mnt_points = {f.mountpoint for f in fs_stat}
logger.debug("Plugin - fs: PsUtil extended fetch failed") for f in all_mounted_fs:
else: if (
# Discard duplicates (#2299) and add entries matching allowed fs types any(f.fstype.find(fs_type) >= 0 for fs_type in allowed_fs_types)
tracked_mnt_points = {f.mountpoint for f in fs_stat} and f.mountpoint not in tracked_mnt_points
for f in all_mounted_fs: ):
if ( fs_stat.append(f)
any(f.fstype.find(fs_type) >= 0 for fs_type in allowed_fs_types)
and f.mountpoint not in tracked_mnt_points
):
fs_stat.append(f)
# Loop over fs # Loop over fs
for fs in fs_stat: for fs in fs_stat:
# Hide the stats if the mount point is in the exclude list # Hide the stats if the mount point is in the exclude list
# # It avoids unnecessary call to PsUtil disk_usage # It avoids unnecessary call to PsUtil disk_usage
if not self.is_display_any(fs.mountpoint, fs.device): if not self.is_display_any(fs.mountpoint, fs.device):
continue continue
# Grab the disk usage # Grab the disk usage
try: fs_usage = get_disk_usage(fs)
fs_usage = psutil.disk_usage(fs.mountpoint) if fs_usage is None:
except OSError:
# Correct issue #346
# Disk is ejected during the command
continue continue
fs_current = { fs_current = {
'device_name': fs.device, 'device_name': fs.device,

View File

@ -14,6 +14,12 @@ I am your father...
import copy import copy
import re import re
import threading
try:
import thread
except ImportError:
import _thread as thread
from glances.actions import GlancesActions from glances.actions import GlancesActions
from glances.events_list import glances_events from glances.events_list import glances_events
@ -1212,7 +1218,25 @@ class GlancesPluginModel:
return wrapper return wrapper
def _exit_after(second):
"""Exit the function if it takes more than 'second' seconds to complete."""
def outer(fn):
def inner(*args, **kwargs):
timer = threading.Timer(second, thread.interrupt_main, args=[fn.__name__])
timer.start()
try:
result = fn(*args, **kwargs)
finally:
timer.cancel()
return result
return inner
return outer
# Mandatory to call the decorator in child classes # Mandatory to call the decorator in child classes
_check_decorator = staticmethod(_check_decorator) _check_decorator = staticmethod(_check_decorator)
_log_result_decorator = staticmethod(_log_result_decorator) _log_result_decorator = staticmethod(_log_result_decorator)
_manage_rate = staticmethod(_manage_rate) _manage_rate = staticmethod(_manage_rate)
_exit_after = staticmethod(_exit_after)

View File

@ -0,0 +1,48 @@
import time
from multiprocessing import Process, Queue
import psutil
def exit_after(seconds, default=None):
"""Exit the function if it takes more than 'second' seconds to complete.
In this case, return the value of 'default' (default: None)."""
def handler(q, func, args, kwargs):
q.put(func(*args, **kwargs))
def decorator(func):
def wraps(*args, **kwargs):
q = Queue()
p = Process(target=handler, args=(q, func, args, kwargs))
p.start()
p.join(timeout=seconds)
if not p.is_alive():
return q.get()
p.terminate()
p.join(timeout=0.1)
if p.is_alive():
# Kill in case processes doesn't terminate
# Happens with cases like broken NFS connections
p.kill()
return default
return wraps
return decorator
class Issue3290:
@exit_after(1, default=None)
def blocking_io_call(self, fs):
try:
return psutil.disk_usage(fs)
except OSError:
return None
issue = Issue3290()
while True:
print(f"{time.time()} {issue.blocking_io_call('/home/nicolargo/tmp/hang')}")
time.sleep(1)