mirror of https://github.com/nicolargo/glances.git
Event is now a Dataclass
This commit is contained in:
parent
272dc23443
commit
77167b8bce
|
|
@ -79,9 +79,12 @@ Requirements
|
|||
============
|
||||
|
||||
- ``python>=3.8`` (use Glances 3.4.x for lower Python version)
|
||||
- ``psutil>=5.3.0`` (better with latest version)
|
||||
- ``psutil`` (better with latest version)
|
||||
- ``defusedxml`` (in order to monkey patch xmlrpc)
|
||||
- ``packaging`` (for the version comparison)
|
||||
- ``ujson`` (an optimized alternative to the standard json module)
|
||||
- ``pytz`` (for the timezone support)
|
||||
- ``pydantic`` (for the data validation support)
|
||||
|
||||
*Note for Python 2 users*
|
||||
|
||||
|
|
|
|||
|
|
@ -1,410 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# This file is part of Glances.
|
||||
#
|
||||
# SPDX-FileCopyrightText: 2022 Nicolas Hennion <nicolas@nicolargo.com>
|
||||
#
|
||||
# SPDX-License-Identifier: LGPL-3.0-only
|
||||
#
|
||||
|
||||
"""Manage Glances events (previously Glances logs in Glances < 3.1)."""
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from glances.logger import logger
|
||||
from glances.processes import glances_processes, sort_stats
|
||||
from glances.thresholds import glances_thresholds
|
||||
|
||||
# Static decision tree for the global alert message
|
||||
# - msg: Message to be displayed (result of the decision tree)
|
||||
# - thresholds: a list of stats to take into account
|
||||
# - thresholds_min: minimal value of the thresholds sum
|
||||
# - 0: OK
|
||||
# - 1: CAREFUL
|
||||
# - 2: WARNING
|
||||
# - 3: CRITICAL
|
||||
tree = [
|
||||
{'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
|
||||
{'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
|
||||
{'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
|
||||
{'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
|
||||
{
|
||||
'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
|
||||
'thresholds': ['cpu_steal'],
|
||||
'thresholds_min': 2,
|
||||
},
|
||||
{'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
|
||||
{'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
|
||||
{'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
|
||||
{'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
|
||||
]
|
||||
|
||||
# TODO: change the algo to use the following decision tree
|
||||
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
|
||||
# _yes means threshold >= 2
|
||||
# _no means threshold < 2
|
||||
# With threshold:
|
||||
# - 0: OK
|
||||
# - 1: CAREFUL
|
||||
# - 2: WARNING
|
||||
# - 3: CRITICAL
|
||||
tree_new = {
|
||||
'cpu_iowait': {
|
||||
'_yes': {
|
||||
'memswap': {
|
||||
'_yes': {
|
||||
'mem': {
|
||||
'_yes': {
|
||||
# Once you've identified the offenders, the resolution will again
|
||||
# depend on whether their memory usage seems business-as-usual or not.
|
||||
# For example, a memory leak can be satisfactorily addressed by a one-time
|
||||
# or periodic restart of the process.
|
||||
# - if memory usage seems anomalous: kill the offending processes.
|
||||
# - if memory usage seems business-as-usual: add RAM to the server,
|
||||
# or split high-memory using services to other servers.
|
||||
'_msg': "Memory issue"
|
||||
},
|
||||
'_no': {
|
||||
# ???
|
||||
'_msg': "Swap issue"
|
||||
},
|
||||
}
|
||||
},
|
||||
'_no': {
|
||||
# Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
|
||||
# iotop is an awesome tool for identifying io offenders. Two things to note:
|
||||
# unless you've already installed iotop, it's probably not already on your system.
|
||||
# Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
|
||||
# tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
|
||||
'_msg': "I/O issue"
|
||||
},
|
||||
}
|
||||
},
|
||||
'_no': {
|
||||
'cpu_total': {
|
||||
'_yes': {
|
||||
'cpu_user': {
|
||||
'_yes': {
|
||||
# We expect the user-time percentage to be high.
|
||||
# There's most likely a program or service you've configured on you server that's
|
||||
# hogging CPU.
|
||||
# Checking the % user time just confirms this. When you see that the % user-time is high,
|
||||
# it's time to see what executable is monopolizing the CPU
|
||||
# Once you've confirmed that the % usertime is high, check the process list(also provided
|
||||
# by top).
|
||||
# Be default, top sorts the process list by % CPU, so you can just look at the top process
|
||||
# or processes.
|
||||
# If there's a single process hogging the CPU in a way that seems abnormal, it's an
|
||||
# anomalous situation
|
||||
# that a service restart can fix. If there are are multiple processes taking up CPU
|
||||
# resources, or it
|
||||
# there's one process that takes lots of resources while otherwise functioning normally,
|
||||
# than your setup
|
||||
# may just be underpowered. You'll need to upgrade your server(add more cores),
|
||||
# or split services out onto
|
||||
# other boxes. In either case, you have a resolution:
|
||||
# - if situation seems anomalous: kill the offending processes.
|
||||
# - if situation seems typical given history: upgrade server or add more servers.
|
||||
'_msg': "CPU issue with user process(es)"
|
||||
},
|
||||
'_no': {
|
||||
'cpu_steal': {
|
||||
'_yes': {
|
||||
'_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
|
||||
},
|
||||
'_no': {'_msg': "CPU issue with system process(es)"},
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
'_no': {
|
||||
'_yes': {
|
||||
# ???
|
||||
'_msg': "Memory issue"
|
||||
},
|
||||
'_no': {
|
||||
# Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
|
||||
# It's also possible that the slowness is being caused by another server in your cluster, or
|
||||
# by an external service you rely on.
|
||||
# start by checking important applications for uncharacteristic slowness(the DB is a good place
|
||||
# to start), think through which parts of your infrastructure could be slowed down externally.
|
||||
# For example, do you use an externally hosted email service that could slow down critical
|
||||
# parts of your application ?
|
||||
# If you suspect another server in your cluster, strace and lsof can provide information on
|
||||
# what the process is doing or waiting on. Strace will show you which file descriptors are
|
||||
# being read or written to (or being attempted to be read from) and lsof can give you a
|
||||
# mapping of those file descriptors to network connections.
|
||||
'_msg': "External issue"
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def build_global_message():
|
||||
"""Parse the decision tree and return the message.
|
||||
|
||||
Note: message corresponding to the current thresholds values
|
||||
"""
|
||||
# Compute the weight for each item in the tree
|
||||
current_thresholds = glances_thresholds.get()
|
||||
for i in tree:
|
||||
i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
|
||||
themax = max(tree, key=lambda d: d['weight'])
|
||||
if themax['weight'] >= themax['thresholds_min']:
|
||||
# Check if the weight is > to the minimal threshold value
|
||||
return themax['msg']
|
||||
else:
|
||||
return tree[0]['msg']
|
||||
|
||||
|
||||
class GlancesEvents(object):
|
||||
|
||||
"""This class manages events inside the Glances software.
|
||||
|
||||
Events is a list of event (stored in the self.events_list var)
|
||||
event_state = "OK|CAREFUL|WARNING|CRITICAL"
|
||||
event_type = "CPU*|LOAD|MEM|MON"
|
||||
event_value = value
|
||||
|
||||
Item (or event) is defined by:
|
||||
{
|
||||
"begin": "begin",
|
||||
"end": "end",
|
||||
"state": "WARNING|CRITICAL",
|
||||
"type": "CPU|LOAD|MEM",
|
||||
"max": MAX,
|
||||
"avg": AVG,
|
||||
"min": MIN,
|
||||
"sum": SUM,
|
||||
"count": COUNT,
|
||||
"top": [top 3 process name],
|
||||
"desc": "Processes description",
|
||||
"sort": "top sort key",
|
||||
"global": "global alert message"
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, max_events=10, min_duration=6, min_interval=6):
|
||||
"""Init the events class.
|
||||
|
||||
max_events: maximum size of the events list
|
||||
min_duration: events duration should be > min_duration to be taken into account (in seconds)
|
||||
min_interval: minimal interval between same kind of alert (in seconds)
|
||||
"""
|
||||
# Maximum size of the events list
|
||||
self.set_max_events(max_events)
|
||||
|
||||
# Minimal event duraton time (in seconds)
|
||||
self.set_min_duration(min_duration)
|
||||
|
||||
# Minimal interval between same kind of alert (in seconds)
|
||||
self.set_min_interval(min_interval)
|
||||
|
||||
# Init the logs list
|
||||
self.events_list = []
|
||||
|
||||
def set_max_events(self, max_events):
|
||||
"""Set the maximum size of the events list."""
|
||||
self.max_events = max_events
|
||||
|
||||
def set_min_duration(self, min_duration):
|
||||
"""Set the minimal event duration time (in seconds)."""
|
||||
self.min_duration = min_duration
|
||||
|
||||
def set_min_interval(self, min_interval):
|
||||
"""Set the minimum interval between same kind of alert (in seconds)."""
|
||||
self.min_interval = min_interval
|
||||
|
||||
def get(self):
|
||||
"""Return the raw events list."""
|
||||
return self.events_list
|
||||
|
||||
def len(self):
|
||||
"""Return the number of events in the logs list."""
|
||||
return self.events_list.__len__()
|
||||
|
||||
def __event_exist(self, event_time, event_type):
|
||||
"""Return the event position in the events list if:
|
||||
type is matching
|
||||
and (end is < 0 or event_time - end < min_interval)
|
||||
Return -1 if the item is not found.
|
||||
"""
|
||||
for i in range(self.len()):
|
||||
if ((self.events_list[i]['end'] < 0) or
|
||||
(event_time - self.events_list[i]['end'] < self.min_interval)) and \
|
||||
self.events_list[i]['type'] == event_type:
|
||||
return i
|
||||
return -1
|
||||
|
||||
def get_event_sort_key(self, event_type):
|
||||
"""Return the process sort key"""
|
||||
# Process sort depending on alert type
|
||||
if event_type.startswith("MEM"):
|
||||
# Sort TOP process by memory_percent
|
||||
ret = 'memory_percent'
|
||||
elif event_type.startswith("CPU_IOWAIT"):
|
||||
# Sort TOP process by io_counters (only for Linux OS)
|
||||
ret = 'io_counters'
|
||||
else:
|
||||
# Default sort is...
|
||||
ret = 'cpu_percent'
|
||||
return ret
|
||||
|
||||
def set_process_sort(self, event_type):
|
||||
"""Define the process auto sort key from the alert type."""
|
||||
if glances_processes.auto_sort:
|
||||
glances_processes.set_sort_key(self.get_event_sort_key(event_type))
|
||||
|
||||
def reset_process_sort(self):
|
||||
"""Reset the process auto sort key."""
|
||||
if glances_processes.auto_sort:
|
||||
glances_processes.set_sort_key('auto')
|
||||
|
||||
def add(self, event_state, event_type, event_value, proc_list=None, proc_desc="", min_duration=None):
|
||||
"""Add a new item to the logs list.
|
||||
|
||||
event_state = "OK|CAREFUL|WARNING|CRITICAL"
|
||||
event_type = "CPU|LOAD|MEM|..."
|
||||
event_value = value
|
||||
proc_list = list of processes
|
||||
proc_desc = processes description
|
||||
global_message = global alert message
|
||||
|
||||
If 'event' is a 'new one', add it at the beginning of the list.
|
||||
If 'event' is not a 'new one', update the list .
|
||||
When finished if event duration < peak_time then the alert is not set.
|
||||
"""
|
||||
event_time = time.mktime(datetime.now().timetuple())
|
||||
global_message = build_global_message()
|
||||
proc_list = proc_list or glances_processes.get_list()
|
||||
|
||||
# Add or update the log
|
||||
event_index = self.__event_exist(event_time, event_type)
|
||||
if event_index < 0:
|
||||
# Event did not exist, add it
|
||||
self._create_event(event_time, event_state, event_type, event_value,
|
||||
proc_desc, global_message)
|
||||
else:
|
||||
# Event exist, update it
|
||||
self._update_event(event_time, event_index, event_state, event_type, event_value,
|
||||
proc_list, proc_desc, global_message)
|
||||
|
||||
return self.len()
|
||||
|
||||
def _create_event(self, event_time, event_state, event_type, event_value,
|
||||
proc_desc, global_message):
|
||||
"""Add a new item in the log list.
|
||||
|
||||
Item is added only if the criticality (event_state) is WARNING or CRITICAL.
|
||||
"""
|
||||
if event_state == "WARNING" or event_state == "CRITICAL":
|
||||
# Define the automatic process sort key
|
||||
self.set_process_sort(event_type)
|
||||
|
||||
# Create the new log item
|
||||
# Time is stored in Epoch format
|
||||
# Epoch -> DMYHMS = datetime.fromtimestamp(epoch)
|
||||
item = {
|
||||
"begin": event_time,
|
||||
"end": -1,
|
||||
"state": event_state,
|
||||
"type": event_type,
|
||||
"max": event_value,
|
||||
"avg": event_value,
|
||||
"min": event_value,
|
||||
"sum": event_value,
|
||||
"count": 1,
|
||||
"top": [],
|
||||
"desc": proc_desc,
|
||||
"sort": glances_processes.sort_key,
|
||||
"global": global_message,
|
||||
}
|
||||
|
||||
# Add the item to the list
|
||||
self.events_list.insert(0, item)
|
||||
|
||||
# Limit the list to 'max_events' items
|
||||
if self.len() > self.max_events:
|
||||
self.events_list.pop()
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _update_event(self, event_time, event_index, event_state, event_type, event_value,
|
||||
proc_list, proc_desc, global_message):
|
||||
"""Update an event in the list"""
|
||||
if event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] < 0:
|
||||
# Close the event
|
||||
self._close_event(event_time, event_index)
|
||||
elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] >= 0:
|
||||
# Event is already closed, do nothing
|
||||
pass
|
||||
else: # event_state == "WARNING" or event_state == "CRITICAL"
|
||||
# Set process sort key
|
||||
self.set_process_sort(event_type)
|
||||
|
||||
# It's an ongoing event, set the end time to -1
|
||||
self.events_list[event_index]['end'] = -1
|
||||
|
||||
# Min/Max/Sum/Count/Avergae value
|
||||
self.events_list[event_index]['min'] = min(self.events_list[event_index]['min'], event_value)
|
||||
self.events_list[event_index]['max'] = max(self.events_list[event_index]['max'], event_value)
|
||||
self.events_list[event_index]['sum'] += event_value
|
||||
self.events_list[event_index]['count'] += 1
|
||||
self.events_list[event_index]['avg'] = self.events_list[event_index]['sum'] / self.events_list[event_index]['count']
|
||||
|
||||
if event_state == "CRITICAL":
|
||||
# Avoid to change from CRITICAL to WARNING
|
||||
# If an events have reached the CRITICAL state, it can't go back to WARNING
|
||||
self.events_list[event_index]['state'] = event_state
|
||||
|
||||
# TOP PROCESS LIST (only for CRITICAL ALERT)
|
||||
events_sort_key = self.get_event_sort_key(event_type)
|
||||
|
||||
# Sort the current process list to retrieve the TOP 3 processes
|
||||
self.events_list[event_index]['top'] = [p['name'] for p in sort_stats(proc_list, events_sort_key)[0:3]]
|
||||
self.events_list[event_index]['sort'] = events_sort_key
|
||||
|
||||
# MONITORED PROCESSES DESC
|
||||
self.events_list[event_index]['desc'] = proc_desc
|
||||
|
||||
# Global message:
|
||||
self.events_list[event_index]['global'] = global_message
|
||||
|
||||
return True
|
||||
|
||||
def _close_event(self, event_time, event_index):
|
||||
"""Close an event in the list"""
|
||||
# Reset the automatic process sort key
|
||||
self.reset_process_sort()
|
||||
|
||||
# Set the end of the events
|
||||
if event_time - self.events_list[event_index]['begin'] >= self.min_duration:
|
||||
# If event is >= min_duration seconds
|
||||
self.events_list[event_index]['end'] = event_time
|
||||
else:
|
||||
# If event < min_duration seconds, ignore
|
||||
self.events_list.remove(self.events_list[event_index])
|
||||
|
||||
def clean(self, critical=False):
|
||||
"""Clean the logs list by deleting finished items.
|
||||
|
||||
By default, only delete WARNING message.
|
||||
If critical = True, also delete CRITICAL message.
|
||||
"""
|
||||
# Create a new clean list
|
||||
clean_events_list = []
|
||||
while self.len() > 0:
|
||||
item = self.events_list.pop()
|
||||
if item['end'] < 0 or (not critical and item['state'].startswith("CRITICAL")):
|
||||
clean_events_list.insert(0, item)
|
||||
# The list is now the clean one
|
||||
self.events_list = clean_events_list
|
||||
return self.len()
|
||||
|
||||
|
||||
glances_events = GlancesEvents()
|
||||
|
|
@ -14,7 +14,7 @@ import sys
|
|||
|
||||
from glances.globals import MACOS, WINDOWS, nativestr, u, itervalues, enable, disable
|
||||
from glances.logger import logger
|
||||
from glances.events import glances_events
|
||||
from glances.events_list import glances_events
|
||||
from glances.processes import glances_processes, sort_processes_key_list
|
||||
from glances.outputs.glances_unicode import unicode_message
|
||||
from glances.timer import Timer
|
||||
|
|
|
|||
|
|
@ -53,7 +53,6 @@ export default {
|
|||
alert.avg = alertalertStats.avg;
|
||||
alert.max = alertalertStats.max;
|
||||
alert.top = alertalertStats.top.join(', ');
|
||||
alert.global = alertalertStats.global;
|
||||
|
||||
if (!alert.ongoing) {
|
||||
const duration = alert.end - alert.begin;
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -13,7 +13,7 @@ from datetime import datetime
|
|||
from time import tzname
|
||||
import pytz
|
||||
|
||||
from glances.events import glances_events
|
||||
from glances.events_list import glances_events
|
||||
|
||||
# from glances.logger import logger
|
||||
from glances.plugins.plugin.model import GlancesPluginModel
|
||||
|
|
@ -88,7 +88,7 @@ fields_description = {
|
|||
'description': 'Sort key of the top processes',
|
||||
'unit': 'string',
|
||||
},
|
||||
'global': {
|
||||
'global_msg': {
|
||||
'description': 'Global alert message',
|
||||
'unit': 'string',
|
||||
}
|
||||
|
|
@ -137,10 +137,11 @@ class PluginModel(GlancesPluginModel):
|
|||
|
||||
# Build the string message
|
||||
# Header with the global message
|
||||
if len(self.stats) > 0 and self.stats[0]['end'] < 0 and 'global' in self.stats[0]:
|
||||
ret.append(self.curse_add_line(self.stats[0]['global'], "TITLE"))
|
||||
global_message = [e['global_msg'] for e in self.stats if (e['end'] == -1 and 'global_msg' in e)]
|
||||
if len(global_message) > 0:
|
||||
ret.append(self.curse_add_line(global_message[0], "TITLE"))
|
||||
else:
|
||||
ret.append(self.curse_add_line("ALERTS", "TITLE"))
|
||||
ret.append(self.curse_add_line("EVENTS history", "TITLE"))
|
||||
# Loop over alerts
|
||||
for alert in self.stats:
|
||||
# New line
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ from glances.globals import iterkeys, itervalues, listkeys, mean, nativestr, jso
|
|||
from glances.actions import GlancesActions
|
||||
from glances.history import GlancesHistory
|
||||
from glances.logger import logger
|
||||
from glances.events import glances_events
|
||||
from glances.events_list import glances_events
|
||||
from glances.thresholds import glances_thresholds
|
||||
from glances.timer import Counter, Timer, getTimeSinceLastUpdate
|
||||
from glances.outputs.glances_unicode import unicode_message
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@ defusedxml
|
|||
packaging
|
||||
ujson>=5.4.0
|
||||
pytz
|
||||
pydantic
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ from glances.thresholds import GlancesThresholds
|
|||
from glances.plugins.plugin.model import GlancesPluginModel
|
||||
from glances.programs import processes_to_programs
|
||||
from glances.secure import secure_popen
|
||||
from glances.events import GlancesEvents
|
||||
from glances.events_list import GlancesEventsList
|
||||
|
||||
# Global variables
|
||||
# =================
|
||||
|
|
@ -298,7 +298,7 @@ class TestGlances(unittest.TestCase):
|
|||
"""Test events class"""
|
||||
print('INFO: [TEST_019] Test events')
|
||||
# Init events
|
||||
events = GlancesEvents(max_events=5, min_duration=1, min_interval=3)
|
||||
events = GlancesEventsList(max_events=5, min_duration=1, min_interval=3)
|
||||
# Minimal event duration not reached
|
||||
events.add('WARNING', 'LOAD', 4)
|
||||
events.add('CRITICAL', 'LOAD', 5)
|
||||
|
|
|
|||
Loading…
Reference in New Issue