diff --git a/README.md b/README.md index eda8dea4..ebc95b8f 100644 --- a/README.md +++ b/README.md @@ -337,6 +337,16 @@ You can extend the built-in error detection with custom regex patterns. Create a "event_category": "SW_DRIVER", "event_priority": 4 } + ], + "priority_override_rules": [ + { + "message": "Application Crash", + "new_priority": "ERROR" + }, + { + "event_category": "SW_DRIVER", + "new_priority": "WARNING" + } ] } } diff --git a/nodescraper/plugins/inband/dmesg/analyzer_args.py b/nodescraper/plugins/inband/dmesg/analyzer_args.py index cd9ba765..b68aec27 100644 --- a/nodescraper/plugins/inband/dmesg/analyzer_args.py +++ b/nodescraper/plugins/inband/dmesg/analyzer_args.py @@ -52,3 +52,13 @@ class DmesgAnalyzerArgs(TimeRangeAnalysisArgs): default=None, description="Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern.", ) + priority_override_rules: Optional[list[dict]] = Field( + default=None, + description=( + "Rules to override the priority of matched ErrorRegex objects. " + "Each rule is a dict where all keys except 'new_priority' and 'match_all' " + "are filter fields matched against ErrorRegex attributes. " + "'new_priority' must be an EventPriority name (e.g. 'WARNING', 'ERROR') " + "or 'NO_CHANGE' to leave the priority unchanged." + ), + ) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index ccfe9ce0..cbc14a81 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -535,6 +535,98 @@ def _norm(s: str) -> str: return True return False + def update_error_regex_priorities( + self, + error_regexes: list[ErrorRegex], + priority_override_rules: list[dict], + ) -> list[EventPriority]: + """Updates the priorities of a list of ErrorRegex objects based on given priority rules + + Args: + error_regexes (list[ErrorRegex]): A list of ErrorRegex objects to have their priorities updated + priority_override_rules (list[dict]): The list of rules which determine what the updated priority should be + + Returns: + list[ErrorRegex]: A list of the same ErrorRegex objects but with their priorities updated + """ + + if priority_override_rules is None: + return error_regexes + + updated_error_regexes = [] + for regex_obj in error_regexes: + new_priority = self._resolve_priority(regex_obj, priority_override_rules) + regex_obj = regex_obj.model_copy(update={"event_priority": new_priority}) + updated_error_regexes.append(regex_obj) + return updated_error_regexes + + def _resolve_priority( + self, + regex_obj: ErrorRegex, + priority_override_rules: list[dict], + ) -> EventPriority: + """Determine the new priority of an ErrorRegex based on provided rules + + Walk the priority_override_rules in order (first-match-wins). + Each rule should be a dict with only these keys allowed: + 1. Any attribute of an ErrorRegex object by which to filter. Currently this include "regex", "message", "event_category", "event_priority". This key should match to a string or a list (match if any value in the list matches). + 2. "new_priority": str. The string value of any EventPriority enum, or "NO_CHANGE", to determine the updated priority of the regex_obj if it matches the given rule. + 3. "match_all": bool. Determines if the rule will automatically match for any regex_obj. Will ignore any provided filters if given. + + Example rule format: + { + "message": ["mode 1 reset failed", "mode 2 reset failed"], + "new_priority": "NO_CHANGE" + } + { + "event_category": "RAS", + "new_priority": "WARNING" + } + + Args: + regex_obj (ErrorRegex): The ErrorRegex object to have its priority updated + priority_override_rules (list[dict]): The list of rules which determine what the updated priority should be + + Returns: + EventPriority: The new priority of the event. Returns the original priority if no rule matches or the matched rule specifies NO_CHANGE + """ + + _NO_CHANGE = "NO_CHANGE" + _EXCLUDED_KEYS = {"new_priority", "match_all"} + + current_priority = regex_obj.event_priority + + for rule in priority_override_rules: + filter_fields = {key: value for key, value in rule.items() if key not in _EXCLUDED_KEYS} + + matched = True + # if match_all is True, don't check attributes, simply move to priority update + if rule.get("match_all", False) is False: + # check for matches in all fields of the current rule + for field, filter_value in filter_fields.items(): + obj_value = getattr(regex_obj, field, None) + + # Normalize enum values to their name for string comparison + if hasattr(obj_value, "name"): + obj_value = obj_value.name + + if isinstance(filter_value, list): + if obj_value not in filter_value: + matched = False + break + else: + if obj_value != filter_value: + matched = False + break + + if matched: # return on encountering first fully matched rule + new_priority = rule.get("new_priority", _NO_CHANGE) + if new_priority == _NO_CHANGE: + return current_priority + return EventPriority[new_priority] + + return current_priority # if no rules are matched, keep the current priority + def analyze_data( self, data: DmesgData, @@ -554,6 +646,9 @@ def analyze_data( args = DmesgAnalyzerArgs() final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX) + final_error_regex = self.update_error_regex_priorities( + final_error_regex, args.priority_override_rules + ) # updates the priorities of the ErrorRegex objects using the given rules. makes no changes if no rules are provided. if args.analysis_range_start or args.analysis_range_end: self.logger.info( @@ -587,19 +682,24 @@ def analyze_data( self.result.events += known_err_events if args.check_unknown_dmesg_errors: + unknown_dmesg_error_regexes = [ + ErrorRegex( + regex=re.compile( + r"kern :(?:err|crit|alert|emerg)\s+: \d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+ (.*)" + ), + message="Unknown dmesg error", + event_category=EventCategory.UNKNOWN, + event_priority=EventPriority.WARNING, + ) + ] + unknown_dmesg_error_regexes = self.update_error_regex_priorities( + unknown_dmesg_error_regexes, args.priority_override_rules + ) # updates the priorities of the ErrorRegex objects using the given rules. makes no changes if no rules are provided. + err_events = self.check_all_regexes( content=dmesg_content, source="dmesg", - error_regex=[ - ErrorRegex( - regex=re.compile( - r"kern :(?:err|crit|alert|emerg)\s+: \d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+ (.*)" - ), - message="Unknown dmesg error", - event_category=EventCategory.UNKNOWN, - event_priority=EventPriority.WARNING, - ) - ], + error_regex=unknown_dmesg_error_regexes, num_timestamps=args.num_timestamps, interval_to_collapse_event=args.interval_to_collapse_event, ) diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index c14b090c..67faaf05 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -25,7 +25,10 @@ ############################################################################### import datetime import pathlib +import re +from nodescraper.base.regexanalyzer import ErrorRegex +from nodescraper.enums.eventcategory import EventCategory from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.executionstatus import ExecutionStatus from nodescraper.plugins.inband.dmesg.analyzer_args import DmesgAnalyzerArgs @@ -708,6 +711,220 @@ def test_custom_regex_empty_list(system_info): assert res.events[0].description == "Out of memory error" +def test_resolve_priority_no_match(system_info): + """No rule matches → returns the original priority unchanged.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [{"event_category": "SW_DRIVER", "new_priority": "WARNING"}] + assert analyzer._resolve_priority(regex_obj, rules) == EventPriority.ERROR + + +def test_resolve_priority_match_by_category(system_info): + """Rule with event_category filter matches and returns the new priority.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [{"event_category": "RAS", "new_priority": "WARNING"}] + result = analyzer._resolve_priority(regex_obj, rules) + assert result == EventPriority.WARNING + + +def test_resolve_priority_match_by_message_list(system_info): + """Rule with a list for message matches when the object's message is in the list.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"Mode2 reset failed"), + message="Mode 2 Reset Failed", + event_category=EventCategory.RAS, + ) + rules = [ + { + "message": ["Mode 2 Reset Failed", "GPU reset failed"], + "new_priority": "WARNING", + } + ] + result = analyzer._resolve_priority(regex_obj, rules) + assert result == EventPriority.WARNING + + +def test_resolve_priority_no_change(system_info): + """new_priority=NO_CHANGE → returns the original priority unchanged.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [{"event_category": "RAS", "new_priority": "NO_CHANGE"}] + assert analyzer._resolve_priority(regex_obj, rules) == EventPriority.ERROR + + +def test_resolve_priority_first_match_wins(system_info): + """First matching rule wins; subsequent matching rules are ignored.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [ + {"event_category": "RAS", "new_priority": "WARNING"}, + {"event_category": "RAS", "new_priority": "ERROR"}, + ] + result = analyzer._resolve_priority(regex_obj, rules) + assert result == EventPriority.WARNING + + +def test_resolve_priority_multiple_filter_fields(system_info): + """All filter fields must match (AND logic).""" + analyzer = DmesgAnalyzer(system_info=system_info) + # Matches both category AND message + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [ + {"event_category": "RAS", "message": "GPU reset failed", "new_priority": "WARNING"}, + ] + assert analyzer._resolve_priority(regex_obj, rules) == EventPriority.WARNING + + # Does NOT match because message differs → returns original priority + rules_mismatch = [ + {"event_category": "RAS", "message": "ACA Error", "new_priority": "WARNING"}, + ] + assert analyzer._resolve_priority(regex_obj, rules_mismatch) == EventPriority.ERROR + + +def test_resolve_priority_match_all_matches_any_regex(system_info): + """match_all=True with no other filter fields always matches any ErrorRegex.""" + analyzer = DmesgAnalyzer(system_info=system_info) + for regex_obj in [ + ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile(r"IO_PAGE_FAULT"), + message="I/O Page Fault", + event_category=EventCategory.SW_DRIVER, + ), + ]: + result = analyzer._resolve_priority( + regex_obj, [{"match_all": True, "new_priority": "WARNING"}] + ) + assert ( + result == EventPriority.WARNING + ), f"Expected WARNING for {regex_obj.message}, got {result}" + + +def test_resolve_priority_match_all_ignores_non_matching_filters(system_info): + """match_all=True ignores filter fields that would otherwise not match.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + # event_category is RAS, but filter says SW_DRIVER — would normally NOT match. + # match_all=True should bypass this check and still apply the rule. + result = analyzer._resolve_priority( + regex_obj, + [{"match_all": True, "event_category": "SW_DRIVER", "new_priority": "WARNING"}], + ) + assert result == EventPriority.WARNING + + +def test_resolve_priority_match_all_false_still_filters(system_info): + """match_all=False (explicit) falls through to normal filter logic.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + # match_all=False with a non-matching filter → returns original priority + result = analyzer._resolve_priority( + regex_obj, + [{"match_all": False, "event_category": "SW_DRIVER", "new_priority": "WARNING"}], + ) + assert result == EventPriority.ERROR + + # match_all=False with a matching filter → should match + result = analyzer._resolve_priority( + regex_obj, + [{"match_all": False, "event_category": "RAS", "new_priority": "WARNING"}], + ) + assert result == EventPriority.WARNING + + +def test_priority_override_rules_in_analyze_data(system_info): + """priority_override_rules passed via DmesgAnalyzerArgs overrides matched regex priorities.""" + dmesg_data = DmesgData( + dmesg_content=( + # RAS event — default ERROR, should become WARNING + "kern :err : 2024-10-07T10:17:15,145363-04:00 " + "amdgpu 0000:0c:00.0: amdgpu: socket: 4 1 correctable hardware errors detected in total in gfx block\n" + # SW_DRIVER event — default ERROR, should stay ERROR (no matching rule) + "kern :err : 2024-10-07T10:17:15,145363-04:00 IO_PAGE_FAULT\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=False, + priority_override_rules=[ + {"event_category": "RAS", "new_priority": "WARNING"}, + ], + ), + ) + + assert res.status == ExecutionStatus.ERROR + ras_events = [e for e in res.events if e.category == "RAS"] + sw_events = [e for e in res.events if e.category == "SW_DRIVER"] + + assert all( + e.priority == EventPriority.WARNING for e in ras_events + ), f"Expected all RAS events to be WARNING, got {[e.priority for e in ras_events]}" + assert all( + e.priority == EventPriority.ERROR for e in sw_events + ), f"Expected SW_DRIVER events to remain ERROR, got {[e.priority for e in sw_events]}" + + +def test_priority_override_no_change_keeps_original(system_info): + """NO_CHANGE rule leaves the original event priority intact.""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2024-10-07T10:17:15,145363-04:00 " + "amdgpu 0000:0c:00.0: amdgpu: socket: 4 1 correctable hardware errors detected in total in gfx block\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=False, + priority_override_rules=[ + {"event_category": "RAS", "new_priority": "NO_CHANGE"}, + ], + ), + ) + + assert len(res.events) == 1 + assert res.events[0].priority == EventPriority.ERROR + + def test_custom_regex_with_multiline_pattern(system_info): """Test custom regex that should NOT match across multiple dmesg lines (each line processed separately)""" dmesg_data = DmesgData( @@ -736,3 +953,24 @@ def test_custom_regex_with_multiline_pattern(system_info): assert len(res.events) >= 1 start_events = [e for e in res.events if e.description == "Start Error Block"] assert len(start_events) == 1 + + +def test_priority_override_updates_unkown_dmesg_error(system_info): + """Updating an 'Unknown dmesg error', which is added after the base ErrorRegex list, successfully changes its priority""" + dmesg_data = DmesgData( + dmesg_content=("kern :err : 2024-10-07T10:17:15,145363-04:00 UNKOWN DMESG ERROR") + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=True, + priority_override_rules=[ + {"message": "Unknown dmesg error", "new_priority": "ERROR"}, + ], + ), + ) + + assert len(res.events) == 1 + assert res.events[0].priority == EventPriority.ERROR