Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 20 additions & 14 deletions nodescraper/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,21 +480,27 @@ def main(arg_input: Optional[list[str]] = None):
dump_results_to_csv(results, sname, log_path, timestamp, logger)

if parsed_args.reference_config:
ref_config = generate_reference_config(results, plugin_reg, logger)
if log_path:
path = os.path.join(log_path, "reference_config.json")
if any(result.status > ExecutionStatus.WARNING for result in results):
logger.warning("Skipping reference config write because one or more plugins failed")
else:
path = os.path.join(os.getcwd(), "reference_config.json")
try:
with open(path, "w") as f:
json.dump(
ref_config.model_dump(mode="json", exclude_none=True),
f,
indent=2,
)
logger.info("Reference config written to: %s", path)
except Exception as exp:
logger.error(exp)
merged_plugin_config = PluginExecutor.merge_configs(plugin_config_inst_list)
ref_config = generate_reference_config(
results, plugin_reg, logger, run_plugin_config=merged_plugin_config
)
if log_path:
path = os.path.join(log_path, "reference_config.json")
else:
path = os.path.join(os.getcwd(), "reference_config.json")
try:
with open(path, "w") as f:
json.dump(
ref_config.model_dump(mode="json", exclude_none=True),
f,
indent=2,
)
logger.info("Reference config written to: %s", path)
except Exception as exp:
logger.error(exp)

if any(result.status > ExecutionStatus.WARNING for result in results):
sys.exit(1)
Expand Down
23 changes: 17 additions & 6 deletions nodescraper/cli/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,20 +316,27 @@ def extract_analyzer_args_from_model(


def generate_reference_config(
results: list[PluginResult], plugin_reg: PluginRegistry, logger: logging.Logger
results: list[PluginResult],
plugin_reg: PluginRegistry,
logger: logging.Logger,
run_plugin_config: Optional[PluginConfig] = None,
) -> PluginConfig:
"""Generate reference config from plugin results
"""Generate reference config from plugin results.

Args:
results (list[PluginResult]): list of plugin results
plugin_reg (PluginRegistry): registry containing all registered plugins
logger (logging.Logger): logger
results: List of plugin results from the run.
plugin_reg: Registry containing all registered plugins.
logger: Logger instance.
run_plugin_config: Optional merged plugin config used for the run;

Returns:
PluginConfig: holds model that defines final reference config
PluginConfig: Reference config with plugins dict containing
collection_args and analysis_args for each successful plugin.
"""
plugin_config = PluginConfig()
plugins = {}
run_plugins = (run_plugin_config.plugins if run_plugin_config else {}) or {}

for obj in results:
if obj.result_data.collection_result.status != ExecutionStatus.OK:
logger.warning(
Expand All @@ -349,6 +356,10 @@ def generate_reference_config(
if obj.source not in plugins:
plugins[obj.source] = {}

run_args = run_plugins.get(obj.source) or {}
if run_args.get("collection_args"):
plugins[obj.source]["collection_args"] = dict(run_args["collection_args"])

a_args = extract_analyzer_args_from_model(plugin, data_model, logger)
if a_args:
plugins[obj.source]["analysis_args"] = a_args.model_dump(exclude_none=True)
Expand Down
32 changes: 32 additions & 0 deletions nodescraper/plugins/inband/rocm/collector_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
###############################################################################
#
# MIT License
#
# Copyright (c) 2026 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
###############################################################################
from nodescraper.models import CollectorArgs


class RocmCollectorArgs(CollectorArgs):
"""Collector arguments for RocmPlugin."""

rocm_path: str = "/opt/rocm"
51 changes: 30 additions & 21 deletions nodescraper/plugins/inband/rocm/rocm_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,40 +31,48 @@
from nodescraper.models import TaskResult
from nodescraper.utils import strip_ansi_codes

from .collector_args import RocmCollectorArgs
from .rocmdata import RocmDataModel


class RocmCollector(InBandDataCollector[RocmDataModel, None]):
class RocmCollector(InBandDataCollector[RocmDataModel, RocmCollectorArgs]):
"""Collect ROCm version data"""

SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX}

DATA_MODEL = RocmDataModel
CMD_VERSION_PATHS = [
"/opt/rocm/.info/version-rocm",
"/opt/rocm/.info/version",
]
CMD_ROCM_SUB_VERSIONS = "grep . -r /opt/rocm/.info/*"
CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
CMD_ROCM_SUB_VERSIONS_TMPL = "grep . -r {rocm_path}/.info/*"
CMD_ROCMINFO_TMPL = "{rocm_path}/bin/rocminfo"
CMD_ROCM_LATEST_TMPL = "ls -v -d {rocm_path}-[3-7]* | tail -1"
CMD_ROCM_DIRS_TMPL = "ls -v -d {rocm_path}*"
CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
CMD_CLINFO_TMPL = "{rocm_path}/opencl/bin/*/clinfo"
CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"

def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
def collect_data(
self, args: Optional[RocmCollectorArgs] = None
) -> tuple[TaskResult, Optional[RocmDataModel]]:
"""Collect ROCm version data from the system.

Returns:
tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available.
"""
if args is None:
args = RocmCollectorArgs()
version_paths = [
f"{args.rocm_path}/.info/version-rocm",
f"{args.rocm_path}/.info/version",
]

rocm_data = None
rocm_sub_versions = {}

# First, try to collect all sub-versions
sub_versions_res = self._run_sut_cmd(self.CMD_ROCM_SUB_VERSIONS)
sub_versions_res = self._run_sut_cmd(
self.CMD_ROCM_SUB_VERSIONS_TMPL.format(rocm_path=args.rocm_path)
)
if sub_versions_res.exit_code == 0:
for line in sub_versions_res.stdout.splitlines():
if ":" in line:
Expand All @@ -75,7 +83,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
rocm_sub_versions[key.strip()] = value.strip()

# Determine the main ROCm version
for path in self.CMD_VERSION_PATHS:
for path in version_paths:
res = self._run_sut_cmd(f"grep . {path}")
if res.exit_code == 0:
try:
Expand Down Expand Up @@ -105,32 +113,33 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
else:
self._log_event(
category=EventCategory.OS,
description=f"Unable to read ROCm version from {self.CMD_VERSION_PATHS}",
description=f"Unable to read ROCm version from {version_paths}",
data={"raw_output": res.stdout},
priority=EventPriority.ERROR,
)

# Collect additional ROCm data if version was found
if rocm_data:
# Collect latest versioned ROCm path (rocm-[3-7]*)
versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST)
versioned_path_res = self._run_sut_cmd(
self.CMD_ROCM_LATEST_TMPL.format(rocm_path=args.rocm_path)
)
if versioned_path_res.exit_code == 0:
rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip()

# Collect all ROCm paths as list
all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS)
all_paths_res = self._run_sut_cmd(
self.CMD_ROCM_DIRS_TMPL.format(rocm_path=args.rocm_path)
)
if all_paths_res.exit_code == 0:
rocm_data.rocm_all_paths = [
path.strip()
for path in all_paths_res.stdout.strip().split("\n")
if path.strip()
]

# Determine ROCm path for commands that need it
rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm"

# Collect rocminfo output as list of lines with ANSI codes stripped
rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path)
rocminfo_cmd = self.CMD_ROCMINFO_TMPL.format(rocm_path=args.rocm_path)
rocminfo_res = self._run_sut_cmd(rocminfo_cmd)
rocminfo_artifact_content = ""
if rocminfo_res.exit_code == 0:
Expand Down Expand Up @@ -167,7 +176,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
]

# Collect clinfo output
clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path)
clinfo_cmd = self.CMD_CLINFO_TMPL.format(rocm_path=args.rocm_path)
clinfo_res = self._run_sut_cmd(clinfo_cmd)

# Always append clinfo section to artifact, even if empty or failed
Expand Down
5 changes: 4 additions & 1 deletion nodescraper/plugins/inband/rocm/rocm_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,21 @@
from nodescraper.base import InBandDataPlugin

from .analyzer_args import RocmAnalyzerArgs
from .collector_args import RocmCollectorArgs
from .rocm_analyzer import RocmAnalyzer
from .rocm_collector import RocmCollector
from .rocmdata import RocmDataModel


class RocmPlugin(InBandDataPlugin[RocmDataModel, None, RocmAnalyzerArgs]):
class RocmPlugin(InBandDataPlugin[RocmDataModel, RocmCollectorArgs, RocmAnalyzerArgs]):
"""Plugin for collection and analysis of rocm version data"""

DATA_MODEL = RocmDataModel

COLLECTOR = RocmCollector

COLLECTOR_ARGS = RocmCollectorArgs

ANALYZER = RocmAnalyzer

ANALYZER_ARGS = RocmAnalyzerArgs
20 changes: 17 additions & 3 deletions test/functional/test_reference_config_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def test_gen_reference_config_subset_plugins(run_cli_command, tmp_path):

assert result.returncode in [0, 1, 2]

if result.returncode != 0:
pytest.skip("One or more plugins failed; reference config is not written")
reference_config_path = find_reference_config(log_path)
assert reference_config_path is not None, "reference_config.json was not created"
assert reference_config_path.exists()
Expand All @@ -148,7 +150,8 @@ def test_use_generated_reference_config(run_cli_command, tmp_path):
assert gen_result.returncode in [0, 1, 2]

reference_config_path = find_reference_config(gen_log_path)
assert reference_config_path is not None, "reference_config.json was not created"
if reference_config_path is None:
pytest.skip("reference_config.json was not created - one or more plugins failed")
assert reference_config_path.exists()

use_result = run_cli_command(
Expand Down Expand Up @@ -268,15 +271,22 @@ def test_reference_config_with_analysis_args(run_cli_command, tmp_path):


def test_reference_config_structure(run_cli_command, tmp_path):
"""Test that generated reference config has correct structure."""
"""Test that reference config is created and has correct structure when no plugin fails.

Uses OsPlugin only (likely to succeed in any environment). Requires returncode 0
so we actually assert the success path: reference config is written.
"""
log_path = str(tmp_path / "logs_structure")

result = run_cli_command(
["--log-path", log_path, "--gen-reference-config", "run-plugins", "OsPlugin"],
check=False,
)

assert result.returncode in [0, 1, 2]
assert result.returncode == 0, (
f"OsPlugin must succeed for this test (reference config only written when no plugin fails). "
f"returncode={result.returncode}, stderr={result.stderr[:500]!r}"
)

reference_config_path = find_reference_config(log_path)
assert reference_config_path is not None, "reference_config.json was not created"
Expand Down Expand Up @@ -305,6 +315,8 @@ def test_gen_reference_config_without_run_plugins(run_cli_command, tmp_path):

assert result.returncode in [0, 1, 2]

if result.returncode != 0:
pytest.skip("One or more plugins failed; reference config is not written")
reference_config_path = find_reference_config(log_path)
assert reference_config_path is not None, "reference_config.json was not created"
assert reference_config_path.exists()
Expand Down Expand Up @@ -332,6 +344,8 @@ def test_reference_config_json_valid(run_cli_command, tmp_path):

assert result.returncode in [0, 1, 2]

if result.returncode != 0:
pytest.skip("One or more plugins failed; reference config is not written")
reference_config_path = find_reference_config(log_path)
assert reference_config_path is not None, "reference_config.json was not created"
assert reference_config_path.exists()
Expand Down
54 changes: 54 additions & 0 deletions test/functional/test_run_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"""Functional tests for running individual plugins."""

import csv
import json
from pathlib import Path

import pytest
Expand Down Expand Up @@ -175,3 +176,56 @@ def test_run_plugin_with_data_file_no_collection(run_cli_command, tmp_path):
f"Bug regression: DmesgPlugin status is NOT_RAN with --data file. "
f"Analysis should have run on provided data. Status: {status}"
)


def test_rocm_plugin_with_custom_rocm_path_collection_args(run_cli_command, tmp_path):
"""Run RocmPlugin with collection_args.rocm_path overriding default /opt/rocm.

Creates a minimal ROCm-like tree under tmp_path, points the collector at it via
collection_args.rocm_path, and asserts the collected version matches.
"""
custom_version = "5.0.0-999"
rocm_root = tmp_path / "custom_rocm"
info_dir = rocm_root / ".info"
info_dir.mkdir(parents=True)
(info_dir / "version-rocm").write_text(custom_version + "\n")
(info_dir / "version").write_text(custom_version + "\n")

config = {
"name": "RocmPlugin custom rocm_path",
"desc": "RocmPlugin with collection_args.rocm_path override",
"global_args": {},
"plugins": {
"RocmPlugin": {
"collection_args": {"rocm_path": str(rocm_root)},
"analysis_args": {},
}
},
"result_collators": {},
}
config_file = tmp_path / "rocm_custom_path_config.json"
config_file.write_text(json.dumps(config, indent=2))

log_path = str(tmp_path / "rocm_custom_logs")
result = run_cli_command(
[
"--log-path",
log_path,
"--plugin-configs=" + str(config_file),
"run-plugins",
"RocmPlugin",
],
check=False,
)

output = result.stdout + result.stderr
assert "RocmPlugin" in output
log_dir = Path(log_path)
csv_files = list(log_dir.glob("**/nodescraper.csv"))
if csv_files:
with open(csv_files[0], "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = [r for r in reader if r.get("plugin") == "RocmPlugin"]
assert len(rows) >= 1, f"RocmPlugin should appear in CSV under {log_path}"
assert rows[0].get("status") != "NOT_RAN"
assert rows[0].get("message")