From 01b3769c322e1e6e964cd1bc15805363ff2a2676 Mon Sep 17 00:00:00 2001 From: ShobhaKumari07 Date: Thu, 20 Feb 2025 11:35:08 +0530 Subject: [PATCH 1/4] adding amd-smi --- .../profiles/MONITORS-GPU-AMD.json | 18 +- .../AmdSmiQueryGpuParserUnitTests.cs | 39 +- .../AmdSmiXGMIQueryGpuParserUnitTests.cs | 40 + .../Examples/amd-smi/metric-8xMI300X.csv | 9 + .../Examples/amd-smi/metric.csv | 2 + .../Examples/amd-smi/result.txt | Bin 296 -> 0 bytes .../Examples/amd-smi/xgmi-8xMI300X.json | 858 ++++++++++++++++++ .../AmdSmiMetricQueryGpuParser.cs | 95 ++ .../VirtualClient.Monitors/AmdSmiMonitor.cs | 168 +++- .../AmdSmiQueryGpuParser.cs | 60 -- .../AmdSmiXGMIQueryGpuParser.cs | 88 ++ .../docs/monitors/0200-monitor-profiles.md | 41 + 12 files changed, 1325 insertions(+), 93 deletions(-) create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv delete mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/result.txt create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/xgmi-8xMI300X.json create mode 100644 src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs delete mode 100644 src/VirtualClient/VirtualClient.Monitors/AmdSmiQueryGpuParser.cs create mode 100644 src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs diff --git a/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json b/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json index 628acf1569..90bda3bfff 100644 --- a/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json +++ b/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json @@ -1,7 +1,7 @@ { "Description": "Default Monitors for AMD GPU systems.", "Metadata": { - "SupportedPlatforms": "linux-x64,win-x64", + "SupportedPlatforms": "linux-x64, win-x64", "SupportedOperatingSystems": "CBL-Mariner,CentOS,Debian,RedHat,Suse,Ubuntu,Windows" }, "Parameters": { @@ -12,9 +12,19 @@ { "Type": "AmdSmiMonitor", "Parameters": { - "Scenario": "AmdGpuCounters", - "MonitorFrequency": "$.Parameters.MonitorFrequency", - "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" + "Scenario": "AmdGpuCounters", + "Subsystem": "metric", + "MonitorFrequency": "$.Parameters.MonitorFrequency", + "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" + } + }, + { + "Type": "AmdSmiMonitor", + "Parameters": { + "Scenario": "AmdGpuCounters", + "Subsystem": "xgmi", + "MonitorFrequency": "$.Parameters.MonitorFrequency", + "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" } }, { diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs index bba3396138..404fba54c1 100644 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs @@ -16,22 +16,41 @@ namespace VirtualClient.Monitors [TestFixture] [Category("Unit")] - public class AmdSmiQueryGpuParserUnitTests + public class AmdSmiMetricQueryGpuParserUnitTests { [Test] - public void AmdSmiQueryGpuParserParsesMetricsCorrectly() + public void AmdSmiMetricQueryGpuParserParsesMetricsCorrectly() { string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); - string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "result.txt"); + string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "metric.csv"); string rawText = File.ReadAllText(outputPath); - - AmdSmiQueryGpuParser testParser = new AmdSmiQueryGpuParser(rawText); + AmdSmiMetricQueryGpuParser testParser = new AmdSmiMetricQueryGpuParser(rawText); IList metrics = testParser.Parse(); + string gpuId = "0"; // Assume GPU ID for testing, can be dynamically extracted from parsed data + MetricAssert.Exists(metrics, $"utilization.gpu [%] (GPU {gpuId})", 98, "%"); + MetricAssert.Exists(metrics, $"framebuffer.total [MB] (GPU {gpuId})", 14928, "MB"); + MetricAssert.Exists(metrics, $"framebuffer.used [MB] (GPU {gpuId})", 363, "MB"); + } - Assert.AreEqual(3, metrics.Count); - MetricAssert.Exists(metrics, "utilization.gpu [%]", 98, "%"); - MetricAssert.Exists(metrics, "framebuffer.total [MB]", 14928, "MB"); - MetricAssert.Exists(metrics, "framebuffer.used [MB]", 363, "MB"); + [Test] + public void AmdSmiMetricQueryGpuParserParsesMetricsCorrectly_MI300X() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "metric-8xMI300X.csv"); + string rawText = File.ReadAllText(outputPath); + AmdSmiMetricQueryGpuParser testParser = new AmdSmiMetricQueryGpuParser(rawText); + IList metrics = testParser.Parse(); + string gpuId = "0"; // Assume GPU ID for testing + MetricAssert.Exists(metrics, $"utilization.gpu (GPU {gpuId})", 0, "%"); + MetricAssert.Exists(metrics, $"utilization.memory (GPU {gpuId})", 0, "%"); + MetricAssert.Exists(metrics, $"temperature.gpu (GPU {gpuId})", 36, "celsius"); + MetricAssert.Exists(metrics, $"temperature.memory (GPU {gpuId})", 30, "celsius"); + MetricAssert.Exists(metrics, $"power.draw.average (GPU {gpuId})", 133, "W"); + MetricAssert.Exists(metrics, $"gfx_clk_avg (GPU {gpuId})", 132.125, "MHz"); + MetricAssert.Exists(metrics, $"mem_clk (GPU {gpuId})", 900, "MHz"); + MetricAssert.Exists(metrics, $"video_vclk_avg (GPU {gpuId})", 29, "MHz"); + MetricAssert.Exists(metrics, $"video_dclk_avg (GPU {gpuId})", 22, "MHz"); + MetricAssert.Exists(metrics, $"pcie_bw (GPU {gpuId})", 24, "MB/s"); } } -} \ No newline at end of file +} diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs new file mode 100644 index 0000000000..b5d94ca668 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System.Collections.Generic; + using System.Diagnostics; + using System.IO; + using System.Linq; + using System.Reflection; + using System.Text; + using System.Threading.Tasks; + using NUnit.Framework; + using VirtualClient.Common; + using VirtualClient.Contracts; + + [TestFixture] + [Category("Unit")] + public class AmdSmiXGMIQueryGpuParserUnitTests + { + [Test] + public void AmdSmiXGMIQueryGpuParserParsesMetricsCorrectly() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "xgmi-8xMI300X.json"); + string rawText = File.ReadAllText(outputPath); + AmdSmiXGMIQueryGpuParser testParser = new AmdSmiXGMIQueryGpuParser(rawText); + IList metrics = testParser.Parse(); + Assert.AreEqual(8, metrics.Count); + MetricAssert.Exists(metrics, "xgmi_0_data", 14, "KB"); + MetricAssert.Exists(metrics, "xgmi_1_data", 12, "KB"); + MetricAssert.Exists(metrics, "xgmi_2_data", 10, "KB"); + MetricAssert.Exists(metrics, "xgmi_3_data", 9, "KB"); + MetricAssert.Exists(metrics, "xgmi_4_data", 9, "KB"); + MetricAssert.Exists(metrics, "xgmi_5_data", 8, "KB"); + MetricAssert.Exists(metrics, "xgmi_6_data", 6, "KB"); + MetricAssert.Exists(metrics, "xgmi_7_data", 6, "KB"); + } + } +} \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv new file mode 100644 index 0000000000..fd48ca0183 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv @@ -0,0 +1,9 @@ +gpu,gfx_activity,umc_activity,mm_activity,vcn_activity,jpeg_activity,socket_power,gfx_voltage,soc_voltage,mem_voltage,power_management,throttle_status,gfx_0_clk,gfx_0_min_clk,gfx_0_max_clk,gfx_0_clk_locked,gfx_0_deep_sleep,gfx_1_clk,gfx_1_min_clk,gfx_1_max_clk,gfx_1_clk_locked,gfx_1_deep_sleep,gfx_2_clk,gfx_2_min_clk,gfx_2_max_clk,gfx_2_clk_locked,gfx_2_deep_sleep,gfx_3_clk,gfx_3_min_clk,gfx_3_max_clk,gfx_3_clk_locked,gfx_3_deep_sleep,gfx_4_clk,gfx_4_min_clk,gfx_4_max_clk,gfx_4_clk_locked,gfx_4_deep_sleep,gfx_5_clk,gfx_5_min_clk,gfx_5_max_clk,gfx_5_clk_locked,gfx_5_deep_sleep,gfx_6_clk,gfx_6_min_clk,gfx_6_max_clk,gfx_6_clk_locked,gfx_6_deep_sleep,gfx_7_clk,gfx_7_min_clk,gfx_7_max_clk,gfx_7_clk_locked,gfx_7_deep_sleep,mem_0_clk,mem_0_min_clk,mem_0_max_clk,mem_0_clk_locked,mem_0_deep_sleep,vclk_0_clk,vclk_0_min_clk,vclk_0_max_clk,vclk_0_clk_locked,vclk_0_deep_sleep,vclk_1_clk,vclk_1_min_clk,vclk_1_max_clk,vclk_1_clk_locked,vclk_1_deep_sleep,vclk_2_clk,vclk_2_min_clk,vclk_2_max_clk,vclk_2_clk_locked,vclk_2_deep_sleep,vclk_3_clk,vclk_3_min_clk,vclk_3_max_clk,vclk_3_clk_locked,vclk_3_deep_sleep,dclk_0_clk,dclk_0_min_clk,dclk_0_max_clk,dclk_0_clk_locked,dclk_0_deep_sleep,dclk_1_clk,dclk_1_min_clk,dclk_1_max_clk,dclk_1_clk_locked,dclk_1_deep_sleep,dclk_2_clk,dclk_2_min_clk,dclk_2_max_clk,dclk_2_clk_locked,dclk_2_deep_sleep,dclk_3_clk,dclk_3_min_clk,dclk_3_max_clk,dclk_3_clk_locked,dclk_3_deep_sleep,edge,hotspot,mem,width,speed,bandwidth,replay_count,l0_to_recovery_count,replay_roll_over_count,nak_sent_count,nak_received_count,current_bandwidth_sent,current_bandwidth_received,max_packet_size,total_correctable_count,total_uncorrectable_count,total_deferred_count,cache_correctable_count,cache_uncorrectable_count,UMC_correctable_count,UMC_uncorrectable_count,UMC_deferred_count,SDMA_correctable_count,SDMA_uncorrectable_count,SDMA_deferred_count,GFX_correctable_count,GFX_uncorrectable_count,GFX_deferred_count,MMHUB_correctable_count,MMHUB_uncorrectable_count,MMHUB_deferred_count,PCIE_BIF_correctable_count,PCIE_BIF_uncorrectable_count,PCIE_BIF_deferred_count,HDP_correctable_count,HDP_uncorrectable_count,HDP_deferred_count,XGMI_WAFL_correctable_count,XGMI_WAFL_uncorrectable_count,XGMI_WAFL_deferred_count,max,rpm,usage,point_0_frequency,point_0_voltage,point_1_frequency,point_1_voltage,point_2_frequency,point_2_voltage,overdrive,perf_level,xgmi_err,total_energy_consumption,total_vram,used_vram,free_vram,total_visible_vram,used_visible_vram,free_visible_vram,total_gtt,used_gtt,free_gtt +0,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",133,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,30,16,N/A,192,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12119300.381,196592,283,196309,196592,283,196309,1031932,20,1031912 +1,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",139,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,35,29,16,N/A,157,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12648636.191,196592,283,196309,196592,283,196309,1031932,20,1031912 +2,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",133,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,32,29,16,N/A,106,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12142639.892,196592,283,196309,196592,283,196309,1031932,20,1031912 +3,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,28,16,N/A,192,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12086086.983,196592,283,196309,196592,283,196309,1031932,20,1031912 +4,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",136,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,134,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,29,16,N/A,145,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12403534.5,196592,283,196309,196592,283,196309,1031932,20,1031912 +5,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,35,29,16,N/A,107,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12011883.234,196592,283,196309,196592,283,196309,1031932,20,1031912 +6,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,131,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,29,16,N/A,94,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,11987029.516,196592,283,196309,196592,283,196309,1031932,20,1031912 +7,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",134,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,37,31,16,N/A,90,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12193331.537,196592,283,196309,196592,283,196309,1031932,20,1031912 \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv new file mode 100644 index 0000000000..bf9329164a --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv @@ -0,0 +1,2 @@ +gpu,gfx_usage,mem_usage,mm_usage_list,fb_total,fb_used,gfx_cur_clk,mem_cur_clk,mm1_cur_clk,mm2_cur_clk +0,98,1,[0, 0],14928,363,N/A,N/A,N/A,N/A \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/result.txt b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/result.txt deleted file mode 100644 index 73551c460e6d558b167c4e7dcf6dc36d251a14a7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 296 zcmZ{d-3o$05QV?%pm*p2f>I;wvKR0LB*aWbYDxToo?bh<5LinWXLjeCGiT)Z^`F{O| l(Bo@X?3H#RRt==o^&8cTBL~ewH;p(Fi6!hf|9x$u)f=;2G86y+ diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/xgmi-8xMI300X.json b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/xgmi-8xMI300X.json new file mode 100644 index 0000000000..eaff221e17 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/xgmi-8xMI300X.json @@ -0,0 +1,858 @@ +[ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": "N/A", + "write": "N/A" + } + ] + } + } +] \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs new file mode 100644 index 0000000000..91cec01f34 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System; + using System.Collections.Generic; + using System.Data; + using System.Linq; + using System.Text.RegularExpressions; + using VirtualClient.Contracts; + using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; + + /// + /// Parser for AmdSmi output document. + /// + public class AmdSmiMetricQueryGpuParser : MetricsParser + { + /// + /// Constructor for + /// + /// Raw text to parse. + public AmdSmiMetricQueryGpuParser(string rawText) + : base(rawText) + { + } + + /// + public override IList Parse() + { + this.Preprocess(); + + List metrics = new List(); + DataTable dataTable = DataTableExtensions.DataTableFromCsv(this.PreprocessedText); + + foreach (DataRow row in dataTable.Rows) + { + string gpuId = Convert.ToString(SafeGet(row, "gpu")); + Dictionary metadata = new Dictionary() + { + { "gpu.id", gpuId } + }; + + metrics.Add(new Metric($"utilization.gpu [%] (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "gfx_usage")), unit: "%", metadata: metadata)); + metrics.Add(new Metric($"framebuffer.total [MB] (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "fb_total")), unit: "MB", metadata: metadata)); + metrics.Add(new Metric($"framebuffer.used [MB] (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "fb_used")), unit: "MB", metadata: metadata)); + + // AMD MI300X + metrics.Add(new Metric($"utilization.gpu (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "gfx_activity")), unit: "%", metadata: metadata)); + double value = 100 * Convert.ToDouble(SafeGet(row, "used_vram")) / Convert.ToDouble(SafeGet(row, "total_vram")); + int roundedValue = Convert.ToInt32(Math.Round(value)); + metrics.Add(new Metric($"utilization.memory (GPU {gpuId})", roundedValue, unit: "%", metadata: metadata)); + metrics.Add(new Metric($"temperature.gpu (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "hotspot")), unit: "celsius", metadata: metadata)); + metrics.Add(new Metric($"temperature.memory (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "mem")), unit: "celsius", metadata: metadata)); + metrics.Add(new Metric($"power.draw.average (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "socket_power")), unit: "W", metadata: metadata)); + + double gfx_clk_avg = (Convert.ToDouble(SafeGet(row, "gfx_0_clk")) + Convert.ToDouble(SafeGet(row, "gfx_1_clk")) + + Convert.ToDouble(SafeGet(row, "gfx_2_clk")) + Convert.ToDouble(SafeGet(row, "gfx_3_clk")) + + Convert.ToDouble(SafeGet(row, "gfx_4_clk")) + Convert.ToDouble(SafeGet(row, "gfx_5_clk")) + + Convert.ToDouble(SafeGet(row, "gfx_6_clk")) + Convert.ToDouble(SafeGet(row, "gfx_7_clk"))) / 8; + + metrics.Add(new Metric($"gfx_clk_avg (GPU {gpuId})", gfx_clk_avg, unit: "MHz", metadata: metadata)); + metrics.Add(new Metric($"mem_clk (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "mem_0_clk")), unit: "MHz", metadata: metadata)); + + double video_vclk_avg = (Convert.ToDouble(SafeGet(row, "vclk_0_clk")) + Convert.ToDouble(SafeGet(row, "vclk_1_clk")) + + Convert.ToDouble(SafeGet(row, "vclk_2_clk")) + Convert.ToDouble(SafeGet(row, "vclk_3_clk"))) / 4; + + metrics.Add(new Metric($"video_vclk_avg (GPU {gpuId})", video_vclk_avg, unit: "MHz", metadata: metadata)); + + double video_dclk_avg = (Convert.ToDouble(SafeGet(row, "dclk_0_clk")) + Convert.ToDouble(SafeGet(row, "dclk_1_clk")) + + Convert.ToDouble(SafeGet(row, "dclk_2_clk")) + Convert.ToDouble(SafeGet(row, "dclk_3_clk"))) / 4; + + metrics.Add(new Metric($"video_dclk_avg (GPU {gpuId})", video_dclk_avg, unit: "MHz", metadata: metadata)); + metrics.Add(new Metric($"pcie_bw (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "bandwidth")) / 8, unit: "MB/s", metadata: metadata)); + } + + return metrics; + } + + /// + protected override void Preprocess() + { + this.PreprocessedText = this.RawText.Replace("\r\n", Environment.NewLine); + Regex quotedPattern = new Regex("\"([^\"]*)\""); + this.PreprocessedText = quotedPattern.Replace(this.PreprocessedText, "N/A"); + Regex quotedPattern2 = new Regex("\\[.*?\\]"); + this.PreprocessedText = quotedPattern2.Replace(this.PreprocessedText, "N/A"); + } + + private static IConvertible SafeGet(DataRow row, string columnName) + { + return row.Table.Columns.Contains(columnName) ? Convert.ToString(row[columnName]) : "-1"; + } + } +} diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs index da9ac6666c..21012e373b 100644 --- a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs +++ b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs @@ -5,14 +5,17 @@ namespace VirtualClient.Monitors { using System; using System.Collections.Generic; + using System.Diagnostics; using System.IO.Abstractions; using System.Linq; using System.Threading; using System.Threading.Tasks; using global::VirtualClient; using global::VirtualClient.Contracts; + using Microsoft.CodeAnalysis; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; + using Utilities; using VirtualClient.Common; using VirtualClient.Common.Extensions; using VirtualClient.Common.Telemetry; @@ -22,27 +25,52 @@ namespace VirtualClient.Monitors /// public class AmdSmiMonitor : VirtualClientIntervalBasedMonitor { + /// + /// Name of Metric subsystem. + /// + protected const string Metric = "metric"; + + /// + /// Name of XGMI subsystem. + /// + protected const string XGMI = "xgmi"; + + private ISystemManagement systemManagement; + private IFileSystem fileSystem; + /// /// Initializes a new instance of the class. /// public AmdSmiMonitor(IServiceCollection dependencies, IDictionary parameters) : base(dependencies, parameters) { + this.systemManagement = this.Dependencies.GetService(); + this.fileSystem = this.systemManagement.FileSystem; + } + + /// + /// AMDSMI Subsystem Name. + /// + public string Subsystem + { + get + { + this.Parameters.TryGetValue(nameof(AmdSmiMonitor.Subsystem), out IConvertible subsystem); + return subsystem?.ToString(); + } } /// protected override async Task ExecuteAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - switch (this.Platform) + if (this.Subsystem == AmdSmiMonitor.Metric) { - case PlatformID.Win32NT: - await this.QueryGpuAsync(telemetryContext, cancellationToken) - .ConfigureAwait(false); - break; + await this.QueryGpuMetricAsync(telemetryContext, cancellationToken).ConfigureAwait(false); + } - case PlatformID.Unix: - // not supported at the moment - break; + if (this.Subsystem == AmdSmiMonitor.XGMI) + { + await this.QueryGpuXGMIAsync(telemetryContext, cancellationToken).ConfigureAwait(false); } } @@ -58,38 +86,69 @@ protected void ValidateParameters() } } + private string GetAmdSmiCommand() + { + string command = string.Empty; + switch (this.Platform) + { + case PlatformID.Win32NT: + command = "amdsmi"; + break; + + case PlatformID.Unix: + command = "amd-smi"; + break; + } + + return command; + } + + private IList AmdSmiXGMIBandwidthAggregator(IList metrics1, IList metrics2, long time) + { + List aggregatedMetrics = new List(); + + if (metrics1.Any() && metrics2.Any()) + { + foreach (Metric counter1 in metrics1) + { + foreach (Metric counter2 in metrics2) + { + if (counter1.Metadata["gpu.id"] == counter2.Metadata["gpu.id"]) + { + double bandwidth = (counter2.Value - counter1.Value) / (((double)time) / 1000.0); + aggregatedMetrics.Add(new Metric($"xgmi.bw", (bandwidth / 1024), unit: "MB/s", metadata: counter1.Metadata)); + } + } + } + } + + return aggregatedMetrics; + } + /// /// Query the gpu for utilization information /// /// Provides context information that will be captured with telemetry events. /// A token that can be used to cancel the operation. /// - private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToken cancellationToken) + private async Task QueryGpuMetricAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - ISystemManagement systemManagement = this.Dependencies.GetService(); - IFileSystem fileSystem = systemManagement.FileSystem; - int totalSamples = (int)this.MonitorFrequency.TotalSeconds; - string command = "amdsmi"; string commandArguments = "metric --csv"; await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) .ConfigureAwait(false); - while (!cancellationToken.IsCancellationRequested) { try { - using (IProcessProxy process = systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, command, $"{commandArguments}", Environment.CurrentDirectory)) + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) { this.CleanupTasks.Add(() => process.SafeKill()); - DateTime startTime = DateTime.UtcNow; await process.StartAndWaitAsync(cancellationToken) .ConfigureAwait(false); - DateTime endTime = DateTime.UtcNow; - if (!cancellationToken.IsCancellationRequested) { try @@ -99,7 +158,7 @@ await process.StartAndWaitAsync(cancellationToken) if (process.StandardOutput.Length > 0) { - AmdSmiQueryGpuParser parser = new AmdSmiQueryGpuParser(process.StandardOutput.ToString()); + AmdSmiMetricQueryGpuParser parser = new AmdSmiMetricQueryGpuParser(process.StandardOutput.ToString()); IList metrics = parser.Parse(); if (metrics?.Any() == true) @@ -128,5 +187,76 @@ await process.StartAndWaitAsync(cancellationToken) } } } + + private async Task QueryGpuXGMIAsync(EventContext telemetryContext, CancellationToken cancellationToken) + { + int totalSamples = (int)this.MonitorFrequency.TotalSeconds; + string commandArguments = "xgmi -m --json"; + DateTime startTime1, endTime1, startTime2, endTime2; + IList metrics1, metrics2, aggregatedMetrics; + + Stopwatch stopwatch; + long elapsedMilliseconds; + + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) + .ConfigureAwait(false); + + while (!cancellationToken.IsCancellationRequested) + { + try + { + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) + { + this.CleanupTasks.Add(() => process.SafeKill()); + + stopwatch = Stopwatch.StartNew(); + + startTime1 = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken) + .ConfigureAwait(false); + + endTime1 = DateTime.UtcNow; + + AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); + metrics1 = parser.Parse(); + } + + await Task.Delay(500).ConfigureAwait(false); + + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) + { + this.CleanupTasks.Add(() => process.SafeKill()); + + startTime2 = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken) + .ConfigureAwait(false); + + endTime2 = DateTime.UtcNow; + stopwatch.Stop(); + elapsedMilliseconds = stopwatch.ElapsedMilliseconds; + + AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); + metrics2 = parser.Parse(); + } + + aggregatedMetrics = this.AmdSmiXGMIBandwidthAggregator(metrics1, metrics2, time: elapsedMilliseconds); + + if (aggregatedMetrics?.Any() == true) + { + this.Logger.LogPerformanceCounters("amd", aggregatedMetrics, startTime1, endTime2, telemetryContext); + } + + await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + // Expected whenever ctrl-C is used. + } + catch (Exception exc) + { + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); + } + } + } } } \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiQueryGpuParser.cs deleted file mode 100644 index 24220e17b7..0000000000 --- a/src/VirtualClient/VirtualClient.Monitors/AmdSmiQueryGpuParser.cs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace VirtualClient.Monitors -{ - using System; - using System.Collections.Generic; - using System.Data; - using System.Linq; - using VirtualClient.Contracts; - using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; - - /// - /// Parser for AmdSmi output document. - /// - public class AmdSmiQueryGpuParser : MetricsParser - { - /// - /// Constructor for - /// - /// Raw text to parse. - public AmdSmiQueryGpuParser(string rawText) - : base(rawText) - { - } - - /// - public override IList Parse() - { - this.Preprocess(); - - // Sanatize non-standard csv tokens in output - string replacedText = this.PreprocessedText.Replace("[0, 0]", "0"); - - List metrics = new List(); - DataTable dataTable = DataTableExtensions.DataTableFromCsv(replacedText); - - foreach (DataRow row in dataTable.Rows) - { - Dictionary metadata = new Dictionary() - { - { "gpu.id", Convert.ToString(row[0]) }, - }; - - // Ingest only the metrics which are exposed at the guest level - metrics.Add(new Metric("utilization.gpu [%]", Convert.ToDouble(row[1]), unit: "%", metadata: metadata)); - metrics.Add(new Metric("framebuffer.total [MB]", Convert.ToDouble(row[4]), unit: "MB", metadata: metadata)); - metrics.Add(new Metric("framebuffer.used [MB]", Convert.ToDouble(row[5]), unit: "MB", metadata: metadata)); - } - - return metrics; - } - - /// - protected override void Preprocess() - { - this.PreprocessedText = this.RawText.Replace("\r\n", Environment.NewLine); - } - } -} diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs new file mode 100644 index 0000000000..c2ba6faef8 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System; + using System.Collections.Generic; + using System.Data; + using System.IO; + using System.Linq; + using System.Text; + using System.Text.RegularExpressions; + using System.Threading; + using Newtonsoft.Json; + using VirtualClient.Contracts; + using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; + + /// + /// Parser for AmdSmi output document. + /// + public class AmdSmiXGMIQueryGpuParser : MetricsParser + { + /// + /// Constructor for + /// + /// Raw text to parse. + public AmdSmiXGMIQueryGpuParser(string rawText) + : base(rawText) + { + } + + /// + public override IList Parse() + { + this.Preprocess(); + List metrics = new List(); + List gpuDataList = JsonConvert.DeserializeObject>(this.PreprocessedText); + DataTable dt = new DataTable(); + dt.Columns.Add("gpu", typeof(int)); + int numGPUs = gpuDataList.Count; + for (int i = 0; i < numGPUs; i++) + { + dt.Columns.Add($"xgmi_{i}_data", typeof(double)); + } + + int id = 0; + foreach (dynamic gpuData in gpuDataList) + { + double data = 0; + DataRow row = dt.NewRow(); + row["gpu"] = gpuData.gpu; + foreach (var link in gpuData.link_metrics.links) + { + data += (link.read.value.Value + link.write.value.Value); + } + + row[$"xgmi_{id}_data"] = data; + dt.Rows.Add(row); + id++; + } + + int gpuId = 0; + foreach (DataRow row in dt.Rows) + { + Dictionary metadata = new Dictionary() + { + { "gpu.id", Convert.ToString(SafeGet(row, "gpu")) }, + }; + metrics.Add(new Metric($"xgmi_{gpuId}_data", Convert.ToDouble(SafeGet(row, $"xgmi_{gpuId}_data")), unit: "KB", metadata: metadata)); + gpuId++; + } + + return metrics; + } + + /// + protected override void Preprocess() + { + Regex quotedPattern = new Regex("\"N/A\""); + this.PreprocessedText = quotedPattern.Replace(this.RawText, "{\r\n\"value\": 0,\r\n\"unit\": \"KB\"\r\n}"); + } + + private static IConvertible SafeGet(DataRow row, string columnName) + { + return row.Table.Columns.Contains(columnName) ? Convert.ToString(row[columnName]) : "-1"; + } + } +} \ No newline at end of file diff --git a/website/docs/monitors/0200-monitor-profiles.md b/website/docs/monitors/0200-monitor-profiles.md index d3214e1334..6c8edbde0f 100644 --- a/website/docs/monitors/0200-monitor-profiles.md +++ b/website/docs/monitors/0200-monitor-profiles.md @@ -114,3 +114,44 @@ The monitor profile designed for Nvidia GPU systems. The profile captures counte ./VirtualClient --profile=PERF-GPU-MLPERF.json --profile=MONITORS-GPU-NVIDIA.json --system=Demo --timeout=1440 --packageStore="{BlobConnectionString|SAS Uri}" ``` + ``` +## MONITORS-GPU-AMD.json +The monitor profile designed for AMD GPU systems. The profile captures metrics on systems of AMD GPUs with amd-smi. + +* **Supported Platform/Architectures** + * linux-x64 + * win-x64 + +* **Supported Operating Systems** + * Ubuntu 18 + * Ubuntu 20 + * Ubuntu 22 + +* **Dependencies** + * The system needs to have AMD GPU with ROCM installed. + +* **Scenarios** + * Captures metrics on systems using [amd-smi](./0500-amd-smi.md) + +* **Profile Parameters** + The following parameters can be optionally supplied on the command line to change this default behavior. + + | Parameter | Purpose | Default value | + |---------------------------|---------------------------------------------------------------------------------|---------------| + | Scenario | Optional. A description of the purpose of the monitor within the overall profile workflow. | | + | MonitorFrequency | Optional. Defines the frequency (timespan) at which performance counters will be captured/emitted (e.g. 00:01:00). | 00:05:00 | + | MonitorWarmupPeriod | Optional. Defines a period of time (timespan) to wait before starting to track/capture performance counters (e.g. 00:03:00). This allows the system to get to a more typical operational state and generally results better representation for the counters captured. | 00:05:00 | + | MetricFilter | Optional. A comma-delimited list of performance counter names to capture. The default behavior is to capture/emit all performance counters (e.g. \Processor Information(_Total)\% System Time,\Processor Information(_Total)\% User Time). This allows the profile author to focus on a smaller/specific subset of the counters. This is typically used when a lower monitor frequency is required for higher sample precision to keep the size of the data sets emitted by the Virtual Client to a minimum. | | + +* **Usage Examples** + The following section provides a few basic examples of how to use the monitor profile. Additional usage examples can be found in the + 'Usage Scenarios/Examples' link at the top. + + ``` bash + # Run the monitoring facilities only. + ./VirtualClient --profile=MONITORS-GPU-AMD.json + + # Monitor profile explicitly defined. + ./VirtualClient --profile=PERF-GPU-3DMARK-AMD.json --profile=MONITORS-GPU-AMD.json --system=Demo --timeout=1440 --packageStore="{BlobConnectionString|SAS Uri}" + + ``` From 425772b909849630ce5e9f1bdd7568a80ab516bf Mon Sep 17 00:00:00 2001 From: ShobhaKumari07 Date: Thu, 27 Feb 2025 10:27:18 +0530 Subject: [PATCH 2/4] edited AmdSmiMonitor.cs --- .../Examples/amd-smi/metric-8xMI300X.csv | 2 +- .../VirtualClient.Monitors/AmdSmiMonitor.cs | 199 ++++++------------ 2 files changed, 70 insertions(+), 131 deletions(-) diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv index fd48ca0183..9b4dc629c0 100644 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv @@ -6,4 +6,4 @@ gpu,gfx_activity,umc_activity,mm_activity,vcn_activity,jpeg_activity,socket_powe 4,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",136,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,134,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,29,16,N/A,145,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12403534.5,196592,283,196309,196592,283,196309,1031932,20,1031912 5,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,35,29,16,N/A,107,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12011883.234,196592,283,196309,196592,283,196309,1031932,20,1031912 6,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,131,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,29,16,N/A,94,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,11987029.516,196592,283,196309,196592,283,196309,1031932,20,1031912 -7,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",134,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,37,31,16,N/A,90,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12193331.537,196592,283,196309,196592,283,196309,1031932,20,1031912 \ No newline at end of file +7,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",134,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,37,31,16,N/A,90,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12193331.537,196592,283,196309,196592,283,196309,1031932,20,1031912 diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs index 21012e373b..eb4575fecd 100644 --- a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs +++ b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs @@ -12,10 +12,8 @@ namespace VirtualClient.Monitors using System.Threading.Tasks; using global::VirtualClient; using global::VirtualClient.Contracts; - using Microsoft.CodeAnalysis; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; - using Utilities; using VirtualClient.Common; using VirtualClient.Common.Extensions; using VirtualClient.Common.Telemetry; @@ -25,16 +23,6 @@ namespace VirtualClient.Monitors /// public class AmdSmiMonitor : VirtualClientIntervalBasedMonitor { - /// - /// Name of Metric subsystem. - /// - protected const string Metric = "metric"; - - /// - /// Name of XGMI subsystem. - /// - protected const string XGMI = "xgmi"; - private ISystemManagement systemManagement; private IFileSystem fileSystem; @@ -49,26 +37,24 @@ public AmdSmiMonitor(IServiceCollection dependencies, IDictionary - /// AMDSMI Subsystem Name. + /// AMDSMI Subsystem Name= metric. /// - public string Subsystem - { - get - { - this.Parameters.TryGetValue(nameof(AmdSmiMonitor.Subsystem), out IConvertible subsystem); - return subsystem?.ToString(); - } - } + public bool SubsystemMetric => this.Parameters.GetValue(nameof(this.SubsystemMetric), false); + + /// + /// AMDSMI Subsystem Name = xgmi. + /// + public bool SubsystemXgmi => this.Parameters.GetValue(nameof(this.SubsystemXgmi), false); /// protected override async Task ExecuteAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - if (this.Subsystem == AmdSmiMonitor.Metric) + if (this.SubsystemMetric) { await this.QueryGpuMetricAsync(telemetryContext, cancellationToken).ConfigureAwait(false); } - if (this.Subsystem == AmdSmiMonitor.XGMI) + if (this.SubsystemXgmi) { await this.QueryGpuXGMIAsync(telemetryContext, cancellationToken).ConfigureAwait(false); } @@ -88,94 +74,44 @@ protected void ValidateParameters() private string GetAmdSmiCommand() { - string command = string.Empty; - switch (this.Platform) - { - case PlatformID.Win32NT: - command = "amdsmi"; - break; - - case PlatformID.Unix: - command = "amd-smi"; - break; - } - - return command; + return this.Platform == PlatformID.Win32NT ? "amdsmi" : "amd-smi"; } - private IList AmdSmiXGMIBandwidthAggregator(IList metrics1, IList metrics2, long time) - { - List aggregatedMetrics = new List(); - - if (metrics1.Any() && metrics2.Any()) - { - foreach (Metric counter1 in metrics1) - { - foreach (Metric counter2 in metrics2) - { - if (counter1.Metadata["gpu.id"] == counter2.Metadata["gpu.id"]) - { - double bandwidth = (counter2.Value - counter1.Value) / (((double)time) / 1000.0); - aggregatedMetrics.Add(new Metric($"xgmi.bw", (bandwidth / 1024), unit: "MB/s", metadata: counter1.Metadata)); - } - } - } - } - - return aggregatedMetrics; - } - - /// - /// Query the gpu for utilization information - /// - /// Provides context information that will be captured with telemetry events. - /// A token that can be used to cancel the operation. - /// private async Task QueryGpuMetricAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - int totalSamples = (int)this.MonitorFrequency.TotalSeconds; string commandArguments = "metric --csv"; - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) - .ConfigureAwait(false); + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken).ConfigureAwait(false); + while (!cancellationToken.IsCancellationRequested) { try { - using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), commandArguments, Environment.CurrentDirectory)) { this.CleanupTasks.Add(() => process.SafeKill()); DateTime startTime = DateTime.UtcNow; - await process.StartAndWaitAsync(cancellationToken) - .ConfigureAwait(false); + await process.StartAndWaitAsync(cancellationToken).ConfigureAwait(false); DateTime endTime = DateTime.UtcNow; + if (!cancellationToken.IsCancellationRequested) { - try + process.ThrowIfErrored(ProcessProxy.DefaultSuccessCodes, errorReason: ErrorReason.MonitorFailed); + + if (process.StandardOutput.Length > 0) { - // We cannot log the process details here. The output is too large. - process.ThrowIfErrored(ProcessProxy.DefaultSuccessCodes, errorReason: ErrorReason.MonitorFailed); + AmdSmiMetricQueryGpuParser parser = new AmdSmiMetricQueryGpuParser(process.StandardOutput.ToString()); + IList metrics = parser.Parse(); - if (process.StandardOutput.Length > 0) + if (metrics?.Any() == true) { - AmdSmiMetricQueryGpuParser parser = new AmdSmiMetricQueryGpuParser(process.StandardOutput.ToString()); - IList metrics = parser.Parse(); - - if (metrics?.Any() == true) - { - this.Logger.LogPerformanceCounters("amd", metrics, startTime, endTime, telemetryContext); - } + this.Logger.LogPerformanceCounters("amd", metrics, startTime, endTime, telemetryContext); } } - catch - { - await this.LogProcessDetailsAsync(process, EventContext.Persisted()); - throw; - } } - - await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); } + + await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); } catch (OperationCanceledException) { @@ -190,56 +126,24 @@ await process.StartAndWaitAsync(cancellationToken) private async Task QueryGpuXGMIAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - int totalSamples = (int)this.MonitorFrequency.TotalSeconds; string commandArguments = "xgmi -m --json"; - DateTime startTime1, endTime1, startTime2, endTime2; - IList metrics1, metrics2, aggregatedMetrics; - Stopwatch stopwatch; - long elapsedMilliseconds; - - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) - .ConfigureAwait(false); + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken).ConfigureAwait(false); while (!cancellationToken.IsCancellationRequested) { try { - using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) - { - this.CleanupTasks.Add(() => process.SafeKill()); - - stopwatch = Stopwatch.StartNew(); - - startTime1 = DateTime.UtcNow; - await process.StartAndWaitAsync(cancellationToken) - .ConfigureAwait(false); - - endTime1 = DateTime.UtcNow; - - AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); - metrics1 = parser.Parse(); - } + Stopwatch stopwatch = Stopwatch.StartNew(); + var (metrics1, startTime1, endTime1) = await this.ExecuteXGMICommand(commandArguments, cancellationToken); await Task.Delay(500).ConfigureAwait(false); + var (metrics2, startTime2, endTime2) = await this.ExecuteXGMICommand(commandArguments, cancellationToken); - using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) - { - this.CleanupTasks.Add(() => process.SafeKill()); - - startTime2 = DateTime.UtcNow; - await process.StartAndWaitAsync(cancellationToken) - .ConfigureAwait(false); - - endTime2 = DateTime.UtcNow; - stopwatch.Stop(); - elapsedMilliseconds = stopwatch.ElapsedMilliseconds; + stopwatch.Stop(); + long elapsedMilliseconds = stopwatch.ElapsedMilliseconds; - AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); - metrics2 = parser.Parse(); - } - - aggregatedMetrics = this.AmdSmiXGMIBandwidthAggregator(metrics1, metrics2, time: elapsedMilliseconds); + IList aggregatedMetrics = this.AmdSmiXGMIBandwidthAggregator(metrics1, metrics2, elapsedMilliseconds); if (aggregatedMetrics?.Any() == true) { @@ -249,8 +153,7 @@ await process.StartAndWaitAsync(cancellationToken) await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); } catch (OperationCanceledException) - { - // Expected whenever ctrl-C is used. + { } catch (Exception exc) { @@ -258,5 +161,41 @@ await process.StartAndWaitAsync(cancellationToken) } } } + + private async Task<(IList, DateTime, DateTime)> ExecuteXGMICommand(string commandArguments, CancellationToken cancellationToken) + { + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), commandArguments, Environment.CurrentDirectory)) + { + this.CleanupTasks.Add(() => process.SafeKill()); + DateTime startTime = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken).ConfigureAwait(false); + DateTime endTime = DateTime.UtcNow; + + AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); + return (parser.Parse(), startTime, endTime); + } + } + + private IList AmdSmiXGMIBandwidthAggregator(IList metrics1, IList metrics2, long time) + { + List aggregatedMetrics = new List(); + + if (metrics1.Any() && metrics2.Any()) + { + foreach (Metric counter1 in metrics1) + { + foreach (Metric counter2 in metrics2) + { + if (counter1.Metadata["gpu.id"] == counter2.Metadata["gpu.id"]) + { + double bandwidth = (counter2.Value - counter1.Value) / (((double)time) / 1000.0); + aggregatedMetrics.Add(new Metric($"xgmi.bw", (bandwidth / 1024), unit: "MB/s", metadata: counter1.Metadata)); + } + } + } + } + + return aggregatedMetrics; + } } -} \ No newline at end of file +} From b755abd1a3b76e27269b8f097a028aeadbdd9e1b Mon Sep 17 00:00:00 2001 From: ShobhaKumari07 Date: Thu, 27 Feb 2025 11:41:02 +0530 Subject: [PATCH 3/4] modified MONITORS-GPU-AMD.json --- .../profiles/MONITORS-GPU-AMD.json | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json b/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json index 90bda3bfff..ae40144fcf 100644 --- a/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json +++ b/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json @@ -13,16 +13,8 @@ "Type": "AmdSmiMonitor", "Parameters": { "Scenario": "AmdGpuCounters", - "Subsystem": "metric", - "MonitorFrequency": "$.Parameters.MonitorFrequency", - "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" - } - }, - { - "Type": "AmdSmiMonitor", - "Parameters": { - "Scenario": "AmdGpuCounters", - "Subsystem": "xgmi", + "SubsystemMetric": true, + "SubsystemXgmi": true, "MonitorFrequency": "$.Parameters.MonitorFrequency", "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" } From e3123d5728028f5b807edbab428a4536db68258e Mon Sep 17 00:00:00 2001 From: ShobhaKumari07 Date: Sun, 9 Mar 2025 11:14:02 +0530 Subject: [PATCH 4/4] added amd-smi parser --- .../Amd-Smi/AmdSmiMetricsParserTests.cs | 41 + .../AmdSmiXGMIQueryGpuParserUnitTests.cs | 0 .../AmdSmiQueryGpuParserUnitTests.cs | 56 - .../Examples/amd-smi/UsageMetrics.csv | 0 .../Examples/amd-smi/metric.csv | 2 - .../Examples/amd-smi/metrics.txt | 1167 +++++++++++++++++ .../Examples/amd-smi/powerMetrics.csv | 9 + .../Examples/amd-smi/temperatureMetrics.csv | 9 + .../Amd-Smi/AmdSmiMetricsParser.cs | 114 ++ .../{ => Amd-Smi}/AmdSmiMonitor.cs | 409 +++--- .../{ => Amd-Smi}/AmdSmiXGMIQueryGpuParser.cs | 0 .../AmdSmiMetricQueryGpuParser.cs | 95 -- 12 files changed, 1548 insertions(+), 354 deletions(-) create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Amd-Smi/AmdSmiMetricsParserTests.cs rename src/VirtualClient/VirtualClient.Monitors.UnitTests/{ => Amd-Smi}/AmdSmiXGMIQueryGpuParserUnitTests.cs (100%) delete mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/UsageMetrics.csv delete mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metrics.txt create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/powerMetrics.csv create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/temperatureMetrics.csv create mode 100644 src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiMetricsParser.cs rename src/VirtualClient/VirtualClient.Monitors/{ => Amd-Smi}/AmdSmiMonitor.cs (80%) rename src/VirtualClient/VirtualClient.Monitors/{ => Amd-Smi}/AmdSmiXGMIQueryGpuParser.cs (100%) delete mode 100644 src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Amd-Smi/AmdSmiMetricsParserTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Amd-Smi/AmdSmiMetricsParserTests.cs new file mode 100644 index 0000000000..4955359eeb --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Amd-Smi/AmdSmiMetricsParserTests.cs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors.UnitTests.Amd_Smi +{ + using System.Collections.Generic; + using System.IO; + using System.Reflection; + using NUnit.Framework; + using VirtualClient.Contracts; + using VirtualClient.Monitors.Amd_Smi; + + [TestFixture] + [Category("Unit")] + public class AmdSmiMetricsParserTests + { + [Test] + public void AmdSmiMetricsParserTest() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "metrics.txt"); + string rawText = File.ReadAllText(outputPath); + string gpuId = "0"; + + AmdSmiMetricsParser testParser = new AmdSmiMetricsParser(rawText); + IList metrics = testParser.Parse(); + + MetricAssert.Exists(metrics, $"GFX_ACTIVITY_GPU{gpuId}", 0, "%"); + MetricAssert.Exists(metrics, $"UMC_ACTIVITY_GPU{gpuId}", 0, "%"); + MetricAssert.Exists(metrics, $"MM_ACTIVITY_GPU{gpuId}", -1, ""); // N/A → -1 + MetricAssert.Exists(metrics, $"SOCKET_POWER_GPU{gpuId}", 137, "W"); + MetricAssert.Exists(metrics, $"GFX_VOLTAGE_GPU{gpuId}", -1, "V"); // N/A → -1 + MetricAssert.Exists(metrics, $"SOC_VOLTAGE_GPU{gpuId}", -1, "V"); // N/A → -1 + MetricAssert.Exists(metrics, $"MEM_VOLTAGE_GPU{gpuId}", -1, "V"); // N/A → -1 + MetricAssert.Exists(metrics, $"POWER_MANAGEMENT_GPU{gpuId}", -1, ""); // ENABLED → 1 + MetricAssert.Exists(metrics, $"TEMPERATURE_EDGE_GPU{gpuId}", -1, "C"); + MetricAssert.Exists(metrics, $"TEMPERATURE_HOTSPOT_GPU{gpuId}", 38, "C"); + MetricAssert.Exists(metrics, $"TEMPERATURE_MEM_GPU{gpuId}", 31, "C"); + } + } +} \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Amd-Smi/AmdSmiXGMIQueryGpuParserUnitTests.cs similarity index 100% rename from src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs rename to src/VirtualClient/VirtualClient.Monitors.UnitTests/Amd-Smi/AmdSmiXGMIQueryGpuParserUnitTests.cs diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs deleted file mode 100644 index 404fba54c1..0000000000 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace VirtualClient.Monitors -{ - using System.Collections.Generic; - using System.Diagnostics; - using System.IO; - using System.Linq; - using System.Reflection; - using System.Text; - using System.Threading.Tasks; - using NUnit.Framework; - using VirtualClient.Common; - using VirtualClient.Contracts; - - [TestFixture] - [Category("Unit")] - public class AmdSmiMetricQueryGpuParserUnitTests - { - [Test] - public void AmdSmiMetricQueryGpuParserParsesMetricsCorrectly() - { - string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); - string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "metric.csv"); - string rawText = File.ReadAllText(outputPath); - AmdSmiMetricQueryGpuParser testParser = new AmdSmiMetricQueryGpuParser(rawText); - IList metrics = testParser.Parse(); - string gpuId = "0"; // Assume GPU ID for testing, can be dynamically extracted from parsed data - MetricAssert.Exists(metrics, $"utilization.gpu [%] (GPU {gpuId})", 98, "%"); - MetricAssert.Exists(metrics, $"framebuffer.total [MB] (GPU {gpuId})", 14928, "MB"); - MetricAssert.Exists(metrics, $"framebuffer.used [MB] (GPU {gpuId})", 363, "MB"); - } - - [Test] - public void AmdSmiMetricQueryGpuParserParsesMetricsCorrectly_MI300X() - { - string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); - string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "metric-8xMI300X.csv"); - string rawText = File.ReadAllText(outputPath); - AmdSmiMetricQueryGpuParser testParser = new AmdSmiMetricQueryGpuParser(rawText); - IList metrics = testParser.Parse(); - string gpuId = "0"; // Assume GPU ID for testing - MetricAssert.Exists(metrics, $"utilization.gpu (GPU {gpuId})", 0, "%"); - MetricAssert.Exists(metrics, $"utilization.memory (GPU {gpuId})", 0, "%"); - MetricAssert.Exists(metrics, $"temperature.gpu (GPU {gpuId})", 36, "celsius"); - MetricAssert.Exists(metrics, $"temperature.memory (GPU {gpuId})", 30, "celsius"); - MetricAssert.Exists(metrics, $"power.draw.average (GPU {gpuId})", 133, "W"); - MetricAssert.Exists(metrics, $"gfx_clk_avg (GPU {gpuId})", 132.125, "MHz"); - MetricAssert.Exists(metrics, $"mem_clk (GPU {gpuId})", 900, "MHz"); - MetricAssert.Exists(metrics, $"video_vclk_avg (GPU {gpuId})", 29, "MHz"); - MetricAssert.Exists(metrics, $"video_dclk_avg (GPU {gpuId})", 22, "MHz"); - MetricAssert.Exists(metrics, $"pcie_bw (GPU {gpuId})", 24, "MB/s"); - } - } -} diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/UsageMetrics.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/UsageMetrics.csv new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv deleted file mode 100644 index bf9329164a..0000000000 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv +++ /dev/null @@ -1,2 +0,0 @@ -gpu,gfx_usage,mem_usage,mm_usage_list,fb_total,fb_used,gfx_cur_clk,mem_cur_clk,mm1_cur_clk,mm2_cur_clk -0,98,1,[0, 0],14928,363,N/A,N/A,N/A,N/A \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metrics.txt b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metrics.txt new file mode 100644 index 0000000000..820d9b5231 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metrics.txt @@ -0,0 +1,1167 @@ +GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 137 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 245 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 131 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 131 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 132 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 132 MHz + MIN_CLK: 263 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 38 °C + MEM: 31 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 4915 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB + +GPU: 1 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 137 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 232 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 132 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 132 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 132 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 132 MHz + MIN_CLK: 251 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 39 °C + MEM: 33 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 431 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB + +GPU: 2 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 138 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 247 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 132 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 132 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 132 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 133 MHz + MIN_CLK: 265 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 38 °C + MEM: 31 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 2196 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB + +GPU: 3 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 135 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 253 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 131 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 132 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 132 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 132 MHz + MIN_CLK: 271 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 36 °C + MEM: 31 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 1027 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB + +GPU: 4 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 140 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 232 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 132 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 132 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 132 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 132 MHz + MIN_CLK: 250 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 37 °C + MEM: 29 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 2657 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB + +GPU: 5 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 138 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 237 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 132 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 132 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 131 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 132 MHz + MIN_CLK: 255 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 38 °C + MEM: 34 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 18 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB + +GPU: 6 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 135 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 262 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 132 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 132 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 132 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 132 MHz + MIN_CLK: 262 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 902 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 36 °C + MEM: 30 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 993 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB + +GPU: 7 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + POWER: + SOCKET_POWER: 139 W + GFX_VOLTAGE: N/A mV + SOC_VOLTAGE: N/A mV + MEM_VOLTAGE: N/A mV + THROTTLE_STATUS: UNTHROTTLED + POWER_MANAGEMENT: ENABLED + CLOCK: + GFX_0: + CLK: 219 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: DISABLED + GFX_1: + CLK: 132 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 131 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 132 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 132 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 132 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 132 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 132 MHz + MIN_CLK: 238 MHz + MAX_CLK: 2100 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1300 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1333 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1143 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + TEMPERATURE: + EDGE: N/A + HOTSPOT: 37 °C + MEM: 31 °C + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 3091 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + MEM_USAGE: + TOTAL_VRAM: 196048 MB + USED_VRAM: 282 MB + FREE_VRAM: 195766 MB + TOTAL_VISIBLE_VRAM: 196048 MB + USED_VISIBLE_VRAM: 282 MB + FREE_VISIBLE_VRAM: 195766 MB + TOTAL_GTT: 885866 MB + USED_GTT: 17 MB + FREE_GTT: 885849 MB \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/powerMetrics.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/powerMetrics.csv new file mode 100644 index 0000000000..aa6efc72b3 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/powerMetrics.csv @@ -0,0 +1,9 @@ +gpu,socket_power,gfx_voltage,soc_voltage,mem_voltage,throttle_status,power_management +0,136,N/A,N/A,N/A,UNTHROTTLED,ENABLED +1,135,N/A,N/A,N/A,UNTHROTTLED,ENABLED +2,137,N/A,N/A,N/A,UNTHROTTLED,ENABLED +3,133,N/A,N/A,N/A,UNTHROTTLED,ENABLED +4,138,N/A,N/A,N/A,UNTHROTTLED,ENABLED +5,137,N/A,N/A,N/A,UNTHROTTLED,ENABLED +6,135,N/A,N/A,N/A,UNTHROTTLED,ENABLED +7,137,N/A,N/A,N/A,UNTHROTTLED,ENABLED \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/temperatureMetrics.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/temperatureMetrics.csv new file mode 100644 index 0000000000..a838c61828 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/temperatureMetrics.csv @@ -0,0 +1,9 @@ +gpu,edge,hotspot,mem +0,N/A,36,29 +1,N/A,36,31 +2,N/A,35,29 +3,N/A,34,29 +4,N/A,35,28 +5,N/A,36,32 +6,N/A,34,29 +7,N/A,36,30 \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiMetricsParser.cs b/src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiMetricsParser.cs new file mode 100644 index 0000000000..5579d2619d --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiMetricsParser.cs @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors.Amd_Smi +{ + using System; + using System.Collections.Generic; + using System.Linq; + using System.Text.RegularExpressions; + using VirtualClient.Contracts; + + /// + /// Parser for AMD SMI power and usage metrics, supporting multiple GPUs. + /// + public class AmdSmiMetricsParser : MetricsParser + { + /// + /// Initializes a new instance of the class. + /// + /// Raw text to parse. + public AmdSmiMetricsParser(string rawText) + : base(rawText) + { + } + + /// + public override IList Parse() + { + this.Preprocess(); + + List metrics = new List(); + var gpuSections = this.ExtractGpuSections(this.PreprocessedText); + + metrics.Add(new Metric("TOTAL_GPUS", gpuSections.Count, "count")); + + foreach (var (gpuId, section) in gpuSections) + { + this.ExtractMetrics(metrics, section, gpuId); + } + + return metrics; + } + + /// + protected override void Preprocess() + { + this.PreprocessedText = this.RawText.Trim(); + } + + /// + /// Extracts GPU sections from the raw text based on whitespace separation. + /// + private List<(string GpuId, string Section)> ExtractGpuSections(string rawText) + { + var gpuSections = new List<(string, string)>(); + var sections = Regex.Split(rawText, "\n\\s*\n").Where(s => !string.IsNullOrWhiteSpace(s)).ToList(); + + for (int i = 0; i < sections.Count; i++) + { + string gpuId = i.ToString(); // Assigning zero-based IDs to GPUs + gpuSections.Add((gpuId, sections[i].Trim())); + } + + return gpuSections; + } + + /// + /// Extracts and adds metrics from the section for a specific GPU. + /// + private void ExtractMetrics(List metrics, string section, string gpuId) + { + var metricDefinitions = new List<(string Name, string Pattern, string Unit, double DivideBy)> + { + ("GFX_ACTIVITY", "GFX_ACTIVITY:\\s+(?\\d+) %", "%", 1), + ("UMC_ACTIVITY", "UMC_ACTIVITY:\\s+(?\\d+) %", "%", 1), + ("MM_ACTIVITY", "MM_ACTIVITY:\\s+(?\\w+)", string.Empty, 1), + ("SOCKET_POWER", "SOCKET_POWER:\\s+(?\\d+) W", "W", 1), + ("GFX_VOLTAGE", "GFX_VOLTAGE:\\s+(?N/A|\\d+(\\.\\d+)?)", "V", 1), + ("SOC_VOLTAGE", "SOC_VOLTAGE:\\s+(?N/A|\\d+(\\.\\d+)?)", "V", 1), + ("MEM_VOLTAGE", "MEM_VOLTAGE:\\s+(?N/A|\\d+(\\.\\d+)?)", "V", 1), + ("POWER_MANAGEMENT", "POWER_MANAGEMENT:\\s+(?\\w+)", string.Empty, 1), + ("TEMPERATURE_EDGE", "EDGE:\\s+(?N/A|\\d+(\\.\\d+)?)(?:\\s+°C)?", "C", 1), + ("TEMPERATURE_HOTSPOT", "HOTSPOT:\\s+(?N/A|\\d+(\\.\\d+)?)(?:\\s+°C)?", "C", 1), + ("TEMPERATURE_MEM", "MEM:\\s+(?N/A|\\d+(\\.\\d+)?)(?:\\s+°C)?", "C", 1) + }; + + foreach (var metric in metricDefinitions) + { + this.AddMetric(metrics, section, $"{metric.Name}_GPU{gpuId}", metric.Pattern, metric.Unit, metric.DivideBy); + } + } + + /// + /// Adds a metric to the list if a match is found in the section, attaching GPU ID. + /// + private void AddMetric(List metrics, string section, string name, string pattern, string unit, double divideBy = 1) + { + var match = Regex.Match(section, pattern); + if (match.Success) + { + double value = ParseDoubleSafely(match.Groups["value"].Value) / divideBy; + metrics.Add(new Metric(name, value, unit)); + } + } + + /// + /// Converts a value to double safely, replacing non-numeric values with -1. + /// + private static double ParseDoubleSafely(string value) + { + return double.TryParse(value, out double result) ? result : -1; + } + } +} \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs b/src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiMonitor.cs similarity index 80% rename from src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs rename to src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiMonitor.cs index eb4575fecd..6e5f045329 100644 --- a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs +++ b/src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiMonitor.cs @@ -1,201 +1,208 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace VirtualClient.Monitors -{ - using System; - using System.Collections.Generic; - using System.Diagnostics; - using System.IO.Abstractions; - using System.Linq; - using System.Threading; - using System.Threading.Tasks; - using global::VirtualClient; - using global::VirtualClient.Contracts; - using Microsoft.Extensions.DependencyInjection; - using Microsoft.Extensions.Logging; - using VirtualClient.Common; - using VirtualClient.Common.Extensions; - using VirtualClient.Common.Telemetry; - - /// - /// The Performance Counter Monitor for Virtual Client - /// - public class AmdSmiMonitor : VirtualClientIntervalBasedMonitor - { - private ISystemManagement systemManagement; - private IFileSystem fileSystem; - - /// - /// Initializes a new instance of the class. - /// - public AmdSmiMonitor(IServiceCollection dependencies, IDictionary parameters) - : base(dependencies, parameters) - { - this.systemManagement = this.Dependencies.GetService(); - this.fileSystem = this.systemManagement.FileSystem; - } - - /// - /// AMDSMI Subsystem Name= metric. - /// - public bool SubsystemMetric => this.Parameters.GetValue(nameof(this.SubsystemMetric), false); - - /// - /// AMDSMI Subsystem Name = xgmi. - /// - public bool SubsystemXgmi => this.Parameters.GetValue(nameof(this.SubsystemXgmi), false); - - /// - protected override async Task ExecuteAsync(EventContext telemetryContext, CancellationToken cancellationToken) - { - if (this.SubsystemMetric) - { - await this.QueryGpuMetricAsync(telemetryContext, cancellationToken).ConfigureAwait(false); - } - - if (this.SubsystemXgmi) - { - await this.QueryGpuXGMIAsync(telemetryContext, cancellationToken).ConfigureAwait(false); - } - } - - /// - protected void ValidateParameters() - { - if (this.MonitorFrequency <= TimeSpan.Zero) - { - throw new MonitorException( - $"The monitor frequency defined/provided for the '{this.TypeName}' component '{this.MonitorFrequency}' is not valid. " + - $"The frequency must be greater than zero.", - ErrorReason.InvalidProfileDefinition); - } - } - - private string GetAmdSmiCommand() - { - return this.Platform == PlatformID.Win32NT ? "amdsmi" : "amd-smi"; - } - - private async Task QueryGpuMetricAsync(EventContext telemetryContext, CancellationToken cancellationToken) - { - string commandArguments = "metric --csv"; - - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken).ConfigureAwait(false); - - while (!cancellationToken.IsCancellationRequested) - { - try - { - using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), commandArguments, Environment.CurrentDirectory)) - { - this.CleanupTasks.Add(() => process.SafeKill()); - DateTime startTime = DateTime.UtcNow; - await process.StartAndWaitAsync(cancellationToken).ConfigureAwait(false); - DateTime endTime = DateTime.UtcNow; - - if (!cancellationToken.IsCancellationRequested) - { - process.ThrowIfErrored(ProcessProxy.DefaultSuccessCodes, errorReason: ErrorReason.MonitorFailed); - - if (process.StandardOutput.Length > 0) - { - AmdSmiMetricQueryGpuParser parser = new AmdSmiMetricQueryGpuParser(process.StandardOutput.ToString()); - IList metrics = parser.Parse(); - - if (metrics?.Any() == true) - { - this.Logger.LogPerformanceCounters("amd", metrics, startTime, endTime, telemetryContext); - } - } - } - } - - await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); - } - catch (OperationCanceledException) - { - // Expected whenever ctrl-C is used. - } - catch (Exception exc) - { - this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); - } - } - } - - private async Task QueryGpuXGMIAsync(EventContext telemetryContext, CancellationToken cancellationToken) - { - string commandArguments = "xgmi -m --json"; - - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken).ConfigureAwait(false); - - while (!cancellationToken.IsCancellationRequested) - { - try - { - Stopwatch stopwatch = Stopwatch.StartNew(); - - var (metrics1, startTime1, endTime1) = await this.ExecuteXGMICommand(commandArguments, cancellationToken); - await Task.Delay(500).ConfigureAwait(false); - var (metrics2, startTime2, endTime2) = await this.ExecuteXGMICommand(commandArguments, cancellationToken); - - stopwatch.Stop(); - long elapsedMilliseconds = stopwatch.ElapsedMilliseconds; - - IList aggregatedMetrics = this.AmdSmiXGMIBandwidthAggregator(metrics1, metrics2, elapsedMilliseconds); - - if (aggregatedMetrics?.Any() == true) - { - this.Logger.LogPerformanceCounters("amd", aggregatedMetrics, startTime1, endTime2, telemetryContext); - } - - await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); - } - catch (OperationCanceledException) - { - } - catch (Exception exc) - { - this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); - } - } - } - - private async Task<(IList, DateTime, DateTime)> ExecuteXGMICommand(string commandArguments, CancellationToken cancellationToken) - { - using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), commandArguments, Environment.CurrentDirectory)) - { - this.CleanupTasks.Add(() => process.SafeKill()); - DateTime startTime = DateTime.UtcNow; - await process.StartAndWaitAsync(cancellationToken).ConfigureAwait(false); - DateTime endTime = DateTime.UtcNow; - - AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); - return (parser.Parse(), startTime, endTime); - } - } - - private IList AmdSmiXGMIBandwidthAggregator(IList metrics1, IList metrics2, long time) - { - List aggregatedMetrics = new List(); - - if (metrics1.Any() && metrics2.Any()) - { - foreach (Metric counter1 in metrics1) - { - foreach (Metric counter2 in metrics2) - { - if (counter1.Metadata["gpu.id"] == counter2.Metadata["gpu.id"]) - { - double bandwidth = (counter2.Value - counter1.Value) / (((double)time) / 1000.0); - aggregatedMetrics.Add(new Metric($"xgmi.bw", (bandwidth / 1024), unit: "MB/s", metadata: counter1.Metadata)); - } - } - } - } - - return aggregatedMetrics; - } - } -} +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System; + using System.Collections.Generic; + using System.Diagnostics; + using System.IO.Abstractions; + using System.Linq; + using System.Threading; + using System.Threading.Tasks; + using global::VirtualClient; + using global::VirtualClient.Contracts; + using Microsoft.Extensions.DependencyInjection; + using Microsoft.Extensions.Logging; + using VirtualClient.Common; + using VirtualClient.Common.Extensions; + using VirtualClient.Common.Telemetry; + using VirtualClient.Monitors.Amd_Smi; + + /// + /// The Performance Counter Monitor for Virtual Client + /// + public class AmdSmiMonitor : VirtualClientIntervalBasedMonitor + { + private ISystemManagement systemManagement; + private IFileSystem fileSystem; + + /// + /// Initializes a new instance of the class. + /// + public AmdSmiMonitor(IServiceCollection dependencies, IDictionary parameters) + : base(dependencies, parameters) + { + this.systemManagement = this.Dependencies.GetService(); + this.fileSystem = this.systemManagement.FileSystem; + } + + /// + protected override async Task ExecuteAsync(EventContext telemetryContext, CancellationToken cancellationToken) + { + try + { + switch (this.Platform) + { + case PlatformID.Win32NT: + await this.QueryGpuMetricAsync(telemetryContext, cancellationToken).ConfigureAwait(false); + + break; + + case PlatformID.Unix: + await this.QueryGpuMetricAsync(telemetryContext, cancellationToken).ConfigureAwait(false); + + if (!cancellationToken.IsCancellationRequested) + { + Console.WriteLine("executing xgmi"); + await this.QueryGpuXGMIAsync(telemetryContext, cancellationToken).ConfigureAwait(false); + } + + break; + } + } + catch (Exception ex) + { + Console.WriteLine($"[Error] ExecuteAsync failed: {ex.Message}"); + } + } + + /// + protected void ValidateParameters() + { + if (this.MonitorFrequency <= TimeSpan.Zero) + { + throw new MonitorException( + $"The monitor frequency defined/provided for the '{this.TypeName}' component '{this.MonitorFrequency}' is not valid. " + + $"The frequency must be greater than zero.", + ErrorReason.InvalidProfileDefinition); + } + } + + private async Task QueryGpuMetricAsync(EventContext telemetryContext, CancellationToken cancellationToken) + { + string amdSmiMonitorCommand = "amd-smi"; + string commandArgumentsForPower = "metric"; + + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken).ConfigureAwait(false); + int i = 0; + + while (!cancellationToken.IsCancellationRequested && i < 1) + { + try + { + i++; + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, amdSmiMonitorCommand, commandArgumentsForPower, Environment.CurrentDirectory)) + { + this.CleanupTasks.Add(() => process.SafeKill()); + DateTime startTime = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken).ConfigureAwait(false); + DateTime endTime = DateTime.UtcNow; + + if (!cancellationToken.IsCancellationRequested) + { + process.ThrowIfErrored(ProcessProxy.DefaultSuccessCodes, errorReason: ErrorReason.MonitorFailed); + + if (process.StandardOutput.Length > 0) + { + AmdSmiMetricsParser parser = new AmdSmiMetricsParser(process.StandardOutput.ToString()); + IList metrics = parser.Parse(); + + if (metrics?.Any() == true) + { + this.Logger.LogPerformanceCounters("amd", metrics, startTime, endTime, telemetryContext); + } + } + } + } + + await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + // Expected whenever ctrl-C is used. + } + catch (Exception exc) + { + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); + } + } + } + + private async Task QueryGpuXGMIAsync(EventContext telemetryContext, CancellationToken cancellationToken) + { + string commandArguments = "xgmi -m --json"; + + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken).ConfigureAwait(false); + int i = 0; + + while (!cancellationToken.IsCancellationRequested && i < 1) + { + i++; + try + { + Stopwatch stopwatch = Stopwatch.StartNew(); + + var (metrics1, startTime1, endTime1) = await this.ExecuteXGMICommand(commandArguments, cancellationToken); + await Task.Delay(500).ConfigureAwait(false); + var (metrics2, startTime2, endTime2) = await this.ExecuteXGMICommand(commandArguments, cancellationToken); + + stopwatch.Stop(); + long elapsedMilliseconds = stopwatch.ElapsedMilliseconds; + + IList aggregatedMetrics = this.AmdSmiXGMIBandwidthAggregator(metrics1, metrics2, elapsedMilliseconds); + + if (aggregatedMetrics?.Any() == true) + { + this.Logger.LogPerformanceCounters("amd", aggregatedMetrics, startTime1, endTime2, telemetryContext); + } + + await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + } + catch (Exception exc) + { + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); + } + } + } + + private async Task<(IList, DateTime, DateTime)> ExecuteXGMICommand(string commandArguments, CancellationToken cancellationToken) + { + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, "amd-smi", commandArguments, Environment.CurrentDirectory)) + { + this.CleanupTasks.Add(() => process.SafeKill()); + DateTime startTime = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken).ConfigureAwait(false); + DateTime endTime = DateTime.UtcNow; + + AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); + return (parser.Parse(), startTime, endTime); + } + } + + private IList AmdSmiXGMIBandwidthAggregator(IList metrics1, IList metrics2, long time) + { + List aggregatedMetrics = new List(); + + if (metrics1.Any() && metrics2.Any()) + { + foreach (Metric counter1 in metrics1) + { + foreach (Metric counter2 in metrics2) + { + if (counter1.Metadata["gpu.id"] == counter2.Metadata["gpu.id"]) + { + double bandwidth = (counter2.Value - counter1.Value) / (((double)time) / 1000.0); + aggregatedMetrics.Add(new Metric($"xgmi.bw", (bandwidth / 1024), unit: "MB/s", metadata: counter1.Metadata)); + } + } + } + } + + return aggregatedMetrics; + } + } +} diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiXGMIQueryGpuParser.cs similarity index 100% rename from src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs rename to src/VirtualClient/VirtualClient.Monitors/Amd-Smi/AmdSmiXGMIQueryGpuParser.cs diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs deleted file mode 100644 index 91cec01f34..0000000000 --- a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace VirtualClient.Monitors -{ - using System; - using System.Collections.Generic; - using System.Data; - using System.Linq; - using System.Text.RegularExpressions; - using VirtualClient.Contracts; - using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; - - /// - /// Parser for AmdSmi output document. - /// - public class AmdSmiMetricQueryGpuParser : MetricsParser - { - /// - /// Constructor for - /// - /// Raw text to parse. - public AmdSmiMetricQueryGpuParser(string rawText) - : base(rawText) - { - } - - /// - public override IList Parse() - { - this.Preprocess(); - - List metrics = new List(); - DataTable dataTable = DataTableExtensions.DataTableFromCsv(this.PreprocessedText); - - foreach (DataRow row in dataTable.Rows) - { - string gpuId = Convert.ToString(SafeGet(row, "gpu")); - Dictionary metadata = new Dictionary() - { - { "gpu.id", gpuId } - }; - - metrics.Add(new Metric($"utilization.gpu [%] (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "gfx_usage")), unit: "%", metadata: metadata)); - metrics.Add(new Metric($"framebuffer.total [MB] (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "fb_total")), unit: "MB", metadata: metadata)); - metrics.Add(new Metric($"framebuffer.used [MB] (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "fb_used")), unit: "MB", metadata: metadata)); - - // AMD MI300X - metrics.Add(new Metric($"utilization.gpu (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "gfx_activity")), unit: "%", metadata: metadata)); - double value = 100 * Convert.ToDouble(SafeGet(row, "used_vram")) / Convert.ToDouble(SafeGet(row, "total_vram")); - int roundedValue = Convert.ToInt32(Math.Round(value)); - metrics.Add(new Metric($"utilization.memory (GPU {gpuId})", roundedValue, unit: "%", metadata: metadata)); - metrics.Add(new Metric($"temperature.gpu (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "hotspot")), unit: "celsius", metadata: metadata)); - metrics.Add(new Metric($"temperature.memory (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "mem")), unit: "celsius", metadata: metadata)); - metrics.Add(new Metric($"power.draw.average (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "socket_power")), unit: "W", metadata: metadata)); - - double gfx_clk_avg = (Convert.ToDouble(SafeGet(row, "gfx_0_clk")) + Convert.ToDouble(SafeGet(row, "gfx_1_clk")) + - Convert.ToDouble(SafeGet(row, "gfx_2_clk")) + Convert.ToDouble(SafeGet(row, "gfx_3_clk")) + - Convert.ToDouble(SafeGet(row, "gfx_4_clk")) + Convert.ToDouble(SafeGet(row, "gfx_5_clk")) + - Convert.ToDouble(SafeGet(row, "gfx_6_clk")) + Convert.ToDouble(SafeGet(row, "gfx_7_clk"))) / 8; - - metrics.Add(new Metric($"gfx_clk_avg (GPU {gpuId})", gfx_clk_avg, unit: "MHz", metadata: metadata)); - metrics.Add(new Metric($"mem_clk (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "mem_0_clk")), unit: "MHz", metadata: metadata)); - - double video_vclk_avg = (Convert.ToDouble(SafeGet(row, "vclk_0_clk")) + Convert.ToDouble(SafeGet(row, "vclk_1_clk")) + - Convert.ToDouble(SafeGet(row, "vclk_2_clk")) + Convert.ToDouble(SafeGet(row, "vclk_3_clk"))) / 4; - - metrics.Add(new Metric($"video_vclk_avg (GPU {gpuId})", video_vclk_avg, unit: "MHz", metadata: metadata)); - - double video_dclk_avg = (Convert.ToDouble(SafeGet(row, "dclk_0_clk")) + Convert.ToDouble(SafeGet(row, "dclk_1_clk")) + - Convert.ToDouble(SafeGet(row, "dclk_2_clk")) + Convert.ToDouble(SafeGet(row, "dclk_3_clk"))) / 4; - - metrics.Add(new Metric($"video_dclk_avg (GPU {gpuId})", video_dclk_avg, unit: "MHz", metadata: metadata)); - metrics.Add(new Metric($"pcie_bw (GPU {gpuId})", Convert.ToDouble(SafeGet(row, "bandwidth")) / 8, unit: "MB/s", metadata: metadata)); - } - - return metrics; - } - - /// - protected override void Preprocess() - { - this.PreprocessedText = this.RawText.Replace("\r\n", Environment.NewLine); - Regex quotedPattern = new Regex("\"([^\"]*)\""); - this.PreprocessedText = quotedPattern.Replace(this.PreprocessedText, "N/A"); - Regex quotedPattern2 = new Regex("\\[.*?\\]"); - this.PreprocessedText = quotedPattern2.Replace(this.PreprocessedText, "N/A"); - } - - private static IConvertible SafeGet(DataRow row, string columnName) - { - return row.Table.Columns.Contains(columnName) ? Convert.ToString(row[columnName]) : "-1"; - } - } -}