From 7164a64aaaff0358f86cc2e5ed9478b532a71d46 Mon Sep 17 00:00:00 2001 From: Erica Vellanoweth Date: Mon, 21 Oct 2024 17:26:27 -0700 Subject: [PATCH 1/3] nvlink extension --- .../Examples/nvidia-smi/query-nvlink.txt | 200 ++++++++++++++++++ .../NvidiaSmiQueryC2CParserUnitTest.cs | 6 +- .../NvidiaSmiQueryGpuParserUnitTests.cs | 0 .../NvidiaSmiQueryNvLinkParserUnitTests.cs | 41 ++++ .../Nvidia-Smi/NvidiaSmiMonitor.cs | 138 +++++++----- .../Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs | 83 ++++++++ 6 files changed, 415 insertions(+), 53 deletions(-) create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt rename src/VirtualClient/VirtualClient.Monitors.UnitTests/{ => NvidiaSmi}/NvidiaSmiQueryC2CParserUnitTest.cs (93%) rename src/VirtualClient/VirtualClient.Monitors.UnitTests/{ => NvidiaSmi}/NvidiaSmiQueryGpuParserUnitTests.cs (100%) create mode 100644 src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs create mode 100644 src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt new file mode 100644 index 0000000000..e4f24736a9 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt @@ -0,0 +1,200 @@ +GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-547e49e9-c77f-15a6-b5da-15b9eb0207d3) + Link 0: Data Tx: 1040 KiB + Link 0: Data Rx: 200 KiB + Link 1: Data Tx: 1500 KiB + Link 1: Data Rx: 1500 KiB + Link 2: Data Tx: 1500 KiB + Link 2: Data Rx: 1500 KiB + Link 3: Data Tx: 1500 KiB + Link 3: Data Rx: 1500 KiB + Link 4: Data Tx: 1500 KiB + Link 4: Data Rx: 1500 KiB + Link 5: Data Tx: 1500 KiB + Link 5: Data Rx: 1500 KiB + Link 6: Data Tx: 1500 KiB + Link 6: Data Rx: 1500 KiB + Link 7: Data Tx: 1500 KiB + Link 7: Data Rx: 1500 KiB + Link 8: Data Tx: 1500 KiB + Link 8: Data Rx: 1500 KiB + Link 9: Data Tx: 1500 KiB + Link 9: Data Rx: 1500 KiB + Link 10: Data Tx: 1500 KiB + Link 10: Data Rx: 1500 KiB + Link 11: Data Tx: 1500 KiB + Link 11: Data Rx: 1500 KiB +GPU 1: NVIDIA A100-SXM4-40GB (UUID: GPU-dc450a09-2c60-40a5-34fc-77ac7faf167e) + Link 0: Data Tx: 800 KiB + Link 0: Data Rx: 800 KiB + Link 1: Data Tx: 800 KiB + Link 1: Data Rx: 800 KiB + Link 2: Data Tx: 800 KiB + Link 2: Data Rx: 800 KiB + Link 3: Data Tx: 800 KiB + Link 3: Data Rx: 800 KiB + Link 4: Data Tx: 800 KiB + Link 4: Data Rx: 800 KiB + Link 5: Data Tx: 800 KiB + Link 5: Data Rx: 800 KiB + Link 6: Data Tx: 800 KiB + Link 6: Data Rx: 800 KiB + Link 7: Data Tx: 800 KiB + Link 7: Data Rx: 800 KiB + Link 8: Data Tx: 800 KiB + Link 8: Data Rx: 800 KiB + Link 9: Data Tx: 800 KiB + Link 9: Data Rx: 800 KiB + Link 10: Data Tx: 800 KiB + Link 10: Data Rx: 800 KiB + Link 11: Data Tx: 800 KiB + Link 11: Data Rx: 800 KiB +GPU 2: NVIDIA A100-SXM4-40GB (UUID: GPU-38eef22b-8b56-ae96-1f1f-25575b9fc7e7) + Link 0: Data Tx: 500 KiB + Link 0: Data Rx: 500 KiB + Link 1: Data Tx: 500 KiB + Link 1: Data Rx: 500 KiB + Link 2: Data Tx: 500 KiB + Link 2: Data Rx: 500 KiB + Link 3: Data Tx: 500 KiB + Link 3: Data Rx: 500 KiB + Link 4: Data Tx: 500 KiB + Link 4: Data Rx: 500 KiB + Link 5: Data Tx: 500 KiB + Link 5: Data Rx: 500 KiB + Link 6: Data Tx: 500 KiB + Link 6: Data Rx: 500 KiB + Link 7: Data Tx: 500 KiB + Link 7: Data Rx: 500 KiB + Link 8: Data Tx: 500 KiB + Link 8: Data Rx: 500 KiB + Link 9: Data Tx: 500 KiB + Link 9: Data Rx: 500 KiB + Link 10: Data Tx: 500 KiB + Link 10: Data Rx: 500 KiB + Link 11: Data Tx: 500 KiB + Link 11: Data Rx: 500 KiB +GPU 3: NVIDIA A100-SXM4-40GB (UUID: GPU-bb58bf68-496a-a909-f7a6-eb6e8bff5892) + Link 0: Data Tx: 1200 KiB + Link 0: Data Rx: 1200 KiB + Link 1: Data Tx: 1200 KiB + Link 1: Data Rx: 1200 KiB + Link 2: Data Tx: 1200 KiB + Link 2: Data Rx: 1200 KiB + Link 3: Data Tx: 1200 KiB + Link 3: Data Rx: 1200 KiB + Link 4: Data Tx: 1200 KiB + Link 4: Data Rx: 1200 KiB + Link 5: Data Tx: 1200 KiB + Link 5: Data Rx: 1200 KiB + Link 6: Data Tx: 1200 KiB + Link 6: Data Rx: 1200 KiB + Link 7: Data Tx: 1200 KiB + Link 7: Data Rx: 1200 KiB + Link 8: Data Tx: 1200 KiB + Link 8: Data Rx: 1200 KiB + Link 9: Data Tx: 1200 KiB + Link 9: Data Rx: 1200 KiB + Link 10: Data Tx: 1200 KiB + Link 10: Data Rx: 1200 KiB + Link 11: Data Tx: 1200 KiB + Link 11: Data Rx: 1200 KiB +GPU 4: NVIDIA A100-SXM4-40GB (UUID: GPU-e7900065-8d18-a01c-7d45-9ef032d7d1ed) + Link 0: Data Tx: 2000 KiB + Link 0: Data Rx: 2000 KiB + Link 1: Data Tx: 2000 KiB + Link 1: Data Rx: 2000 KiB + Link 2: Data Tx: 2000 KiB + Link 2: Data Rx: 2000 KiB + Link 3: Data Tx: 2000 KiB + Link 3: Data Rx: 2000 KiB + Link 4: Data Tx: 2000 KiB + Link 4: Data Rx: 2000 KiB + Link 5: Data Tx: 2000 KiB + Link 5: Data Rx: 2000 KiB + Link 6: Data Tx: 2000 KiB + Link 6: Data Rx: 2000 KiB + Link 7: Data Tx: 2000 KiB + Link 7: Data Rx: 2000 KiB + Link 8: Data Tx: 2000 KiB + Link 8: Data Rx: 2000 KiB + Link 9: Data Tx: 2000 KiB + Link 9: Data Rx: 2000 KiB + Link 10: Data Tx: 2000 KiB + Link 10: Data Rx: 2000 KiB + Link 11: Data Tx: 2000 KiB + Link 11: Data Rx: 2000 KiB +GPU 5: NVIDIA A100-SXM4-40GB (UUID: GPU-8e000139-4a61-ec47-798b-374ae1cbf96a) + Link 0: Data Tx: 400 KiB + Link 0: Data Rx: 400 KiB + Link 1: Data Tx: 400 KiB + Link 1: Data Rx: 400 KiB + Link 2: Data Tx: 400 KiB + Link 2: Data Rx: 400 KiB + Link 3: Data Tx: 400 KiB + Link 3: Data Rx: 400 KiB + Link 4: Data Tx: 400 KiB + Link 4: Data Rx: 400 KiB + Link 5: Data Tx: 400 KiB + Link 5: Data Rx: 400 KiB + Link 6: Data Tx: 400 KiB + Link 6: Data Rx: 400 KiB + Link 7: Data Tx: 400 KiB + Link 7: Data Rx: 400 KiB + Link 8: Data Tx: 400 KiB + Link 8: Data Rx: 400 KiB + Link 9: Data Tx: 400 KiB + Link 9: Data Rx: 400 KiB + Link 10: Data Tx: 400 KiB + Link 10: Data Rx: 400 KiB + Link 11: Data Tx: 400 KiB + Link 11: Data Rx: 400 KiB +GPU 6: NVIDIA A100-SXM4-40GB (UUID: GPU-53bbb70c-10a4-f0b3-9e6a-0bfc103ed298) + Link 0: Data Tx: 750 KiB + Link 0: Data Rx: 750 KiB + Link 1: Data Tx: 750 KiB + Link 1: Data Rx: 750 KiB + Link 2: Data Tx: 750 KiB + Link 2: Data Rx: 750 KiB + Link 3: Data Tx: 750 KiB + Link 3: Data Rx: 750 KiB + Link 4: Data Tx: 750 KiB + Link 4: Data Rx: 750 KiB + Link 5: Data Tx: 750 KiB + Link 5: Data Rx: 750 KiB + Link 6: Data Tx: 750 KiB + Link 6: Data Rx: 750 KiB + Link 7: Data Tx: 750 KiB + Link 7: Data Rx: 750 KiB + Link 8: Data Tx: 750 KiB + Link 8: Data Rx: 750 KiB + Link 9: Data Tx: 750 KiB + Link 9: Data Rx: 750 KiB + Link 10: Data Tx: 750 KiB + Link 10: Data Rx: 750 KiB + Link 11: Data Tx: 750 KiB + Link 11: Data Rx: 750 KiB +GPU 7: NVIDIA A100-SXM4-40GB (UUID: GPU-f6babbbb-c44f-416a-79ec-8d28350c2ad2) + Link 0: Data Tx: 600 KiB + Link 0: Data Rx: 600 KiB + Link 1: Data Tx: 600 KiB + Link 1: Data Rx: 600 KiB + Link 2: Data Tx: 600 KiB + Link 2: Data Rx: 600 KiB + Link 3: Data Tx: 600 KiB + Link 3: Data Rx: 600 KiB + Link 4: Data Tx: 600 KiB + Link 4: Data Rx: 600 KiB + Link 5: Data Tx: 600 KiB + Link 5: Data Rx: 600 KiB + Link 6: Data Tx: 600 KiB + Link 6: Data Rx: 600 KiB + Link 7: Data Tx: 600 KiB + Link 7: Data Rx: 600 KiB + Link 8: Data Tx: 600 KiB + Link 8: Data Rx: 600 KiB + Link 9: Data Tx: 600 KiB + Link 9: Data Rx: 600 KiB + Link 10: Data Tx: 600 KiB + Link 10: Data Rx: 600 KiB + Link 11: Data Tx: 600 KiB + Link 11: Data Rx: 600 KiB \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryC2CParserUnitTest.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryC2CParserUnitTest.cs similarity index 93% rename from src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryC2CParserUnitTest.cs rename to src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryC2CParserUnitTest.cs index 60525fff5d..6e2d78cc45 100644 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryC2CParserUnitTest.cs +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryC2CParserUnitTest.cs @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -namespace VirtualClient.Monitors.UnitTests +namespace VirtualClient.Monitors { using NUnit.Framework; using System; @@ -27,8 +27,8 @@ public void NvidiaSmiC2CParserParsesMetricsCorrectly() NvidiaSmiC2CParser testParser = new NvidiaSmiC2CParser(rawText); IList metrics = testParser.Parse(); - Assert.AreEqual(10, metrics.Count); - MetricAssert.Exists(metrics, "GPU 0: C2C Link 0 Speed", 44.712, "GB/s"); + Assert.AreEqual(10, metrics.Count); + MetricAssert.Exists(metrics, "GPU 0: C2C Link 0 Speed", 44.712, "GB/s"); MetricAssert.Exists(metrics, "GPU 0: C2C Link 1 Speed", 44.712, "GB/s"); MetricAssert.Exists(metrics, "GPU 0: C2C Link 2 Speed", 44.712, "GB/s"); MetricAssert.Exists(metrics, "GPU 0: C2C Link 3 Speed", 44.712, "GB/s"); diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryGpuParserUnitTests.cs similarity index 100% rename from src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryGpuParserUnitTests.cs rename to src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryGpuParserUnitTests.cs diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs new file mode 100644 index 0000000000..1a2717a130 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using NUnit.Framework; + using System; + using System.Collections.Generic; + using System.IO; + using System.Linq; + using System.Reflection; + using System.Text; + using System.Threading.Tasks; + using VirtualClient.Contracts; + + [TestFixture] + [Category("Unit")] + public class NvidiaSmiQueryNvLinkParserUnitTest + { + [Test] + public void NvidiaSmiNvLinkParserParsesMetricsCorrectly() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "nvidia-smi", "query-nvlink.txt"); + string rawText = File.ReadAllText(outputPath); + + NvidiaSmiQueryNvLinkParser testParser = new NvidiaSmiQueryNvLinkParser(rawText); + IList metrics = testParser.Parse(); + + Assert.AreEqual(192, metrics.Count); + MetricAssert.Exists(metrics, "GPU 0: NvLink Rx 0 Throughput", 200, "KiB"); + MetricAssert.Exists(metrics, "GPU 1: NvLink Tx 11 Throughput", 800, "KiB"); + MetricAssert.Exists(metrics, "GPU 2: NvLink Rx 9 Throughput", 500, "KiB"); + MetricAssert.Exists(metrics, "GPU 3: NvLink Tx 5 Throughput", 1200, "KiB"); + MetricAssert.Exists(metrics, "GPU 4: NvLink Rx 1 Throughput", 2000, "KiB"); + MetricAssert.Exists(metrics, "GPU 5: NvLink Tx 3 Throughput", 400, "KiB"); + MetricAssert.Exists(metrics, "GPU 6: NvLink Rx 2 Throughput", 750, "KiB"); + MetricAssert.Exists(metrics, "GPU 7: NvLink Tx 10 Throughput", 600, "KiB"); + } + } +} diff --git a/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs index 04741a6caa..c900ba8884 100644 --- a/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs +++ b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs @@ -22,6 +22,8 @@ namespace VirtualClient.Monitors /// public class NvidiaSmiMonitor : VirtualClientIntervalBasedMonitor { + private const string NvidiaSmiCommand = "nvidia-smi"; + /// /// Initializes a new instance of the class. /// @@ -43,10 +45,26 @@ protected override async Task ExecuteAsync(EventContext telemetryContext, Cancel break; case PlatformID.Unix: - await this.QueryC2CAsync(telemetryContext, cancellationToken) - .ConfigureAwait(false); - await this.QueryGpuAsync(telemetryContext, cancellationToken) + + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) .ConfigureAwait(false); + + while (!cancellationToken.IsCancellationRequested) + { + DateTime nextIteration = DateTime.UtcNow; + await this.WaitAsync(nextIteration, cancellationToken); + nextIteration = DateTime.UtcNow.Add(this.MonitorFrequency); + + await this.QueryC2CAsync(telemetryContext, cancellationToken) + .ConfigureAwait(false); + + await this.QueryNvLinkBandwidthAsync(telemetryContext, cancellationToken) + .ConfigureAwait(false); + + await this.QueryGpuAsync(telemetryContext, cancellationToken) + .ConfigureAwait(false); + } + break; } } @@ -64,21 +82,53 @@ protected override void Validate() } } - private async Task QueryC2CAsync(EventContext telemetryContext, CancellationToken cancellationToken) + private async Task QueryNvLinkBandwidthAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - ISystemManagement systemManagement = this.Dependencies.GetService(); + string arguments = "nvlink -gt d"; + + try + { + DateTime startTime = DateTime.UtcNow; + IProcessProxy process = await this.ExecuteCommandAsync(NvidiaSmiCommand, arguments, Environment.CurrentDirectory, telemetryContext, cancellationToken, runElevated: true); + DateTime endTime = DateTime.UtcNow; + + if (!cancellationToken.IsCancellationRequested) + { + await this.LogProcessDetailsAsync(process, telemetryContext, "Nvidia-Smi-NvLink", logToFile: true); + process.ThrowIfErrored(errorReason: ErrorReason.MonitorFailed); + + if (process.StandardOutput.Length > 0) + { + NvidiaSmiQueryNvLinkParser parser = new NvidiaSmiQueryNvLinkParser(process.StandardOutput.ToString()); + IList metrics = parser.Parse(); + + if (metrics?.Any() == true) + { + this.Logger.LogPerformanceCounters("nvidia", metrics, startTime, endTime, telemetryContext); + } + } + } + } + catch (OperationCanceledException) + { + // Expected whenever ctrl-C is used. + } + catch (Exception exc) + { + // This would be expected on new VM while nvidia-smi is still being installed. + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); + } + } + private async Task QueryC2CAsync(EventContext telemetryContext, CancellationToken cancellationToken) + { // This is the Nvidia smi c2c command - string command = "nvidia-smi"; string c2cCommandArguments = "c2c -s"; - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) - .ConfigureAwait(false); - try { DateTime startTime = DateTime.UtcNow; - IProcessProxy process = await this.ExecuteCommandAsync(command, c2cCommandArguments, Environment.CurrentDirectory, telemetryContext, cancellationToken, runElevated: true); + IProcessProxy process = await this.ExecuteCommandAsync(NvidiaSmiCommand, c2cCommandArguments, Environment.CurrentDirectory, telemetryContext, cancellationToken, runElevated: true); DateTime endTime = DateTime.UtcNow; if (!cancellationToken.IsCancellationRequested) @@ -109,7 +159,7 @@ await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); } } - + private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToken cancellationToken) { ISystemManagement systemManagement = this.Dependencies.GetService(); @@ -125,8 +175,7 @@ private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToke // ecc.errors.uncorrected.volatile.total,ecc.errors.uncorrected.aggregate.device_memory,ecc.errors.uncorrected.aggregate.dram,ecc.errors.uncorrected.aggregate.sram, // ecc.errors.uncorrected.aggregate.total // --format=csv,nounits - int totalSamples = (int)this.MonitorFrequency.TotalSeconds; - string command = "nvidia-smi"; + string commandArguments = "--query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory," + "power.draw.average,clocks.gr,clocks.sm,clocks.video,clocks.mem,memory.total,memory.free,memory.used,power.draw.instant,pcie.link.gen.gpucurrent," + "pcie.link.width.current,ecc.errors.corrected.volatile.device_memory,ecc.errors.corrected.volatile.dram,ecc.errors.corrected.volatile.sram," + @@ -136,56 +185,45 @@ private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToke "ecc.errors.uncorrected.aggregate.total " + "--format=csv,nounits"; - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) - .ConfigureAwait(false); - - DateTime nextIteration = DateTime.UtcNow; - - while (!cancellationToken.IsCancellationRequested) + try { - try + using (IProcessProxy process = systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, NvidiaSmiCommand, $"{commandArguments}", Environment.CurrentDirectory)) { - await this.WaitAsync(nextIteration, cancellationToken); - nextIteration = DateTime.UtcNow.Add(this.MonitorFrequency); + this.CleanupTasks.Add(() => process.SafeKill()); - using (IProcessProxy process = systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, command, $"{commandArguments}", Environment.CurrentDirectory)) - { - this.CleanupTasks.Add(() => process.SafeKill()); + DateTime startTime = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken) + .ConfigureAwait(false); - DateTime startTime = DateTime.UtcNow; - await process.StartAndWaitAsync(cancellationToken) - .ConfigureAwait(false); + DateTime endTime = DateTime.UtcNow; - DateTime endTime = DateTime.UtcNow; + if (!cancellationToken.IsCancellationRequested) + { + // We cannot log the process details here. The output is too large. + await this.LogProcessDetailsAsync(process, telemetryContext, "Nvidia-Smi-gpu", logToFile: true); + process.ThrowIfErrored(errorReason: ErrorReason.MonitorFailed); - if (!cancellationToken.IsCancellationRequested) + if (process.StandardOutput.Length > 0) { - // We cannot log the process details here. The output is too large. - await this.LogProcessDetailsAsync(process, telemetryContext, "Nvidia-Smi-gpu", logToFile: true); - process.ThrowIfErrored(errorReason: ErrorReason.MonitorFailed); + NvidiaSmiQueryGpuParser parser = new NvidiaSmiQueryGpuParser(process.StandardOutput.ToString()); + IList metrics = parser.Parse(); - if (process.StandardOutput.Length > 0) + if (metrics?.Any() == true) { - NvidiaSmiQueryGpuParser parser = new NvidiaSmiQueryGpuParser(process.StandardOutput.ToString()); - IList metrics = parser.Parse(); - - if (metrics?.Any() == true) - { - this.Logger.LogPerformanceCounters("nvidia", metrics, startTime, endTime, telemetryContext); - } + this.Logger.LogPerformanceCounters("nvidia", metrics, startTime, endTime, telemetryContext); } } } } - catch (OperationCanceledException) - { - // Expected whenever ctrl-C is used. - } - catch (Exception exc) - { - // This would be expected on new VM while nvidia-smi is still being installed. - this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); - } + } + catch (OperationCanceledException) + { + // Expected whenever ctrl-C is used. + } + catch (Exception exc) + { + // This would be expected on new VM while nvidia-smi is still being installed. + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); } } } diff --git a/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs new file mode 100644 index 0000000000..1e94da08d1 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System; + using System.Collections.Generic; + using System.ComponentModel.DataAnnotations; + using System.Data; + using System.Text.RegularExpressions; + using VirtualClient.Contracts; + using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; + + /// + /// Parser for NvidiaSmi output document. + /// + public class NvidiaSmiQueryNvLinkParser : MetricsParser + { + private static readonly Regex GpuInfoExpression = new Regex(@"GPU (?\d+): (?.+) \(UUID: (?.+)\)", RegexOptions.Compiled); + private static readonly Regex NvLinkTxExpression = new Regex(@"Link (?\d+): Data Tx: (?[\d.]+) KiB", RegexOptions.Compiled); + private static readonly Regex NvLinkRxExpression = new Regex(@"Link (?\d+): Data Rx: (?[\d.]+) KiB", RegexOptions.Compiled); + + /// + /// Constructor for + /// + /// Raw text to parse. + public NvidiaSmiQueryNvLinkParser(string rawText) + : base(rawText) + { + } + + /// + public override IList Parse() + { + List metrics = new List(); + string[] lines = this.RawText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); + + string gpuName = string.Empty; + string gpuUuid = string.Empty; + int gpuNumber = -1; + + foreach (string line in lines) + { + Match gpuMatch = GpuInfoExpression.Match(line); + if (gpuMatch.Success) + { + gpuNumber = int.Parse(gpuMatch.Groups["GPU"].Value.Trim()); + gpuName = gpuMatch.Groups["Name"].Value.Trim(); + gpuUuid = gpuMatch.Groups["UUID"].Value.Trim(); + continue; + } + + Match nvLinkTxMatch = NvLinkTxExpression.Match(line); + if (nvLinkTxMatch.Success) + { + int linkNumber = int.Parse(nvLinkTxMatch.Groups["LinkNumber"].Value.Trim()); + double linkSpeed = double.Parse(nvLinkTxMatch.Groups["Throughput"].Value.Trim()); + IDictionary metadata = new Dictionary + { + { "GPU Name", gpuName }, + { "GPU UUID", gpuUuid } + }; + metrics.Add(new Metric($"GPU {gpuNumber}: NvLink Tx {linkNumber} Throughput", linkSpeed, unit: "KiB", description: "Nvidia-smi nvlink", metadata: metadata)); + } + + Match nvLinkRxMatch = NvLinkRxExpression.Match(line); + if (nvLinkRxMatch.Success) + { + int linkNumber = int.Parse(nvLinkRxMatch.Groups["LinkNumber"].Value.Trim()); + double linkSpeed = double.Parse(nvLinkRxMatch.Groups["Throughput"].Value.Trim()); + IDictionary metadata = new Dictionary + { + { "GPU Name", gpuName }, + { "GPU UUID", gpuUuid } + }; + metrics.Add(new Metric($"GPU {gpuNumber}: NvLink Rx {linkNumber} Throughput", linkSpeed, unit: "KiB", description: "Nvidia-smi nvlink", metadata: metadata)); + } + } + + return metrics; + } + } +} From dab8cbfad6b8b4aba2ebd88e2c42bb002c31ccfa Mon Sep 17 00:00:00 2001 From: Erica Vellanoweth Date: Tue, 22 Oct 2024 10:38:34 -0700 Subject: [PATCH 2/3] making sure dec values work --- .../Examples/nvidia-smi/query-nvlink.txt | 2 +- .../NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt index e4f24736a9..8152c7c72e 100644 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt @@ -1,6 +1,6 @@ GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-547e49e9-c77f-15a6-b5da-15b9eb0207d3) Link 0: Data Tx: 1040 KiB - Link 0: Data Rx: 200 KiB + Link 0: Data Rx: 200.3 KiB Link 1: Data Tx: 1500 KiB Link 1: Data Rx: 1500 KiB Link 2: Data Tx: 1500 KiB diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs index 1a2717a130..018a066fea 100644 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs @@ -28,7 +28,7 @@ public void NvidiaSmiNvLinkParserParsesMetricsCorrectly() IList metrics = testParser.Parse(); Assert.AreEqual(192, metrics.Count); - MetricAssert.Exists(metrics, "GPU 0: NvLink Rx 0 Throughput", 200, "KiB"); + MetricAssert.Exists(metrics, "GPU 0: NvLink Rx 0 Throughput", 200.3, "KiB"); MetricAssert.Exists(metrics, "GPU 1: NvLink Tx 11 Throughput", 800, "KiB"); MetricAssert.Exists(metrics, "GPU 2: NvLink Rx 9 Throughput", 500, "KiB"); MetricAssert.Exists(metrics, "GPU 3: NvLink Tx 5 Throughput", 1200, "KiB"); From 22d6e34c7977e16dc732cef4f290879c73124a34 Mon Sep 17 00:00:00 2001 From: Erica Vellanoweth Date: Tue, 29 Oct 2024 12:56:52 -0700 Subject: [PATCH 3/3] changing file encoding --- .../Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs | Bin 3701 -> 7564 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs index 1e94da08d1be0347ab11777fd0d4b68bf3935ad1..6af058029fc9a2ad1cac1887fc86f2b4ecee5bf1 100644 GIT binary patch literal 7564 zcmeI1?Qa`J5XSd2692==Lc$UBBECW}Z;h%55=xu!CFO&YIKiszM1G;QBLC~a^V{*n zcjxulR;-AMs*3M!Z+GUI*_mf%umAkLANE5QehRnYE=dkE!>F+weWByPp_-$*XtGhwS!>8dB$wT5VaaBj7qtFRg8ky++O7}zEkENp%Wgmsp zrP(G2Sp!M$YvxG0+Cssa-z9k{2@{RsTFJkOM!TAQ zqx;vgc%-=<>G)=&S$XtSPdYn7<93gv`9;Klb$DwVY5nNO!Q2Kcz)L;Jd>(z<4nGL* zK;z^1Y`z|Ctiz>jis@9^-%D2>Ug_>e9_vZ(FdS<1PFHpVAF|_hDC+i!_+iJ09{gA* z$MK(P?nU%GNZ2*>JB}=Ts~z1==fcDFvdweEqel@jPLv-A!9>y1j~<$aRw;^n*30;o zJRtV3WjoiC@R!hE>94Iw?CYB8%|w>n#otWtZsi^1dwi!c?AM||8h#@#GySof>|hGP zZ<-6rX+%d355n6}#{ace9%~l6hs;sweG%b($s6uFVyG+I;SKVc3^Wb_AMw?WmJft< zsu7DqVlq>fVa<_H;l)%$S{#r~?@DW%;rr0g`j_EM90FF=IXUoVn`xtvcISoqQf@x8ZeTI%hnjb@UGwBG0eI*^LAbu4kc! z*tg8JRV)XZQOC{8^XV|CM}6V*z5{%Wy;DrXKw~|(#7K2kw|X`S*qDo5_UF$? zOk*uG^Kv~{?}uh&Px#b2nljeCB>%eQMA?Nb0dKQnWqM_r&u39feH4L@jt`{cd0ktz zgJrMWipf~VO8TNq3=9_$We24?)Y7{}rD}>-Q<%~=xKCC~rW!0NUb)JwS_OtQx4S;e z72CaYabXtgG}VjxOh-efjJ0%Q#W>8>Gu*}5W9^>RJCks)RiBGphx(-waX-*+7H0AJ z_1q33W+9_mrKq}nMK#_(jP+`)YJ4b3<^>SCbtylmEX<^l-Sz5f78ZSw7j$jzIhT7T z*+j`>ejF*?msLBH`Ye1k-_ts)jq&JYF13Hu(4v)#UCxAep!dCaHJ{eWt&d*! z`LXnl=bFuq;s{Z-zK`N8X^%?L;k=fs-6PXgWu$Qa7{OSFPGP z+u6Lt5zF{qC~xRh?u71KY;QecL%Esjbis*}4mnpnfRXyLaql4y9KpJ)BeWjV;%Yq- z&zgEH^;uIzW?IX6vmS|ey}LwmDX)`x>=GW{%#PKyr)udUnyK#P?zs4l?AWAx-c5e2 zM)DMy;#_C!^Hz3O*)Ekcb(pK!LuKRyS(IEKHJdVw?=NR$JKxe9xFyszI!&(y#aU$r z*v6VvH@;hII9+n4_VXeg=`?<2u&PVVrludo?@(Z;H)6JZ8KDQVPK7fx{N2_FG3_Vf z$I7p(PQE98-Hx{xtFi~)B^Qk)?NnW5PCA-x5T$i2_zd4xIng-ASFXFxHe@cBQ|>Es zee1i@Kkw+uUs{^svw~;t6Ftf-o*hpL{AQ=|EZjfjMlly>vNs*ocw#x5809-IT|DQ9 zMkt=!>CZ=sUMl0_n}^K_UdQb=2?C>Wy0x9XBsol^Nj#`DBg~8v8-|~(hd$E2 z!jd!ku`H)?7hCjFA8eT-KXOQZ*9uj`BvX~=6fYZG!RE93tJ?#%2mxuwTd1Uf5 zWqLiXB%hcznD9rX;rdy`g3VI_Dz{l~Ay!eIQj#m}(Vzj4N1NJZQbEn|tfYSH&sh z5~q$uYV91X2tqw`twqdXawUu%dCrW%ktV1e*WiTOP)$9}3X1H$3U<1i%>1ct?;#PA z(?GFHVM0up8(Q-$5jH@Yp~41E?(|6?zvT&R`!IL;a2h3pW<15%FVf7eFK~hhZqQpJ zQAHA;vYfby?z|v$HZ||+$N*JuauNTns*slSJjGD+{f~QYajbC`EukzT9TtT2hpsb9 zZkt&p$&^_%MSTBo+8#xrdlL}qUV6!1JVH=fhx0s&aQ-)al2+R@ZEer8F|!-W=U&^r)EiRTmF!k~iefiw$3_Vhu^q9QE}8m>oSM7ZIE%z4|x z4pF?jvH;R;lEtd5LVNnYU4{n6Kdl9I8J4)1z2Eogk z9d2iOy8*YWUAy&NgHuCp`v+lm3|wzH1J`}qgZkTcLlHzV55tdfOw$bfNDY2)k!-My z{pi>Z@%BPh!`%#?U!VaP97k11|q=mappv