Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add task-dump endpoints to MGS #7272

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,8 @@ gateway-client = { path = "clients/gateway-client" }
# is "fine", because SP/MGS communication maintains forwards and backwards
# compatibility, but will mean that faux-mgs might be missing new
# functionality.)
gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0", default-features = false, features = ["std"] }
gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0" }
gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078", default-features = false, features = ["std"] }
gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078" }
gateway-test-utils = { path = "gateway-test-utils" }
gateway-types = { path = "gateway-types" }
gethostname = "0.5.0"
Expand Down
3 changes: 2 additions & 1 deletion dev-tools/omdb/src/bin/omdb/mgs/sensors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ pub(crate) struct Sensor {
impl Sensor {
fn units(&self) -> &str {
match self.kind {
MeasurementKind::Temperature => "°C",
MeasurementKind::Temperature | MeasurementKind::CpuTctl => "°C",
MeasurementKind::Current | MeasurementKind::InputCurrent => "A",
MeasurementKind::Voltage | MeasurementKind::InputVoltage => "V",
MeasurementKind::Speed => "RPM",
Expand Down Expand Up @@ -150,6 +150,7 @@ impl Sensor {
fn to_kind_string(&self) -> &str {
match self.kind {
MeasurementKind::Temperature => "temp",
MeasurementKind::CpuTctl => "tctl",
MeasurementKind::Power => "power",
MeasurementKind::Current => "current",
MeasurementKind::Voltage => "voltage",
Expand Down
31 changes: 31 additions & 0 deletions gateway-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use gateway_types::{
ignition::{IgnitionCommand, SpIgnitionInfo},
rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState},
sensor::SpSensorReading,
task_dump::TaskDump,
update::{
HostPhase2Progress, HostPhase2RecoveryImageId, InstallinatorImageId,
SpUpdateStatus,
Expand Down Expand Up @@ -306,6 +307,26 @@ pub trait GatewayApi {
params: TypedBody<GetRotBootInfoParams>,
) -> Result<HttpResponseOk<RotState>, HttpError>;

/// Get the number of task dumps present on an SP
#[endpoint {
method = GET,
path = "/sp/{type}/{slot}/task-dump",
}]
async fn sp_host_task_dump_count(
rqctx: RequestContext<Self::Context>,
path: Path<PathSp>,
) -> Result<HttpResponseOk<u32>, HttpError>;

/// Read a single task dump from an SP
#[endpoint {
method = GET,
path = "/sp/{type}/{slot}/task-dump/{task_dump_index}",
}]
async fn sp_host_task_dump_get(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpTaskDumpIndex>,
) -> Result<HttpResponseOk<TaskDump>, HttpError>;

/// List SPs via Ignition
///
/// Retreive information for all SPs via the Ignition controller. This is
Expand Down Expand Up @@ -498,6 +519,16 @@ pub struct PathSpComponent {
pub component: String,
}

#[derive(Deserialize, JsonSchema)]
pub struct PathSpTaskDumpIndex {
/// ID for the SP that the gateway service translates into the appropriate
/// port for communicating with the given SP.
#[serde(flatten)]
pub sp: SpIdentifier,
/// The index of the task dump to be read.
pub task_dump_index: u32,
}

#[derive(Deserialize, JsonSchema)]
pub struct ComponentCabooseSlot {
/// The firmware slot to for which we want to request caboose information.
Expand Down
2 changes: 2 additions & 0 deletions gateway-types/src/component_details.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ pub enum MeasurementKind {
InputCurrent,
InputVoltage,
Speed,
CpuTctl,
}

impl From<gateway_messages::measurement::MeasurementKind> for MeasurementKind {
Expand All @@ -387,6 +388,7 @@ impl From<gateway_messages::measurement::MeasurementKind> for MeasurementKind {
MeasurementKind::InputCurrent => Self::InputCurrent,
MeasurementKind::InputVoltage => Self::InputVoltage,
MeasurementKind::Speed => Self::Speed,
MeasurementKind::CpuTctl => Self::CpuTctl,
}
}
}
Expand Down
1 change: 1 addition & 0 deletions gateway-types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ pub mod host;
pub mod ignition;
pub mod rot;
pub mod sensor;
pub mod task_dump;
pub mod update;
35 changes: 35 additions & 0 deletions gateway-types/src/task_dump.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

#[derive(
Debug,
Clone,
PartialEq,
Eq,
PartialOrd,
Ord,
Deserialize,
Serialize,
JsonSchema,
)]
pub struct TaskDump {
/// Index of the crashed task.
pub task_index: u16,
/// Timestamp at which the task crash occurred.
pub timestamp: u64,
/// Hex-encoded Hubris archive ID.
pub archive_id: String,
/// `BORD` field from the caboose.
pub bord: String,
/// `GITC` field from the caboose.
pub gitc: String,
/// `VERS` field from the caboose, if present.
pub vers: Option<String>,
/// Base64-encoded raw memory read from the SP.
pub base64_memory: BTreeMap<u32, String>,
}
61 changes: 61 additions & 0 deletions gateway/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ use gateway_types::rot::RotCfpaSlot;
use gateway_types::rot::RotCmpa;
use gateway_types::rot::RotState;
use gateway_types::sensor::SpSensorReading;
use gateway_types::task_dump::TaskDump;
use gateway_types::update::HostPhase2Progress;
use gateway_types::update::HostPhase2RecoveryImageId;
use gateway_types::update::InstallinatorImageId;
Expand Down Expand Up @@ -655,6 +656,66 @@ impl GatewayApi for GatewayImpl {
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn sp_host_task_dump_count(
rqctx: RequestContext<Self::Context>,
path: Path<PathSp>,
) -> Result<HttpResponseOk<u32>, HttpError> {
let apictx = rqctx.context();
let sp_id = path.into_inner().sp.into();

let handler = async {
let sp = apictx.mgmt_switch.sp(sp_id)?;
let ct = sp.task_dump_count().await.map_err(|err| {
SpCommsError::SpCommunicationFailed { sp: sp_id, err }
})?;

Ok(HttpResponseOk(ct))
};
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn sp_host_task_dump_get(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpTaskDumpIndex>,
) -> Result<HttpResponseOk<TaskDump>, HttpError> {
let apictx = rqctx.context();
let path = path.into_inner();
let task_index = path.task_dump_index;
let sp_id = path.sp.into();

let handler = async {
let sp = apictx.mgmt_switch.sp(sp_id)?;
let raw_dump =
sp.task_dump_read(task_index).await.map_err(|err| {
SpCommsError::SpCommunicationFailed { sp: sp_id, err }
})?;

let archive_id = hex::encode(raw_dump.archive_id);
let base64_memory = raw_dump
.memory
.into_iter()
.map(|(key, mem)| {
let base64_mem =
base64::engine::general_purpose::STANDARD.encode(mem);
(key, base64_mem)
})
.collect();

let dump = TaskDump {
task_index: raw_dump.task_index,
timestamp: raw_dump.timestamp,
archive_id,
bord: raw_dump.bord,
gitc: raw_dump.gitc,
vers: raw_dump.vers,
base64_memory,
};

Ok(HttpResponseOk(dump))
};
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn ignition_list(
rqctx: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<SpIgnitionInfo>>, HttpError> {
Expand Down
14 changes: 14 additions & 0 deletions gateway/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,7 @@ impl SpPoller {
MeasurementKind::InputCurrent => "input_current",
MeasurementKind::InputVoltage => "input_voltage",
MeasurementKind::Speed => "fan_speed",
MeasurementKind::CpuTctl => "cpu_tctl",
};
let error = match error {
MeasurementError::InvalidSensor => "invalid_sensor",
Expand Down Expand Up @@ -858,6 +859,10 @@ impl SpPoller {
&metric::AmdCpuTctl { sensor, datum },
)
}
(Ok(datum), MeasurementKind::CpuTctl) => Sample::new(
target,
&metric::AmdCpuTctl { sensor, datum },
),
// Other measurements with the "temperature" measurement
// kind are physical temperatures that actually exist in
// reality (and are always in Celsius).
Expand All @@ -873,6 +878,12 @@ impl SpPoller {
&metric::AmdCpuTctl { sensor, datum: 0.0 },
)
}
(Err(_), MeasurementKind::CpuTctl) => {
Sample::new_missing(
target,
&metric::AmdCpuTctl { sensor, datum: 0.0 },
)
}
(Err(_), MeasurementKind::Temperature) => {
Sample::new_missing(
target,
Expand Down Expand Up @@ -1205,5 +1216,8 @@ fn comms_error_str(error: CommunicationError) -> &'static str {
CommunicationError::BadTrailingDataSize { .. } => {
"bad_trailing_data_size"
}
CommunicationError::BadDecompressionSize { .. } => {
"bad_decompression_size"
}
}
}
1 change: 1 addition & 0 deletions nexus/tests/integration_tests/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,7 @@ async fn test_mgs_metrics(
temp += 1;
}
}
Kind::CpuTctl => cpu_tctl += 1,
Kind::Current => current += 1,
Kind::Voltage => voltage += 1,
Kind::InputVoltage => input_voltage += 1,
Expand Down
Loading
Loading