From 62349a65598da0b372ff8bf202e82fb1a1a8f3cb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 1 May 2026 19:30:27 +0200 Subject: [PATCH] admin api: return full layout computation statistics as json (fix #1428) --- Cargo.lock | 1 + src/api/admin/api.rs | 8 + src/api/admin/layout.rs | 18 +- src/garage/cli/remote/layout.rs | 1 + src/garage/server.rs | 4 +- src/rpc/Cargo.toml | 1 + src/rpc/layout/history.rs | 7 +- src/rpc/layout/mod.rs | 4 - src/rpc/layout/test.rs | 20 +- src/rpc/layout/version.rs | 349 ++++++++++++++++++++++---------- 10 files changed, 284 insertions(+), 129 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bf4e80a5..a8621e8d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1782,6 +1782,7 @@ dependencies = [ "thiserror 2.0.18", "tokio", "tracing", + "utoipa", ] [[package]] diff --git a/src/api/admin/api.rs b/src/api/admin/api.rs index bcfc3634..124702ce 100644 --- a/src/api/admin/api.rs +++ b/src/api/admin/api.rs @@ -618,6 +618,10 @@ pub enum PreviewClusterLayoutChangesResponse { /// Plain-text information about the layout computation /// (do not try to parse this) message: Vec, + /// Structured statistics about the layout computation + // FIXME for v3: remove default and skip_serializing_if + #[serde(default, skip_serializing_if = "Option::is_none")] + statistics: Option>, /// Details about the new cluster layout new_layout: GetClusterLayoutResponse, }, @@ -639,6 +643,10 @@ pub struct ApplyClusterLayoutResponse { /// Plain-text information about the layout computation /// (do not try to parse this) pub message: Vec, + /// Structured statistics about the layout computation + // FIXME for v3: remove default and skip_serializing_if + #[serde(default, skip_serializing_if = "Option::is_none")] + pub statistics: Option, /// Details about the new cluster layout pub layout: GetClusterLayoutResponse, } diff --git a/src/api/admin/layout.rs b/src/api/admin/layout.rs index 998b61a2..62ec1899 100644 --- a/src/api/admin/layout.rs +++ b/src/api/admin/layout.rs @@ -255,10 +255,14 @@ impl RequestHandler for PreviewClusterLayoutChangesRequest { Ok(PreviewClusterLayoutChangesResponse::Error { error }) } Err(e) => Err(e.into()), - Ok((new_layout, msg)) => Ok(PreviewClusterLayoutChangesResponse::Success { - message: msg, - new_layout: format_cluster_layout(&new_layout), - }), + Ok((new_layout, stat)) => { + let message = stat.to_message(); + Ok(PreviewClusterLayoutChangesResponse::Success { + message, + statistics: Some(Box::new(stat)), + new_layout: format_cluster_layout(&new_layout), + }) + } } } } @@ -272,7 +276,8 @@ impl RequestHandler for ApplyClusterLayoutRequest { _admin: &Admin, ) -> Result { let layout = garage.system.cluster_layout().inner().clone(); - let (layout, msg) = layout.apply_staged_changes(self.version)?; + let (layout, stat) = layout.apply_staged_changes(self.version)?; + let message = stat.to_message(); garage .system @@ -281,7 +286,8 @@ impl RequestHandler for ApplyClusterLayoutRequest { .await?; Ok(ApplyClusterLayoutResponse { - message: msg, + message, + statistics: Some(stat), layout: format_cluster_layout(&layout), }) } diff --git a/src/garage/cli/remote/layout.rs b/src/garage/cli/remote/layout.rs index 10be1029..326e2239 100644 --- a/src/garage/cli/remote/layout.rs +++ b/src/garage/cli/remote/layout.rs @@ -40,6 +40,7 @@ impl Cli { PreviewClusterLayoutChangesResponse::Success { message, new_layout, + .. } => { println!(); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); diff --git a/src/garage/server.rs b/src/garage/server.rs index d7aae075..53b65fbf 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -228,10 +228,10 @@ async fn initial_config(garage: &Arc, opt: ServerOpt) -> Result<(), Erro })), ); - let (layout, msg) = layout.apply_staged_changes(1)?; + let (layout, stat) = layout.apply_staged_changes(1)?; info!( "Created initial layout for single-node configuration:\n{}", - msg.join("\n") + stat.to_message().join("\n") ); garage diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 7f4882ce..20c71fdc 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -33,6 +33,7 @@ async-trait.workspace = true serde.workspace = true serde_bytes.workspace = true serde_json.workspace = true +utoipa.workspace = true thiserror = { workspace = true, optional = true } # newer version requires rust edition 2021 diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index b4659543..8969ddc7 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -6,6 +6,7 @@ use garage_util::encode::nonversioned_encode; use garage_util::error::*; use super::*; +use crate::layout::ComputationStat; use crate::replication_mode::*; impl LayoutHistory { @@ -269,13 +270,13 @@ impl LayoutHistory { changed } - pub fn apply_staged_changes(mut self, version: u64) -> Result<(Self, Message), Error> { + pub fn apply_staged_changes(mut self, version: u64) -> Result<(Self, ComputationStat), Error> { if version != self.current().version + 1 { return Err(Error::Message("Invalid new layout version".into())); } // Compute new version and add it to history - let (new_version, msg) = self + let (new_version, stat) = self .current() .clone() .calculate_next_version(self.staging.get())?; @@ -289,7 +290,7 @@ impl LayoutHistory { roles: LwwMap::new(), }); - Ok((self, msg)) + Ok((self, stat)) } pub fn revert_staged_changes(mut self) -> Result { diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index e565d13e..5b1aa55d 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -389,10 +389,6 @@ impl NodeRole { None => "gateway".to_string(), } } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } } impl UpdateTracker { diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index aa146dfd..3941780d 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -79,8 +79,8 @@ fn check_against_naive(cl: &LayoutVersion) -> bool { false } -fn show_msg(msg: &Message) { - for s in msg.iter() { +fn show_stat(stat: &ComputationStat) { + for s in stat.to_message().iter() { println!("{}", s); } } @@ -123,8 +123,8 @@ fn test_assignment() { let mut cl = LayoutHistory::new(ReplicationFactor::new(3).unwrap()); update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); let v = cl.current().version; - let (mut cl, msg) = cl.apply_staged_changes(v + 1).unwrap(); - show_msg(&msg); + let (mut cl, stat) = cl.apply_staged_changes(v + 1).unwrap(); + show_stat(&stat); assert_eq!(cl.check(), Ok(())); assert!(check_against_naive(cl.current())); @@ -132,16 +132,16 @@ fn test_assignment() { node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"]; update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2); let v = cl.current().version; - let (mut cl, msg) = cl.apply_staged_changes(v + 1).unwrap(); - show_msg(&msg); + let (mut cl, stat) = cl.apply_staged_changes(v + 1).unwrap(); + show_stat(&stat); assert_eq!(cl.check(), Ok(())); assert!(check_against_naive(cl.current())); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); let v = cl.current().version; - let (mut cl, msg) = cl.apply_staged_changes(v + 1).unwrap(); - show_msg(&msg); + let (mut cl, stat) = cl.apply_staged_changes(v + 1).unwrap(); + show_stat(&stat); assert_eq!(cl.check(), Ok(())); assert!(check_against_naive(cl.current())); @@ -150,8 +150,8 @@ fn test_assignment() { ]; update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1); let v = cl.current().version; - let (cl, msg) = cl.apply_staged_changes(v + 1).unwrap(); - show_msg(&msg); + let (cl, stat) = cl.apply_staged_changes(v + 1).unwrap(); + show_stat(&stat); assert_eq!(cl.check(), Ok(())); assert!(check_against_naive(cl.current())); } diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 18728e63..2f3a7436 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -4,6 +4,8 @@ use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; use garage_util::crdt::{Crdt, LwwMap}; use garage_util::data::*; @@ -13,9 +15,6 @@ use super::graph_algo::*; use super::*; use crate::replication_mode::*; -// The Message type will be used to collect information on the algorithm. -pub type Message = Vec; - impl LayoutVersion { pub fn new(replication_factor: ReplicationFactor) -> Self { // We set the default zone redundancy to be Maximum, meaning that the maximum @@ -291,16 +290,16 @@ impl LayoutVersion { pub(crate) fn calculate_next_version( mut self, staging: &LayoutStaging, - ) -> Result<(Self, Message), Error> { + ) -> Result<(Self, ComputationStat), Error> { self.version += 1; self.roles.merge(&staging.roles); self.roles.retain(|(_, _, v)| v.0.is_some()); self.parameters = *staging.parameters.get(); - let msg = self.calculate_partition_assignment()?; + let stat = self.calculate_partition_assignment()?; - Ok((self, msg)) + Ok((self, stat)) } /// This function calculates a new partition-to-node assignment. @@ -312,21 +311,15 @@ impl LayoutVersion { /// data to be moved. /// Staged role changes must be merged with nodes roles before calling this function, /// hence it must only be called from `apply_staged_changes()` and hence is not public. - fn calculate_partition_assignment(&mut self) -> Result { + fn calculate_partition_assignment(&mut self) -> Result { // We update the node ids, since the node role list might have changed with the // changes in the layout. We retrieve the old_assignment reframed with new ids let old_assignment_opt = self.update_node_id_vec()?; let zone_redundancy = self.effective_zone_redundancy(); - let mut msg = Message::new(); - msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); - msg.push("".into()); - msg.push(format!( - "Partitions are \ - replicated {} times on at least {} distinct zones.", - self.replication_factor, zone_redundancy - )); + let stat_replication_factor = self.replication_factor; + let stat_effective_zone_redundancy = zone_redundancy; // We generate for once numerical ids for the zones of non gateway nodes, // to use them as indices in the flow graphs. @@ -355,28 +348,17 @@ impl LayoutVersion { // optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; - msg.push("".into()); - if old_assignment_opt.is_some() { - msg.push(format!( - "Optimal partition size: {} ({} in previous layout)", - ByteSize::b(partition_size).display().iec(), - ByteSize::b(self.partition_size).display().iec() - )); + let stat_partition_size = partition_size; + let stat_previous_partition_size = if old_assignment_opt.is_some() { + Some(self.partition_size) } else { - msg.push(format!( - "Optimal partition size: {}", - ByteSize::b(partition_size).display().iec() - )); - } + None + }; + // We write the partition size. self.partition_size = partition_size; - if partition_size < 100 { - msg.push( - "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" - .into(), - ); - } + let stat_low_partition_size = partition_size < 100; // We compute a first flow/assignment that is heuristically close to the previous // assignment @@ -388,18 +370,28 @@ impl LayoutVersion { } // We display statistics of the computation - msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); + let stat = self.output_stat( + &gflow, + &old_assignment_opt, + &zone_to_id, + &id_to_zone, + stat_replication_factor, + stat_effective_zone_redundancy, + stat_partition_size, + stat_previous_partition_size, + stat_low_partition_size, + )?; // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; if let Err(e) = self.check() { return Err(Error::Message( - format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, stat.to_message().join("\n")) )); } - Ok(msg) + Ok(stat) } /// The `LwwMap` of node roles might have changed. This function updates the `node_id_vec` @@ -706,47 +698,26 @@ impl LayoutVersion { /// This function returns a message summing up the partition repartition of the new /// layout, and other statistics of the partition assignment computation. + #[allow(clippy::too_many_arguments)] fn output_stat( &self, gflow: &Graph, prev_assign_opt: &Option>>, zone_to_id: &HashMap, id_to_zone: &[String], - ) -> Result { - let mut msg = Message::new(); + replication_factor: usize, + effective_zone_redundancy: usize, + partition_size: u64, + previous_partition_size: Option, + low_partition_size: bool, + ) -> Result { + let usable_capacity = + self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; + let total_capacity = self.get_total_capacity(); + let effective_capacity = usable_capacity / replication_factor as u64; - let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; - let total_cap = self.get_total_capacity(); - let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); - msg.push(format!( - "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", - ByteSize::b(used_cap).display().iec(), - ByteSize::b(total_cap).display().iec(), - percent_cap - )); - msg.push(format!( - "Effective capacity (replication factor {}): {}", - self.replication_factor, - ByteSize::b(used_cap / self.replication_factor as u64) - .display() - .iec() - )); - if percent_cap < 80. { - msg.push("".into()); - msg.push( - "If the percentage is too low, it might be that the \ - cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ - storage capacities." - .into(), - ); - msg.push( - "You might want to move storage capacity between zones or relax the redundancy constraint." - .into(), - ); - msg.push( - "See the detailed statistics below and look for saturated nodes/zones.".into(), - ); - } + let percent_cap = 100.0 * (usable_capacity as f32) / (total_capacity as f32); + let low_usable_capacity = percent_cap < 80.; // We define and fill in the following tables let storing_nodes = self.nongateway_nodes(); @@ -795,18 +766,13 @@ impl LayoutVersion { // We display the statistics - msg.push("".into()); - if prev_assign_opt.is_some() { - let total_new_partitions: usize = new_partitions.iter().sum(); - msg.push(format!( - "A total of {} new copies of partitions need to be \ - transferred.", - total_new_partitions - )); - msg.push("".into()); - } + let total_moved_partitions = if prev_assign_opt.is_some() { + Some(new_partitions.iter().sum()) + } else { + None + }; - let mut table = vec![]; + let mut zones = vec![]; for z in 0..id_to_zone.len() { let mut nodes_of_z = Vec::::new(); for n in 0..storing_nodes.len() { @@ -814,49 +780,224 @@ impl LayoutVersion { nodes_of_z.push(n); } } - let replicated_partitions: usize = + let replicated_partitions_z: usize = nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); - table.push(format!( - "{}\tTags\tPartitions\tCapacity\tUsable capacity", - id_to_zone[z] - )); - let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; + let available_cap_z: u64 = self.partition_size * replicated_partitions_z as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]); } - let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + let mut nodes = vec![]; for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]); - let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); + let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))? + .tags + .clone(); + nodes.push(ComputationStatNode { + id: hex::encode(self.node_id_vec[*n]), + tags: tags_n, + stored_partitions: stored_partitions[*n], + new_partitions: new_partitions[*n], + total_capacity: total_cap_n, + usable_capacity: available_cap_n, + }); + } + + zones.push(ComputationStatZone { + name: id_to_zone[z].to_string(), + nodes, + total_replicated_partitions: replicated_partitions_z, + unique_partitions: stored_partitions_zone[z], + total_capacity: total_cap_z, + usable_capacity: available_cap_z, + }); + } + + Ok(ComputationStat { + replication_factor, + effective_zone_redundancy, + partition_size, + previous_partition_size, + low_partition_size, + usable_capacity, + total_capacity, + effective_capacity, + low_usable_capacity, + total_moved_partitions, + zones, + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct ComputationStat { + /// The cluster's replication factor + pub replication_factor: usize, + /// The zone redundancy factor achieved by this layout + pub effective_zone_redundancy: usize, + /// The size of a partition, in bytes + pub partition_size: u64, + /// The size of a partition, in bytes, in the previous layout + pub previous_partition_size: Option, + /// Warning flag indicating when partitions are very small + pub low_partition_size: bool, + /// The portion of total raw node capacity that is used by partitions + pub usable_capacity: u64, + /// The total raw capacity of nodes + pub total_capacity: u64, + /// The final effective capacity of the cluster, accounting for replication + pub effective_capacity: u64, + /// Warning flag indicating that the raw node capacity could not be used + /// effectively + pub low_usable_capacity: bool, + /// The total number of partitions that will be moved to a new storage node + pub total_moved_partitions: Option, + /// Per-zone storage statistics + pub zones: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct ComputationStatZone { + /// The name of the zone + pub name: String, + /// Per-node storage statistics for nodes in this zone + pub nodes: Vec, + /// The total number of partition replicas in this zone + pub total_replicated_partitions: usize, + /// The number of unique partitions that have at least one replica in this zone + pub unique_partitions: usize, + /// The total raw capacity of nodes in this zone + pub total_capacity: u64, + /// The used portion of the raw capacity of nodes in this zones + pub usable_capacity: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct ComputationStatNode { + /// The node's ID + pub id: String, + /// The node's tags as defined in the layout + pub tags: Vec, + /// The number of partitions that are replicated on this node + pub stored_partitions: usize, + /// The number of partitions that are newly replicated on this node + pub new_partitions: usize, + /// The node's raw capacity + pub total_capacity: u64, + /// The portion of the node's raw capacity that is used by partitions it stores + pub usable_capacity: u64, +} + +impl ComputationStat { + pub fn to_message(&self) -> Vec { + let mut msg = Vec::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); + msg.push(format!( + "Partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, self.effective_zone_redundancy + )); + + msg.push("".into()); + if let Some(prev) = self.previous_partition_size { + msg.push(format!( + "Optimal partition size: {} ({} in previous layout)", + ByteSize::b(self.partition_size).display().iec(), + ByteSize::b(prev).display().iec() + )); + } else { + msg.push(format!( + "Optimal partition size: {}", + ByteSize::b(self.partition_size).display().iec() + )); + } + + if self.low_partition_size { + msg.push( + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" + .into(), + ); + } + + let percent_cap = 100.0 * (self.usable_capacity as f32) / (self.total_capacity as f32); + msg.push(format!( + "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", + ByteSize::b(self.usable_capacity).display().iec(), + ByteSize::b(self.total_capacity).display().iec(), + percent_cap + )); + msg.push(format!( + "Effective capacity (replication factor {}): {}", + self.replication_factor, + ByteSize::b(self.effective_capacity).display().iec() + )); + + if self.low_usable_capacity { + msg.push("".into()); + msg.push( + "If the percentage is too low, it might be that the \ + cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ + storage capacities." + .into(), + ); + msg.push( + "You might want to move storage capacity between zones or relax the redundancy constraint." + .into(), + ); + msg.push( + "See the detailed statistics below and look for saturated nodes/zones.".into(), + ); + } + + msg.push("".into()); + if let Some(tmp) = self.total_moved_partitions { + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + tmp + )); + msg.push("".into()); + } + + let mut table = vec![]; + for z in self.zones.iter() { + table.push(format!( + "{}\tTags\tPartitions\tCapacity\tUsable capacity", + z.name + )); + + for n in z.nodes.iter() { table.push(format!( - " {:?}\t[{}]\t{} ({} new)\t{}\t{} ({:.1}%)", - self.node_id_vec[*n], - tags_n, - stored_partitions[*n], - new_partitions[*n], - ByteSize::b(total_cap_n).display().iec(), - ByteSize::b(available_cap_n).display().iec(), - (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + " {:.16}\t[{}]\t{} ({} new)\t{}\t{} ({:.1}%)", + n.id, + n.tags.join(","), + n.stored_partitions, + n.new_partitions, + ByteSize::b(n.total_capacity).display().iec(), + ByteSize::b(n.usable_capacity).display().iec(), + (n.usable_capacity as f32) / (n.total_capacity as f32) * 100.0, )); } table.push(format!( " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", - replicated_partitions, - stored_partitions_zone[z], - //new_partitions_zone[z], - ByteSize::b(total_cap_z).display().iec(), - ByteSize::b(available_cap_z).display().iec(), - percent_cap_z + z.total_replicated_partitions, + z.unique_partitions, + ByteSize::b(z.total_capacity).display().iec(), + ByteSize::b(z.usable_capacity).display().iec(), + (z.usable_capacity as f32) / (z.total_capacity as f32) * 100.0, )); table.push("".into()); } msg.push(format_table::format_table_to_string(table)); - Ok(msg) + msg } }