admin api: return full layout computation statistics as json (fix #1428)

This commit is contained in:
Alex Auvolat 2026-05-01 19:30:27 +02:00 committed by Alex
parent ada0c8ab70
commit 62349a6559
10 changed files with 284 additions and 129 deletions

1
Cargo.lock generated
View File

@ -1782,6 +1782,7 @@ dependencies = [
"thiserror 2.0.18",
"tokio",
"tracing",
"utoipa",
]
[[package]]

View File

@ -618,6 +618,10 @@ pub enum PreviewClusterLayoutChangesResponse {
/// Plain-text information about the layout computation
/// (do not try to parse this)
message: Vec<String>,
/// Structured statistics about the layout computation
// FIXME for v3: remove default and skip_serializing_if
#[serde(default, skip_serializing_if = "Option::is_none")]
statistics: Option<Box<garage_rpc::layout::ComputationStat>>,
/// Details about the new cluster layout
new_layout: GetClusterLayoutResponse,
},
@ -639,6 +643,10 @@ pub struct ApplyClusterLayoutResponse {
/// Plain-text information about the layout computation
/// (do not try to parse this)
pub message: Vec<String>,
/// Structured statistics about the layout computation
// FIXME for v3: remove default and skip_serializing_if
#[serde(default, skip_serializing_if = "Option::is_none")]
pub statistics: Option<garage_rpc::layout::ComputationStat>,
/// Details about the new cluster layout
pub layout: GetClusterLayoutResponse,
}

View File

@ -255,10 +255,14 @@ impl RequestHandler for PreviewClusterLayoutChangesRequest {
Ok(PreviewClusterLayoutChangesResponse::Error { error })
}
Err(e) => Err(e.into()),
Ok((new_layout, msg)) => Ok(PreviewClusterLayoutChangesResponse::Success {
message: msg,
new_layout: format_cluster_layout(&new_layout),
}),
Ok((new_layout, stat)) => {
let message = stat.to_message();
Ok(PreviewClusterLayoutChangesResponse::Success {
message,
statistics: Some(Box::new(stat)),
new_layout: format_cluster_layout(&new_layout),
})
}
}
}
}
@ -272,7 +276,8 @@ impl RequestHandler for ApplyClusterLayoutRequest {
_admin: &Admin,
) -> Result<ApplyClusterLayoutResponse, Error> {
let layout = garage.system.cluster_layout().inner().clone();
let (layout, msg) = layout.apply_staged_changes(self.version)?;
let (layout, stat) = layout.apply_staged_changes(self.version)?;
let message = stat.to_message();
garage
.system
@ -281,7 +286,8 @@ impl RequestHandler for ApplyClusterLayoutRequest {
.await?;
Ok(ApplyClusterLayoutResponse {
message: msg,
message,
statistics: Some(stat),
layout: format_cluster_layout(&layout),
})
}

View File

@ -40,6 +40,7 @@ impl Cli {
PreviewClusterLayoutChangesResponse::Success {
message,
new_layout,
..
} => {
println!();
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");

View File

@ -228,10 +228,10 @@ async fn initial_config(garage: &Arc<Garage>, opt: ServerOpt) -> Result<(), Erro
})),
);
let (layout, msg) = layout.apply_staged_changes(1)?;
let (layout, stat) = layout.apply_staged_changes(1)?;
info!(
"Created initial layout for single-node configuration:\n{}",
msg.join("\n")
stat.to_message().join("\n")
);
garage

View File

@ -33,6 +33,7 @@ async-trait.workspace = true
serde.workspace = true
serde_bytes.workspace = true
serde_json.workspace = true
utoipa.workspace = true
thiserror = { workspace = true, optional = true }
# newer version requires rust edition 2021

View File

@ -6,6 +6,7 @@ use garage_util::encode::nonversioned_encode;
use garage_util::error::*;
use super::*;
use crate::layout::ComputationStat;
use crate::replication_mode::*;
impl LayoutHistory {
@ -269,13 +270,13 @@ impl LayoutHistory {
changed
}
pub fn apply_staged_changes(mut self, version: u64) -> Result<(Self, Message), Error> {
pub fn apply_staged_changes(mut self, version: u64) -> Result<(Self, ComputationStat), Error> {
if version != self.current().version + 1 {
return Err(Error::Message("Invalid new layout version".into()));
}
// Compute new version and add it to history
let (new_version, msg) = self
let (new_version, stat) = self
.current()
.clone()
.calculate_next_version(self.staging.get())?;
@ -289,7 +290,7 @@ impl LayoutHistory {
roles: LwwMap::new(),
});
Ok((self, msg))
Ok((self, stat))
}
pub fn revert_staged_changes(mut self) -> Result<Self, Error> {

View File

@ -389,10 +389,6 @@ impl NodeRole {
None => "gateway".to_string(),
}
}
pub fn tags_string(&self) -> String {
self.tags.join(",")
}
}
impl UpdateTracker {

View File

@ -79,8 +79,8 @@ fn check_against_naive(cl: &LayoutVersion) -> bool {
false
}
fn show_msg(msg: &Message) {
for s in msg.iter() {
fn show_stat(stat: &ComputationStat) {
for s in stat.to_message().iter() {
println!("{}", s);
}
}
@ -123,8 +123,8 @@ fn test_assignment() {
let mut cl = LayoutHistory::new(ReplicationFactor::new(3).unwrap());
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
let v = cl.current().version;
let (mut cl, msg) = cl.apply_staged_changes(v + 1).unwrap();
show_msg(&msg);
let (mut cl, stat) = cl.apply_staged_changes(v + 1).unwrap();
show_stat(&stat);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()));
@ -132,16 +132,16 @@ fn test_assignment() {
node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"];
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2);
let v = cl.current().version;
let (mut cl, msg) = cl.apply_staged_changes(v + 1).unwrap();
show_msg(&msg);
let (mut cl, stat) = cl.apply_staged_changes(v + 1).unwrap();
show_stat(&stat);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()));
node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000];
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
let v = cl.current().version;
let (mut cl, msg) = cl.apply_staged_changes(v + 1).unwrap();
show_msg(&msg);
let (mut cl, stat) = cl.apply_staged_changes(v + 1).unwrap();
show_stat(&stat);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()));
@ -150,8 +150,8 @@ fn test_assignment() {
];
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1);
let v = cl.current().version;
let (cl, msg) = cl.apply_staged_changes(v + 1).unwrap();
show_msg(&msg);
let (cl, stat) = cl.apply_staged_changes(v + 1).unwrap();
show_stat(&stat);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()));
}

View File

@ -4,6 +4,8 @@ use std::convert::TryInto;
use bytesize::ByteSize;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use garage_util::crdt::{Crdt, LwwMap};
use garage_util::data::*;
@ -13,9 +15,6 @@ use super::graph_algo::*;
use super::*;
use crate::replication_mode::*;
// The Message type will be used to collect information on the algorithm.
pub type Message = Vec<String>;
impl LayoutVersion {
pub fn new(replication_factor: ReplicationFactor) -> Self {
// We set the default zone redundancy to be Maximum, meaning that the maximum
@ -291,16 +290,16 @@ impl LayoutVersion {
pub(crate) fn calculate_next_version(
mut self,
staging: &LayoutStaging,
) -> Result<(Self, Message), Error> {
) -> Result<(Self, ComputationStat), Error> {
self.version += 1;
self.roles.merge(&staging.roles);
self.roles.retain(|(_, _, v)| v.0.is_some());
self.parameters = *staging.parameters.get();
let msg = self.calculate_partition_assignment()?;
let stat = self.calculate_partition_assignment()?;
Ok((self, msg))
Ok((self, stat))
}
/// This function calculates a new partition-to-node assignment.
@ -312,21 +311,15 @@ impl LayoutVersion {
/// data to be moved.
/// Staged role changes must be merged with nodes roles before calling this function,
/// hence it must only be called from `apply_staged_changes()` and hence is not public.
fn calculate_partition_assignment(&mut self) -> Result<Message, Error> {
fn calculate_partition_assignment(&mut self) -> Result<ComputationStat, Error> {
// We update the node ids, since the node role list might have changed with the
// changes in the layout. We retrieve the old_assignment reframed with new ids
let old_assignment_opt = self.update_node_id_vec()?;
let zone_redundancy = self.effective_zone_redundancy();
let mut msg = Message::new();
msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
msg.push("".into());
msg.push(format!(
"Partitions are \
replicated {} times on at least {} distinct zones.",
self.replication_factor, zone_redundancy
));
let stat_replication_factor = self.replication_factor;
let stat_effective_zone_redundancy = zone_redundancy;
// We generate for once numerical ids for the zones of non gateway nodes,
// to use them as indices in the flow graphs.
@ -355,28 +348,17 @@ impl LayoutVersion {
// optimality.
let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?;
msg.push("".into());
if old_assignment_opt.is_some() {
msg.push(format!(
"Optimal partition size: {} ({} in previous layout)",
ByteSize::b(partition_size).display().iec(),
ByteSize::b(self.partition_size).display().iec()
));
let stat_partition_size = partition_size;
let stat_previous_partition_size = if old_assignment_opt.is_some() {
Some(self.partition_size)
} else {
msg.push(format!(
"Optimal partition size: {}",
ByteSize::b(partition_size).display().iec()
));
}
None
};
// We write the partition size.
self.partition_size = partition_size;
if partition_size < 100 {
msg.push(
"WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB"
.into(),
);
}
let stat_low_partition_size = partition_size < 100;
// We compute a first flow/assignment that is heuristically close to the previous
// assignment
@ -388,18 +370,28 @@ impl LayoutVersion {
}
// We display statistics of the computation
msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?);
let stat = self.output_stat(
&gflow,
&old_assignment_opt,
&zone_to_id,
&id_to_zone,
stat_replication_factor,
stat_effective_zone_redundancy,
stat_partition_size,
stat_previous_partition_size,
stat_low_partition_size,
)?;
// We update the layout structure
self.update_ring_from_flow(id_to_zone.len(), &gflow)?;
if let Err(e) = self.check() {
return Err(Error::Message(
format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n"))
format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, stat.to_message().join("\n"))
));
}
Ok(msg)
Ok(stat)
}
/// The `LwwMap` of node roles might have changed. This function updates the `node_id_vec`
@ -706,47 +698,26 @@ impl LayoutVersion {
/// This function returns a message summing up the partition repartition of the new
/// layout, and other statistics of the partition assignment computation.
#[allow(clippy::too_many_arguments)]
fn output_stat(
&self,
gflow: &Graph<FlowEdge>,
prev_assign_opt: &Option<Vec<Vec<usize>>>,
zone_to_id: &HashMap<String, usize>,
id_to_zone: &[String],
) -> Result<Message, Error> {
let mut msg = Message::new();
replication_factor: usize,
effective_zone_redundancy: usize,
partition_size: u64,
previous_partition_size: Option<u64>,
low_partition_size: bool,
) -> Result<ComputationStat, Error> {
let usable_capacity =
self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
let total_capacity = self.get_total_capacity();
let effective_capacity = usable_capacity / replication_factor as u64;
let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
let total_cap = self.get_total_capacity();
let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
msg.push(format!(
"Usable capacity / total cluster capacity: {} / {} ({:.1} %)",
ByteSize::b(used_cap).display().iec(),
ByteSize::b(total_cap).display().iec(),
percent_cap
));
msg.push(format!(
"Effective capacity (replication factor {}): {}",
self.replication_factor,
ByteSize::b(used_cap / self.replication_factor as u64)
.display()
.iec()
));
if percent_cap < 80. {
msg.push("".into());
msg.push(
"If the percentage is too low, it might be that the \
cluster topology and redundancy constraints are forcing the use of nodes/zones with small \
storage capacities."
.into(),
);
msg.push(
"You might want to move storage capacity between zones or relax the redundancy constraint."
.into(),
);
msg.push(
"See the detailed statistics below and look for saturated nodes/zones.".into(),
);
}
let percent_cap = 100.0 * (usable_capacity as f32) / (total_capacity as f32);
let low_usable_capacity = percent_cap < 80.;
// We define and fill in the following tables
let storing_nodes = self.nongateway_nodes();
@ -795,18 +766,13 @@ impl LayoutVersion {
// We display the statistics
msg.push("".into());
if prev_assign_opt.is_some() {
let total_new_partitions: usize = new_partitions.iter().sum();
msg.push(format!(
"A total of {} new copies of partitions need to be \
transferred.",
total_new_partitions
));
msg.push("".into());
}
let total_moved_partitions = if prev_assign_opt.is_some() {
Some(new_partitions.iter().sum())
} else {
None
};
let mut table = vec![];
let mut zones = vec![];
for z in 0..id_to_zone.len() {
let mut nodes_of_z = Vec::<usize>::new();
for n in 0..storing_nodes.len() {
@ -814,49 +780,224 @@ impl LayoutVersion {
nodes_of_z.push(n);
}
}
let replicated_partitions: usize =
let replicated_partitions_z: usize =
nodes_of_z.iter().map(|n| stored_partitions[*n]).sum();
table.push(format!(
"{}\tTags\tPartitions\tCapacity\tUsable capacity",
id_to_zone[z]
));
let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
let available_cap_z: u64 = self.partition_size * replicated_partitions_z as u64;
let mut total_cap_z = 0;
for n in nodes_of_z.iter() {
total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]);
}
let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
let mut nodes = vec![];
for n in nodes_of_z.iter() {
let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]);
let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?
.tags
.clone();
nodes.push(ComputationStatNode {
id: hex::encode(self.node_id_vec[*n]),
tags: tags_n,
stored_partitions: stored_partitions[*n],
new_partitions: new_partitions[*n],
total_capacity: total_cap_n,
usable_capacity: available_cap_n,
});
}
zones.push(ComputationStatZone {
name: id_to_zone[z].to_string(),
nodes,
total_replicated_partitions: replicated_partitions_z,
unique_partitions: stored_partitions_zone[z],
total_capacity: total_cap_z,
usable_capacity: available_cap_z,
});
}
Ok(ComputationStat {
replication_factor,
effective_zone_redundancy,
partition_size,
previous_partition_size,
low_partition_size,
usable_capacity,
total_capacity,
effective_capacity,
low_usable_capacity,
total_moved_partitions,
zones,
})
}
}
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct ComputationStat {
/// The cluster's replication factor
pub replication_factor: usize,
/// The zone redundancy factor achieved by this layout
pub effective_zone_redundancy: usize,
/// The size of a partition, in bytes
pub partition_size: u64,
/// The size of a partition, in bytes, in the previous layout
pub previous_partition_size: Option<u64>,
/// Warning flag indicating when partitions are very small
pub low_partition_size: bool,
/// The portion of total raw node capacity that is used by partitions
pub usable_capacity: u64,
/// The total raw capacity of nodes
pub total_capacity: u64,
/// The final effective capacity of the cluster, accounting for replication
pub effective_capacity: u64,
/// Warning flag indicating that the raw node capacity could not be used
/// effectively
pub low_usable_capacity: bool,
/// The total number of partitions that will be moved to a new storage node
pub total_moved_partitions: Option<usize>,
/// Per-zone storage statistics
pub zones: Vec<ComputationStatZone>,
}
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct ComputationStatZone {
/// The name of the zone
pub name: String,
/// Per-node storage statistics for nodes in this zone
pub nodes: Vec<ComputationStatNode>,
/// The total number of partition replicas in this zone
pub total_replicated_partitions: usize,
/// The number of unique partitions that have at least one replica in this zone
pub unique_partitions: usize,
/// The total raw capacity of nodes in this zone
pub total_capacity: u64,
/// The used portion of the raw capacity of nodes in this zones
pub usable_capacity: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct ComputationStatNode {
/// The node's ID
pub id: String,
/// The node's tags as defined in the layout
pub tags: Vec<String>,
/// The number of partitions that are replicated on this node
pub stored_partitions: usize,
/// The number of partitions that are newly replicated on this node
pub new_partitions: usize,
/// The node's raw capacity
pub total_capacity: u64,
/// The portion of the node's raw capacity that is used by partitions it stores
pub usable_capacity: u64,
}
impl ComputationStat {
pub fn to_message(&self) -> Vec<String> {
let mut msg = Vec::new();
msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
msg.push("".into());
msg.push(format!(
"Partitions are \
replicated {} times on at least {} distinct zones.",
self.replication_factor, self.effective_zone_redundancy
));
msg.push("".into());
if let Some(prev) = self.previous_partition_size {
msg.push(format!(
"Optimal partition size: {} ({} in previous layout)",
ByteSize::b(self.partition_size).display().iec(),
ByteSize::b(prev).display().iec()
));
} else {
msg.push(format!(
"Optimal partition size: {}",
ByteSize::b(self.partition_size).display().iec()
));
}
if self.low_partition_size {
msg.push(
"WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB"
.into(),
);
}
let percent_cap = 100.0 * (self.usable_capacity as f32) / (self.total_capacity as f32);
msg.push(format!(
"Usable capacity / total cluster capacity: {} / {} ({:.1} %)",
ByteSize::b(self.usable_capacity).display().iec(),
ByteSize::b(self.total_capacity).display().iec(),
percent_cap
));
msg.push(format!(
"Effective capacity (replication factor {}): {}",
self.replication_factor,
ByteSize::b(self.effective_capacity).display().iec()
));
if self.low_usable_capacity {
msg.push("".into());
msg.push(
"If the percentage is too low, it might be that the \
cluster topology and redundancy constraints are forcing the use of nodes/zones with small \
storage capacities."
.into(),
);
msg.push(
"You might want to move storage capacity between zones or relax the redundancy constraint."
.into(),
);
msg.push(
"See the detailed statistics below and look for saturated nodes/zones.".into(),
);
}
msg.push("".into());
if let Some(tmp) = self.total_moved_partitions {
msg.push(format!(
"A total of {} new copies of partitions need to be \
transferred.",
tmp
));
msg.push("".into());
}
let mut table = vec![];
for z in self.zones.iter() {
table.push(format!(
"{}\tTags\tPartitions\tCapacity\tUsable capacity",
z.name
));
for n in z.nodes.iter() {
table.push(format!(
" {:?}\t[{}]\t{} ({} new)\t{}\t{} ({:.1}%)",
self.node_id_vec[*n],
tags_n,
stored_partitions[*n],
new_partitions[*n],
ByteSize::b(total_cap_n).display().iec(),
ByteSize::b(available_cap_n).display().iec(),
(available_cap_n as f32) / (total_cap_n as f32) * 100.0,
" {:.16}\t[{}]\t{} ({} new)\t{}\t{} ({:.1}%)",
n.id,
n.tags.join(","),
n.stored_partitions,
n.new_partitions,
ByteSize::b(n.total_capacity).display().iec(),
ByteSize::b(n.usable_capacity).display().iec(),
(n.usable_capacity as f32) / (n.total_capacity as f32) * 100.0,
));
}
table.push(format!(
" TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)",
replicated_partitions,
stored_partitions_zone[z],
//new_partitions_zone[z],
ByteSize::b(total_cap_z).display().iec(),
ByteSize::b(available_cap_z).display().iec(),
percent_cap_z
z.total_replicated_partitions,
z.unique_partitions,
ByteSize::b(z.total_capacity).display().iec(),
ByteSize::b(z.usable_capacity).display().iec(),
(z.usable_capacity as f32) / (z.total_capacity as f32) * 100.0,
));
table.push("".into());
}
msg.push(format_table::format_table_to_string(table));
Ok(msg)
msg
}
}