Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding cancelcontext to closing vmbus channels #734

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions openhcl/underhill_core/src/dispatch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ pub trait LoadedVmNetworkSettings: Inspect {
async fn remove_network(&mut self, instance_id: Guid) -> anyhow::Result<()>;

/// Callback after stopping the VM and all workers, in preparation for a VTL2 reboot.
async fn unload_for_servicing(&mut self);
async fn unload_for_servicing(&mut self) -> anyhow::Result<()>;

/// Handles packet capture related operations.
async fn packet_capture(
Expand Down Expand Up @@ -519,7 +519,9 @@ impl LoadedVm {
network_settings
.unload_for_servicing()
.instrument(tracing::info_span!("shutdown_mana"))
.await;
.await
} else {
Ok(())
}
};

Expand All @@ -541,8 +543,9 @@ impl LoadedVm {
.await
};

let (r, (), ()) = (shutdown_pci, shutdown_mana, shutdown_nvme).join().await;
let (r, r_mana, ()) = (shutdown_pci, shutdown_mana, shutdown_nvme).join().await;
r?;
r_mana?;

Ok(state)
}
Expand Down
67 changes: 47 additions & 20 deletions openhcl/underhill_core/src/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ impl UhVmNetworkSettings {
vf_managers: &mut Vec<(Guid, Arc<HclNetworkVFManager>)>,
remove_vtl0_vf: bool,
keep_vf_alive: bool,
) {
) -> anyhow::Result<()> {
// Notify VF managers of shutdown so that the subsequent teardown of
// the NICs does not modify VF state.
let mut vf_managers = vf_managers
Expand Down Expand Up @@ -702,17 +702,30 @@ impl UhVmNetworkSettings {
}

// Close vmbus channels and drop all of the NICs.
let mut endpoints: Vec<_> = join_all(nic_channels.drain(..).map(
|(instance_id, channel)| async move {
async {
let nic = channel.remove().await.revoke().await;
nic.shutdown()
}
.instrument(tracing::info_span!("nic_shutdown", %instance_id))
.await
},
))
.await;
// If vmbus channel shutdown fails, the error is preserved and VF Manager shutdown proceeds.
let mut endpoints: Vec<_> = Vec::new();
let mut error: Option<anyhow::Error> = None;
match CancelContext::new()
.with_timeout(Duration::from_secs(2))
.until_cancelled(join_all(nic_channels.drain(..).map(
|(instance_id, channel)| async move {
async {
let nic = channel.remove().await.revoke().await;
nic.shutdown()
}
.instrument(tracing::info_span!("nic_shutdown", %instance_id))
.await
},
)))
.await
.context("cancelled waiting for vmbus channel close")
{
Ok(result) => endpoints = result,
Err(e) => {
tracing::error!("Error closing vmbus channels: {:?}", e);
error = Some(e);
}
};

let shutdown_vfs = join_all(vf_managers.drain(..).map(
|(instance_id, mut manager)| async move {
Expand All @@ -722,7 +735,7 @@ impl UhVmNetworkSettings {
.await
},
));
let run_endpoints = async {
let run_endpoints = async move {
loop {
let _ = endpoints
.iter_mut()
Expand All @@ -732,9 +745,24 @@ impl UhVmNetworkSettings {
.await;
}
};
// Complete shutdown on the VFs. Process events on the endpoints to
// allow for proper shutdown.
let _ = (shutdown_vfs, run_endpoints).race().await;

if error.is_some() {
// If there was an error closing vmbus channels, complete VF shutdown.
shutdown_vfs.await;
} else {
// Complete shutdown on the VFs. Process events on the endpoints to
// allow for proper shutdown.
// run_endpoints is a loop, so the race completes when shutdown_vfs completes.
// Also, run_endpoints races wait_for_endpoint_action() so it doesn't guarentee
// all endpoints will close their channels before shutdown_vfs completes.
let _ = (shutdown_vfs, run_endpoints).race().await;
}

if let Some(e) = error {
return Err(e);
}

Ok(())
}

async fn new_underhill_nic(
Expand Down Expand Up @@ -930,15 +958,14 @@ impl LoadedVmNetworkSettings for UhVmNetworkSettings {
.ok_or(NetworkSettingsError::VFManagerMissing(instance_id));

self.shutdown_vf_devices(&mut vec![vf_manager.unwrap()], true, false)
.await;
Ok(())
.await
}

async fn unload_for_servicing(&mut self) {
async fn unload_for_servicing(&mut self) -> anyhow::Result<()> {
let mut vf_managers: Vec<(Guid, Arc<HclNetworkVFManager>)> =
self.vf_managers.drain().collect();
self.shutdown_vf_devices(&mut vf_managers, false, true)
.await;
.await
}

async fn prepare_for_hibernate(&self, rollback: bool) {
Expand Down