scsi: Add virtio daemon

This adds the virtio-specific parts that use the previously formed interfaces and scsi emulation in order to build a daemon that offers files from the host system as drives to the guest. The vast majority of this work was done by Gaelan Steele as part of a GSoC project [1][2]. [1] #4 [2] https://gist.github.com/Gaelan/febec4e4606e1320026a0924c3bf74d0 Co-developed-by: Erik Schilling <[email protected]> Signed-off-by: Erik Schilling <[email protected]> Signed-off-by: Gaelan Steele <[email protected]>
rust-vmm · Mar 13, 2023 · e9a0965 · e9a0965
1 parent 47fb818
commit e9a0965
Show file tree

Hide file tree

Showing 3 changed files with 696 additions and 1 deletion.
diff --git a/crates/scsi/src/lib.rs b/crates/scsi/src/lib.rs
@@ -1 +1,2 @@
 pub mod scsi;
+pub mod virtio;
diff --git a/crates/scsi/src/main.rs b/crates/scsi/src/main.rs
@@ -1,3 +1,390 @@
+use std::{
+    convert::TryFrom,
+    fs::File,
+    io::{self, ErrorKind},
+    path::PathBuf,
+    process::exit,
+    sync::{Arc, RwLock},
+};
+
+use clap::{arg, Parser};
+use log::{debug, error, info, warn};
+use vhost::{
+    vhost_user,
+    vhost_user::{
+        message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures},
+        Listener,
+    },
+};
+use vhost_user_backend::{VhostUserBackendMut, VhostUserDaemon, VringRwLock, VringT};
+use vhost_user_scsi::{
+    scsi::{
+        self,
+        emulation::{
+            block_device::{BlockDevice, FileBackend, MediumRotationRate},
+            target::EmulatedTarget,
+        },
+        CmdError, TaskAttr,
+    },
+    virtio::{self, Request, RequestParseError, Response, ResponseCode, VirtioScsiLun, SENSE_SIZE},
+};
+use virtio_bindings::{
+    virtio_config::VIRTIO_F_VERSION_1,
+    virtio_ring::{VIRTIO_RING_F_EVENT_IDX, VIRTIO_RING_F_INDIRECT_DESC},
+    virtio_scsi::VIRTIO_SCSI_F_HOTPLUG,
+};
+use virtio_queue::QueueOwnedT;
+use vm_memory::{GuestAddressSpace, GuestMemoryAtomic, GuestMemoryLoadGuard, GuestMemoryMmap};
+use vmm_sys_util::{
+    epoll::EventSet,
+    eventfd::{EventFd, EFD_NONBLOCK},
+};
+
+const REQUEST_QUEUE: u16 = 2;
+
+type DescriptorChainWriter = virtio::DescriptorChainWriter<GuestMemoryLoadGuard<GuestMemoryMmap>>;
+type DescriptorChainReader = virtio::DescriptorChainReader<GuestMemoryLoadGuard<GuestMemoryMmap>>;
+type Target = dyn scsi::Target<DescriptorChainWriter, DescriptorChainReader>;
+
+struct VhostUserScsiBackend {
+    event_idx: bool,
+    mem: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
+    targets: Vec<Box<Target>>,
+    exit_event: EventFd,
+}
+
+impl VhostUserScsiBackend {
+    fn new() -> Self {
+        Self {
+            event_idx: false,
+            mem: None,
+            targets: Vec::new(),
+            exit_event: EventFd::new(EFD_NONBLOCK).expect("Creating exit eventfd"),
+        }
+    }
+
+    fn parse_target(&mut self, lun: VirtioScsiLun) -> Option<(&mut Target, u16)> {
+        match lun {
+            VirtioScsiLun::TargetLun(target, lun) => self
+                .targets
+                .get_mut(usize::from(target))
+                .map(|tgt| (tgt.as_mut(), lun)),
+            VirtioScsiLun::ReportLuns => {
+                // TODO: do we need to handle the REPORT LUNS well-known LUN?
+                // In practice, everyone seems to just use LUN 0
+                warn!("Guest is trying to use the REPORT LUNS well-known LUN, which we don't support.");
+                None
+            }
+        }
+    }
+
+    fn process_request_queue(&mut self, vring: &VringRwLock) -> Result<(), io::Error> {
+        let chains: Vec<_> = vring
+            .get_mut()
+            .get_queue_mut()
+            .iter(self.mem.as_ref().unwrap().memory())
+            .map_err(|e| io::Error::new(ErrorKind::Other, e))?
+            .collect();
+        for dc in chains {
+            let mut writer = DescriptorChainWriter::new(dc.clone());
+            let mut reader = DescriptorChainReader::new(dc.clone());
+
+            self.handle_request_queue(&mut reader, &mut writer);
+
+            vring
+                .add_used(dc.head_index(), writer.max_written())
+                .map_err(|e| io::Error::new(ErrorKind::Other, e))?;
+        }
+        vring
+            .signal_used_queue()
+            .map_err(|e| io::Error::new(ErrorKind::Other, e))?;
+        Ok(())
+    }
+
+    fn handle_request_queue(
+        &mut self,
+        reader: &mut DescriptorChainReader,
+        writer: &mut DescriptorChainWriter,
+    ) {
+        // TODO: make error handling responsibility of caller -> deduplicate writing of the error
+
+        let mut body_writer = writer.clone();
+        const RESPONSE_HEADER_SIZE: u32 = 12;
+        body_writer.skip(
+            RESPONSE_HEADER_SIZE + u32::try_from(SENSE_SIZE).expect("SENSE_SIZE should fit 32bit"),
+        );
+
+        let response = match Request::parse(reader) {
+            Ok(r) => {
+                if let Some((target, lun)) = self.parse_target(r.lun) {
+                    let output = target.execute_command(
+                        lun,
+                        scsi::Request {
+                            id: r.id,
+                            cdb: &r.cdb,
+                            task_attr: match r.task_attr {
+                                0 => TaskAttr::Simple,
+                                1 => TaskAttr::Ordered,
+                                2 => TaskAttr::HeadOfQueue,
+                                3 => TaskAttr::Aca,
+                                _ => {
+                                    // virtio-scsi spec allows us to map any task attr to simple, presumably
+                                    // including future ones
+                                    warn!("Unknown task attr: {}", r.task_attr);
+                                    TaskAttr::Simple
+                                }
+                            },
+                            data_in: &mut body_writer,
+                            data_out: reader,
+                            crn: r.crn,
+                            prio: r.prio,
+                        },
+                    );
+
+                    match output {
+                        Ok(output) => {
+                            assert!(output.sense.len() < SENSE_SIZE);
+
+                            Response {
+                                response: ResponseCode::Ok,
+                                status: output.status,
+                                status_qualifier: output.status_qualifier,
+                                sense: output.sense,
+                                // TODO: handle residual for data in
+                                residual: body_writer.residual(),
+                            }
+                        }
+                        Err(CmdError::CdbTooShort) => {
+                            // the CDB buffer is, by default, sized larger than any CDB we support; we don't
+                            // handle writes to config space (because QEMU doesn't let us), so there's no
+                            // way the guest can set it too small
+                            unreachable!();
+                        }
+                        Err(CmdError::DataIn(e)) => {
+                            if e.kind() == ErrorKind::WriteZero {
+                                Response::error(ResponseCode::Overrun, 0)
+                            } else {
+                                error!("Error writing response to guest memory: {}", e);
+
+                                // There's some chance the header and data in are on different descriptors,
+                                // and only the data in descriptor is bad, so let's at least try to write an
+                                // error to the header
+                                Response::error(ResponseCode::Failure, body_writer.residual())
+                            }
+                        }
+                    }
+                } else {
+                    debug!("Rejecting command to LUN with bad target {:?}", r.lun);
+                    Response::error(ResponseCode::BadTarget, body_writer.residual())
+                }
+            }
+            Err(RequestParseError::CouldNotReadGuestMemory(e)) => {
+                // See comment later about errors while writing to guest mem; maybe we at least
+                // got functional write desciptors, so we can report an error
+                error!("Error reading request from guest memory: {:?}", e);
+                Response::error(ResponseCode::Failure, body_writer.residual())
+            }
+            Err(RequestParseError::FailedParsingLun(lun)) => {
+                error!("Unable to parse LUN: {:?}", lun);
+                Response::error(ResponseCode::Failure, body_writer.residual())
+            }
+        };
+
+        if let Err(e) = response.write(writer) {
+            // Alright, so something went wrong writing our response header to guest memory.
+            // The only reason this should ever happen, I think, is if the guest gave us a
+            // virtio descriptor with an invalid address.
+
+            // There's not a great way to recover from this - we just discovered that
+            // our only way of communicating with the guest doesn't work - so we either
+            // silently fail or crash. There isn't too much sense in crashing, IMO, as
+            // the guest could still recover by, say, installing a fixed kernel and
+            // rebooting. So let's just log an error and do nothing.
+            error!("Error writing response to guest memory: {:?}", e);
+        }
+    }
+
+    fn add_target(&mut self, target: Box<Target>) {
+        self.targets.push(target);
+    }
+}
+
+impl VhostUserBackendMut<VringRwLock> for VhostUserScsiBackend {
+    fn num_queues(&self) -> usize {
+        // control + event + request queues
+        let num_request_queues = 1;
+        2 + num_request_queues
+    }
+
+    fn max_queue_size(&self) -> usize {
+        128 // qemu assumes this by default
+    }
+
+    fn features(&self) -> u64 {
+        1 << VIRTIO_F_VERSION_1
+            | 1 << VIRTIO_SCSI_F_HOTPLUG
+            | 1 << VIRTIO_RING_F_INDIRECT_DESC
+            | 1 << VIRTIO_RING_F_EVENT_IDX
+            | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits()
+    }
+
+    fn protocol_features(&self) -> VhostUserProtocolFeatures {
+        VhostUserProtocolFeatures::MQ
+    }
+
+    fn set_event_idx(&mut self, enabled: bool) {
+        self.event_idx = enabled;
+    }
+
+    fn update_memory(
+        &mut self,
+        atomic_mem: GuestMemoryAtomic<GuestMemoryMmap>,
+    ) -> std::result::Result<(), std::io::Error> {
+        info!("Memory updated - guest probably booting");
+        self.mem = Some(atomic_mem);
+        Ok(())
+    }
+
+    fn handle_event(
+        &mut self,
+        device_event: u16,
+        evset: EventSet,
+        vrings: &[VringRwLock],
+        thread_id: usize,
+    ) -> io::Result<bool> {
+        assert!(evset == EventSet::IN);
+        assert!(vrings.len() == 3);
+        assert!((device_event as usize) < vrings.len());
+        assert!(thread_id == 0);
+
+        let vring = &vrings[device_event as usize];
+        match device_event {
+            REQUEST_QUEUE => {
+                if self.event_idx {
+                    // vm-virtio's Queue implementation only checks avail_index
+                    // once, so to properly support EVENT_IDX we need to keep
+                    // calling process_request_queue() until it stops finding
+                    // new requests on the queue.
+                    loop {
+                        vring.disable_notification().unwrap();
+                        self.process_request_queue(vring)?;
+                        if !vring.enable_notification().unwrap() {
+                            break;
+                        }
+                    }
+                } else {
+                    // Without EVENT_IDX, a single call is enough.
+                    self.process_request_queue(vring)?;
+                }
+            }
+            _ => {
+                error!("Ignoring descriptor on queue {}", device_event);
+            }
+        }
+
+        Ok(false)
+    }
+
+    fn get_config(&self, _offset: u32, _size: u32) -> Vec<u8> {
+        // QEMU handles config space itself
+        panic!("Access to configuration space is not supported.");
+    }
+
+    fn set_config(&mut self, _offset: u32, _buf: &[u8]) -> std::result::Result<(), std::io::Error> {
+        // QEMU handles config space itself
+        panic!("Access to configuration space is not supported.");
+    }
+
+    fn exit_event(&self, _thread_index: usize) -> Option<EventFd> {
+        Some(self.exit_event.try_clone().expect("Cloning exit eventfd"))
+    }
+}
+
+#[derive(Parser)]
+struct Opt {
+    /// Make the images read-only.
+    ///
+    /// Currently, we don't actually support writes, but sometimes we want to
+    /// pretend the disk is writable to work around issues with some tools that
+    /// use the Linux SCSI generic API.
+    #[arg(long = "read-only", short = 'r')]
+    read_only: bool,
+    /// Tell the guest this disk is non-rotational.
+    ///
+    /// Affects some heuristics in Linux around, for example, scheduling.
+    #[arg(long = "solid-state", short = 's')]
+    solid_state: bool,
+    /// Location of vhost-user socket.
+    sock: PathBuf,
+    /// Images against which the SCSI actions are emulated.
+    images: Vec<PathBuf>,
+}
+
 fn main() {
-    println!("Hello world");
+    env_logger::init();
+
+    let opt = Opt::parse();
+
+    let mut backend = VhostUserScsiBackend::new();
+    let mut target = EmulatedTarget::new();
+
+    if opt.images.len() > 256 {
+        error!("More than 256 LUNs aren't currently supported.");
+        // This is fairly simple to add; it's just a matter of supporting the right LUN
+        // encoding formats.
+        exit(1);
+    }
+
+    if !opt.read_only {
+        warn!("Currently, only read-only images are supported. Unless you know what you're doing, you want to pass -r");
+    }
+
+    for image in opt.images {
+        let mut dev = BlockDevice::new(FileBackend::new(File::open(image).expect("Opening image")));
+        dev.set_write_protected(opt.read_only);
+        dev.set_solid_state(if opt.solid_state {
+            MediumRotationRate::NonRotating
+        } else {
+            MediumRotationRate::Unreported
+        });
+        target.add_lun(Box::new(dev));
+    }
+
+    backend.add_target(Box::new(target));
+
+    let backend = Arc::new(RwLock::new(backend));
+
+    let mut daemon = VhostUserDaemon::new(
+        "vhost-user-scsi".into(),
+        Arc::clone(&backend),
+        GuestMemoryAtomic::new(GuestMemoryMmap::new()),
+    )
+    .expect("Creating daemon");
+
+    daemon
+        .start(Listener::new(opt.sock, true).expect("Creating listener"))
+        .expect("Starting daemon");
+
+    let run_result = daemon.wait();
+
+    match run_result {
+        Ok(()) => {
+            info!("Stopping cleanly.");
+        }
+        Err(vhost_user_backend::Error::HandleRequest(vhost_user::Error::PartialMessage)) => {
+            info!("vhost-user connection closed with partial message. If the VM is shutting down, this is expected behavior; otherwise, it might be a bug.");
+        }
+        Err(e) => {
+            error!("Error running daemon: {:?}", e);
+        }
+    }
+
+    // No matter the result, we need to shut down the worker thread.
+    // unwrap will only panic if we already panicked somewhere else
+    backend
+        .read()
+        .unwrap()
+        .exit_event
+        .write(1)
+        .expect("Shutting down worker thread");
 }