From 25f5d2277a29d9bf8aa00e3dde63461975c4776d Mon Sep 17 00:00:00 2001 From: Tomasz Andrzejak Date: Thu, 18 Dec 2025 10:08:53 +0100 Subject: [PATCH 1/3] Add HIP 0001 ring buffer proposal for Hyperlight I/O Signed-off-by: Tomasz Andrzejak --- proposals/0001-rng-buf/README.md | 767 +++++++++++++++++++++++++++++++ typos.toml | 2 + 2 files changed, 769 insertions(+) create mode 100644 proposals/0001-rng-buf/README.md diff --git a/proposals/0001-rng-buf/README.md b/proposals/0001-rng-buf/README.md new file mode 100644 index 000000000..216fdfca6 --- /dev/null +++ b/proposals/0001-rng-buf/README.md @@ -0,0 +1,767 @@ +# HIP 0001 - Virtio-Inspired Ring Buffer for Hyperlight I/O + + + +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories](#user-stories) + - [Story 1: Stream-based Communication](#story-1-stream-based-communication) + - [Story 2: High-throughput RPC](#story-2-high-throughput-rpc) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Packed Virtqueue Overview](#packed-virtqueue-overview) + - [Bidirectional Communication](#bidirectional-communication) + - [Memory Layout](#memory-layout) + - [Request-Response Flow](#request-response-flow) + - [Publishing and Consumption Protocol](#publishing-and-consumption-protocol) + - [Performance Optimizations](#performance-optimizations) + - [Dynamic Response Sizing](#dynamic-response-sizing) + - [Type System Design](#type-system-design) + - [Low Level API](#low-level-api) + - [Higher-Level API](#higher-level-api) + - [Test Plan](#test-plan) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + + +## Summary + +This HIP proposes implementing a ring buffer mechanism for Hyperlight I/O, loosely based on the +virtio packed virtqueue +[specification](https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-720008). +The ring buffer will serve as the foundation for host-guest communication, supporting use cases such +as streams, RPC, and other I/O patterns in both single-threaded and multi-threaded environments. + +The design leverages virtio's well-defined publishing/consumption semantics and memory safety +guarantees while adapting them to Hyperlight's specific needs. Since we control both ends of the +queue, we can deviate from strict virtio compliance where it makes sense (see +[Difference from virtio spec](#difference-from-virtio-spec) section). + +## Motivation + +Currently, Hyperlight lacks a good I/O story. Each interaction between host and guest relies on +individual VM exits, which, while providing strong shared memory access safety guarantees, also +introduces significant overhead. Supporting streaming communication patterns through this mechanism +is hard as exiting the VM context for each memory chunk transferred in either direction would result +in performance degradation. A ring buffer can mitigate this overhead by: + +- **Reducing VM exits**: Batch multiple requests, responses, and function calls, only exiting when + necessary +- **Foundation for streams**: Enable bidirectional, streaming I/O patterns +- **Foundation for futures**: Support futures as a special case of a stream with a single element +- **Better cache locality**: The packed queue format improves cache characteristics. While this + claim is speculative, it's worth noting that improving the cache behavior of the split queue, + which managed state across three separate tables, was a primary motivation for developing the + packed virtqueue specification. + +### Goals + +- Implement a low-level ring buffer based on virtio packed virtqueue semantics +- Support both single-threaded (host and guest on the same thread) and multi-threaded scenarios +- Maintains backward compatibility, which means current function call model can be ported to queue + without changes in public API - as such the ring buffer is an implementation detail +- Provide memory-safe abstractions over shared memory regions +- Enable batching and notification suppression to minimize VM exits +- Establish the foundation for higher-level APIs (streams, async I/O) + +### Non-Goals + +- 100% virtio specification compliance +- Indirect descriptor table support (deferred to future work if needed) +- Immediate async/await integration (deferred to future work) + +## Proposal + +We propose implementing a ring buffer mechanism based on the virtio packed virtqueue specification. +The packed format offers superior cache characteristics compared to the split queue format by +keeping descriptors, driver area, and device area in contiguous memory. Both sides of the ring only +need to poll a single memory address to detect the next available or used slot. + +### User Stories + +#### Story 1: Stream-based Communication + +As a Hyperlight user, I want to stream data between host and guest (e.g., file I/O, network sockets) +without incurring a VM exit for each small read/write operation. The ring buffer should allow me to +batch multiple operations and only exit when buffers are full or when I explicitly flush. + +#### Story 2: High-throughput RPC + +As a Hyperlight developer, I want to make multiple function calls from guest to host with minimal +overhead. The ring buffer should let me queue multiple requests, suppress notifications, and process +responses in batches. + +### Risks and Mitigations + +**Risk**: Malicious guest corrupting the queue **Mitigation**: Do not expose low level queue API to +the guest. Assert queue state invariants after each producer and consumer operation, poison sandbox +if any invariant is not upheld + +**Risk**: Complexity of implementing a lock-free ring buffer with proper memory ordering +**Mitigation**: Follow established virtio semantics; use atomic operations with appropriate memory +orderings; implement comprehensive testing + +**Risk**: Potential deadlocks in backpressure scenarios **Mitigation**: Provide clear documentation +of blocking behavior; consider timeout mechanisms; shuttle testing + +**Risk**: Memory safety issues with shared buffer management **Mitigation**: Employ a strong type +system; implement ownership tracking; perform thorough validation and fuzzing + +## Design Details + +### Packed Virtqueue Overview + +Before diving into the specifics, it will be useful to agree on some terminology. We borrow terms +specific for virtio spec. Throughout this document, when we refer to the `driver`, we mean the +`producer` side of the queue (the side that submits elements). When we refer to the `device`, we +mean the `consumer` side (the side that processes queue elements). In the virtio specification, +these terms come from the traditional device driver model, but in our case, either the host or guest +can act as driver or device depending on the direction of communication. + +#### Bidirectional Communication + +For full bidirectional communication between host and guest, we need two queues: + +1. Host-to-Guest Queue: Host acts as driver (producer), guest acts as device (consumer), e.g. queue + elements represent host to guest function calls, +2. Guest-to-Host Queue: Guest acts as driver (producer), host acts as device (consumer), e.g. queue + elements represent guest to host function calls, + +Each queue is independent and implements the same packed virtqueue semantics described below. + +#### Memory Layout + +The queue structure reside in shared memory and are accessible to both host and guest. The layout +for queue that allocates the buffer from buffer pool that resides in shared memory could look like +in the picture below. The `addr` field contains an offset (or physical address, depending on +implementation) pointing to where the actual buffer data lives in the shared memory region. The +descriptor itself only contains metadata about the buffer and is agnostic to where the address +points. The only requirements for the address is that both host and guest can translate it to +referenceable pointer to the buffer memory. The `addr` field does not preserve Rust's notion of +pointer provenance. + +Each descriptor is 16 bytes and has the following layout: + +```rust +struct Descriptor { + addr: u64, // Offset into shared memory where buffer resides + len: u32, // Buffer length in bytes + id: u16, // Buffer ID for tracking + flags: u16, // AVAIL, USED, WRITE, NEXT, INDIRECT, etc. +} +``` + +layout + +#### Request-Response Flow + +The typical flow for a request-response interaction works as follows: + +1. Driver allocate buffers: The driver allocates buffers from the shared memory pool +2. Driver submits descriptors: The driver writes one or more descriptors into the ring: + - Read descriptors: Point to buffers containing request data (device reads from these) + - Write descriptors: Point to empty buffers where the device should write responses +3. Device processes request: The device reads data from read-buffers and writes results into + write-buffers +4. Device marks completion: The device updates the descriptor flags to indicate completion + +**Step 1:** Driver submits request with read buffer (request) and write buffer (response) + +submit + +**Step 2:** Device processes and writes response + +process + +Note how the driver pre-allocates the response buffer and provides it to the device via a write +descriptor. The device then writes its response directly into this buffer. The `len` field in the +used descriptor tells the driver how many bytes were actually written (128 in this example, even +though 256 bytes were available). The driver is allowed to use the same descriptor for read and +write in which case the request data could be overwritten. + +#### Publishing and Consumption Protocol + +The packed virtqueue uses a circular descriptor ring where both driver and device maintain their own +wrap counters. Each descriptor has two key flags: + +- `AVAIL`: Indicates availability from the driver's perspective. After setting this flag, descriptor + ownership is transferred to the device and descriptor cannot be mutated by the driver. +- `USED`: Indicates usage from the device's perspective. Similarly, after setting this flag, + descriptor ownership is transferred back to the driver and the slot can be reused. + +In this scheme, both sides only need to poll a single memory location (the next descriptor in order) +to detect new work or completions. + +The driver will publish buffers until there is no space left in the descriptor ring, at which point +it must wait for the device to process some descriptors before it can continue. Both publishing and +processing wrap around when reaching the end of the descriptor table, with the wrap counter flipping +to indicate the beginning of a new round through the ring. + +This mechanism ensures that no locks are required for synchronization, only memory barriers combined +with atomic publishing of flags ensure that the other side will never observe a partial update: + +- Driver: Write descriptor fields ? memory barrier ? atomic Release-store flags +- Device: Atomic Acquire-load flags ? memory barrier ? read descriptor fields + +Because the packed ring reuses the same descriptor slot for both `available` and `used` states and +both sides only poll a single next slot, each side needs to differentiate between "this change +belongs to the current lap in the ring" and "this is an old value from the previous lap." This is +done using "wrap" counters: + +- Each side keeps a boolean "wrap" flag that toggles when it passes the last descriptor in the ring, +- When the driver publishes an available descriptor, it sets `AVAIL` to its wrap bit and `USED` to + the inverse. When the device publishes a used descriptor, it sets both `AVAIL` and `USED` to its + wrap bit. +- The reader of a descriptor then compares the flags it reads to its own current wrap to decide if + the descriptr is newly available/used now, or is it lagging behind. + +### Comparison with current implementation + +Hyperlight uses two separate shared-memory stacks to pass function calls and returns between host +and guest: + +- an input stack the guest pops from (host -> guest calls) and +- an output stack the guest pushes to (guest -> host returns). + +Each of these memory regions begins with an 8-byte header that stores a relative offset pointing to +the next free byte in the stack. + +When pushing, the payload which is flatbuffer-serialized message is written at the current stack +pointer, followed by the 8-byte footer that containins just written payload's starting offset. +Finally, the header is advanced to point past the footer. This makes each item a pair of +`payload + back-pointer`, so the top of the stack can always be found in O(1) without extra +metadata. + +Popping from the stack mirrors this process. The guest reads stack pointer from the input stack's +header. It then reads the 8-byte back-pointer located just before stack pointer to get last element +offset in the buffer. It treats the slice starting at that offset as the flatbuffer-serialized +payload. The last step is to deserialize the slice, rewind the stack pointer to just consumed +payload offset. + +This model is a natural fit for synchronous, in-order communication, but the LIFO stack semantics +makes asynchronous constructs with out-of-order completion impossible to implement. This proposal +suggests we replace current implementation with ring buffer approach because the virtio-queue can +support both sync and async work completion. + +hl-model + +### Performance Optimizations + +The primary performance benefits of the ring buffer come from reducing number of expensive +operations, specifically VM exits, but also improving memory access patterns. This section discusses +the potential performance improvements that stems from using ring buffer. + +In the current Hyperlight model, every host-guest interaction triggers a VM exit. While this +provides strong isolation guarantees, it comes at a significant cost. Each VM exit involves: + +- Saving the guest CPU state +- Switching to the hypervisor/host context +- Processing the request +- Restoring guest CPU state and resuming execution + +For I/O-intensive workloads, this overhead dominates execution time. Consider a scenario where a +host needs to transfer data as stream to the guest and each stream chunk triggers VM exit. + +**1. Notification Suppression** + +The virtio queue defines event suppression mechanism that allow both sides to control when they want +to be notified about the submissions or completions in the queue. Notification suppression allow for +different batching strategies. For example: + +- A driver can queue multiple requests, suppress notifications, and only notify the device once when + ready +- A device can process descriptors in batches and only notify the driver when a certain threshold is + reached or when the ring is about to fill up + +**2. Event based notifications** + +In the single threaded application the notification involve VM exit but in multi-thread environment +where host and guest are running in separate threads we can laverage event-based notifications (for +example `ioeventfd` for kvm). This is especially useful for streaming scenarios where the guest can +continue processing while the host consumes data asynchronously. + +**3. Inline Descriptors** + +An interesting optimization that is not part of virtio-queue spec but is worth considering is +embedding "tiny" payloads into descriptor. Virtio model, no matter the size of the payload, +requires: + +1. Allocating a buffer in shared memory +2. Writing the data to that buffer +3. Pointing a descriptor at the buffer +4. The receiver reading from the buffer + +We can eliminate all the steps for small messages by embedding the data directly into the +descriptor: + +```rust +const INLINE: u16 = 1 << 8; // New flag + +struct Descriptor { + addr: u64, + len: u16, + data: [u8; 16], // addr is unused, data is written inline in the descriptor + id: u16, + flags: u16, +} +``` + +or + +```rust +struct Descriptor { + // When INLINE is set, reinterpret addr/len as data: + data: [u8; 12], // addr(8) + len(4) repurposed as inline data + id: u16, + flags: u16, +} + +``` + +When the `INLINE` flag is set, the `addr` is unused. This optimization, inspired by io_uring, +eliminates memory indirection for common small messages, improving both latency and cache behavior. +The tradeof is the increased size of descriptor table. Alternatively we could repurpose the `addr` +and `len` as raw bytes providing 12 bytes of inline storage. We should asses if any of flatbuffer +schema serialized data can actually fit into small inline data. + +**4. Descriptor Chaining - scatter gather list** + +Descriptors can be chained using the `NEXT` flag. This enables zero-copy scatter-gather I/O +patterns. Imagine again the stream running on the host. We want to gather few chunks before sending +it to the guest. For each incoming chunk we can grab the buffer from the buffer pool and write data +to it. After reaching some threshold we want to present all the buffers to guest. scatter-gather +list allow us to represent the chunks as descriptor chain without need to copy it to contiguous +memory. + +### Dynamic Response Sizing + +A slightly annoying consequence of using virtio model is that we have to account for the fact that +the driver pre-allocates response buffers, but the device may produce variable-length responses. +This means that the pre allocated size might not be enough to write a complete response. The +proposed solution to that is to use truncation protocol. The protocol can be implemented in the +descriptor layer or in the flatbuffer schema: + +1. Driver allocates buffer of estimated size +2. Device writes up to buffer length +3. Device sets actual written length in descriptor +4. If `actual_length > buffer_length`, device sets a `TRUNCATED` flag, +5. Driver can re-submit with larger buffer if needed + +### Snapshotting + +Snapshotting requires that the descriptor table has no in-flight guest-to-host requests and any +attempt to snapshot a sandbox with such pending requests will result in a snapshot failure. + +### Difference from virtio spec + +- Do not support indirect descriptor table (can be deferred to future work if needed), +- Do not support feature negotiation, set of features is fixed for driver and device, +- Only support packed queue, +- Introduce inline data optimization in descriptor (only if benchmarks support the claim) + +### Type System Design + +The goal of this section is not to pin exactly the API for queue semantics but rather give an +overview of type system that represents the concepts outlined above. The presented API is intended +for internal Hyperlight usage and won't be exposed to Hyperlight user. + +#### Low Level API + +```rust +bitflags! { + #[repr(transparent)] + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] + pub struct DescFlags: u16 { + const NEXT = 1 << 0; + const WRITE = 1 << 1; + const INDIRECT = 1 << 2; + const AVAIL = 1 << 7; + const USED = 1 << 15; + } +} + +#[repr(C)] +#[derive(Clone, Copy, Debug, Pod, Zeroable)] +pub struct Descriptor { + pub addr: u64, + pub len: u32, + pub id: u16, + pub flags: u16, +} + +impl Descriptor { + /// Interpret flags as DescFlags + pub fn flags(&self) -> DescFlags { } + /// Did the driver mark this descriptor in the current driver round? + pub fn is_avail(&self, wrap: bool) -> bool { } + /// Did the device mark this descriptor used in the current device round? + pub fn is_used(&self, wrap: bool) -> bool { } + /// Mark descriptor as available according to the driver's wrap bit. + pub fn mark_avail(&mut self, wrap: bool) { } + /// Mark descriptor as used according to the device's wrap bit. + pub fn mark_used(&mut self, wrap: bool) { } +} + +/// A view into a Descriptor stored in shared memory. +/// +/// Allows reading/writing the descriptor with proper memory ordering. +pub struct DescriptorView<'t> { + base: NonNull, + owner: PhantomData<&'t DescTable<'t>>, +} + +impl<'t> DescriptorView<'t> { + /// # Safety: base must be valid for reads/writes for 't + pub unsafe fn new(base: NonNull) -> Self { } + /// Read descriptor from memory: Acquire-load flags then volatile-read other fields. + pub fn read(&self) -> Descriptor { } + /// Write descriptor fields except flags (volatile), then publish flags atomically (Release). + pub fn write(&self, desc: &Descriptor) { } +} + +bitflags! { + #[repr(transparent)] + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] + pub struct EventFlags: u16 { + /// Always send notifications + const ENABLE = 1 << 0; + /// Never send notifications (polling mode) + const DISABLE = 1 << 1; + /// Only notify when a specific descriptor is processed + const DESC_SPECIFIC = 1 << 2; + } +} + +/// Event suppression structure controls notification behavior between driver and device +#[repr(C)] +#[derive(Clone, Copy, Debug, Pod, Zeroable)] +pub struct EventSuppression { + /// Packed descriptor event offset + desc: u16, + /// Event flags + flags: u16, +} + +/// A table of descriptors stored in shared memory. +struct DescTable<'t> { + base: NonNull, + size: u16, + owner: PhantomData<&'t [Descriptor]>, +} + +impl<'t> DescTable<'t> { + /// # Safety: base must be valid for reads/writes for size descriptors + pub unsafe fn init_mem(base: NonNull, size: u16) -> Self { } + /// # Safety: base must be valid for reads/writes for size descriptors + pub unsafe fn from_mem(base: NonNull, size: u16) -> Self { } + /// Get descriptor at index or None if idx is out of bounds + pub fn get(&self, idx: u16) -> Option> { } + /// Set descriptor at index + pub fn set(&self, idx: u16, desc: &Descriptor) { } + /// Get number of descriptors in table + pub fn len(&self) -> u16 { } +} + +/// A buffer element (part of a scatter-gather list). +#[derive(Debug, Clone)] +pub struct BufferElement { + /// Physical address of buffer + pub addr: u64, + /// Length in bytes + pub len: u32, +} + +/// A buffer returned from the ring after being used by the device. +struct UsedBuffer { + /// Descriptor ID associated with this used buffer + pub id: u16, + /// Length in bytes of data written by device + pub len: u32, +} + + +/// Type-state: Can add readable buffers +pub struct Readable; + +/// Type-state: Can add writable buffers (no more readables allowed) +pub struct Writable; + +/// A builder for buffer chains using type-state to enforce readable/writable order. +/// Upholds invariants: +/// - at least one buffer must be present in the chain, +/// - readable buffers must be added before writable buffers. +#[derive(Debug, Default)] +struct BufferChainBuilder { + readables: Vec, + writables: Vec, + marker: PhantomData, +} + +impl BufferChainBuilder { + /// Create a new builder starting in Readable state. + pub fn new() -> Self { } + /// Add a readable buffer (device reads from this). + pub fn readable(mut self, addr: u64, len: u32) -> Self { } + /// Add a writable buffer (device writes to this). This transitions to Writable + /// state so no more readable buffers can be added. + pub fn writable(mut self, addr: u64, len: u32) -> BufferChainBuilder { } + /// Chain must have at least one buffer otherwise an error is returned. + pub fn build(self) -> Result { } +} + + +impl BufferChainBuilder { + /// Add writable buffer + pub fn writable(mut self, addr: u64, len: u32) -> Self { } + /// Build the buffer chain. + pub fn build(self) -> Result { } +} + +#[derive(Debug, Default)] +struct BufferChain { + readables: Vec, + writables: Vec, +} + +impl BufferChain { + /// Get slice of readable buffers + pub fn readables(&self) -> &[BufferElement] { } + /// Get slice of writable buffers + pub fn writables(&self) -> &[BufferElement] { } +} + +#[derive(Debug)] +struct RingProducer<'t> { + /// Next available descriptor position + avail_cursor: RingCursor, + /// Next used descriptor position + used_cursor: RingCursor, + /// Free slots in the ring + num_free: usize, + /// Descriptor table in shared memory + desc_table: DescTable<'t>, + /// stack of free IDs, allows out-of-order completion + id_free: SmallVec<[u16; DescTable::DEFAULT_LEN]>, + // chain length per ID, index = ID, + id_num: SmallVec<[u16; DescTable::DEFAULT_LEN]>, +} + + +/// The producer side of a packed ring. +impl<'t> RingProducer<'t> { + /// Submit a buffer chain to the ring. + pub fn submit(&self, chain: &BufferChain) -> Result { } + /// Poll the ring for a used buffer. + pub fn poll(&self) -> Result { } +} + + +/// The consumer side of a packed ring. +#[derive(Debug)] +pub struct RingConsumer<'t> { + /// Cursor for reading available (driver-published) descriptors + avail_cursor: RingCursor, + /// Cursor for writing used descriptors + used_cursor: RingCursor, + /// Shared descriptor table + desc_table: DescTable<'t>, + /// Per-ID chain length learned when polling (index = ID) + id_num: SmallVec<[u16; DescTable::DEFAULT_LEN]>, +} + + +impl<'t> RingConsumer<'t> { + /// Poll the ring for an available buffer chain. + pub fn poll(&self) -> Result { } + /// Submit a used buffer back to the ring. + pub fn submit(&self, used: &UsedBuffer) -> Result<(), RingError> { } +} + +``` + +#### Higher-Level API + +The low-level ring buffer implementation provides the foundation for safe and efficient +communication, but working directly with descriptors, buffer allocation, and notification +suppression requires in-depth knowledge about the virtqueue semantics. The higher-level API aims to +provide an ergonomic, type-safe interface for common communication patterns. Specifically: + +- abstracts buffer allocation, +- abstracts notification strategy, +- enforces type safety by requiring ring payloads to be `FlatbufferSerializable` + +```rust +use allocator_api2::alloc::{AllocError, Allocator}; + +/// Trait for types that can be serialized/deserialized via flatbuffers +pub trait FlatbufferSerializable: Sized + Sealed { + type Error: Into; + + /// Estimate the serialized size (hint for buffer allocation) + fn size_hint(&self) -> usize; + /// Serialize into the provided buffer + fn serialize(&self, buf: &mut [u8]) -> Result; + /// Deserialize from the provided buffer + fn deserialize(buf: &[u8]) -> Result; +} + +/// Notification strategy trait - determines when to notify the other side about new descriptors. +pub trait NotificationStrategy { + /// Returns true if notification should be sent. + fn should_notify(&self, stats: &RingStats) -> bool; +} + +/// Notification strategy that will notify the device after each send +struct AlwaysNotify; + +impl NotificationStrategy for AlwaysNotify { + fn should_notify(&self, _stats: &RingStats) -> bool { true }; +} + +struct BufferPool { } + +impl Allocator for BufferPool { + /// Allocate a buffer with the given layout from the pool. + fn allocate(&self, layout: Layout) -> Result, AllocError> { } + /// Return buffer to the pool. + unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { } +} + +/// Split ring into separate sender and receiver +/// +/// Sender owns the allocator and notification strategy since only +/// it needs to allocate buffers and decide when to notify. +pub struct RingSender +where + A: Allocator, + N: NotificationStrategy, +{ + /// The sync version needs to handle concurrent access properly + inner: Arc>, +} + +/// Receiver only needs to know the receive type +pub struct RingReceiver +{ + /// The sync version needs to handle concurrent access properly + inner: Arc>, +} + +impl Ring +where + A: Allocator, + N: NotificationStrategy, +{ + /// Split into separate sender and receiver + pub fn split(self) -> (RingSender, RingReceiver); +} + +impl RingSender +where + A: Allocator, + N: NotificationStrategy, +{ + /// Send a message, use token for out-of-order completion + pub fn send(&mut self, message: T) -> Result + where + T: FlatbufferSerializable; + + /// Try to send without blocking + pub fn try_send(&mut self, message: T) -> Result; + where + T: FlatbufferSerializable; +} + +impl RingReceiver +{ + /// Receive a message of the specified type + pub fn recv(&mut self) -> Result + where + T: FlatbufferSerializable; + + /// Try to receive a message without blocking + pub fn try_recv(&mut self) -> Result + where + T: FlatbufferSerializable; +} + +``` + +### Test Plan + +**Unit tests**: + +- Descriptor read/write with proper memory ordering +- Wrap counter transitions +- Buffer chain building and validation +- Event suppression logic +- Miri testing + +**Integration tests**: + +- Single-threaded producer-consumer patterns +- Multi-threaded scenarios with concurrent access +- Shuttle tests (https://github.com/awslabs/shuttle) +- Backpressure behavior (queue full, memory exhausted) +- Truncation protocol for oversized responses +- Notification suppression and batching + +**Property-based tests**: + +- Invariants hold across all valid sequences of operations +- No lost or duplicated messages +- Wrap counter consistency + +**e2e tests**: + +- Actual host-guest communication via ring buffer +- Performance benchmarks vs. current VM exit approach +- Stress testing under high load + +### Implementation Plan + +As proposed, the queue will eventually replace the current stack‑based mechanism for function calls. +The initial implementation will be gated behind a feature flag. The work can be roughly broken down +into: + +- Introduce a low‑level queue implementation that operates on shared memory. +- Introduce a serialization trait for data that can be safely sent through the queue. +- Introduce a high‑level API over the queue. +- Integrate the queue infrastructure with the existing memory model and function‑call API. + +Future work includes: + +- Adding `Future` and `Stream` as function argument and return types. +- Supporting host and guest running on separate threads. +- Providing an asynchronous API for function calls, streams, and related workflows. + +## Implementation History + +- **2025-11-12**: HIP proposed + +## Drawbacks + +- **Complexity**: Ring buffer logic with wrap counters and memory ordering is subtle +- **Fixed size**: Queue size must be known upfront; resizing requires reallocation +- **Learning curve**: Developers need to understand packed virtqueue semantics +- **Debugging**: Race conditions and memory ordering issues can be hard to diagnose + +## Alternatives + +**1. Split Virtqueue** + +- A ready to use to crate that would require adopting their memory model (probably an overkill) +- Simpler descriptor management +- Worse cache characteristics due to separated rings +- Still used in production, proven design + +**2. Lock-based Queue** + +- Simpler implementation +- Much higher overhead due to lock contention +- Doesn't leverage hypervisor-specific optimizations +- no locks other than spin available on guest diff --git a/typos.toml b/typos.toml index 07ffa9dee..11de95945 100644 --- a/typos.toml +++ b/typos.toml @@ -9,3 +9,5 @@ extend-exclude = ["**/*.patch", "src/hyperlight_guest_bin/third_party/**/*", "NO typ="typ" mmaped="mmapped" fpr="fpr" +readables="readables" +writables="writables" From 7e0a191e06bebc45bb295ff368de41d410f1f684 Mon Sep 17 00:00:00 2001 From: Tomasz Andrzejak Date: Mon, 19 Jan 2026 14:17:03 +0100 Subject: [PATCH 2/3] Apply suggestions from code review Fix typos Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Tomasz Andrzejak --- proposals/0001-rng-buf/README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/proposals/0001-rng-buf/README.md b/proposals/0001-rng-buf/README.md index 216fdfca6..a5aee918e 100644 --- a/proposals/0001-rng-buf/README.md +++ b/proposals/0001-rng-buf/README.md @@ -217,7 +217,7 @@ done using "wrap" counters: the inverse. When the device publishes a used descriptor, it sets both `AVAIL` and `USED` to its wrap bit. - The reader of a descriptor then compares the flags it reads to its own current wrap to decide if - the descriptr is newly available/used now, or is it lagging behind. + the descriptor is newly available/used now, or is it lagging behind. ### Comparison with current implementation @@ -231,7 +231,7 @@ Each of these memory regions begins with an 8-byte header that stores a relative the next free byte in the stack. When pushing, the payload which is flatbuffer-serialized message is written at the current stack -pointer, followed by the 8-byte footer that containins just written payload's starting offset. +pointer, followed by the 8-byte footer that containing just written payload's starting offset. Finally, the header is advanced to point past the footer. This makes each item a pair of `payload + back-pointer`, so the top of the stack can always be found in O(1) without extra metadata. @@ -280,7 +280,7 @@ different batching strategies. For example: **2. Event based notifications** In the single threaded application the notification involve VM exit but in multi-thread environment -where host and guest are running in separate threads we can laverage event-based notifications (for +where host and guest are running in separate threads we can leverage event-based notifications (for example `ioeventfd` for kvm). This is especially useful for streaming scenarios where the guest can continue processing while the host consumes data asynchronously. @@ -324,7 +324,7 @@ struct Descriptor { When the `INLINE` flag is set, the `addr` is unused. This optimization, inspired by io_uring, eliminates memory indirection for common small messages, improving both latency and cache behavior. -The tradeof is the increased size of descriptor table. Alternatively we could repurpose the `addr` +The tradeoff is the increased size of descriptor table. Alternatively we could repurpose the `addr` and `len` as raw bytes providing 12 bytes of inline storage. We should asses if any of flatbuffer schema serialized data can actually fit into small inline data. @@ -657,7 +657,9 @@ where N: NotificationStrategy, { /// Split into separate sender and receiver - pub fn split(self) -> (RingSender, RingReceiver); + pub fn split(self) -> (RingSender, RingReceiver) { + unimplemented!("split is not implemented in this example"); + } } impl RingSender @@ -671,7 +673,7 @@ where T: FlatbufferSerializable; /// Try to send without blocking - pub fn try_send(&mut self, message: T) -> Result; + pub fn try_send(&mut self, message: T) -> Result where T: FlatbufferSerializable; } From 934dee7b5328b2758c6f6ef199a2ed0e1d5edb67 Mon Sep 17 00:00:00 2001 From: Tomasz Andrzejak Date: Thu, 22 Jan 2026 14:42:15 +0100 Subject: [PATCH 3/3] Update type system design for virtqueue implementation Signed-off-by: Tomasz Andrzejak --- proposals/0001-rng-buf/README.md | 490 ++++++++++++++++--------------- 1 file changed, 247 insertions(+), 243 deletions(-) diff --git a/proposals/0001-rng-buf/README.md b/proposals/0001-rng-buf/README.md index a5aee918e..e1d13b8c4 100644 --- a/proposals/0001-rng-buf/README.md +++ b/proposals/0001-rng-buf/README.md @@ -365,332 +365,336 @@ attempt to snapshot a sandbox with such pending requests will result in a snapsh ### Type System Design -The goal of this section is not to pin exactly the API for queue semantics but rather give an -overview of type system that represents the concepts outlined above. The presented API is intended -for internal Hyperlight usage and won't be exposed to Hyperlight user. +This section provides an overview of the implemented type system. The API is internal to Hyperlight. -#### Low Level API +#### Memory Access Abstraction + +The `MemOps` trait abstracts memory access, allowing the virtqueue to work with different backends +(host vs guest memory). This decouples the ring logic from the underlying memory representation. + +```rust +/// Backend-provided memory access for virtqueue. +/// +/// Implementations must ensure that: +/// - Pointers passed to methods are valid for the duration of the call +/// - Memory ordering guarantees are upheld as documented +/// - Reads and writes don't cause undefined behavior (alignment, validity) +pub trait MemOps { + type Error; + + /// Read bytes from physical memory. + /// Used for reading buffer contents pointed to by descriptors. + fn read(&self, addr: u64, dst: &mut [u8]) -> Result; + + /// Write bytes to physical memory. + fn write(&self, addr: u64, src: &[u8]) -> Result; + + /// Load a u16 with acquire semantics. + /// `addr` must translate to a valid, aligned AtomicU16 in shared memory. + fn load_acquire(&self, addr: u64) -> Result; + + /// Store a u16 with release semantics. + /// `addr` must translate to a valid AtomicU16 in shared memory. + fn store_release(&self, addr: u64, val: u16) -> Result<(), Self::Error>; +} +``` + +#### Descriptor and Event Suppression + +The descriptor format follows the VIRTIO packed virtqueue specification. Each descriptor +represents a memory buffer in a scatter-gather list. ```rust bitflags! { - #[repr(transparent)] - #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] + /// Descriptor flags as defined by VIRTIO specification. pub struct DescFlags: u16 { - const NEXT = 1 << 0; - const WRITE = 1 << 1; - const INDIRECT = 1 << 2; - const AVAIL = 1 << 7; - const USED = 1 << 15; + const NEXT = 1 << 0; // Buffer continues via next descriptor + const WRITE = 1 << 1; // Device write-only (otherwise read-only) + const INDIRECT = 1 << 2; // Buffer contains list of descriptors + const AVAIL = 1 << 7; // Available flag for wrap counter + const USED = 1 << 15; // Used flag for wrap counter } } #[repr(C)] -#[derive(Clone, Copy, Debug, Pod, Zeroable)] pub struct Descriptor { + /// Physical address of the buffer pub addr: u64, + /// Length in bytes (for used: bytes written by device) pub len: u32, + /// Buffer ID for correlating completions with submissions pub id: u16, + /// Flags (NEXT, WRITE, INDIRECT, AVAIL, USED) pub flags: u16, } impl Descriptor { - /// Interpret flags as DescFlags - pub fn flags(&self) -> DescFlags { } - /// Did the driver mark this descriptor in the current driver round? - pub fn is_avail(&self, wrap: bool) -> bool { } - /// Did the device mark this descriptor used in the current device round? - pub fn is_used(&self, wrap: bool) -> bool { } - /// Mark descriptor as available according to the driver's wrap bit. - pub fn mark_avail(&mut self, wrap: bool) { } - /// Mark descriptor as used according to the device's wrap bit. - pub fn mark_used(&mut self, wrap: bool) { } -} + /// Read descriptor with acquire semantics for flags. + /// This is the primary synchronization point for consuming descriptors. + pub fn read_acquire(mem: &M, addr: u64) -> Result; -/// A view into a Descriptor stored in shared memory. -/// -/// Allows reading/writing the descriptor with proper memory ordering. -pub struct DescriptorView<'t> { - base: NonNull, - owner: PhantomData<&'t DescTable<'t>>, -} + /// Write descriptor with release semantics for flags. + /// This is the primary synchronization point for publishing descriptors. + pub fn write_release(&self, mem: &M, addr: u64) -> Result<(), M::Error>; -impl<'t> DescriptorView<'t> { - /// # Safety: base must be valid for reads/writes for 't - pub unsafe fn new(base: NonNull) -> Self { } - /// Read descriptor from memory: Acquire-load flags then volatile-read other fields. - pub fn read(&self) -> Descriptor { } - /// Write descriptor fields except flags (volatile), then publish flags atomically (Release). - pub fn write(&self, desc: &Descriptor) { } + /// Did the driver mark this descriptor available in current round? + pub fn is_avail(&self, wrap: bool) -> bool; + + /// Did the device mark this descriptor used in current round? + pub fn is_used(&self, wrap: bool) -> bool; } bitflags! { - #[repr(transparent)] - #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct EventFlags: u16 { - /// Always send notifications - const ENABLE = 1 << 0; - /// Never send notifications (polling mode) - const DISABLE = 1 << 1; - /// Only notify when a specific descriptor is processed - const DESC_SPECIFIC = 1 << 2; + const ENABLE = 0x0; // Always notify + const DISABLE = 0x1; // Never notify (polling mode) + const DESC = 0x2; // Notify at specific descriptor index } } -/// Event suppression structure controls notification behavior between driver and device +/// Event suppression structure for controlling notifications. +/// Both sides can control when they want to be notified. #[repr(C)] -#[derive(Clone, Copy, Debug, Pod, Zeroable)] pub struct EventSuppression { - /// Packed descriptor event offset - desc: u16, - /// Event flags - flags: u16, + /// bits 0-14: descriptor offset, bit 15: wrap counter + pub off_wrap: u16, + /// bits 0-1: flags (ENABLE/DISABLE/DESC), bits 2-15: reserved + pub flags: u16, } +``` -/// A table of descriptors stored in shared memory. -struct DescTable<'t> { - base: NonNull, - size: u16, - owner: PhantomData<&'t [Descriptor]>, -} +#### Buffer Pool -impl<'t> DescTable<'t> { - /// # Safety: base must be valid for reads/writes for size descriptors - pub unsafe fn init_mem(base: NonNull, size: u16) -> Self { } - /// # Safety: base must be valid for reads/writes for size descriptors - pub unsafe fn from_mem(base: NonNull, size: u16) -> Self { } - /// Get descriptor at index or None if idx is out of bounds - pub fn get(&self, idx: u16) -> Option> { } - /// Set descriptor at index - pub fn set(&self, idx: u16, desc: &Descriptor) { } - /// Get number of descriptors in table - pub fn len(&self) -> u16 { } +An allocator for virtio buffer management. This will operate on guest physical +addresses from page allocator. + +```rust +/// Trait for buffer providers. +pub trait BufferProvider { + /// Allocate at least `len` bytes. + fn alloc(&self, len: usize) -> Result; + + /// Free a previously allocated block. + fn dealloc(&self, alloc: Allocation) -> Result<(), AllocError>; + + /// Resize by trying in-place grow; otherwise reserve new block and free old. + fn resize(&self, old: Allocation, new_len: usize) -> Result; } -/// A buffer element (part of a scatter-gather list). -#[derive(Debug, Clone)] -pub struct BufferElement { - /// Physical address of buffer +/// Allocation result +pub struct Allocation { + /// Starting address of the allocation pub addr: u64, - /// Length in bytes - pub len: u32, + /// Length in bytes rounded up to slab size + pub len: usize, } -/// A buffer returned from the ring after being used by the device. -struct UsedBuffer { - /// Descriptor ID associated with this used buffer - pub id: u16, - /// Length in bytes of data written by device - pub len: u32, -} +/// Two-tier buffer pool with small and large slabs. +/// - Lower tier (L=256): for control messages, small descriptors +/// - Upper tier (U=4096): for larger data buffers +/// Small allocations try lower tier first, falling back to upper on OOM. +pub struct BufferPool; +``` +#### Ring Producer/Consumer -/// Type-state: Can add readable buffers -pub struct Readable; +The core ring types implement the packed virtqueue protocol. The producer submits buffer chains +and polls for completions; the consumer polls for available buffers and marks them used. -/// Type-state: Can add writable buffers (no more readables allowed) -pub struct Writable; +```rust +/// Producer (driver) side of a packed virtqueue. +/// Submits buffer chains for the device to process and polls for completions. +pub struct RingProducer { + mem: M, + avail_cursor: RingCursor, // Next available descriptor position + used_cursor: RingCursor, // Next used descriptor position + num_free: usize, // Free slots in the ring + desc_table: DescTable, // Descriptor table in shared memory + id_free: SmallVec<[u16; 256]>, // Stack of free IDs (allows out-of-order completion) + id_num: SmallVec<[u16; 256]>, // Chain length per ID + drv_evt_addr: u64, // Controls when device notifies about used buffers + dev_evt_addr: u64, // Reads device event to check notification preference +} + +impl RingProducer { + /// Submit a buffer chain to the ring. + /// Returns the descriptor ID assigned to this chain. + pub fn submit_available(&mut self, chain: &BufferChain) -> Result; -/// A builder for buffer chains using type-state to enforce readable/writable order. -/// Upholds invariants: -/// - at least one buffer must be present in the chain, -/// - readable buffers must be added before writable buffers. -#[derive(Debug, Default)] -struct BufferChainBuilder { - readables: Vec, - writables: Vec, - marker: PhantomData, + /// Submit with notification check based on device's event suppression settings. + pub fn submit_available_with_notify(&mut self, chain: &BufferChain) -> Result; + + /// Poll for a used buffer. Returns WouldBlock if no completions available. + pub fn poll_used(&mut self) -> Result; + + /// Enable/disable used-buffer notifications from device. + pub fn enable_used_notifications(&mut self) -> Result<(), RingError>; + pub fn disable_used_notifications(&mut self) -> Result<(), RingError>; } -impl BufferChainBuilder { - /// Create a new builder starting in Readable state. - pub fn new() -> Self { } - /// Add a readable buffer (device reads from this). - pub fn readable(mut self, addr: u64, len: u32) -> Self { } - /// Add a writable buffer (device writes to this). This transitions to Writable - /// state so no more readable buffers can be added. - pub fn writable(mut self, addr: u64, len: u32) -> BufferChainBuilder { } - /// Chain must have at least one buffer otherwise an error is returned. - pub fn build(self) -> Result { } +/// Consumer (device) side of a packed virtqueue. +/// Polls for available buffer chains and marks them as used after processing. +pub struct RingConsumer { + mem: M, + avail_cursor: RingCursor, + used_cursor: RingCursor, + desc_table: DescTable, + id_num: SmallVec<[u16; 256]>, // Per-ID chain length learned when polling + num_inflight: usize, + drv_evt_addr: u64, + dev_evt_addr: u64, } +impl RingConsumer { + /// Poll for an available buffer chain. + /// Returns the chain ID and BufferChain containing all buffers. + pub fn poll_available(&mut self) -> Result<(u16, BufferChain), RingError>; -impl BufferChainBuilder { - /// Add writable buffer - pub fn writable(mut self, addr: u64, len: u32) -> Self { } - /// Build the buffer chain. - pub fn build(self) -> Result { } -} + /// Mark a chain as used. `written_len` is total bytes written to writable buffers. + pub fn submit_used(&mut self, id: u16, written_len: u32) -> Result<(), RingError>; -#[derive(Debug, Default)] -struct BufferChain { - readables: Vec, - writables: Vec, + /// Submit used with notification check based on driver's event suppression settings. + pub fn submit_used_with_notify(&mut self, id: u16, len: u32) -> Result; } -impl BufferChain { - /// Get slice of readable buffers - pub fn readables(&self) -> &[BufferElement] { } - /// Get slice of writable buffers - pub fn writables(&self) -> &[BufferElement] { } +/// Result of submitting a buffer to the ring. +pub struct SubmitResult { + pub id: u16, // Descriptor ID assigned + pub notify: bool, // Whether to notify the other side } -#[derive(Debug)] -struct RingProducer<'t> { - /// Next available descriptor position - avail_cursor: RingCursor, - /// Next used descriptor position - used_cursor: RingCursor, - /// Free slots in the ring - num_free: usize, - /// Descriptor table in shared memory - desc_table: DescTable<'t>, - /// stack of free IDs, allows out-of-order completion - id_free: SmallVec<[u16; DescTable::DEFAULT_LEN]>, - // chain length per ID, index = ID, - id_num: SmallVec<[u16; DescTable::DEFAULT_LEN]>, +/// A buffer returned after being used by the device. +pub struct UsedBuffer { + pub id: u16, // Descriptor ID + pub len: u32, // Bytes written by device } +``` +#### Buffer Chain Builder -/// The producer side of a packed ring. -impl<'t> RingProducer<'t> { - /// Submit a buffer chain to the ring. - pub fn submit(&self, chain: &BufferChain) -> Result { } - /// Poll the ring for a used buffer. - pub fn poll(&self) -> Result { } +Type-state pattern enforces that readable buffers are added before writable buffers, preventing +invalid chain construction at compile time. + +```rust +/// Type-state: Can add readable buffers +pub struct Readable; + +/// Type-state: Can add writable buffers (no more readables allowed) +pub struct Writable; + +/// Builder for buffer chains using type-state to enforce readable/writable order. +/// Invariants enforced by the type system: +/// - At least one buffer must be present in the chain +/// - Readable buffers must be added before writable buffers +pub struct BufferChainBuilder { + elems: SmallVec<[BufferElement; 16]>, + split: usize, // Index separating readables from writables + marker: PhantomData, } +impl BufferChainBuilder { + /// Create a new builder in Readable state. + pub fn new() -> Self; -/// The consumer side of a packed ring. -#[derive(Debug)] -pub struct RingConsumer<'t> { - /// Cursor for reading available (driver-published) descriptors - avail_cursor: RingCursor, - /// Cursor for writing used descriptors - used_cursor: RingCursor, - /// Shared descriptor table - desc_table: DescTable<'t>, - /// Per-ID chain length learned when polling (index = ID) - id_num: SmallVec<[u16; DescTable::DEFAULT_LEN]>, + /// Add a readable buffer (device reads from this). + pub fn readable(self, addr: u64, len: u32) -> Self; + + /// Add a writable buffer. Transitions to Writable state, + /// preventing further readable additions. + pub fn writable(self, addr: u64, len: u32) -> BufferChainBuilder; + + /// Build the chain. Fails if empty. + pub fn build(self) -> Result; } +impl BufferChainBuilder { + /// Add another writable buffer. + pub fn writable(self, addr: u64, len: u32) -> Self; -impl<'t> RingConsumer<'t> { - /// Poll the ring for an available buffer chain. - pub fn poll(&self) -> Result { } - /// Submit a used buffer back to the ring. - pub fn submit(&self, used: &UsedBuffer) -> Result<(), RingError> { } + /// Build the chain. + pub fn build(self) -> Result; } -``` +/// A chain of buffers ready for submission. +pub struct BufferChain { + elems: SmallVec<[BufferElement; 16]>, + split: usize, +} -#### Higher-Level API +impl BufferChain { + pub fn readables(&self) -> &[BufferElement]; + pub fn writables(&self) -> &[BufferElement]; +} +``` -The low-level ring buffer implementation provides the foundation for safe and efficient -communication, but working directly with descriptors, buffer allocation, and notification -suppression requires in-depth knowledge about the virtqueue semantics. The higher-level API aims to -provide an ergonomic, type-safe interface for common communication patterns. Specifically: +#### High-Level Virtqueue API -- abstracts buffer allocation, -- abstracts notification strategy, -- enforces type safety by requiring ring payloads to be `FlatbufferSerializable` +Wraps ring primitives with buffer management and notification handling. This layer abstracts +buffer allocation and provides a request/response model with token-based correlation. For +simplicity this draft omits a batching API. ```rust -use allocator_api2::alloc::{AllocError, Allocator}; - -/// Trait for types that can be serialized/deserialized via flatbuffers -pub trait FlatbufferSerializable: Sized + Sealed { - type Error: Into; - - /// Estimate the serialized size (hint for buffer allocation) - fn size_hint(&self) -> usize; - /// Serialize into the provided buffer - fn serialize(&self, buf: &mut [u8]) -> Result; - /// Deserialize from the provided buffer - fn deserialize(buf: &[u8]) -> Result; +/// Trait for notifying about new requests in the virtqueue. +pub trait Notifier { + fn notify(&self, stats: QueueStats); } -/// Notification strategy trait - determines when to notify the other side about new descriptors. -pub trait NotificationStrategy { - /// Returns true if notification should be sent. - fn should_notify(&self, stats: &RingStats) -> bool; +pub struct QueueStats { + pub num_free: usize, + pub num_inflight: usize, } -/// Notification strategy that will notify the device after each send -struct AlwaysNotify; +/// A token representing a sent request, used for correlating responses. +pub struct Token(pub u16); -impl NotificationStrategy for AlwaysNotify { - fn should_notify(&self, _stats: &RingStats) -> bool { true }; +/// A request received from the driver side. +pub struct Request { + pub token: Token, + pub data: Bytes, } -struct BufferPool { } - -impl Allocator for BufferPool { - /// Allocate a buffer with the given layout from the pool. - fn allocate(&self, layout: Layout) -> Result, AllocError> { } - /// Return buffer to the pool. - unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { } +/// A response received after the device processes a request. +pub struct Response { + pub token: Token, + pub data: Bytes, + pub written: usize, } -/// Split ring into separate sender and receiver -/// -/// Sender owns the allocator and notification strategy since only -/// it needs to allocate buffers and decide when to notify. -pub struct RingSender -where - A: Allocator, - N: NotificationStrategy, -{ - /// The sync version needs to handle concurrent access properly - inner: Arc>, +/// High-level virtqueue producer for sending requests and receiving responses. +/// Manages buffer allocation and tracks in-flight requests. +pub struct VirtqProducer { + inner: RingProducer, + notifier: N, + pool: P, + inflight: Vec>, } -/// Receiver only needs to know the receive type -pub struct RingReceiver -{ - /// The sync version needs to handle concurrent access properly - inner: Arc>, -} +impl VirtqProducer { + /// Send a request with pre-allocated response capacity. + /// Returns a token for correlating with the response. + pub fn send(&mut self, req: &[u8], resp_cap: usize) -> Result; -impl Ring -where - A: Allocator, - N: NotificationStrategy, -{ - /// Split into separate sender and receiver - pub fn split(self) -> (RingSender, RingReceiver) { - unimplemented!("split is not implemented in this example"); - } -} + /// Poll for a single response. Returns None if no completions available. + pub fn poll_once(&mut self) -> Result, VirtqError>; -impl RingSender -where - A: Allocator, - N: NotificationStrategy, -{ - /// Send a message, use token for out-of-order completion - pub fn send(&mut self, message: T) -> Result - where - T: FlatbufferSerializable; - - /// Try to send without blocking - pub fn try_send(&mut self, message: T) -> Result - where - T: FlatbufferSerializable; + /// Drain all available responses, calling `f` for each. + pub fn drain(&mut self, f: impl FnMut(Token, Bytes)) -> Result<(), VirtqError>; } -impl RingReceiver -{ - /// Receive a message of the specified type - pub fn recv(&mut self) -> Result - where - T: FlatbufferSerializable; - - /// Try to receive a message without blocking - pub fn try_recv(&mut self) -> Result - where - T: FlatbufferSerializable; +/// High-level virtqueue consumer for receiving requests and sending responses. +pub struct VirtqConsumer { + inner: RingConsumer, + notifier: N, + inflight: Vec>, } +impl VirtqConsumer { + /// Poll for a single request. Returns None if no requests available. + pub fn poll_once(&mut self, max_req: usize) -> Result, VirtqError>; + + /// Complete a request by sending the response. + pub fn complete(&mut self, token: Token, resp: &[u8]) -> Result<(), VirtqError>; +} ``` ### Test Plan