Skip to content

Commit cdc6f02

Browse files
committed
feat: add support for VIRTIO_NET_F_MRG_RXBUF to virtio-net
Now virtio-net device can split incoming packets across multiple descriptor chains if VIRTIO_NET_F_MRG_RXBUF is enabled by the guest. Signed-off-by: Egor Lazarchuk <yegorlz@amazon.co.uk>
1 parent 11a41bd commit cdc6f02

File tree

2 files changed

+157
-99
lines changed

2 files changed

+157
-99
lines changed

src/vmm/src/devices/virtio/net/device.rs

Lines changed: 156 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,22 @@
99
use std::io::Read;
1010
use std::mem;
1111
use std::net::Ipv4Addr;
12+
use std::num::Wrapping;
1213
use std::sync::{Arc, Mutex};
1314

1415
use libc::EAGAIN;
15-
use log::{error, warn};
16+
use log::error;
1617
use utils::eventfd::EventFd;
1718
use utils::net::mac::MacAddr;
18-
use utils::u64_to_usize;
19-
use vm_memory::GuestMemoryError;
19+
use utils::{u64_to_usize, usize_to_u64};
20+
use vm_memory::{GuestAddress, GuestMemory, GuestMemoryError};
2021

2122
use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice};
2223
use crate::devices::virtio::gen::virtio_blk::VIRTIO_F_VERSION_1;
2324
use crate::devices::virtio::gen::virtio_net::{
2425
virtio_net_hdr_v1, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4,
2526
VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC,
27+
VIRTIO_NET_F_MRG_RXBUF,
2628
};
2729
use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
2830
use crate::devices::virtio::iovec::IoVecBuffer;
@@ -31,7 +33,7 @@ use crate::devices::virtio::net::tap::Tap;
3133
use crate::devices::virtio::net::{
3234
gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX,
3335
};
34-
use crate::devices::virtio::queue::{DescriptorChain, Queue};
36+
use crate::devices::virtio::queue::Queue;
3537
use crate::devices::virtio::{ActivateError, TYPE_NET};
3638
use crate::devices::{report_net_event_fail, DeviceError};
3739
use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN;
@@ -46,16 +48,14 @@ const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN;
4648

4749
#[derive(Debug, thiserror::Error, displaydoc::Display)]
4850
enum FrontendError {
49-
/// Add user.
50-
AddUsed,
51-
/// Descriptor chain too mall.
52-
DescriptorChainTooSmall,
5351
/// Empty queue.
5452
EmptyQueue,
5553
/// Guest memory error: {0}
5654
GuestMemory(GuestMemoryError),
57-
/// Read only descriptor.
58-
ReadOnlyDescriptor,
55+
/// Attempt to write an empty packet.
56+
AttemptToWriteEmptyPacket,
57+
/// Attempt to use more than 1 head to process a packet without VIRTIO_NET_F_MRG_RXBUF support.
58+
NoMrgBufSupport,
5959
}
6060

6161
pub(crate) const fn vnet_hdr_len() -> usize {
@@ -102,6 +102,20 @@ pub struct ConfigSpace {
102102
// SAFETY: `ConfigSpace` contains only PODs in `repr(C)` or `repr(transparent)`, without padding.
103103
unsafe impl ByteValued for ConfigSpace {}
104104

105+
// This struct contains information about partially
106+
// written packet.
107+
#[derive(Debug)]
108+
struct PartialWrite {
109+
// Amount of bytes written so far.
110+
bytes_written: usize,
111+
// Amount of descriptor heads used for the packet.
112+
used_heads: u16,
113+
// Guest address of the first buffer used for the packet.
114+
// This will be used to set number of descriptors heads used
115+
// to store the whole packet.
116+
packet_start_addr: GuestAddress,
117+
}
118+
105119
/// VirtIO network device.
106120
///
107121
/// It emulates a network device able to exchange L2 frames between the guest
@@ -126,6 +140,7 @@ pub struct Net {
126140

127141
rx_bytes_read: usize,
128142
rx_frame_buf: [u8; MAX_BUFFER_SIZE],
143+
rx_partial_write: Option<PartialWrite>,
129144

130145
tx_frame_headers: [u8; frame_hdr_len()],
131146

@@ -159,6 +174,7 @@ impl Net {
159174
| 1 << VIRTIO_NET_F_HOST_TSO4
160175
| 1 << VIRTIO_NET_F_HOST_UFO
161176
| 1 << VIRTIO_F_VERSION_1
177+
| 1 << VIRTIO_NET_F_MRG_RXBUF
162178
| 1 << VIRTIO_RING_F_EVENT_IDX;
163179

164180
let mut config_space = ConfigSpace::default();
@@ -188,6 +204,7 @@ impl Net {
188204
rx_deferred_frame: false,
189205
rx_bytes_read: 0,
190206
rx_frame_buf: [0u8; MAX_BUFFER_SIZE],
207+
rx_partial_write: None,
191208
tx_frame_headers: [0u8; frame_hdr_len()],
192209
irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?,
193210
config_space,
@@ -316,7 +333,7 @@ impl Net {
316333
}
317334

318335
// Attempt frame delivery.
319-
let success = self.write_frame_to_guest();
336+
let success = self.write_frame_to_guest().is_ok();
320337

321338
// Undo the tokens consumption if guest delivery failed.
322339
if !success {
@@ -327,108 +344,149 @@ impl Net {
327344
success
328345
}
329346

330-
/// Write a slice in a descriptor chain
331-
///
332-
/// # Errors
333-
///
334-
/// Returns an error if the descriptor chain is too short or
335-
/// an inappropriate (read only) descriptor is found in the chain
336-
fn write_to_descriptor_chain(
337-
mem: &GuestMemoryMmap,
338-
data: &[u8],
339-
head: DescriptorChain,
340-
net_metrics: &NetDeviceMetrics,
341-
) -> Result<(), FrontendError> {
342-
let mut chunk = data;
343-
let mut next_descriptor = Some(head);
347+
fn write_frame_to_guest(&mut self) -> Result<(), FrontendError> {
348+
// This is safe since we checked in the event handler that the device is activated.
349+
let mem = self.device_state.mem().unwrap();
350+
351+
// Check if queue is empty and early error out
352+
if self.queues[RX_INDEX].is_empty(mem) {
353+
return Err(FrontendError::EmptyQueue);
354+
}
344355

345-
while let Some(descriptor) = &next_descriptor {
346-
if !descriptor.is_write_only() {
347-
return Err(FrontendError::ReadOnlyDescriptor);
356+
let actual_size = self.queues[RX_INDEX].actual_size();
357+
358+
let (mut slice, mut packet_start_addr, mut used_heads, mut next_used) =
359+
if let Some(pw) = &self.rx_partial_write {
360+
(
361+
&self.rx_frame_buf[pw.bytes_written..self.rx_bytes_read],
362+
Some(pw.packet_start_addr),
363+
pw.used_heads,
364+
self.queues[RX_INDEX].next_used + Wrapping(pw.used_heads),
365+
)
366+
} else {
367+
(
368+
&self.rx_frame_buf[..self.rx_bytes_read],
369+
None,
370+
0,
371+
self.queues[RX_INDEX].next_used,
372+
)
373+
};
374+
375+
while !self.queues[RX_INDEX].is_empty(mem) && !slice.is_empty() {
376+
used_heads += 1;
377+
378+
if !self.has_feature(u64::from(VIRTIO_NET_F_MRG_RXBUF)) && 1 < used_heads {
379+
return Err(FrontendError::NoMrgBufSupport);
348380
}
349381

350-
let len = std::cmp::min(chunk.len(), descriptor.len as usize);
351-
match mem.write_slice(&chunk[..len], descriptor.addr) {
352-
Ok(()) => {
353-
net_metrics.rx_count.inc();
354-
chunk = &chunk[len..];
355-
}
356-
Err(err) => {
357-
error!("Failed to write slice: {:?}", err);
358-
if let GuestMemoryError::PartialBuffer { .. } = err {
359-
net_metrics.rx_partial_writes.inc();
360-
}
361-
return Err(FrontendError::GuestMemory(err));
362-
}
382+
// SAFETY:
383+
// This should never panic as we just checked that the queue is not empty.
384+
let mut desc = self.queues[RX_INDEX].do_pop_unchecked(mem).unwrap();
385+
386+
let head_desc_index = desc.index;
387+
let mut desc_len = 0;
388+
389+
// If this is the first head of the packet, save it for later
390+
if packet_start_addr.is_none() {
391+
packet_start_addr = Some(desc.addr)
363392
}
364393

365-
// If chunk is empty we are done here.
366-
if chunk.is_empty() {
367-
let len = data.len() as u64;
368-
net_metrics.rx_bytes_count.add(len);
369-
net_metrics.rx_packets_count.inc();
370-
return Ok(());
394+
// Handle descriptor chain head
395+
let len = slice.len().min(desc.len as usize);
396+
mem.write_slice(&slice[..len], desc.addr)
397+
.map_err(FrontendError::GuestMemory)?;
398+
desc_len += len;
399+
slice = &slice[len..];
400+
401+
// Handle following descriptors
402+
while desc.has_next() && !slice.is_empty() {
403+
// SAFETY:
404+
// This should never panic as we just checked there is next descriptor.
405+
desc = desc.next_descriptor().unwrap();
406+
407+
let len = slice.len().min(desc.len as usize);
408+
mem.write_slice(&slice[..len], desc.addr)
409+
.map_err(FrontendError::GuestMemory)?;
410+
desc_len += len;
411+
slice = &slice[len..];
371412
}
372413

373-
next_descriptor = descriptor.next_descriptor();
414+
// Add used descriptor head to used ring
415+
let next_used_index = next_used.0 % actual_size;
416+
// SAFETY:
417+
// This should never panic as we provide index in
418+
// correct bounds.
419+
self.queues[RX_INDEX]
420+
.write_used_ring(
421+
mem,
422+
next_used_index,
423+
head_desc_index,
424+
u32::try_from(desc_len).unwrap(),
425+
)
426+
.unwrap();
427+
428+
// We don't update queues internal next_used value just yet.
429+
// This is done to prevent giving information to the guest
430+
// about descriptor heads we used for partial packets.
431+
next_used += Wrapping(1);
374432
}
375433

376-
warn!("Receiving buffer is too small to hold frame of current size");
377-
Err(FrontendError::DescriptorChainTooSmall)
378-
}
434+
let packet_start_addr =
435+
packet_start_addr.ok_or(FrontendError::AttemptToWriteEmptyPacket)?;
379436

380-
// Copies a single frame from `self.rx_frame_buf` into the guest.
381-
fn do_write_frame_to_guest(&mut self) -> Result<(), FrontendError> {
382-
// This is safe since we checked in the event handler that the device is activated.
383-
let mem = self.device_state.mem().unwrap();
384-
385-
let queue = &mut self.queues[RX_INDEX];
386-
let head_descriptor = queue.pop_or_enable_notification(mem).ok_or_else(|| {
387-
self.metrics.no_rx_avail_buffer.inc();
388-
FrontendError::EmptyQueue
389-
})?;
390-
let head_index = head_descriptor.index;
437+
if slice.is_empty() {
438+
// Packet was fully written.
439+
self.metrics
440+
.rx_bytes_count
441+
.add(usize_to_u64(self.rx_bytes_read));
442+
self.metrics.rx_packets_count.inc();
443+
444+
// TODO: can be simplified by usage of `std::mem::offset_of`
445+
// when Rust version is updated.
446+
//
447+
// Update number of descriptor heads used to store a packet.
448+
// SAFETY:
449+
// The packet_start_addr is valid guest address and we check
450+
// memory boundaries.
451+
#[allow(clippy::transmute_ptr_to_ref)]
452+
let header: &mut virtio_net_hdr_v1 = unsafe {
453+
let header_slice = mem
454+
.get_slice(packet_start_addr, std::mem::size_of::<virtio_net_hdr_v1>())
455+
.map_err(FrontendError::GuestMemory)?;
456+
std::mem::transmute(header_slice.ptr_guard_mut().as_ptr())
457+
};
458+
header.num_buffers = used_heads;
391459

392-
let result = Self::write_to_descriptor_chain(
393-
mem,
394-
&self.rx_frame_buf[..self.rx_bytes_read],
395-
head_descriptor,
396-
&self.metrics,
397-
);
398-
// Mark the descriptor chain as used. If an error occurred, skip the descriptor chain.
399-
let used_len = if result.is_err() {
400-
self.metrics.rx_fails.inc();
401-
0
402-
} else {
403-
// Safe to unwrap because a frame must be smaller than 2^16 bytes.
404-
u32::try_from(self.rx_bytes_read).unwrap()
405-
};
406-
queue.add_used(mem, head_index, used_len).map_err(|err| {
407-
error!("Failed to add available descriptor {}: {}", head_index, err);
408-
FrontendError::AddUsed
409-
})?;
460+
// Update queues internals
461+
self.queues[RX_INDEX].next_used = next_used;
462+
self.queues[RX_INDEX].num_added += Wrapping(used_heads);
410463

411-
result
412-
}
464+
// Update used ring with what we used to process the packet
465+
self.queues[RX_INDEX].set_used_ring_idx(next_used.0, mem);
466+
let next_avail = self.queues[RX_INDEX].next_avail.0;
467+
self.queues[RX_INDEX].set_used_ring_avail_event(next_avail, mem);
413468

414-
// Copies a single frame from `self.rx_frame_buf` into the guest. In case of an error retries
415-
// the operation if possible. Returns true if the operation was successfull.
416-
fn write_frame_to_guest(&mut self) -> bool {
417-
let max_iterations = self.queues[RX_INDEX].actual_size();
418-
for _ in 0..max_iterations {
419-
match self.do_write_frame_to_guest() {
420-
Ok(()) => return true,
421-
Err(FrontendError::EmptyQueue) | Err(FrontendError::AddUsed) => {
422-
return false;
423-
}
424-
Err(_) => {
425-
// retry
426-
continue;
427-
}
469+
// Clear partial write info if there was one
470+
self.rx_partial_write = None;
471+
Ok(())
472+
} else {
473+
// Packet could not be fully written to the guest
474+
// Save necessary info to use it during next invocation.
475+
self.metrics.rx_partial_writes.inc();
476+
477+
if let Some(pw) = &mut self.rx_partial_write {
478+
pw.bytes_written = self.rx_bytes_read - slice.len();
479+
pw.used_heads = used_heads;
480+
} else {
481+
let pw = PartialWrite {
482+
bytes_written: self.rx_bytes_read - slice.len(),
483+
used_heads,
484+
packet_start_addr,
485+
};
486+
self.rx_partial_write = Some(pw);
428487
}
488+
Err(FrontendError::EmptyQueue)
429489
}
430-
431-
false
432490
}
433491

434492
// Tries to detour the frame to MMDS and if MMDS doesn't accept it, sends it on the host TAP.

src/vmm/src/devices/virtio/queue.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ impl Queue {
381381
/// # Important
382382
/// This is an internal method that ASSUMES THAT THERE ARE AVAILABLE DESCRIPTORS. Otherwise it
383383
/// will retrieve a descriptor that contains garbage data (obsolete/empty).
384-
fn do_pop_unchecked<'b, M: GuestMemory>(
384+
pub fn do_pop_unchecked<'b, M: GuestMemory>(
385385
&mut self,
386386
mem: &'b M,
387387
) -> Option<DescriptorChain<'b, M>> {

0 commit comments

Comments
 (0)