Skip to content
This repository has been archived by the owner on Jan 22, 2025. It is now read-only.

Commit

Permalink
Page-pin packet memory for cuda (#4250)
Browse files Browse the repository at this point in the history
* Page-pin packet memory for cuda

Bring back recyclers and pin offset buffers

* Add packet recycler to streamer

* Add set_pinnable to sigverify vecs to pin them

* Add packets reset test

* Add test for recycler and reduce the gc lock critical section
* Add comments/tests to cuda_runtime

* Add recycler to recv_blobs path.

* Add trace/names for debug and PacketsRecycler to bench-streamer

* Predict realloc and unpin beforehand.

* Add helper to reserve and pin

* Cap buffered packets length

* Call cuda wrapper functions
  • Loading branch information
sakridge authored Jun 27, 2019
1 parent 44a5724 commit fbea9d8
Show file tree
Hide file tree
Showing 13 changed files with 613 additions and 56 deletions.
12 changes: 10 additions & 2 deletions bench-streamer/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use clap::{crate_description, crate_name, crate_version, App, Arg};
use solana::packet::PacketsRecycler;
use solana::packet::{Packet, Packets, BLOB_SIZE, PACKET_DATA_SIZE};
use solana::result::Result;
use solana::streamer::{receiver, PacketReceiver};
Expand All @@ -16,7 +17,7 @@ fn producer(addr: &SocketAddr, exit: Arc<AtomicBool>) -> JoinHandle<()> {
let send = UdpSocket::bind("0.0.0.0:0").unwrap();
let mut msgs = Packets::default();
msgs.packets.resize(10, Packet::default());
for w in &mut msgs.packets {
for w in msgs.packets.iter_mut() {
w.meta.size = PACKET_DATA_SIZE;
w.meta.set_addr(&addr);
}
Expand Down Expand Up @@ -74,6 +75,7 @@ fn main() -> Result<()> {

let mut read_channels = Vec::new();
let mut read_threads = Vec::new();
let recycler = PacketsRecycler::default();
for _ in 0..num_sockets {
let read = solana_netutil::bind_to(port, false).unwrap();
read.set_read_timeout(Some(Duration::new(1, 0))).unwrap();
Expand All @@ -83,7 +85,13 @@ fn main() -> Result<()> {

let (s_reader, r_reader) = channel();
read_channels.push(r_reader);
read_threads.push(receiver(Arc::new(read), &exit, s_reader));
read_threads.push(receiver(
Arc::new(read),
&exit,
s_reader,
recycler.clone(),
"bench-streamer-test",
));
}

let t_producer1 = producer(&addr, exit.clone());
Expand Down
5 changes: 4 additions & 1 deletion core/benches/sigverify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
extern crate test;

use solana::packet::to_packets;
use solana::recycler::Recycler;
use solana::sigverify;
use solana::test_tx::test_tx;
use test::Bencher;
Expand All @@ -14,8 +15,10 @@ fn bench_sigverify(bencher: &mut Bencher) {
// generate packet vector
let batches = to_packets(&vec![tx; 128]);

let recycler = Recycler::default();
let recycler_out = Recycler::default();
// verify packets
bencher.iter(|| {
let _ans = sigverify::ed25519_verify(&batches);
let _ans = sigverify::ed25519_verify(&batches, &recycler, &recycler_out);
})
}
3 changes: 3 additions & 0 deletions core/src/banking_stage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,9 @@ impl BankingStage {
packet_indexes: Vec<usize>,
) {
if !packet_indexes.is_empty() {
if unprocessed_packets.len() > 400 {
unprocessed_packets.remove(0);
}
unprocessed_packets.push((packets, packet_indexes));
}
}
Expand Down
297 changes: 297 additions & 0 deletions core/src/cuda_runtime.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
// Module for cuda-related helper functions and wrappers.
//
// cudaHostRegister/cudaHostUnregister -
// apis for page-pinning memory. Cuda driver/hardware cannot overlap
// copies from host memory to GPU memory unless the memory is page-pinned and
// cannot be paged to disk. The cuda driver provides these interfaces to pin and unpin memory.

use crate::recycler::Reset;
#[cfg(feature = "cuda")]
use crate::sigverify::{cuda_host_register, cuda_host_unregister};
use std::ops::{Deref, DerefMut};

#[cfg(feature = "cuda")]
use std::mem::size_of;

#[cfg(feature = "cuda")]
use core::ffi::c_void;

#[cfg(feature = "cuda")]
use std::os::raw::c_int;

#[cfg(feature = "cuda")]
const CUDA_SUCCESS: c_int = 0;

pub fn pin<T>(_mem: &mut Vec<T>) {
#[cfg(feature = "cuda")]
unsafe {
let err = cuda_host_register(
_mem.as_mut_ptr() as *mut c_void,
_mem.capacity() * size_of::<T>(),
0,
);
if err != CUDA_SUCCESS {
error!(
"cudaHostRegister error: {} ptr: {:?} bytes: {}",
err,
_mem.as_ptr(),
_mem.capacity() * size_of::<T>()
);
}
}
}

pub fn unpin<T>(_mem: *mut T) {
#[cfg(feature = "cuda")]
unsafe {
let err = cuda_host_unregister(_mem as *mut c_void);
if err != CUDA_SUCCESS {
error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);
}
}
}

// A vector wrapper where the underlying memory can be
// page-pinned. Controlled by flags in case user only wants
// to pin in certain circumstances.
#[derive(Debug)]
pub struct PinnedVec<T> {
x: Vec<T>,
pinned: bool,
pinnable: bool,
}

impl Reset for PinnedVec<u8> {
fn reset(&mut self) {
self.resize(0, 0u8);
}
}

impl Reset for PinnedVec<u32> {
fn reset(&mut self) {
self.resize(0, 0u32);
}
}

impl<T: Clone> Default for PinnedVec<T> {
fn default() -> Self {
Self {
x: Vec::new(),
pinned: false,
pinnable: false,
}
}
}

impl<T> Deref for PinnedVec<T> {
type Target = Vec<T>;

fn deref(&self) -> &Self::Target {
&self.x
}
}

impl<T> DerefMut for PinnedVec<T> {
fn deref_mut(&mut self) -> &mut Vec<T> {
&mut self.x
}
}

pub struct PinnedIter<'a, T>(std::slice::Iter<'a, T>);

pub struct PinnedIterMut<'a, T>(std::slice::IterMut<'a, T>);

impl<'a, T> Iterator for PinnedIter<'a, T> {
type Item = &'a T;

fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}
}

impl<'a, T> Iterator for PinnedIterMut<'a, T> {
type Item = &'a mut T;

fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}
}

impl<'a, T> IntoIterator for &'a mut PinnedVec<T> {
type Item = &'a T;
type IntoIter = PinnedIter<'a, T>;

fn into_iter(self) -> Self::IntoIter {
PinnedIter(self.iter())
}
}

impl<'a, T> IntoIterator for &'a PinnedVec<T> {
type Item = &'a T;
type IntoIter = PinnedIter<'a, T>;

fn into_iter(self) -> Self::IntoIter {
PinnedIter(self.iter())
}
}

impl<T: Clone> PinnedVec<T> {
pub fn reserve_and_pin(&mut self, size: usize) {
if self.x.capacity() < size {
if self.pinned {
unpin(&mut self.x);
self.pinned = false;
}
self.x.reserve(size);
}
self.set_pinnable();
if !self.pinned {
pin(&mut self.x);
self.pinned = true;
}
}

pub fn set_pinnable(&mut self) {
self.pinnable = true;
}

pub fn from_vec(source: Vec<T>) -> Self {
Self {
x: source,
pinned: false,
pinnable: false,
}
}

pub fn with_capacity(capacity: usize) -> Self {
let x = Vec::with_capacity(capacity);
Self {
x,
pinned: false,
pinnable: false,
}
}

pub fn iter(&self) -> PinnedIter<T> {
PinnedIter(self.x.iter())
}

pub fn iter_mut(&mut self) -> PinnedIterMut<T> {
PinnedIterMut(self.x.iter_mut())
}

pub fn is_empty(&self) -> bool {
self.x.is_empty()
}

pub fn len(&self) -> usize {
self.x.len()
}

#[cfg(feature = "cuda")]
pub fn as_ptr(&self) -> *const T {
self.x.as_ptr()
}

#[cfg(feature = "cuda")]
pub fn as_mut_ptr(&mut self) -> *mut T {
self.x.as_mut_ptr()
}

pub fn push(&mut self, x: T) {
let old_ptr = self.x.as_mut_ptr();
let old_capacity = self.x.capacity();
// Predict realloc and unpin
if self.pinned && self.x.capacity() == self.x.len() {
unpin(old_ptr);
self.pinned = false;
}
self.x.push(x);
self.check_ptr(old_ptr, old_capacity, "push");
}

pub fn resize(&mut self, size: usize, elem: T) {
let old_ptr = self.x.as_mut_ptr();
let old_capacity = self.x.capacity();
// Predict realloc and unpin.
if self.pinned && self.x.capacity() < size {
unpin(old_ptr);
self.pinned = false;
}
self.x.resize(size, elem);
self.check_ptr(old_ptr, old_capacity, "resize");
}

fn check_ptr(&mut self, _old_ptr: *mut T, _old_capacity: usize, _from: &'static str) {
#[cfg(feature = "cuda")]
{
if self.pinnable && (self.x.as_ptr() != _old_ptr || self.x.capacity() != _old_capacity)
{
if self.pinned {
unpin(_old_ptr);
}

trace!(
"pinning from check_ptr old: {} size: {} from: {}",
_old_capacity,
self.x.capacity(),
_from
);
pin(&mut self.x);
self.pinned = true;
}
}
}
}

impl<T: Clone> Clone for PinnedVec<T> {
fn clone(&self) -> Self {
let mut x = self.x.clone();
let pinned = if self.pinned {
pin(&mut x);
true
} else {
false
};
debug!(
"clone PinnedVec: size: {} pinned?: {} pinnable?: {}",
self.x.capacity(),
self.pinned,
self.pinnable
);
Self {
x,
pinned,
pinnable: self.pinnable,
}
}
}

impl<T> Drop for PinnedVec<T> {
fn drop(&mut self) {
if self.pinned {
unpin(self.x.as_mut_ptr());
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_pinned_vec() {
let mut mem = PinnedVec::with_capacity(10);
mem.set_pinnable();
mem.push(50);
mem.resize(2, 10);
assert_eq!(mem[0], 50);
assert_eq!(mem[1], 10);
assert_eq!(mem.len(), 2);
assert_eq!(mem.is_empty(), false);
let mut iter = mem.iter();
assert_eq!(*iter.next().unwrap(), 50);
assert_eq!(*iter.next().unwrap(), 10);
assert_eq!(iter.next(), None);
}
}
14 changes: 11 additions & 3 deletions core/src/fetch_stage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
use crate::banking_stage::FORWARD_TRANSACTIONS_TO_LEADER_AT_SLOT_OFFSET;
use crate::poh_recorder::PohRecorder;
use crate::recycler::Recycler;
use crate::result::{Error, Result};
use crate::service::Service;
use crate::streamer::{self, PacketReceiver, PacketSender};
Expand Down Expand Up @@ -87,9 +88,16 @@ impl FetchStage {
sender: &PacketSender,
poh_recorder: &Arc<Mutex<PohRecorder>>,
) -> Self {
let tpu_threads = sockets
.into_iter()
.map(|socket| streamer::receiver(socket, &exit, sender.clone()));
let recycler = Recycler::default();
let tpu_threads = sockets.into_iter().map(|socket| {
streamer::receiver(
socket,
&exit,
sender.clone(),
recycler.clone(),
"fetch_stage",
)
});

let (forward_sender, forward_receiver) = channel();
let tpu_via_blobs_threads = tpu_via_blobs_sockets
Expand Down
Loading

0 comments on commit fbea9d8

Please sign in to comment.