1 //! This module contains specializations that can offload `io::copy()` operations on file descriptor 2 //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`. 3 //! 4 //! Specialization is only applied to wholly std-owned types so that user code can't observe 5 //! that the `Read` and `Write` traits are not used. 6 //! 7 //! Since a copy operation involves a reader and writer side where each can consist of different types 8 //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize 9 //! a single method on all possible combinations. 10 //! 11 //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization 12 //! traits and then specialized on by the `Copier::copy` method. 13 //! 14 //! `Copier` uses the specialization traits to unpack the underlying file descriptors and 15 //! additional prerequisites and constraints imposed by the wrapper types. 16 //! 17 //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they 18 //! can be safely bypassed it will attempt to use the `copy_file_range(2)`, 19 //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors. 20 //! Since those syscalls have requirements that cannot be fully checked in advance it attempts 21 //! to use them one after another (guided by hints) to figure out which one works and 22 //! falls back to the generic read-write copy loop if none of them does. 23 //! Once a working syscall is found for a pair of file descriptors it will be called in a loop 24 //! until the copy operation is completed. 25 //! 26 //! Advantages of using these syscalls: 27 //! 28 //! * fewer context switches since reads and writes are coalesced into a single syscall 29 //! and more bytes are transferred per syscall. This translates to higher throughput 30 //! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing. 31 //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and 32 //! consuming less disk space 33 //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while 34 //! a naive copy loop would move every byte through the CPU. 35 //! 36 //! Drawbacks: 37 //! 38 //! * copy operations smaller than the default buffer size can under some circumstances, especially 39 //! on older kernels, incur more syscalls than the naive approach would. As mentioned above 40 //! the syscall selection is guided by hints to minimize this possibility but they are not perfect. 41 //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report 42 //! progress, they can hit a performance cliff. 43 //! * complexity 44 45 use crate::std::cmp::min; 46 use crate::std::fs::{File, Metadata}; 47 use crate::std::io::copy::generic_copy; 48 use crate::std::io::{ 49 BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take, 50 Write, 51 }; 52 use crate::std::mem::ManuallyDrop; 53 use crate::std::net::TcpStream; 54 use crate::std::os::unix::fs::FileTypeExt; 55 use crate::std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 56 use crate::std::os::unix::net::UnixStream; 57 use crate::std::process::{ChildStderr, ChildStdin, ChildStdout}; 58 use crate::std::ptr; 59 use crate::std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; 60 use crate::std::sys::cvt; 61 use crate::std::sys::weak::syscall; 62 use dlibc; 63 #[cfg(not(all(target_os = "linux", target_env = "gnu")))] 64 use dlibc::sendfile as sendfile64; 65 #[cfg(all(target_os = "linux", target_env = "gnu"))] 66 use dlibc::sendfile64; 67 use dlibc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV}; 68 #[cfg(test)] 69 mod tests; 70 71 pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>( 72 read: &mut R, 73 write: &mut W, 74 ) -> Result<u64> { 75 let copier = Copier { read, write }; 76 SpecCopy::copy(copier) 77 } 78 79 /// This type represents either the inferred `FileType` of a `RawFd` based on the source 80 /// type from which it was extracted or the actual metadata 81 /// 82 /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred 83 /// type may be wrong. 84 enum FdMeta { 85 Metadata(Metadata), 86 Socket, 87 Pipe, 88 /// We don't have any metadata because the stat syscall failed 89 NoneObtained, 90 } 91 92 #[derive(PartialEq)] 93 enum FdHandle { 94 Input, 95 Output, 96 } 97 98 impl FdMeta { 99 fn maybe_fifo(&self) -> bool { 100 match self { 101 FdMeta::Metadata(meta) => meta.file_type().is_fifo(), 102 FdMeta::Socket => false, 103 FdMeta::Pipe => true, 104 FdMeta::NoneObtained => true, 105 } 106 } 107 108 fn potential_sendfile_source(&self) -> bool { 109 match self { 110 // procfs erroneously shows 0 length on non-empty readable files. 111 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall 112 // thus there would be benefit from attempting sendfile 113 FdMeta::Metadata(meta) 114 if meta.file_type().is_file() && meta.len() > 0 115 || meta.file_type().is_block_device() => 116 { 117 true 118 } 119 _ => false, 120 } 121 } 122 123 fn copy_file_range_candidate(&self, f: FdHandle) -> bool { 124 match self { 125 // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached 126 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range 127 FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => { 128 true 129 } 130 FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true, 131 _ => false, 132 } 133 } 134 } 135 136 /// Returns true either if changes made to the source after a sendfile/splice call won't become 137 /// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing 138 /// a file into a pipe, the pipe being the source in this case). 139 /// 140 /// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold 141 /// the Read/Write API semantics of io::copy. 142 /// 143 /// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a 144 /// regular file into a TcpSocket which will be treated as a socket here without checking. 145 fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool { 146 match (source, sink) { 147 // Data arriving from a socket is safe because the sender can't modify the socket buffer. 148 // Data arriving from a pipe is safe(-ish) because either the sender *copied* 149 // the bytes into the pipe OR explicitly performed an operation that enables zero-copy, 150 // thus promising not to modify the data later. 151 (FdMeta::Socket, _) => true, 152 (FdMeta::Pipe, _) => true, 153 (FdMeta::Metadata(meta), _) 154 if meta.file_type().is_fifo() || meta.file_type().is_socket() => 155 { 156 true 157 } 158 // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue 159 // only happens for pages sitting in send buffers or pipes. 160 (_, FdMeta::Metadata(meta)) 161 if !meta.file_type().is_fifo() && !meta.file_type().is_socket() => 162 { 163 true 164 } 165 _ => false, 166 } 167 } 168 169 struct CopyParams(FdMeta, Option<RawFd>); 170 171 struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> { 172 read: &'a mut R, 173 write: &'b mut W, 174 } 175 176 trait SpecCopy { 177 fn copy(self) -> Result<u64>; 178 } 179 180 impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> { 181 default fn copy(self) -> Result<u64> { 182 generic_copy(self.read, self.write) 183 } 184 } 185 186 impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> { 187 fn copy(self) -> Result<u64> { 188 let (reader, writer) = (self.read, self.write); 189 let r_cfg = reader.properties(); 190 let w_cfg = writer.properties(); 191 192 // before direct operations on file descriptors ensure that all source and sink buffers are empty 193 let mut flush = || -> crate::std::io::Result<u64> { 194 let bytes = reader.drain_to(writer, u64::MAX)?; 195 // BufWriter buffered bytes have already been accounted for in earlier write() calls 196 writer.flush()?; 197 Ok(bytes) 198 }; 199 200 let mut written = 0u64; 201 202 if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) = 203 (r_cfg, w_cfg) 204 { 205 written += flush()?; 206 let max_write = reader.min_limit(); 207 208 if input_meta.copy_file_range_candidate(FdHandle::Input) 209 && output_meta.copy_file_range_candidate(FdHandle::Output) 210 { 211 let result = copy_regular_files(readfd, writefd, max_write); 212 result.update_take(reader); 213 214 match result { 215 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), 216 CopyResult::Error(e, _) => return Err(e), 217 CopyResult::Fallback(bytes) => written += bytes, 218 } 219 } 220 221 // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices) 222 // to any writable file descriptor. On older kernels the writer side can only be a socket. 223 // So we just try and fallback if needed. 224 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead 225 // fall back to the generic copy loop. 226 if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta) 227 { 228 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write); 229 result.update_take(reader); 230 231 match result { 232 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), 233 CopyResult::Error(e, _) => return Err(e), 234 CopyResult::Fallback(bytes) => written += bytes, 235 } 236 } 237 238 if (input_meta.maybe_fifo() || output_meta.maybe_fifo()) 239 && safe_kernel_copy(&input_meta, &output_meta) 240 { 241 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write); 242 result.update_take(reader); 243 244 match result { 245 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), 246 CopyResult::Error(e, _) => return Err(e), 247 CopyResult::Fallback(0) => { /* use the fallback below */ } 248 CopyResult::Fallback(_) => { 249 unreachable!("splice should not return > 0 bytes on the fallback path") 250 } 251 } 252 } 253 } 254 255 // fallback if none of the more specialized syscalls wants to work with these file descriptors 256 match generic_copy(reader, writer) { 257 Ok(bytes) => Ok(bytes + written), 258 err => err, 259 } 260 } 261 } 262 263 #[rustc_specialization_trait] 264 trait CopyRead: Read { 265 /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal 266 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been 267 /// transferred, whichever occurs sooner. 268 /// If nested buffers are present the outer buffers must be drained first. 269 /// 270 /// This is necessary to directly bypass the wrapper types while preserving the data order 271 /// when operating directly on the underlying file descriptors. 272 fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> { 273 Ok(0) 274 } 275 276 /// Updates `Take` wrappers to remove the number of bytes copied. 277 fn taken(&mut self, _bytes: u64) {} 278 279 /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise. 280 /// This method does not account for data `BufReader` buffers and would underreport 281 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid 282 /// after draining the buffers via `drain_to`. 283 fn min_limit(&self) -> u64 { 284 u64::MAX 285 } 286 287 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. 288 fn properties(&self) -> CopyParams; 289 } 290 291 #[rustc_specialization_trait] 292 trait CopyWrite: Write { 293 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. 294 fn properties(&self) -> CopyParams; 295 } 296 297 impl<T> CopyRead for &mut T 298 where 299 T: CopyRead, 300 { 301 fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> { 302 (**self).drain_to(writer, limit) 303 } 304 305 fn taken(&mut self, bytes: u64) { 306 (**self).taken(bytes); 307 } 308 309 fn min_limit(&self) -> u64 { 310 (**self).min_limit() 311 } 312 313 fn properties(&self) -> CopyParams { 314 (**self).properties() 315 } 316 } 317 318 impl<T> CopyWrite for &mut T 319 where 320 T: CopyWrite, 321 { 322 fn properties(&self) -> CopyParams { 323 (**self).properties() 324 } 325 } 326 327 impl CopyRead for File { 328 fn properties(&self) -> CopyParams { 329 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 330 } 331 } 332 333 impl CopyRead for &File { 334 fn properties(&self) -> CopyParams { 335 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) 336 } 337 } 338 339 impl CopyWrite for File { 340 fn properties(&self) -> CopyParams { 341 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 342 } 343 } 344 345 impl CopyWrite for &File { 346 fn properties(&self) -> CopyParams { 347 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) 348 } 349 } 350 351 impl CopyRead for TcpStream { 352 fn properties(&self) -> CopyParams { 353 // avoid the stat syscall since we can be fairly sure it's a socket 354 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 355 } 356 } 357 358 impl CopyRead for &TcpStream { 359 fn properties(&self) -> CopyParams { 360 // avoid the stat syscall since we can be fairly sure it's a socket 361 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 362 } 363 } 364 365 impl CopyWrite for TcpStream { 366 fn properties(&self) -> CopyParams { 367 // avoid the stat syscall since we can be fairly sure it's a socket 368 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 369 } 370 } 371 372 impl CopyWrite for &TcpStream { 373 fn properties(&self) -> CopyParams { 374 // avoid the stat syscall since we can be fairly sure it's a socket 375 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 376 } 377 } 378 379 impl CopyRead for UnixStream { 380 fn properties(&self) -> CopyParams { 381 // avoid the stat syscall since we can be fairly sure it's a socket 382 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 383 } 384 } 385 386 impl CopyRead for &UnixStream { 387 fn properties(&self) -> CopyParams { 388 // avoid the stat syscall since we can be fairly sure it's a socket 389 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 390 } 391 } 392 393 impl CopyWrite for UnixStream { 394 fn properties(&self) -> CopyParams { 395 // avoid the stat syscall since we can be fairly sure it's a socket 396 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 397 } 398 } 399 400 impl CopyWrite for &UnixStream { 401 fn properties(&self) -> CopyParams { 402 // avoid the stat syscall since we can be fairly sure it's a socket 403 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 404 } 405 } 406 407 impl CopyWrite for ChildStdin { 408 fn properties(&self) -> CopyParams { 409 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) 410 } 411 } 412 413 impl CopyRead for ChildStdout { 414 fn properties(&self) -> CopyParams { 415 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) 416 } 417 } 418 419 impl CopyRead for ChildStderr { 420 fn properties(&self) -> CopyParams { 421 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) 422 } 423 } 424 425 impl CopyRead for StdinLock<'_> { 426 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { 427 let buf_reader = self.as_mut_buf(); 428 let buf = buf_reader.buffer(); 429 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; 430 let bytes_drained = buf.len(); 431 writer.write_all(buf)?; 432 buf_reader.consume(bytes_drained); 433 434 Ok(bytes_drained as u64) 435 } 436 437 fn properties(&self) -> CopyParams { 438 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 439 } 440 } 441 442 impl CopyWrite for StdoutLock<'_> { 443 fn properties(&self) -> CopyParams { 444 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 445 } 446 } 447 448 impl CopyWrite for StderrLock<'_> { 449 fn properties(&self) -> CopyParams { 450 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 451 } 452 } 453 454 impl<T: CopyRead> CopyRead for Take<T> { 455 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { 456 let local_limit = self.limit(); 457 let combined_limit = min(outer_limit, local_limit); 458 let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?; 459 // update limit since read() was bypassed 460 self.set_limit(local_limit - bytes_drained); 461 462 Ok(bytes_drained) 463 } 464 465 fn taken(&mut self, bytes: u64) { 466 self.set_limit(self.limit() - bytes); 467 self.get_mut().taken(bytes); 468 } 469 470 fn min_limit(&self) -> u64 { 471 min(Take::limit(self), self.get_ref().min_limit()) 472 } 473 474 fn properties(&self) -> CopyParams { 475 self.get_ref().properties() 476 } 477 } 478 479 impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> { 480 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { 481 let buf = self.buffer(); 482 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; 483 let bytes = buf.len(); 484 writer.write_all(buf)?; 485 self.consume(bytes); 486 487 let remaining = outer_limit - bytes as u64; 488 489 // in case of nested bufreaders we also need to drain the ones closer to the source 490 let inner_bytes = self.get_mut().drain_to(writer, remaining)?; 491 492 Ok(bytes as u64 + inner_bytes) 493 } 494 495 fn taken(&mut self, bytes: u64) { 496 self.get_mut().taken(bytes); 497 } 498 499 fn min_limit(&self) -> u64 { 500 self.get_ref().min_limit() 501 } 502 503 fn properties(&self) -> CopyParams { 504 self.get_ref().properties() 505 } 506 } 507 508 impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> { 509 fn properties(&self) -> CopyParams { 510 self.get_ref().properties() 511 } 512 } 513 514 fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta { 515 let fd = fd.as_raw_fd(); 516 let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) }); 517 match file.metadata() { 518 Ok(meta) => FdMeta::Metadata(meta), 519 Err(_) => FdMeta::NoneObtained, 520 } 521 } 522 523 pub(super) enum CopyResult { 524 Ended(u64), 525 Error(Error, u64), 526 Fallback(u64), 527 } 528 529 impl CopyResult { 530 fn update_take(&self, reader: &mut impl CopyRead) { 531 match *self { 532 CopyResult::Fallback(bytes) 533 | CopyResult::Ended(bytes) 534 | CopyResult::Error(_, bytes) => reader.taken(bytes), 535 } 536 } 537 } 538 539 /// Invalid file descriptor. 540 /// 541 /// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage) 542 /// while negative values are used to indicate errors. 543 /// Thus -1 will never be overlap with a valid open file. 544 const INVALID_FD: RawFd = -1; 545 546 /// Linux-specific implementation that will attempt to use copy_file_range for copy offloading. 547 /// As the name says, it only works on regular files. 548 /// 549 /// Callers must handle fallback to a generic copy loop. 550 /// `Fallback` may indicate non-zero number of bytes already written 551 /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`). 552 pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult { 553 use crate::std::cmp; 554 555 const NOT_PROBED: u8 = 0; 556 const UNAVAILABLE: u8 = 1; 557 const AVAILABLE: u8 = 2; 558 559 // Kernel prior to 4.5 don't have copy_file_range 560 // We store the availability in a global to avoid unnecessary syscalls 561 static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED); 562 563 syscall! { 564 fn copy_file_range( 565 fd_in: dlibc::c_int, 566 off_in: *mut dlibc::loff_t, 567 fd_out: dlibc::c_int, 568 off_out: *mut dlibc::loff_t, 569 len: dlibc::size_t, 570 flags: dlibc::c_uint 571 ) -> dlibc::ssize_t 572 } 573 574 match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) { 575 NOT_PROBED => { 576 // EPERM can indicate seccomp filters or an immutable file. 577 // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported 578 // and some other error (ENOSYS or EPERM) if it's not available 579 let result = unsafe { 580 cvt(copy_file_range( 581 INVALID_FD, 582 ptr::null_mut(), 583 INVALID_FD, 584 ptr::null_mut(), 585 1, 586 0, 587 )) 588 }; 589 590 if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) { 591 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed); 592 } else { 593 HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed); 594 return CopyResult::Fallback(0); 595 } 596 } 597 UNAVAILABLE => return CopyResult::Fallback(0), 598 _ => {} 599 }; 600 601 let mut written = 0u64; 602 while written < max_len { 603 let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64); 604 // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position 605 // this allows us to copy large chunks without hitting EOVERFLOW, 606 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required 607 let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize); 608 let copy_result = unsafe { 609 // We actually don't have to adjust the offsets, 610 // because copy_file_range adjusts the file offset automatically 611 cvt(copy_file_range( 612 reader, 613 ptr::null_mut(), 614 writer, 615 ptr::null_mut(), 616 bytes_to_copy, 617 0, 618 )) 619 }; 620 621 match copy_result { 622 Ok(0) if written == 0 => { 623 // fallback to work around several kernel bugs where copy_file_range will fail to 624 // copy any bytes and return 0 instead of an error if 625 // - reading virtual files from the proc filesystem which appear to have 0 size 626 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19. 627 // - copying from an overlay filesystem in docker. reported to occur on fedora 32. 628 return CopyResult::Fallback(0); 629 } 630 Ok(0) => return CopyResult::Ended(written), // reached EOF 631 Ok(ret) => written += ret as u64, 632 Err(err) => { 633 return match err.raw_os_error() { 634 // when file offset + max_length > u64::MAX 635 Some(EOVERFLOW) => CopyResult::Fallback(written), 636 Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => { 637 // Try fallback io::copy if either: 638 // - Kernel version is < 4.5 (ENOSYS¹) 639 // - Files are mounted on different fs (EXDEV) 640 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP) 641 // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM) 642 // - copy_file_range cannot be used with pipes or device nodes (EINVAL) 643 // - the writer fd was opened with O_APPEND (EBADF²) 644 // and no bytes were written successfully yet. (All these errnos should 645 // not be returned if something was already written, but they happen in 646 // the wild, see #91152.) 647 // 648 // ¹ these cases should be detected by the initial probe but we handle them here 649 // anyway in case syscall interception changes during runtime 650 // ² actually invalid file descriptors would cause this too, but in that case 651 // the fallback code path is expected to encounter the same error again 652 CopyResult::Fallback(0) 653 } 654 _ => CopyResult::Error(err, written), 655 }; 656 } 657 } 658 } 659 CopyResult::Ended(written) 660 } 661 662 #[derive(PartialEq)] 663 enum SpliceMode { 664 Sendfile, 665 Splice, 666 } 667 668 /// performs splice or sendfile between file descriptors 669 /// Does _not_ fall back to a generic copy loop. 670 fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult { 671 static HAS_SENDFILE: AtomicBool = AtomicBool::new(true); 672 static HAS_SPLICE: AtomicBool = AtomicBool::new(true); 673 674 // Android builds use feature level 14, but the libc wrapper for splice is 675 // gated on feature level 21+, so we have to invoke the syscall directly. 676 #[cfg(target_os = "android")] 677 syscall! { 678 fn splice( 679 srcfd: dlibc::c_int, 680 src_offset: *const i64, 681 dstfd: dlibc::c_int, 682 dst_offset: *const i64, 683 len: dlibc::size_t, 684 flags: dlibc::c_int 685 ) -> dlibc::ssize_t 686 } 687 688 #[cfg(target_os = "linux")] 689 use dlibc::splice; 690 691 #[cfg(target_os = "dragonos")] 692 use dlibc::splice; 693 694 match mode { 695 SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => { 696 return CopyResult::Fallback(0); 697 } 698 SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => { 699 return CopyResult::Fallback(0); 700 } 701 _ => (), 702 } 703 704 let mut written = 0u64; 705 while written < len { 706 // according to its manpage that's the maximum size sendfile() will copy per invocation 707 let chunk_size = crate::std::cmp::min(len - written, 0x7ffff000_u64) as usize; 708 709 let result = match mode { 710 SpliceMode::Sendfile => { 711 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) }) 712 } 713 SpliceMode::Splice => cvt(unsafe { 714 splice( 715 reader, 716 ptr::null_mut(), 717 writer, 718 ptr::null_mut(), 719 chunk_size, 720 0, 721 ) 722 }), 723 }; 724 725 match result { 726 Ok(0) => break, // EOF 727 Ok(ret) => written += ret as u64, 728 Err(err) => { 729 return match err.raw_os_error() { 730 Some(ENOSYS | EPERM) => { 731 // syscall not supported (ENOSYS) 732 // syscall is disallowed, e.g. by seccomp (EPERM) 733 match mode { 734 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed), 735 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed), 736 } 737 assert_eq!(written, 0); 738 CopyResult::Fallback(0) 739 } 740 Some(EINVAL) => { 741 // splice/sendfile do not support this particular file descriptor (EINVAL) 742 assert_eq!(written, 0); 743 CopyResult::Fallback(0) 744 } 745 Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => { 746 CopyResult::Fallback(written) 747 } 748 _ => CopyResult::Error(err, written), 749 }; 750 } 751 } 752 } 753 CopyResult::Ended(written) 754 } 755