1 //! This module contains specializations that can offload `io::copy()` operations on file descriptor 2 //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`. 3 //! 4 //! Specialization is only applied to wholly std-owned types so that user code can't observe 5 //! that the `Read` and `Write` traits are not used. 6 //! 7 //! Since a copy operation involves a reader and writer side where each can consist of different types 8 //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize 9 //! a single method on all possible combinations. 10 //! 11 //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization 12 //! traits and then specialized on by the `Copier::copy` method. 13 //! 14 //! `Copier` uses the specialization traits to unpack the underlying file descriptors and 15 //! additional prerequisites and constraints imposed by the wrapper types. 16 //! 17 //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they 18 //! can be safely bypassed it will attempt to use the `copy_file_range(2)`, 19 //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors. 20 //! Since those syscalls have requirements that cannot be fully checked in advance it attempts 21 //! to use them one after another (guided by hints) to figure out which one works and 22 //! falls back to the generic read-write copy loop if none of them does. 23 //! Once a working syscall is found for a pair of file descriptors it will be called in a loop 24 //! until the copy operation is completed. 25 //! 26 //! Advantages of using these syscalls: 27 //! 28 //! * fewer context switches since reads and writes are coalesced into a single syscall 29 //! and more bytes are transferred per syscall. This translates to higher throughput 30 //! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing. 31 //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and 32 //! consuming less disk space 33 //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while 34 //! a naive copy loop would move every byte through the CPU. 35 //! 36 //! Drawbacks: 37 //! 38 //! * copy operations smaller than the default buffer size can under some circumstances, especially 39 //! on older kernels, incur more syscalls than the naive approach would. As mentioned above 40 //! the syscall selection is guided by hints to minimize this possibility but they are not perfect. 41 //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report 42 //! progress, they can hit a performance cliff. 43 //! * complexity 44 45 use crate::std::cmp::min; 46 use crate::std::fs::{File, Metadata}; 47 use crate::std::io::copy::generic_copy; 48 use crate::std::io::{ 49 BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take, 50 Write, 51 }; 52 use crate::std::mem::ManuallyDrop; 53 use crate::std::net::TcpStream; 54 use crate::std::os::unix::fs::FileTypeExt; 55 use crate::std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 56 use crate::std::os::unix::net::UnixStream; 57 use crate::std::process::{ChildStderr, ChildStdin, ChildStdout}; 58 use crate::std::ptr; 59 use crate::std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; 60 use crate::std::sys::cvt; 61 use crate::std::sys::weak::syscall; 62 use dlibc; 63 #[cfg(not(all(target_os = "linux", target_env = "gnu")))] 64 use dlibc::sendfile as sendfile64; 65 #[cfg(all(target_os = "linux", target_env = "gnu"))] 66 use dlibc::sendfile64; 67 use dlibc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV}; 68 69 #[allow(dead_code)] 70 pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>( 71 read: &mut R, 72 write: &mut W, 73 ) -> Result<u64> { 74 let copier = Copier { read, write }; 75 SpecCopy::copy(copier) 76 } 77 78 /// This type represents either the inferred `FileType` of a `RawFd` based on the source 79 /// type from which it was extracted or the actual metadata 80 /// 81 /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred 82 /// type may be wrong. 83 enum FdMeta { 84 Metadata(Metadata), 85 Socket, 86 Pipe, 87 /// We don't have any metadata because the stat syscall failed 88 NoneObtained, 89 } 90 91 #[derive(PartialEq)] 92 enum FdHandle { 93 Input, 94 Output, 95 } 96 97 impl FdMeta { 98 fn maybe_fifo(&self) -> bool { 99 match self { 100 FdMeta::Metadata(meta) => meta.file_type().is_fifo(), 101 FdMeta::Socket => false, 102 FdMeta::Pipe => true, 103 FdMeta::NoneObtained => true, 104 } 105 } 106 107 fn potential_sendfile_source(&self) -> bool { 108 match self { 109 // procfs erroneously shows 0 length on non-empty readable files. 110 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall 111 // thus there would be benefit from attempting sendfile 112 FdMeta::Metadata(meta) 113 if meta.file_type().is_file() && meta.len() > 0 114 || meta.file_type().is_block_device() => 115 { 116 true 117 } 118 _ => false, 119 } 120 } 121 122 fn copy_file_range_candidate(&self, f: FdHandle) -> bool { 123 match self { 124 // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached 125 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range 126 FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => { 127 true 128 } 129 FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true, 130 _ => false, 131 } 132 } 133 } 134 135 /// Returns true either if changes made to the source after a sendfile/splice call won't become 136 /// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing 137 /// a file into a pipe, the pipe being the source in this case). 138 /// 139 /// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold 140 /// the Read/Write API semantics of io::copy. 141 /// 142 /// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a 143 /// regular file into a TcpSocket which will be treated as a socket here without checking. 144 fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool { 145 match (source, sink) { 146 // Data arriving from a socket is safe because the sender can't modify the socket buffer. 147 // Data arriving from a pipe is safe(-ish) because either the sender *copied* 148 // the bytes into the pipe OR explicitly performed an operation that enables zero-copy, 149 // thus promising not to modify the data later. 150 (FdMeta::Socket, _) => true, 151 (FdMeta::Pipe, _) => true, 152 (FdMeta::Metadata(meta), _) 153 if meta.file_type().is_fifo() || meta.file_type().is_socket() => 154 { 155 true 156 } 157 // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue 158 // only happens for pages sitting in send buffers or pipes. 159 (_, FdMeta::Metadata(meta)) 160 if !meta.file_type().is_fifo() && !meta.file_type().is_socket() => 161 { 162 true 163 } 164 _ => false, 165 } 166 } 167 168 struct CopyParams(FdMeta, Option<RawFd>); 169 170 struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> { 171 read: &'a mut R, 172 write: &'b mut W, 173 } 174 175 trait SpecCopy { 176 fn copy(self) -> Result<u64>; 177 } 178 179 impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> { 180 default fn copy(self) -> Result<u64> { 181 generic_copy(self.read, self.write) 182 } 183 } 184 185 impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> { 186 fn copy(self) -> Result<u64> { 187 let (reader, writer) = (self.read, self.write); 188 let r_cfg = reader.properties(); 189 let w_cfg = writer.properties(); 190 191 // before direct operations on file descriptors ensure that all source and sink buffers are empty 192 let mut flush = || -> crate::std::io::Result<u64> { 193 let bytes = reader.drain_to(writer, u64::MAX)?; 194 // BufWriter buffered bytes have already been accounted for in earlier write() calls 195 writer.flush()?; 196 Ok(bytes) 197 }; 198 199 let mut written = 0u64; 200 201 if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) = 202 (r_cfg, w_cfg) 203 { 204 written += flush()?; 205 let max_write = reader.min_limit(); 206 207 if input_meta.copy_file_range_candidate(FdHandle::Input) 208 && output_meta.copy_file_range_candidate(FdHandle::Output) 209 { 210 let result = copy_regular_files(readfd, writefd, max_write); 211 result.update_take(reader); 212 213 match result { 214 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), 215 CopyResult::Error(e, _) => return Err(e), 216 CopyResult::Fallback(bytes) => written += bytes, 217 } 218 } 219 220 // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices) 221 // to any writable file descriptor. On older kernels the writer side can only be a socket. 222 // So we just try and fallback if needed. 223 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead 224 // fall back to the generic copy loop. 225 if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta) 226 { 227 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write); 228 result.update_take(reader); 229 230 match result { 231 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), 232 CopyResult::Error(e, _) => return Err(e), 233 CopyResult::Fallback(bytes) => written += bytes, 234 } 235 } 236 237 if (input_meta.maybe_fifo() || output_meta.maybe_fifo()) 238 && safe_kernel_copy(&input_meta, &output_meta) 239 { 240 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write); 241 result.update_take(reader); 242 243 match result { 244 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), 245 CopyResult::Error(e, _) => return Err(e), 246 CopyResult::Fallback(0) => { /* use the fallback below */ } 247 CopyResult::Fallback(_) => { 248 unreachable!("splice should not return > 0 bytes on the fallback path") 249 } 250 } 251 } 252 } 253 254 // fallback if none of the more specialized syscalls wants to work with these file descriptors 255 match generic_copy(reader, writer) { 256 Ok(bytes) => Ok(bytes + written), 257 err => err, 258 } 259 } 260 } 261 262 #[rustc_specialization_trait] 263 trait CopyRead: Read { 264 /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal 265 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been 266 /// transferred, whichever occurs sooner. 267 /// If nested buffers are present the outer buffers must be drained first. 268 /// 269 /// This is necessary to directly bypass the wrapper types while preserving the data order 270 /// when operating directly on the underlying file descriptors. 271 fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> { 272 Ok(0) 273 } 274 275 /// Updates `Take` wrappers to remove the number of bytes copied. 276 fn taken(&mut self, _bytes: u64) {} 277 278 /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise. 279 /// This method does not account for data `BufReader` buffers and would underreport 280 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid 281 /// after draining the buffers via `drain_to`. 282 fn min_limit(&self) -> u64 { 283 u64::MAX 284 } 285 286 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. 287 fn properties(&self) -> CopyParams; 288 } 289 290 #[rustc_specialization_trait] 291 trait CopyWrite: Write { 292 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. 293 fn properties(&self) -> CopyParams; 294 } 295 296 impl<T> CopyRead for &mut T 297 where 298 T: CopyRead, 299 { 300 fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> { 301 (**self).drain_to(writer, limit) 302 } 303 304 fn taken(&mut self, bytes: u64) { 305 (**self).taken(bytes); 306 } 307 308 fn min_limit(&self) -> u64 { 309 (**self).min_limit() 310 } 311 312 fn properties(&self) -> CopyParams { 313 (**self).properties() 314 } 315 } 316 317 impl<T> CopyWrite for &mut T 318 where 319 T: CopyWrite, 320 { 321 fn properties(&self) -> CopyParams { 322 (**self).properties() 323 } 324 } 325 326 impl CopyRead for File { 327 fn properties(&self) -> CopyParams { 328 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 329 } 330 } 331 332 impl CopyRead for &File { 333 fn properties(&self) -> CopyParams { 334 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) 335 } 336 } 337 338 impl CopyWrite for File { 339 fn properties(&self) -> CopyParams { 340 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 341 } 342 } 343 344 impl CopyWrite for &File { 345 fn properties(&self) -> CopyParams { 346 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) 347 } 348 } 349 350 impl CopyRead for TcpStream { 351 fn properties(&self) -> CopyParams { 352 // avoid the stat syscall since we can be fairly sure it's a socket 353 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 354 } 355 } 356 357 impl CopyRead for &TcpStream { 358 fn properties(&self) -> CopyParams { 359 // avoid the stat syscall since we can be fairly sure it's a socket 360 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 361 } 362 } 363 364 impl CopyWrite for TcpStream { 365 fn properties(&self) -> CopyParams { 366 // avoid the stat syscall since we can be fairly sure it's a socket 367 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 368 } 369 } 370 371 impl CopyWrite for &TcpStream { 372 fn properties(&self) -> CopyParams { 373 // avoid the stat syscall since we can be fairly sure it's a socket 374 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 375 } 376 } 377 378 impl CopyRead for UnixStream { 379 fn properties(&self) -> CopyParams { 380 // avoid the stat syscall since we can be fairly sure it's a socket 381 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 382 } 383 } 384 385 impl CopyRead for &UnixStream { 386 fn properties(&self) -> CopyParams { 387 // avoid the stat syscall since we can be fairly sure it's a socket 388 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 389 } 390 } 391 392 impl CopyWrite for UnixStream { 393 fn properties(&self) -> CopyParams { 394 // avoid the stat syscall since we can be fairly sure it's a socket 395 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 396 } 397 } 398 399 impl CopyWrite for &UnixStream { 400 fn properties(&self) -> CopyParams { 401 // avoid the stat syscall since we can be fairly sure it's a socket 402 CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) 403 } 404 } 405 406 impl CopyWrite for ChildStdin { 407 fn properties(&self) -> CopyParams { 408 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) 409 } 410 } 411 412 impl CopyRead for ChildStdout { 413 fn properties(&self) -> CopyParams { 414 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) 415 } 416 } 417 418 impl CopyRead for ChildStderr { 419 fn properties(&self) -> CopyParams { 420 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) 421 } 422 } 423 424 impl CopyRead for StdinLock<'_> { 425 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { 426 let buf_reader = self.as_mut_buf(); 427 let buf = buf_reader.buffer(); 428 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; 429 let bytes_drained = buf.len(); 430 writer.write_all(buf)?; 431 buf_reader.consume(bytes_drained); 432 433 Ok(bytes_drained as u64) 434 } 435 436 fn properties(&self) -> CopyParams { 437 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 438 } 439 } 440 441 impl CopyWrite for StdoutLock<'_> { 442 fn properties(&self) -> CopyParams { 443 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 444 } 445 } 446 447 impl CopyWrite for StderrLock<'_> { 448 fn properties(&self) -> CopyParams { 449 CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) 450 } 451 } 452 453 impl<T: CopyRead> CopyRead for Take<T> { 454 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { 455 let local_limit = self.limit(); 456 let combined_limit = min(outer_limit, local_limit); 457 let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?; 458 // update limit since read() was bypassed 459 self.set_limit(local_limit - bytes_drained); 460 461 Ok(bytes_drained) 462 } 463 464 fn taken(&mut self, bytes: u64) { 465 self.set_limit(self.limit() - bytes); 466 self.get_mut().taken(bytes); 467 } 468 469 fn min_limit(&self) -> u64 { 470 min(Take::limit(self), self.get_ref().min_limit()) 471 } 472 473 fn properties(&self) -> CopyParams { 474 self.get_ref().properties() 475 } 476 } 477 478 impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> { 479 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { 480 let buf = self.buffer(); 481 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; 482 let bytes = buf.len(); 483 writer.write_all(buf)?; 484 self.consume(bytes); 485 486 let remaining = outer_limit - bytes as u64; 487 488 // in case of nested bufreaders we also need to drain the ones closer to the source 489 let inner_bytes = self.get_mut().drain_to(writer, remaining)?; 490 491 Ok(bytes as u64 + inner_bytes) 492 } 493 494 fn taken(&mut self, bytes: u64) { 495 self.get_mut().taken(bytes); 496 } 497 498 fn min_limit(&self) -> u64 { 499 self.get_ref().min_limit() 500 } 501 502 fn properties(&self) -> CopyParams { 503 self.get_ref().properties() 504 } 505 } 506 507 impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> { 508 fn properties(&self) -> CopyParams { 509 self.get_ref().properties() 510 } 511 } 512 513 fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta { 514 let fd = fd.as_raw_fd(); 515 let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) }); 516 match file.metadata() { 517 Ok(meta) => FdMeta::Metadata(meta), 518 Err(_) => FdMeta::NoneObtained, 519 } 520 } 521 522 pub(super) enum CopyResult { 523 Ended(u64), 524 Error(Error, u64), 525 Fallback(u64), 526 } 527 528 impl CopyResult { 529 fn update_take(&self, reader: &mut impl CopyRead) { 530 match *self { 531 CopyResult::Fallback(bytes) 532 | CopyResult::Ended(bytes) 533 | CopyResult::Error(_, bytes) => reader.taken(bytes), 534 } 535 } 536 } 537 538 /// Invalid file descriptor. 539 /// 540 /// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage) 541 /// while negative values are used to indicate errors. 542 /// Thus -1 will never be overlap with a valid open file. 543 const INVALID_FD: RawFd = -1; 544 545 /// Linux-specific implementation that will attempt to use copy_file_range for copy offloading. 546 /// As the name says, it only works on regular files. 547 /// 548 /// Callers must handle fallback to a generic copy loop. 549 /// `Fallback` may indicate non-zero number of bytes already written 550 /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`). 551 pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult { 552 use crate::std::cmp; 553 554 const NOT_PROBED: u8 = 0; 555 const UNAVAILABLE: u8 = 1; 556 const AVAILABLE: u8 = 2; 557 558 // Kernel prior to 4.5 don't have copy_file_range 559 // We store the availability in a global to avoid unnecessary syscalls 560 static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED); 561 562 syscall! { 563 fn copy_file_range( 564 fd_in: dlibc::c_int, 565 off_in: *mut dlibc::loff_t, 566 fd_out: dlibc::c_int, 567 off_out: *mut dlibc::loff_t, 568 len: dlibc::size_t, 569 flags: dlibc::c_uint 570 ) -> dlibc::ssize_t 571 } 572 573 match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) { 574 NOT_PROBED => { 575 // EPERM can indicate seccomp filters or an immutable file. 576 // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported 577 // and some other error (ENOSYS or EPERM) if it's not available 578 let result = unsafe { 579 cvt(copy_file_range( 580 INVALID_FD, 581 ptr::null_mut(), 582 INVALID_FD, 583 ptr::null_mut(), 584 1, 585 0, 586 )) 587 }; 588 589 if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) { 590 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed); 591 } else { 592 HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed); 593 return CopyResult::Fallback(0); 594 } 595 } 596 UNAVAILABLE => return CopyResult::Fallback(0), 597 _ => {} 598 }; 599 600 let mut written = 0u64; 601 while written < max_len { 602 let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64); 603 // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position 604 // this allows us to copy large chunks without hitting EOVERFLOW, 605 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required 606 let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize); 607 let copy_result = unsafe { 608 // We actually don't have to adjust the offsets, 609 // because copy_file_range adjusts the file offset automatically 610 cvt(copy_file_range( 611 reader, 612 ptr::null_mut(), 613 writer, 614 ptr::null_mut(), 615 bytes_to_copy, 616 0, 617 )) 618 }; 619 620 match copy_result { 621 Ok(0) if written == 0 => { 622 // fallback to work around several kernel bugs where copy_file_range will fail to 623 // copy any bytes and return 0 instead of an error if 624 // - reading virtual files from the proc filesystem which appear to have 0 size 625 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19. 626 // - copying from an overlay filesystem in docker. reported to occur on fedora 32. 627 return CopyResult::Fallback(0); 628 } 629 Ok(0) => return CopyResult::Ended(written), // reached EOF 630 Ok(ret) => written += ret as u64, 631 Err(err) => { 632 return match err.raw_os_error() { 633 // when file offset + max_length > u64::MAX 634 Some(EOVERFLOW) => CopyResult::Fallback(written), 635 Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => { 636 // Try fallback io::copy if either: 637 // - Kernel version is < 4.5 (ENOSYS¹) 638 // - Files are mounted on different fs (EXDEV) 639 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP) 640 // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM) 641 // - copy_file_range cannot be used with pipes or device nodes (EINVAL) 642 // - the writer fd was opened with O_APPEND (EBADF²) 643 // and no bytes were written successfully yet. (All these errnos should 644 // not be returned if something was already written, but they happen in 645 // the wild, see #91152.) 646 // 647 // ¹ these cases should be detected by the initial probe but we handle them here 648 // anyway in case syscall interception changes during runtime 649 // ² actually invalid file descriptors would cause this too, but in that case 650 // the fallback code path is expected to encounter the same error again 651 CopyResult::Fallback(0) 652 } 653 _ => CopyResult::Error(err, written), 654 }; 655 } 656 } 657 } 658 CopyResult::Ended(written) 659 } 660 661 #[derive(PartialEq)] 662 enum SpliceMode { 663 Sendfile, 664 Splice, 665 } 666 667 /// performs splice or sendfile between file descriptors 668 /// Does _not_ fall back to a generic copy loop. 669 fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult { 670 static HAS_SENDFILE: AtomicBool = AtomicBool::new(true); 671 static HAS_SPLICE: AtomicBool = AtomicBool::new(true); 672 673 // Android builds use feature level 14, but the libc wrapper for splice is 674 // gated on feature level 21+, so we have to invoke the syscall directly. 675 #[cfg(target_os = "android")] 676 syscall! { 677 fn splice( 678 srcfd: dlibc::c_int, 679 src_offset: *const i64, 680 dstfd: dlibc::c_int, 681 dst_offset: *const i64, 682 len: dlibc::size_t, 683 flags: dlibc::c_int 684 ) -> dlibc::ssize_t 685 } 686 687 #[cfg(target_os = "linux")] 688 use dlibc::splice; 689 690 #[cfg(target_os = "dragonos")] 691 use dlibc::splice; 692 693 match mode { 694 SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => { 695 return CopyResult::Fallback(0); 696 } 697 SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => { 698 return CopyResult::Fallback(0); 699 } 700 _ => (), 701 } 702 703 let mut written = 0u64; 704 while written < len { 705 // according to its manpage that's the maximum size sendfile() will copy per invocation 706 let chunk_size = crate::std::cmp::min(len - written, 0x7ffff000_u64) as usize; 707 708 let result = match mode { 709 SpliceMode::Sendfile => { 710 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) }) 711 } 712 SpliceMode::Splice => cvt(unsafe { 713 splice( 714 reader, 715 ptr::null_mut(), 716 writer, 717 ptr::null_mut(), 718 chunk_size, 719 0, 720 ) 721 }), 722 }; 723 724 match result { 725 Ok(0) => break, // EOF 726 Ok(ret) => written += ret as u64, 727 Err(err) => { 728 return match err.raw_os_error() { 729 Some(ENOSYS | EPERM) => { 730 // syscall not supported (ENOSYS) 731 // syscall is disallowed, e.g. by seccomp (EPERM) 732 match mode { 733 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed), 734 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed), 735 } 736 assert_eq!(written, 0); 737 CopyResult::Fallback(0) 738 } 739 Some(EINVAL) => { 740 // splice/sendfile do not support this particular file descriptor (EINVAL) 741 assert_eq!(written, 0); 742 CopyResult::Fallback(0) 743 } 744 Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => { 745 CopyResult::Fallback(written) 746 } 747 _ => CopyResult::Error(err, written), 748 }; 749 } 750 } 751 } 752 CopyResult::Ended(written) 753 } 754