1 //! This module contains specializations that can offload `io::copy()` operations on file descriptor
2 //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
3 //!
4 //! Specialization is only applied to wholly std-owned types so that user code can't observe
5 //! that the `Read` and `Write` traits are not used.
6 //!
7 //! Since a copy operation involves a reader and writer side where each can consist of different types
8 //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9 //! a single method on all possible combinations.
10 //!
11 //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12 //! traits and then specialized on by the `Copier::copy` method.
13 //!
14 //! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15 //! additional prerequisites and constraints imposed by the wrapper types.
16 //!
17 //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18 //! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19 //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20 //! Since those syscalls have requirements that cannot be fully checked in advance it attempts
21 //! to use them one after another (guided by hints) to figure out which one works and
22 //! falls back to the generic read-write copy loop if none of them does.
23 //! Once a working syscall is found for a pair of file descriptors it will be called in a loop
24 //! until the copy operation is completed.
25 //!
26 //! Advantages of using these syscalls:
27 //!
28 //! * fewer context switches since reads and writes are coalesced into a single syscall
29 //! and more bytes are transferred per syscall. This translates to higher throughput
30 //! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
31 //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
32 //! consuming less disk space
33 //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
34 //! a naive copy loop would move every byte through the CPU.
35 //!
36 //! Drawbacks:
37 //!
38 //! * copy operations smaller than the default buffer size can under some circumstances, especially
39 //! on older kernels, incur more syscalls than the naive approach would. As mentioned above
40 //! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
41 //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
42 //! progress, they can hit a performance cliff.
43 //! * complexity
44
45 use crate::std::cmp::min;
46 use crate::std::fs::{File, Metadata};
47 use crate::std::io::copy::generic_copy;
48 use crate::std::io::{
49 BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
50 Write,
51 };
52 use crate::std::mem::ManuallyDrop;
53 use crate::std::net::TcpStream;
54 use crate::std::os::unix::fs::FileTypeExt;
55 use crate::std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
56 use crate::std::os::unix::net::UnixStream;
57 use crate::std::process::{ChildStderr, ChildStdin, ChildStdout};
58 use crate::std::ptr;
59 use crate::std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
60 use crate::std::sys::cvt;
61 use crate::std::sys::weak::syscall;
62 use dlibc;
63 #[cfg(not(all(target_os = "linux", target_env = "gnu")))]
64 use dlibc::sendfile as sendfile64;
65 #[cfg(all(target_os = "linux", target_env = "gnu"))]
66 use dlibc::sendfile64;
67 use dlibc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
68
69 #[allow(dead_code)]
copy_spec<R: Read + ?Sized, W: Write + ?Sized>( read: &mut R, write: &mut W, ) -> Result<u64>70 pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
71 read: &mut R,
72 write: &mut W,
73 ) -> Result<u64> {
74 let copier = Copier { read, write };
75 SpecCopy::copy(copier)
76 }
77
78 /// This type represents either the inferred `FileType` of a `RawFd` based on the source
79 /// type from which it was extracted or the actual metadata
80 ///
81 /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
82 /// type may be wrong.
83 enum FdMeta {
84 Metadata(Metadata),
85 Socket,
86 Pipe,
87 /// We don't have any metadata because the stat syscall failed
88 NoneObtained,
89 }
90
91 #[derive(PartialEq)]
92 enum FdHandle {
93 Input,
94 Output,
95 }
96
97 impl FdMeta {
maybe_fifo(&self) -> bool98 fn maybe_fifo(&self) -> bool {
99 match self {
100 FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
101 FdMeta::Socket => false,
102 FdMeta::Pipe => true,
103 FdMeta::NoneObtained => true,
104 }
105 }
106
potential_sendfile_source(&self) -> bool107 fn potential_sendfile_source(&self) -> bool {
108 match self {
109 // procfs erroneously shows 0 length on non-empty readable files.
110 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
111 // thus there would be benefit from attempting sendfile
112 FdMeta::Metadata(meta)
113 if meta.file_type().is_file() && meta.len() > 0
114 || meta.file_type().is_block_device() =>
115 {
116 true
117 }
118 _ => false,
119 }
120 }
121
copy_file_range_candidate(&self, f: FdHandle) -> bool122 fn copy_file_range_candidate(&self, f: FdHandle) -> bool {
123 match self {
124 // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
125 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
126 FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => {
127 true
128 }
129 FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true,
130 _ => false,
131 }
132 }
133 }
134
135 /// Returns true either if changes made to the source after a sendfile/splice call won't become
136 /// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing
137 /// a file into a pipe, the pipe being the source in this case).
138 ///
139 /// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold
140 /// the Read/Write API semantics of io::copy.
141 ///
142 /// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a
143 /// regular file into a TcpSocket which will be treated as a socket here without checking.
safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool144 fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool {
145 match (source, sink) {
146 // Data arriving from a socket is safe because the sender can't modify the socket buffer.
147 // Data arriving from a pipe is safe(-ish) because either the sender *copied*
148 // the bytes into the pipe OR explicitly performed an operation that enables zero-copy,
149 // thus promising not to modify the data later.
150 (FdMeta::Socket, _) => true,
151 (FdMeta::Pipe, _) => true,
152 (FdMeta::Metadata(meta), _)
153 if meta.file_type().is_fifo() || meta.file_type().is_socket() =>
154 {
155 true
156 }
157 // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue
158 // only happens for pages sitting in send buffers or pipes.
159 (_, FdMeta::Metadata(meta))
160 if !meta.file_type().is_fifo() && !meta.file_type().is_socket() =>
161 {
162 true
163 }
164 _ => false,
165 }
166 }
167
168 struct CopyParams(FdMeta, Option<RawFd>);
169
170 struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
171 read: &'a mut R,
172 write: &'b mut W,
173 }
174
175 trait SpecCopy {
copy(self) -> Result<u64>176 fn copy(self) -> Result<u64>;
177 }
178
179 impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
copy(self) -> Result<u64>180 default fn copy(self) -> Result<u64> {
181 generic_copy(self.read, self.write)
182 }
183 }
184
185 impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
copy(self) -> Result<u64>186 fn copy(self) -> Result<u64> {
187 let (reader, writer) = (self.read, self.write);
188 let r_cfg = reader.properties();
189 let w_cfg = writer.properties();
190
191 // before direct operations on file descriptors ensure that all source and sink buffers are empty
192 let mut flush = || -> crate::std::io::Result<u64> {
193 let bytes = reader.drain_to(writer, u64::MAX)?;
194 // BufWriter buffered bytes have already been accounted for in earlier write() calls
195 writer.flush()?;
196 Ok(bytes)
197 };
198
199 let mut written = 0u64;
200
201 if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
202 (r_cfg, w_cfg)
203 {
204 written += flush()?;
205 let max_write = reader.min_limit();
206
207 if input_meta.copy_file_range_candidate(FdHandle::Input)
208 && output_meta.copy_file_range_candidate(FdHandle::Output)
209 {
210 let result = copy_regular_files(readfd, writefd, max_write);
211 result.update_take(reader);
212
213 match result {
214 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
215 CopyResult::Error(e, _) => return Err(e),
216 CopyResult::Fallback(bytes) => written += bytes,
217 }
218 }
219
220 // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
221 // to any writable file descriptor. On older kernels the writer side can only be a socket.
222 // So we just try and fallback if needed.
223 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
224 // fall back to the generic copy loop.
225 if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta)
226 {
227 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
228 result.update_take(reader);
229
230 match result {
231 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
232 CopyResult::Error(e, _) => return Err(e),
233 CopyResult::Fallback(bytes) => written += bytes,
234 }
235 }
236
237 if (input_meta.maybe_fifo() || output_meta.maybe_fifo())
238 && safe_kernel_copy(&input_meta, &output_meta)
239 {
240 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
241 result.update_take(reader);
242
243 match result {
244 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
245 CopyResult::Error(e, _) => return Err(e),
246 CopyResult::Fallback(0) => { /* use the fallback below */ }
247 CopyResult::Fallback(_) => {
248 unreachable!("splice should not return > 0 bytes on the fallback path")
249 }
250 }
251 }
252 }
253
254 // fallback if none of the more specialized syscalls wants to work with these file descriptors
255 match generic_copy(reader, writer) {
256 Ok(bytes) => Ok(bytes + written),
257 err => err,
258 }
259 }
260 }
261
262 #[rustc_specialization_trait]
263 trait CopyRead: Read {
264 /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
265 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
266 /// transferred, whichever occurs sooner.
267 /// If nested buffers are present the outer buffers must be drained first.
268 ///
269 /// This is necessary to directly bypass the wrapper types while preserving the data order
270 /// when operating directly on the underlying file descriptors.
drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64>271 fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
272 Ok(0)
273 }
274
275 /// Updates `Take` wrappers to remove the number of bytes copied.
taken(&mut self, _bytes: u64)276 fn taken(&mut self, _bytes: u64) {}
277
278 /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
279 /// This method does not account for data `BufReader` buffers and would underreport
280 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
281 /// after draining the buffers via `drain_to`.
min_limit(&self) -> u64282 fn min_limit(&self) -> u64 {
283 u64::MAX
284 }
285
286 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
properties(&self) -> CopyParams287 fn properties(&self) -> CopyParams;
288 }
289
290 #[rustc_specialization_trait]
291 trait CopyWrite: Write {
292 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
properties(&self) -> CopyParams293 fn properties(&self) -> CopyParams;
294 }
295
296 impl<T> CopyRead for &mut T
297 where
298 T: CopyRead,
299 {
drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64>300 fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
301 (**self).drain_to(writer, limit)
302 }
303
taken(&mut self, bytes: u64)304 fn taken(&mut self, bytes: u64) {
305 (**self).taken(bytes);
306 }
307
min_limit(&self) -> u64308 fn min_limit(&self) -> u64 {
309 (**self).min_limit()
310 }
311
properties(&self) -> CopyParams312 fn properties(&self) -> CopyParams {
313 (**self).properties()
314 }
315 }
316
317 impl<T> CopyWrite for &mut T
318 where
319 T: CopyWrite,
320 {
properties(&self) -> CopyParams321 fn properties(&self) -> CopyParams {
322 (**self).properties()
323 }
324 }
325
326 impl CopyRead for File {
properties(&self) -> CopyParams327 fn properties(&self) -> CopyParams {
328 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
329 }
330 }
331
332 impl CopyRead for &File {
properties(&self) -> CopyParams333 fn properties(&self) -> CopyParams {
334 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
335 }
336 }
337
338 impl CopyWrite for File {
properties(&self) -> CopyParams339 fn properties(&self) -> CopyParams {
340 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
341 }
342 }
343
344 impl CopyWrite for &File {
properties(&self) -> CopyParams345 fn properties(&self) -> CopyParams {
346 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
347 }
348 }
349
350 impl CopyRead for TcpStream {
properties(&self) -> CopyParams351 fn properties(&self) -> CopyParams {
352 // avoid the stat syscall since we can be fairly sure it's a socket
353 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
354 }
355 }
356
357 impl CopyRead for &TcpStream {
properties(&self) -> CopyParams358 fn properties(&self) -> CopyParams {
359 // avoid the stat syscall since we can be fairly sure it's a socket
360 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
361 }
362 }
363
364 impl CopyWrite for TcpStream {
properties(&self) -> CopyParams365 fn properties(&self) -> CopyParams {
366 // avoid the stat syscall since we can be fairly sure it's a socket
367 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
368 }
369 }
370
371 impl CopyWrite for &TcpStream {
properties(&self) -> CopyParams372 fn properties(&self) -> CopyParams {
373 // avoid the stat syscall since we can be fairly sure it's a socket
374 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
375 }
376 }
377
378 impl CopyRead for UnixStream {
properties(&self) -> CopyParams379 fn properties(&self) -> CopyParams {
380 // avoid the stat syscall since we can be fairly sure it's a socket
381 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
382 }
383 }
384
385 impl CopyRead for &UnixStream {
properties(&self) -> CopyParams386 fn properties(&self) -> CopyParams {
387 // avoid the stat syscall since we can be fairly sure it's a socket
388 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
389 }
390 }
391
392 impl CopyWrite for UnixStream {
properties(&self) -> CopyParams393 fn properties(&self) -> CopyParams {
394 // avoid the stat syscall since we can be fairly sure it's a socket
395 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
396 }
397 }
398
399 impl CopyWrite for &UnixStream {
properties(&self) -> CopyParams400 fn properties(&self) -> CopyParams {
401 // avoid the stat syscall since we can be fairly sure it's a socket
402 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
403 }
404 }
405
406 impl CopyWrite for ChildStdin {
properties(&self) -> CopyParams407 fn properties(&self) -> CopyParams {
408 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
409 }
410 }
411
412 impl CopyRead for ChildStdout {
properties(&self) -> CopyParams413 fn properties(&self) -> CopyParams {
414 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
415 }
416 }
417
418 impl CopyRead for ChildStderr {
properties(&self) -> CopyParams419 fn properties(&self) -> CopyParams {
420 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
421 }
422 }
423
424 impl CopyRead for StdinLock<'_> {
drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64>425 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
426 let buf_reader = self.as_mut_buf();
427 let buf = buf_reader.buffer();
428 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
429 let bytes_drained = buf.len();
430 writer.write_all(buf)?;
431 buf_reader.consume(bytes_drained);
432
433 Ok(bytes_drained as u64)
434 }
435
properties(&self) -> CopyParams436 fn properties(&self) -> CopyParams {
437 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
438 }
439 }
440
441 impl CopyWrite for StdoutLock<'_> {
properties(&self) -> CopyParams442 fn properties(&self) -> CopyParams {
443 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
444 }
445 }
446
447 impl CopyWrite for StderrLock<'_> {
properties(&self) -> CopyParams448 fn properties(&self) -> CopyParams {
449 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
450 }
451 }
452
453 impl<T: CopyRead> CopyRead for Take<T> {
drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64>454 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
455 let local_limit = self.limit();
456 let combined_limit = min(outer_limit, local_limit);
457 let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?;
458 // update limit since read() was bypassed
459 self.set_limit(local_limit - bytes_drained);
460
461 Ok(bytes_drained)
462 }
463
taken(&mut self, bytes: u64)464 fn taken(&mut self, bytes: u64) {
465 self.set_limit(self.limit() - bytes);
466 self.get_mut().taken(bytes);
467 }
468
min_limit(&self) -> u64469 fn min_limit(&self) -> u64 {
470 min(Take::limit(self), self.get_ref().min_limit())
471 }
472
properties(&self) -> CopyParams473 fn properties(&self) -> CopyParams {
474 self.get_ref().properties()
475 }
476 }
477
478 impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> {
drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64>479 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
480 let buf = self.buffer();
481 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
482 let bytes = buf.len();
483 writer.write_all(buf)?;
484 self.consume(bytes);
485
486 let remaining = outer_limit - bytes as u64;
487
488 // in case of nested bufreaders we also need to drain the ones closer to the source
489 let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
490
491 Ok(bytes as u64 + inner_bytes)
492 }
493
taken(&mut self, bytes: u64)494 fn taken(&mut self, bytes: u64) {
495 self.get_mut().taken(bytes);
496 }
497
min_limit(&self) -> u64498 fn min_limit(&self) -> u64 {
499 self.get_ref().min_limit()
500 }
501
properties(&self) -> CopyParams502 fn properties(&self) -> CopyParams {
503 self.get_ref().properties()
504 }
505 }
506
507 impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> {
properties(&self) -> CopyParams508 fn properties(&self) -> CopyParams {
509 self.get_ref().properties()
510 }
511 }
512
fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta513 fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
514 let fd = fd.as_raw_fd();
515 let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
516 match file.metadata() {
517 Ok(meta) => FdMeta::Metadata(meta),
518 Err(_) => FdMeta::NoneObtained,
519 }
520 }
521
522 pub(super) enum CopyResult {
523 Ended(u64),
524 Error(Error, u64),
525 Fallback(u64),
526 }
527
528 impl CopyResult {
update_take(&self, reader: &mut impl CopyRead)529 fn update_take(&self, reader: &mut impl CopyRead) {
530 match *self {
531 CopyResult::Fallback(bytes)
532 | CopyResult::Ended(bytes)
533 | CopyResult::Error(_, bytes) => reader.taken(bytes),
534 }
535 }
536 }
537
538 /// Invalid file descriptor.
539 ///
540 /// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
541 /// while negative values are used to indicate errors.
542 /// Thus -1 will never be overlap with a valid open file.
543 const INVALID_FD: RawFd = -1;
544
545 /// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
546 /// As the name says, it only works on regular files.
547 ///
548 /// Callers must handle fallback to a generic copy loop.
549 /// `Fallback` may indicate non-zero number of bytes already written
550 /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult551 pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
552 use crate::std::cmp;
553
554 const NOT_PROBED: u8 = 0;
555 const UNAVAILABLE: u8 = 1;
556 const AVAILABLE: u8 = 2;
557
558 // Kernel prior to 4.5 don't have copy_file_range
559 // We store the availability in a global to avoid unnecessary syscalls
560 static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED);
561
562 syscall! {
563 fn copy_file_range(
564 fd_in: dlibc::c_int,
565 off_in: *mut dlibc::loff_t,
566 fd_out: dlibc::c_int,
567 off_out: *mut dlibc::loff_t,
568 len: dlibc::size_t,
569 flags: dlibc::c_uint
570 ) -> dlibc::ssize_t
571 }
572
573 match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
574 NOT_PROBED => {
575 // EPERM can indicate seccomp filters or an immutable file.
576 // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported
577 // and some other error (ENOSYS or EPERM) if it's not available
578 let result = unsafe {
579 cvt(copy_file_range(
580 INVALID_FD,
581 ptr::null_mut(),
582 INVALID_FD,
583 ptr::null_mut(),
584 1,
585 0,
586 ))
587 };
588
589 if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) {
590 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
591 } else {
592 HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed);
593 return CopyResult::Fallback(0);
594 }
595 }
596 UNAVAILABLE => return CopyResult::Fallback(0),
597 _ => {}
598 };
599
600 let mut written = 0u64;
601 while written < max_len {
602 let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
603 // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
604 // this allows us to copy large chunks without hitting EOVERFLOW,
605 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
606 let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
607 let copy_result = unsafe {
608 // We actually don't have to adjust the offsets,
609 // because copy_file_range adjusts the file offset automatically
610 cvt(copy_file_range(
611 reader,
612 ptr::null_mut(),
613 writer,
614 ptr::null_mut(),
615 bytes_to_copy,
616 0,
617 ))
618 };
619
620 match copy_result {
621 Ok(0) if written == 0 => {
622 // fallback to work around several kernel bugs where copy_file_range will fail to
623 // copy any bytes and return 0 instead of an error if
624 // - reading virtual files from the proc filesystem which appear to have 0 size
625 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
626 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
627 return CopyResult::Fallback(0);
628 }
629 Ok(0) => return CopyResult::Ended(written), // reached EOF
630 Ok(ret) => written += ret as u64,
631 Err(err) => {
632 return match err.raw_os_error() {
633 // when file offset + max_length > u64::MAX
634 Some(EOVERFLOW) => CopyResult::Fallback(written),
635 Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => {
636 // Try fallback io::copy if either:
637 // - Kernel version is < 4.5 (ENOSYS¹)
638 // - Files are mounted on different fs (EXDEV)
639 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
640 // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
641 // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
642 // - the writer fd was opened with O_APPEND (EBADF²)
643 // and no bytes were written successfully yet. (All these errnos should
644 // not be returned if something was already written, but they happen in
645 // the wild, see #91152.)
646 //
647 // ¹ these cases should be detected by the initial probe but we handle them here
648 // anyway in case syscall interception changes during runtime
649 // ² actually invalid file descriptors would cause this too, but in that case
650 // the fallback code path is expected to encounter the same error again
651 CopyResult::Fallback(0)
652 }
653 _ => CopyResult::Error(err, written),
654 };
655 }
656 }
657 }
658 CopyResult::Ended(written)
659 }
660
661 #[derive(PartialEq)]
662 enum SpliceMode {
663 Sendfile,
664 Splice,
665 }
666
667 /// performs splice or sendfile between file descriptors
668 /// Does _not_ fall back to a generic copy loop.
sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult669 fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
670 static HAS_SENDFILE: AtomicBool = AtomicBool::new(true);
671 static HAS_SPLICE: AtomicBool = AtomicBool::new(true);
672
673 // Android builds use feature level 14, but the libc wrapper for splice is
674 // gated on feature level 21+, so we have to invoke the syscall directly.
675 #[cfg(target_os = "android")]
676 syscall! {
677 fn splice(
678 srcfd: dlibc::c_int,
679 src_offset: *const i64,
680 dstfd: dlibc::c_int,
681 dst_offset: *const i64,
682 len: dlibc::size_t,
683 flags: dlibc::c_int
684 ) -> dlibc::ssize_t
685 }
686
687 #[cfg(target_os = "linux")]
688 use dlibc::splice;
689
690 #[cfg(target_os = "dragonos")]
691 use dlibc::splice;
692
693 match mode {
694 SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
695 return CopyResult::Fallback(0);
696 }
697 SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
698 return CopyResult::Fallback(0);
699 }
700 _ => (),
701 }
702
703 let mut written = 0u64;
704 while written < len {
705 // according to its manpage that's the maximum size sendfile() will copy per invocation
706 let chunk_size = crate::std::cmp::min(len - written, 0x7ffff000_u64) as usize;
707
708 let result = match mode {
709 SpliceMode::Sendfile => {
710 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) })
711 }
712 SpliceMode::Splice => cvt(unsafe {
713 splice(
714 reader,
715 ptr::null_mut(),
716 writer,
717 ptr::null_mut(),
718 chunk_size,
719 0,
720 )
721 }),
722 };
723
724 match result {
725 Ok(0) => break, // EOF
726 Ok(ret) => written += ret as u64,
727 Err(err) => {
728 return match err.raw_os_error() {
729 Some(ENOSYS | EPERM) => {
730 // syscall not supported (ENOSYS)
731 // syscall is disallowed, e.g. by seccomp (EPERM)
732 match mode {
733 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
734 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
735 }
736 assert_eq!(written, 0);
737 CopyResult::Fallback(0)
738 }
739 Some(EINVAL) => {
740 // splice/sendfile do not support this particular file descriptor (EINVAL)
741 assert_eq!(written, 0);
742 CopyResult::Fallback(0)
743 }
744 Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
745 CopyResult::Fallback(written)
746 }
747 _ => CopyResult::Error(err, written),
748 };
749 }
750 }
751 }
752 CopyResult::Ended(written)
753 }
754