xref: /drstd/src/std/sys/unix/kernel_copy.rs (revision 9670759b785600bf6315e4173e46a602f16add7a)
1 //! This module contains specializations that can offload `io::copy()` operations on file descriptor
2 //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
3 //!
4 //! Specialization is only applied to wholly std-owned types so that user code can't observe
5 //! that the `Read` and `Write` traits are not used.
6 //!
7 //! Since a copy operation involves a reader and writer side where each can consist of different types
8 //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9 //! a single method on all possible combinations.
10 //!
11 //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12 //! traits and then specialized on by the `Copier::copy` method.
13 //!
14 //! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15 //! additional prerequisites and constraints imposed by the wrapper types.
16 //!
17 //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18 //! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19 //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20 //! Since those syscalls have requirements that cannot be fully checked in advance it attempts
21 //! to use them one after another (guided by hints) to figure out which one works and
22 //! falls back to the generic read-write copy loop if none of them does.
23 //! Once a working syscall is found for a pair of file descriptors it will be called in a loop
24 //! until the copy operation is completed.
25 //!
26 //! Advantages of using these syscalls:
27 //!
28 //! * fewer context switches since reads and writes are coalesced into a single syscall
29 //!   and more bytes are transferred per syscall. This translates to higher throughput
30 //!   and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
31 //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
32 //!   consuming less disk space
33 //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
34 //!   a naive copy loop would move every byte through the CPU.
35 //!
36 //! Drawbacks:
37 //!
38 //! * copy operations smaller than the default buffer size can under some circumstances, especially
39 //!   on older kernels, incur more syscalls than the naive approach would. As mentioned above
40 //!   the syscall selection is guided by hints to minimize this possibility but they are not perfect.
41 //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
42 //!   progress, they can hit a performance cliff.
43 //! * complexity
44 
45 use crate::std::cmp::min;
46 use crate::std::fs::{File, Metadata};
47 use crate::std::io::copy::generic_copy;
48 use crate::std::io::{
49     BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
50     Write,
51 };
52 use crate::std::mem::ManuallyDrop;
53 use crate::std::net::TcpStream;
54 use crate::std::os::unix::fs::FileTypeExt;
55 use crate::std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
56 use crate::std::os::unix::net::UnixStream;
57 use crate::std::process::{ChildStderr, ChildStdin, ChildStdout};
58 use crate::std::ptr;
59 use crate::std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
60 use crate::std::sys::cvt;
61 use crate::std::sys::weak::syscall;
62 use dlibc;
63 #[cfg(not(all(target_os = "linux", target_env = "gnu")))]
64 use dlibc::sendfile as sendfile64;
65 #[cfg(all(target_os = "linux", target_env = "gnu"))]
66 use dlibc::sendfile64;
67 use dlibc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
68 #[cfg(test)]
69 mod tests;
70 
71 pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
72     read: &mut R,
73     write: &mut W,
74 ) -> Result<u64> {
75     let copier = Copier { read, write };
76     SpecCopy::copy(copier)
77 }
78 
79 /// This type represents either the inferred `FileType` of a `RawFd` based on the source
80 /// type from which it was extracted or the actual metadata
81 ///
82 /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
83 /// type may be wrong.
84 enum FdMeta {
85     Metadata(Metadata),
86     Socket,
87     Pipe,
88     /// We don't have any metadata because the stat syscall failed
89     NoneObtained,
90 }
91 
92 #[derive(PartialEq)]
93 enum FdHandle {
94     Input,
95     Output,
96 }
97 
98 impl FdMeta {
99     fn maybe_fifo(&self) -> bool {
100         match self {
101             FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
102             FdMeta::Socket => false,
103             FdMeta::Pipe => true,
104             FdMeta::NoneObtained => true,
105         }
106     }
107 
108     fn potential_sendfile_source(&self) -> bool {
109         match self {
110             // procfs erroneously shows 0 length on non-empty readable files.
111             // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
112             // thus there would be benefit from attempting sendfile
113             FdMeta::Metadata(meta)
114                 if meta.file_type().is_file() && meta.len() > 0
115                     || meta.file_type().is_block_device() =>
116             {
117                 true
118             }
119             _ => false,
120         }
121     }
122 
123     fn copy_file_range_candidate(&self, f: FdHandle) -> bool {
124         match self {
125             // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
126             // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
127             FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => {
128                 true
129             }
130             FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true,
131             _ => false,
132         }
133     }
134 }
135 
136 /// Returns true either if changes made to the source after a sendfile/splice call won't become
137 /// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing
138 /// a file into a pipe, the pipe being the source in this case).
139 ///
140 /// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold
141 /// the Read/Write API semantics of io::copy.
142 ///
143 /// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a
144 /// regular file into a TcpSocket which will be treated as a socket here without checking.
145 fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool {
146     match (source, sink) {
147         // Data arriving from a socket is safe because the sender can't modify the socket buffer.
148         // Data arriving from a pipe is safe(-ish) because either the sender *copied*
149         // the bytes into the pipe OR explicitly performed an operation that enables zero-copy,
150         // thus promising not to modify the data later.
151         (FdMeta::Socket, _) => true,
152         (FdMeta::Pipe, _) => true,
153         (FdMeta::Metadata(meta), _)
154             if meta.file_type().is_fifo() || meta.file_type().is_socket() =>
155         {
156             true
157         }
158         // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue
159         // only happens for pages sitting in send buffers or pipes.
160         (_, FdMeta::Metadata(meta))
161             if !meta.file_type().is_fifo() && !meta.file_type().is_socket() =>
162         {
163             true
164         }
165         _ => false,
166     }
167 }
168 
169 struct CopyParams(FdMeta, Option<RawFd>);
170 
171 struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
172     read: &'a mut R,
173     write: &'b mut W,
174 }
175 
176 trait SpecCopy {
177     fn copy(self) -> Result<u64>;
178 }
179 
180 impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
181     default fn copy(self) -> Result<u64> {
182         generic_copy(self.read, self.write)
183     }
184 }
185 
186 impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
187     fn copy(self) -> Result<u64> {
188         let (reader, writer) = (self.read, self.write);
189         let r_cfg = reader.properties();
190         let w_cfg = writer.properties();
191 
192         // before direct operations on file descriptors ensure that all source and sink buffers are empty
193         let mut flush = || -> crate::std::io::Result<u64> {
194             let bytes = reader.drain_to(writer, u64::MAX)?;
195             // BufWriter buffered bytes have already been accounted for in earlier write() calls
196             writer.flush()?;
197             Ok(bytes)
198         };
199 
200         let mut written = 0u64;
201 
202         if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
203             (r_cfg, w_cfg)
204         {
205             written += flush()?;
206             let max_write = reader.min_limit();
207 
208             if input_meta.copy_file_range_candidate(FdHandle::Input)
209                 && output_meta.copy_file_range_candidate(FdHandle::Output)
210             {
211                 let result = copy_regular_files(readfd, writefd, max_write);
212                 result.update_take(reader);
213 
214                 match result {
215                     CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
216                     CopyResult::Error(e, _) => return Err(e),
217                     CopyResult::Fallback(bytes) => written += bytes,
218                 }
219             }
220 
221             // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
222             // to any writable file descriptor. On older kernels the writer side can only be a socket.
223             // So we just try and fallback if needed.
224             // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
225             // fall back to the generic copy loop.
226             if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta)
227             {
228                 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
229                 result.update_take(reader);
230 
231                 match result {
232                     CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
233                     CopyResult::Error(e, _) => return Err(e),
234                     CopyResult::Fallback(bytes) => written += bytes,
235                 }
236             }
237 
238             if (input_meta.maybe_fifo() || output_meta.maybe_fifo())
239                 && safe_kernel_copy(&input_meta, &output_meta)
240             {
241                 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
242                 result.update_take(reader);
243 
244                 match result {
245                     CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
246                     CopyResult::Error(e, _) => return Err(e),
247                     CopyResult::Fallback(0) => { /* use the fallback below */ }
248                     CopyResult::Fallback(_) => {
249                         unreachable!("splice should not return > 0 bytes on the fallback path")
250                     }
251                 }
252             }
253         }
254 
255         // fallback if none of the more specialized syscalls wants to work with these file descriptors
256         match generic_copy(reader, writer) {
257             Ok(bytes) => Ok(bytes + written),
258             err => err,
259         }
260     }
261 }
262 
263 #[rustc_specialization_trait]
264 trait CopyRead: Read {
265     /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
266     /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
267     /// transferred, whichever occurs sooner.
268     /// If nested buffers are present the outer buffers must be drained first.
269     ///
270     /// This is necessary to directly bypass the wrapper types while preserving the data order
271     /// when operating directly on the underlying file descriptors.
272     fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
273         Ok(0)
274     }
275 
276     /// Updates `Take` wrappers to remove the number of bytes copied.
277     fn taken(&mut self, _bytes: u64) {}
278 
279     /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
280     /// This method does not account for data `BufReader` buffers and would underreport
281     /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
282     /// after draining the buffers via `drain_to`.
283     fn min_limit(&self) -> u64 {
284         u64::MAX
285     }
286 
287     /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
288     fn properties(&self) -> CopyParams;
289 }
290 
291 #[rustc_specialization_trait]
292 trait CopyWrite: Write {
293     /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
294     fn properties(&self) -> CopyParams;
295 }
296 
297 impl<T> CopyRead for &mut T
298 where
299     T: CopyRead,
300 {
301     fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
302         (**self).drain_to(writer, limit)
303     }
304 
305     fn taken(&mut self, bytes: u64) {
306         (**self).taken(bytes);
307     }
308 
309     fn min_limit(&self) -> u64 {
310         (**self).min_limit()
311     }
312 
313     fn properties(&self) -> CopyParams {
314         (**self).properties()
315     }
316 }
317 
318 impl<T> CopyWrite for &mut T
319 where
320     T: CopyWrite,
321 {
322     fn properties(&self) -> CopyParams {
323         (**self).properties()
324     }
325 }
326 
327 impl CopyRead for File {
328     fn properties(&self) -> CopyParams {
329         CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
330     }
331 }
332 
333 impl CopyRead for &File {
334     fn properties(&self) -> CopyParams {
335         CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
336     }
337 }
338 
339 impl CopyWrite for File {
340     fn properties(&self) -> CopyParams {
341         CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
342     }
343 }
344 
345 impl CopyWrite for &File {
346     fn properties(&self) -> CopyParams {
347         CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
348     }
349 }
350 
351 impl CopyRead for TcpStream {
352     fn properties(&self) -> CopyParams {
353         // avoid the stat syscall since we can be fairly sure it's a socket
354         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
355     }
356 }
357 
358 impl CopyRead for &TcpStream {
359     fn properties(&self) -> CopyParams {
360         // avoid the stat syscall since we can be fairly sure it's a socket
361         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
362     }
363 }
364 
365 impl CopyWrite for TcpStream {
366     fn properties(&self) -> CopyParams {
367         // avoid the stat syscall since we can be fairly sure it's a socket
368         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
369     }
370 }
371 
372 impl CopyWrite for &TcpStream {
373     fn properties(&self) -> CopyParams {
374         // avoid the stat syscall since we can be fairly sure it's a socket
375         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
376     }
377 }
378 
379 impl CopyRead for UnixStream {
380     fn properties(&self) -> CopyParams {
381         // avoid the stat syscall since we can be fairly sure it's a socket
382         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
383     }
384 }
385 
386 impl CopyRead for &UnixStream {
387     fn properties(&self) -> CopyParams {
388         // avoid the stat syscall since we can be fairly sure it's a socket
389         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
390     }
391 }
392 
393 impl CopyWrite for UnixStream {
394     fn properties(&self) -> CopyParams {
395         // avoid the stat syscall since we can be fairly sure it's a socket
396         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
397     }
398 }
399 
400 impl CopyWrite for &UnixStream {
401     fn properties(&self) -> CopyParams {
402         // avoid the stat syscall since we can be fairly sure it's a socket
403         CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
404     }
405 }
406 
407 impl CopyWrite for ChildStdin {
408     fn properties(&self) -> CopyParams {
409         CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
410     }
411 }
412 
413 impl CopyRead for ChildStdout {
414     fn properties(&self) -> CopyParams {
415         CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
416     }
417 }
418 
419 impl CopyRead for ChildStderr {
420     fn properties(&self) -> CopyParams {
421         CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
422     }
423 }
424 
425 impl CopyRead for StdinLock<'_> {
426     fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
427         let buf_reader = self.as_mut_buf();
428         let buf = buf_reader.buffer();
429         let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
430         let bytes_drained = buf.len();
431         writer.write_all(buf)?;
432         buf_reader.consume(bytes_drained);
433 
434         Ok(bytes_drained as u64)
435     }
436 
437     fn properties(&self) -> CopyParams {
438         CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
439     }
440 }
441 
442 impl CopyWrite for StdoutLock<'_> {
443     fn properties(&self) -> CopyParams {
444         CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
445     }
446 }
447 
448 impl CopyWrite for StderrLock<'_> {
449     fn properties(&self) -> CopyParams {
450         CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
451     }
452 }
453 
454 impl<T: CopyRead> CopyRead for Take<T> {
455     fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
456         let local_limit = self.limit();
457         let combined_limit = min(outer_limit, local_limit);
458         let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?;
459         // update limit since read() was bypassed
460         self.set_limit(local_limit - bytes_drained);
461 
462         Ok(bytes_drained)
463     }
464 
465     fn taken(&mut self, bytes: u64) {
466         self.set_limit(self.limit() - bytes);
467         self.get_mut().taken(bytes);
468     }
469 
470     fn min_limit(&self) -> u64 {
471         min(Take::limit(self), self.get_ref().min_limit())
472     }
473 
474     fn properties(&self) -> CopyParams {
475         self.get_ref().properties()
476     }
477 }
478 
479 impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> {
480     fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
481         let buf = self.buffer();
482         let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
483         let bytes = buf.len();
484         writer.write_all(buf)?;
485         self.consume(bytes);
486 
487         let remaining = outer_limit - bytes as u64;
488 
489         // in case of nested bufreaders we also need to drain the ones closer to the source
490         let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
491 
492         Ok(bytes as u64 + inner_bytes)
493     }
494 
495     fn taken(&mut self, bytes: u64) {
496         self.get_mut().taken(bytes);
497     }
498 
499     fn min_limit(&self) -> u64 {
500         self.get_ref().min_limit()
501     }
502 
503     fn properties(&self) -> CopyParams {
504         self.get_ref().properties()
505     }
506 }
507 
508 impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> {
509     fn properties(&self) -> CopyParams {
510         self.get_ref().properties()
511     }
512 }
513 
514 fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
515     let fd = fd.as_raw_fd();
516     let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
517     match file.metadata() {
518         Ok(meta) => FdMeta::Metadata(meta),
519         Err(_) => FdMeta::NoneObtained,
520     }
521 }
522 
523 pub(super) enum CopyResult {
524     Ended(u64),
525     Error(Error, u64),
526     Fallback(u64),
527 }
528 
529 impl CopyResult {
530     fn update_take(&self, reader: &mut impl CopyRead) {
531         match *self {
532             CopyResult::Fallback(bytes)
533             | CopyResult::Ended(bytes)
534             | CopyResult::Error(_, bytes) => reader.taken(bytes),
535         }
536     }
537 }
538 
539 /// Invalid file descriptor.
540 ///
541 /// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
542 /// while negative values are used to indicate errors.
543 /// Thus -1 will never be overlap with a valid open file.
544 const INVALID_FD: RawFd = -1;
545 
546 /// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
547 /// As the name says, it only works on regular files.
548 ///
549 /// Callers must handle fallback to a generic copy loop.
550 /// `Fallback` may indicate non-zero number of bytes already written
551 /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
552 pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
553     use crate::std::cmp;
554 
555     const NOT_PROBED: u8 = 0;
556     const UNAVAILABLE: u8 = 1;
557     const AVAILABLE: u8 = 2;
558 
559     // Kernel prior to 4.5 don't have copy_file_range
560     // We store the availability in a global to avoid unnecessary syscalls
561     static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED);
562 
563     syscall! {
564         fn copy_file_range(
565             fd_in: dlibc::c_int,
566             off_in: *mut dlibc::loff_t,
567             fd_out: dlibc::c_int,
568             off_out: *mut dlibc::loff_t,
569             len: dlibc::size_t,
570             flags: dlibc::c_uint
571         ) -> dlibc::ssize_t
572     }
573 
574     match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
575         NOT_PROBED => {
576             // EPERM can indicate seccomp filters or an immutable file.
577             // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported
578             // and some other error (ENOSYS or EPERM) if it's not available
579             let result = unsafe {
580                 cvt(copy_file_range(
581                     INVALID_FD,
582                     ptr::null_mut(),
583                     INVALID_FD,
584                     ptr::null_mut(),
585                     1,
586                     0,
587                 ))
588             };
589 
590             if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) {
591                 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
592             } else {
593                 HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed);
594                 return CopyResult::Fallback(0);
595             }
596         }
597         UNAVAILABLE => return CopyResult::Fallback(0),
598         _ => {}
599     };
600 
601     let mut written = 0u64;
602     while written < max_len {
603         let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
604         // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
605         // this allows us to copy large chunks without hitting EOVERFLOW,
606         // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
607         let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
608         let copy_result = unsafe {
609             // We actually don't have to adjust the offsets,
610             // because copy_file_range adjusts the file offset automatically
611             cvt(copy_file_range(
612                 reader,
613                 ptr::null_mut(),
614                 writer,
615                 ptr::null_mut(),
616                 bytes_to_copy,
617                 0,
618             ))
619         };
620 
621         match copy_result {
622             Ok(0) if written == 0 => {
623                 // fallback to work around several kernel bugs where copy_file_range will fail to
624                 // copy any bytes and return 0 instead of an error if
625                 // - reading virtual files from the proc filesystem which appear to have 0 size
626                 //   but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
627                 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
628                 return CopyResult::Fallback(0);
629             }
630             Ok(0) => return CopyResult::Ended(written), // reached EOF
631             Ok(ret) => written += ret as u64,
632             Err(err) => {
633                 return match err.raw_os_error() {
634                     // when file offset + max_length > u64::MAX
635                     Some(EOVERFLOW) => CopyResult::Fallback(written),
636                     Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => {
637                         // Try fallback io::copy if either:
638                         // - Kernel version is < 4.5 (ENOSYS¹)
639                         // - Files are mounted on different fs (EXDEV)
640                         // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
641                         // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
642                         // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
643                         // - the writer fd was opened with O_APPEND (EBADF²)
644                         // and no bytes were written successfully yet. (All these errnos should
645                         // not be returned if something was already written, but they happen in
646                         // the wild, see #91152.)
647                         //
648                         // ¹ these cases should be detected by the initial probe but we handle them here
649                         //   anyway in case syscall interception changes during runtime
650                         // ² actually invalid file descriptors would cause this too, but in that case
651                         //   the fallback code path is expected to encounter the same error again
652                         CopyResult::Fallback(0)
653                     }
654                     _ => CopyResult::Error(err, written),
655                 };
656             }
657         }
658     }
659     CopyResult::Ended(written)
660 }
661 
662 #[derive(PartialEq)]
663 enum SpliceMode {
664     Sendfile,
665     Splice,
666 }
667 
668 /// performs splice or sendfile between file descriptors
669 /// Does _not_ fall back to a generic copy loop.
670 fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
671     static HAS_SENDFILE: AtomicBool = AtomicBool::new(true);
672     static HAS_SPLICE: AtomicBool = AtomicBool::new(true);
673 
674     // Android builds use feature level 14, but the libc wrapper for splice is
675     // gated on feature level 21+, so we have to invoke the syscall directly.
676     #[cfg(target_os = "android")]
677     syscall! {
678         fn splice(
679             srcfd: dlibc::c_int,
680             src_offset: *const i64,
681             dstfd: dlibc::c_int,
682             dst_offset: *const i64,
683             len: dlibc::size_t,
684             flags: dlibc::c_int
685         ) -> dlibc::ssize_t
686     }
687 
688     #[cfg(target_os = "linux")]
689     use dlibc::splice;
690 
691     #[cfg(target_os = "dragonos")]
692     use dlibc::splice;
693 
694     match mode {
695         SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
696             return CopyResult::Fallback(0);
697         }
698         SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
699             return CopyResult::Fallback(0);
700         }
701         _ => (),
702     }
703 
704     let mut written = 0u64;
705     while written < len {
706         // according to its manpage that's the maximum size sendfile() will copy per invocation
707         let chunk_size = crate::std::cmp::min(len - written, 0x7ffff000_u64) as usize;
708 
709         let result = match mode {
710             SpliceMode::Sendfile => {
711                 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) })
712             }
713             SpliceMode::Splice => cvt(unsafe {
714                 splice(
715                     reader,
716                     ptr::null_mut(),
717                     writer,
718                     ptr::null_mut(),
719                     chunk_size,
720                     0,
721                 )
722             }),
723         };
724 
725         match result {
726             Ok(0) => break, // EOF
727             Ok(ret) => written += ret as u64,
728             Err(err) => {
729                 return match err.raw_os_error() {
730                     Some(ENOSYS | EPERM) => {
731                         // syscall not supported (ENOSYS)
732                         // syscall is disallowed, e.g. by seccomp (EPERM)
733                         match mode {
734                             SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
735                             SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
736                         }
737                         assert_eq!(written, 0);
738                         CopyResult::Fallback(0)
739                     }
740                     Some(EINVAL) => {
741                         // splice/sendfile do not support this particular file descriptor (EINVAL)
742                         assert_eq!(written, 0);
743                         CopyResult::Fallback(0)
744                     }
745                     Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
746                         CopyResult::Fallback(written)
747                     }
748                     _ => CopyResult::Error(err, written),
749                 };
750             }
751         }
752     }
753     CopyResult::Ended(written)
754 }
755