xref: /drstd/src/std/sys/windows/thread_local_key.rs (revision 86982c5e9b2eaa583327251616ee822c36288824)
1 use crate::std::cell::UnsafeCell;
2 use crate::std::ptr;
3 use crate::std::sync::atomic::{
4     AtomicBool, AtomicPtr, AtomicU32,
5     Ordering::{AcqRel, Acquire, Relaxed, Release},
6 };
7 use crate::std::sys::c;
8 
9 #[cfg(test)]
10 mod tests;
11 
12 /// An optimization hint. The compiler is often smart enough to know if an atomic
13 /// is never set and can remove dead code based on that fact.
14 static HAS_DTORS: AtomicBool = AtomicBool::new(false);
15 
16 // Using a per-thread list avoids the problems in synchronizing global state.
17 #[thread_local]
18 #[cfg(target_thread_local)]
19 static mut DESTRUCTORS: Vec<(*mut u8, unsafe extern "C" fn(*mut u8))> = Vec::new();
20 
21 // Ensure this can never be inlined because otherwise this may break in dylibs.
22 // See #44391.
23 #[inline(never)]
24 #[cfg(target_thread_local)]
25 pub unsafe fn register_keyless_dtor(t: *mut u8, dtor: unsafe extern "C" fn(*mut u8)) {
26     DESTRUCTORS.push((t, dtor));
27     HAS_DTORS.store(true, Relaxed);
28 }
29 
30 #[inline(never)] // See comment above
31 #[cfg(target_thread_local)]
32 /// Runs destructors. This should not be called until thread exit.
33 unsafe fn run_keyless_dtors() {
34     // Drop all the destructors.
35     //
36     // Note: While this is potentially an infinite loop, it *should* be
37     // the case that this loop always terminates because we provide the
38     // guarantee that a TLS key cannot be set after it is flagged for
39     // destruction.
40     while let Some((ptr, dtor)) = DESTRUCTORS.pop() {
41         (dtor)(ptr);
42     }
43     // We're done so free the memory.
44     DESTRUCTORS = Vec::new();
45 }
46 
47 type Key = c::DWORD;
48 type Dtor = unsafe extern "C" fn(*mut u8);
49 
50 // Turns out, like pretty much everything, Windows is pretty close the
51 // functionality that Unix provides, but slightly different! In the case of
52 // TLS, Windows does not provide an API to provide a destructor for a TLS
53 // variable. This ends up being pretty crucial to this implementation, so we
54 // need a way around this.
55 //
56 // The solution here ended up being a little obscure, but fear not, the
57 // internet has informed me [1][2] that this solution is not unique (no way
58 // I could have thought of it as well!). The key idea is to insert some hook
59 // somewhere to run arbitrary code on thread termination. With this in place
60 // we'll be able to run anything we like, including all TLS destructors!
61 //
62 // To accomplish this feat, we perform a number of threads, all contained
63 // within this module:
64 //
65 // * All TLS destructors are tracked by *us*, not the Windows runtime. This
66 //   means that we have a global list of destructors for each TLS key that
67 //   we know about.
68 // * When a thread exits, we run over the entire list and run dtors for all
69 //   non-null keys. This attempts to match Unix semantics in this regard.
70 //
71 // For more details and nitty-gritty, see the code sections below!
72 //
73 // [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
74 // [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42
75 
76 pub struct StaticKey {
77     /// The key value shifted up by one. Since TLS_OUT_OF_INDEXES == DWORD::MAX
78     /// is not a valid key value, this allows us to use zero as sentinel value
79     /// without risking overflow.
80     key: AtomicU32,
81     dtor: Option<Dtor>,
82     next: AtomicPtr<StaticKey>,
83     /// Currently, destructors cannot be unregistered, so we cannot use racy
84     /// initialization for keys. Instead, we need synchronize initialization.
85     /// Use the Windows-provided `Once` since it does not require TLS.
86     once: UnsafeCell<c::INIT_ONCE>,
87 }
88 
89 impl StaticKey {
90     #[inline]
91     pub const fn new(dtor: Option<Dtor>) -> StaticKey {
92         StaticKey {
93             key: AtomicU32::new(0),
94             dtor,
95             next: AtomicPtr::new(ptr::null_mut()),
96             once: UnsafeCell::new(c::INIT_ONCE_STATIC_INIT),
97         }
98     }
99 
100     #[inline]
101     pub unsafe fn set(&'static self, val: *mut u8) {
102         let r = c::TlsSetValue(self.key(), val.cast());
103         debug_assert_eq!(r, c::TRUE);
104     }
105 
106     #[inline]
107     pub unsafe fn get(&'static self) -> *mut u8 {
108         c::TlsGetValue(self.key()).cast()
109     }
110 
111     #[inline]
112     unsafe fn key(&'static self) -> Key {
113         match self.key.load(Acquire) {
114             0 => self.init(),
115             key => key - 1,
116         }
117     }
118 
119     #[cold]
120     unsafe fn init(&'static self) -> Key {
121         if self.dtor.is_some() {
122             let mut pending = c::FALSE;
123             let r = c::InitOnceBeginInitialize(self.once.get(), 0, &mut pending, ptr::null_mut());
124             assert_eq!(r, c::TRUE);
125 
126             if pending == c::FALSE {
127                 // Some other thread initialized the key, load it.
128                 self.key.load(Relaxed) - 1
129             } else {
130                 let key = c::TlsAlloc();
131                 if key == c::TLS_OUT_OF_INDEXES {
132                     // Wakeup the waiting threads before panicking to avoid deadlock.
133                     c::InitOnceComplete(self.once.get(), c::INIT_ONCE_INIT_FAILED, ptr::null_mut());
134                     panic!("out of TLS indexes");
135                 }
136 
137                 self.key.store(key + 1, Release);
138                 register_dtor(self);
139 
140                 let r = c::InitOnceComplete(self.once.get(), 0, ptr::null_mut());
141                 debug_assert_eq!(r, c::TRUE);
142 
143                 key
144             }
145         } else {
146             // If there is no destructor to clean up, we can use racy initialization.
147 
148             let key = c::TlsAlloc();
149             assert_ne!(key, c::TLS_OUT_OF_INDEXES, "out of TLS indexes");
150 
151             match self.key.compare_exchange(0, key + 1, AcqRel, Acquire) {
152                 Ok(_) => key,
153                 Err(new) => {
154                     // Some other thread completed initialization first, so destroy
155                     // our key and use theirs.
156                     let r = c::TlsFree(key);
157                     debug_assert_eq!(r, c::TRUE);
158                     new - 1
159                 }
160             }
161         }
162     }
163 }
164 
165 unsafe impl Send for StaticKey {}
166 unsafe impl Sync for StaticKey {}
167 
168 // -------------------------------------------------------------------------
169 // Dtor registration
170 //
171 // Windows has no native support for running destructors so we manage our own
172 // list of destructors to keep track of how to destroy keys. We then install a
173 // callback later to get invoked whenever a thread exits, running all
174 // appropriate destructors.
175 //
176 // Currently unregistration from this list is not supported. A destructor can be
177 // registered but cannot be unregistered. There's various simplifying reasons
178 // for doing this, the big ones being:
179 //
180 // 1. Currently we don't even support deallocating TLS keys, so normal operation
181 //    doesn't need to deallocate a destructor.
182 // 2. There is no point in time where we know we can unregister a destructor
183 //    because it could always be getting run by some remote thread.
184 //
185 // Typically processes have a statically known set of TLS keys which is pretty
186 // small, and we'd want to keep this memory alive for the whole process anyway
187 // really.
188 
189 static DTORS: AtomicPtr<StaticKey> = AtomicPtr::new(ptr::null_mut());
190 
191 /// Should only be called once per key, otherwise loops or breaks may occur in
192 /// the linked list.
193 unsafe fn register_dtor(key: &'static StaticKey) {
194     // Ensure this is never run when native thread locals are available.
195     assert_eq!(false, cfg!(target_thread_local));
196     let this = <*const StaticKey>::cast_mut(key);
197     // Use acquire ordering to pass along the changes done by the previously
198     // registered keys when we store the new head with release ordering.
199     let mut head = DTORS.load(Acquire);
200     loop {
201         key.next.store(head, Relaxed);
202         match DTORS.compare_exchange_weak(head, this, Release, Acquire) {
203             Ok(_) => break,
204             Err(new) => head = new,
205         }
206     }
207     HAS_DTORS.store(true, Release);
208 }
209 
210 // -------------------------------------------------------------------------
211 // Where the Magic (TM) Happens
212 //
213 // If you're looking at this code, and wondering "what is this doing?",
214 // you're not alone! I'll try to break this down step by step:
215 //
216 // # What's up with CRT$XLB?
217 //
218 // For anything about TLS destructors to work on Windows, we have to be able
219 // to run *something* when a thread exits. To do so, we place a very special
220 // static in a very special location. If this is encoded in just the right
221 // way, the kernel's loader is apparently nice enough to run some function
222 // of ours whenever a thread exits! How nice of the kernel!
223 //
224 // Lots of detailed information can be found in source [1] above, but the
225 // gist of it is that this is leveraging a feature of Microsoft's PE format
226 // (executable format) which is not actually used by any compilers today.
227 // This apparently translates to any callbacks in the ".CRT$XLB" section
228 // being run on certain events.
229 //
230 // So after all that, we use the compiler's #[link_section] feature to place
231 // a callback pointer into the magic section so it ends up being called.
232 //
233 // # What's up with this callback?
234 //
235 // The callback specified receives a number of parameters from... someone!
236 // (the kernel? the runtime? I'm not quite sure!) There are a few events that
237 // this gets invoked for, but we're currently only interested on when a
238 // thread or a process "detaches" (exits). The process part happens for the
239 // last thread and the thread part happens for any normal thread.
240 //
241 // # Ok, what's up with running all these destructors?
242 //
243 // This will likely need to be improved over time, but this function
244 // attempts a "poor man's" destructor callback system. Once we've got a list
245 // of what to run, we iterate over all keys, check their values, and then run
246 // destructors if the values turn out to be non null (setting them to null just
247 // beforehand). We do this a few times in a loop to basically match Unix
248 // semantics. If we don't reach a fixed point after a short while then we just
249 // inevitably leak something most likely.
250 //
251 // # The article mentions weird stuff about "/INCLUDE"?
252 //
253 // It sure does! Specifically we're talking about this quote:
254 //
255 //      The Microsoft run-time library facilitates this process by defining a
256 //      memory image of the TLS Directory and giving it the special name
257 //      “__tls_used” (Intel x86 platforms) or “_tls_used” (other platforms). The
258 //      linker looks for this memory image and uses the data there to create the
259 //      TLS Directory. Other compilers that support TLS and work with the
260 //      Microsoft linker must use this same technique.
261 //
262 // Basically what this means is that if we want support for our TLS
263 // destructors/our hook being called then we need to make sure the linker does
264 // not omit this symbol. Otherwise it will omit it and our callback won't be
265 // wired up.
266 //
267 // We don't actually use the `/INCLUDE` linker flag here like the article
268 // mentions because the Rust compiler doesn't propagate linker flags, but
269 // instead we use a shim function which performs a volatile 1-byte load from
270 // the address of the symbol to ensure it sticks around.
271 
272 #[link_section = ".CRT$XLB"]
273 #[allow(dead_code, unused_variables)]
274 #[used] // we don't want LLVM eliminating this symbol for any reason, and
275         // when the symbol makes it to the linker the linker will take over
276 pub static p_thread_callback: unsafe extern "system" fn(c::LPVOID, c::DWORD, c::LPVOID) =
277     on_tls_callback;
278 
279 #[allow(dead_code, unused_variables)]
280 unsafe extern "system" fn on_tls_callback(h: c::LPVOID, dwReason: c::DWORD, pv: c::LPVOID) {
281     if !HAS_DTORS.load(Acquire) {
282         return;
283     }
284     if dwReason == c::DLL_THREAD_DETACH || dwReason == c::DLL_PROCESS_DETACH {
285         #[cfg(not(target_thread_local))]
286         run_dtors();
287         #[cfg(target_thread_local)]
288         run_keyless_dtors();
289     }
290 
291     // See comments above for what this is doing. Note that we don't need this
292     // trickery on GNU windows, just on MSVC.
293     reference_tls_used();
294     #[cfg(target_env = "msvc")]
295     unsafe fn reference_tls_used() {
296         extern "C" {
297             static _tls_used: u8;
298         }
299         crate::std::intrinsics::volatile_load(&_tls_used);
300     }
301     #[cfg(not(target_env = "msvc"))]
302     unsafe fn reference_tls_used() {}
303 }
304 
305 #[allow(dead_code)] // actually called below
306 unsafe fn run_dtors() {
307     for _ in 0..5 {
308         let mut any_run = false;
309 
310         // Use acquire ordering to observe key initialization.
311         let mut cur = DTORS.load(Acquire);
312         while !cur.is_null() {
313             let key = (*cur).key.load(Relaxed) - 1;
314             let dtor = (*cur).dtor.unwrap();
315 
316             let ptr = c::TlsGetValue(key);
317             if !ptr.is_null() {
318                 c::TlsSetValue(key, ptr::null_mut());
319                 dtor(ptr as *mut _);
320                 any_run = true;
321             }
322 
323             cur = (*cur).next.load(Relaxed);
324         }
325 
326         if !any_run {
327             break;
328         }
329     }
330 }
331