LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34#if OMPD_SUPPORT
35#include "ompd-specific.h"
36#endif
37
38#if OMP_PROFILING_SUPPORT
39#include "llvm/Support/TimeProfiler.h"
40static char *ProfileTraceFile = nullptr;
41#endif
42
43/* these are temporary issues to be dealt with */
44#define KMP_USE_PRCTL 0
45
46#if KMP_OS_WINDOWS
47#include <process.h>
48#endif
49
50#if KMP_OS_WINDOWS
51// windows does not need include files as it doesn't use shared memory
52#else
53#include <sys/mman.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#define SHM_SIZE 1024
57#endif
58
59#if defined(KMP_GOMP_COMPAT)
60char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62#endif /* defined(KMP_GOMP_COMPAT) */
63
64char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66
67#ifdef KMP_DEBUG
68char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70#endif /* KMP_DEBUG */
71
72#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73
74/* ------------------------------------------------------------------------ */
75
76#if KMP_USE_MONITOR
77kmp_info_t __kmp_monitor;
78#endif
79
80/* Forward declarations */
81
82void __kmp_cleanup(void);
83
84static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89#if KMP_AFFINITY_SUPPORTED
90static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92#endif
93static void __kmp_do_serial_initialize(void);
94void __kmp_fork_barrier(int gtid, int tid);
95void __kmp_join_barrier(int gtid);
96void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
98
99#ifdef USE_LOAD_BALANCE
100static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101#endif
102
103static int __kmp_expand_threads(int nNeed);
104#if KMP_OS_WINDOWS
105static int __kmp_unregister_root_other_thread(int gtid);
106#endif
107static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109
110/* Calculate the identifier of the current thread */
111/* fast (and somewhat portable) way to get unique identifier of executing
112 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
113int __kmp_get_global_thread_id() {
114 int i;
115 kmp_info_t **other_threads;
116 size_t stack_data;
117 char *stack_addr;
118 size_t stack_size;
119 char *stack_base;
120
121 KA_TRACE(
122 1000,
123 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
124 __kmp_nth, __kmp_all_nth));
125
126 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
127 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
128 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
129 __kmp_init_gtid for this to work. */
130
131 if (!TCR_4(__kmp_init_gtid))
132 return KMP_GTID_DNE;
133
134#ifdef KMP_TDATA_GTID
135 if (TCR_4(__kmp_gtid_mode) >= 3) {
136 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
137 return __kmp_gtid;
138 }
139#endif
140 if (TCR_4(__kmp_gtid_mode) >= 2) {
141 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
142 return __kmp_gtid_get_specific();
143 }
144 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
145
146 stack_addr = (char *)&stack_data;
147 other_threads = __kmp_threads;
148
149 /* ATT: The code below is a source of potential bugs due to unsynchronized
150 access to __kmp_threads array. For example:
151 1. Current thread loads other_threads[i] to thr and checks it, it is
152 non-NULL.
153 2. Current thread is suspended by OS.
154 3. Another thread unregisters and finishes (debug versions of free()
155 may fill memory with something like 0xEF).
156 4. Current thread is resumed.
157 5. Current thread reads junk from *thr.
158 TODO: Fix it. --ln */
159
160 for (i = 0; i < __kmp_threads_capacity; i++) {
161
162 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
163 if (!thr)
164 continue;
165
166 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
167 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
168
169 /* stack grows down -- search through all of the active threads */
170
171 if (stack_addr <= stack_base) {
172 size_t stack_diff = stack_base - stack_addr;
173
174 if (stack_diff <= stack_size) {
175 /* The only way we can be closer than the allocated */
176 /* stack size is if we are running on this thread. */
177 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
178 return i;
179 }
180 }
181 }
182
183 /* get specific to try and determine our gtid */
184 KA_TRACE(1000,
185 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
186 "thread, using TLS\n"));
187 i = __kmp_gtid_get_specific();
188
189 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
190
191 /* if we havn't been assigned a gtid, then return code */
192 if (i < 0)
193 return i;
194
195 /* dynamically updated stack window for uber threads to avoid get_specific
196 call */
197 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
198 KMP_FATAL(StackOverflow, i);
199 }
200
201 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
202 if (stack_addr > stack_base) {
203 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
204 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
205 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
206 stack_base);
207 } else {
208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209 stack_base - stack_addr);
210 }
211
212 /* Reprint stack bounds for ubermaster since they have been refined */
213 if (__kmp_storage_map) {
214 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
215 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
216 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
217 other_threads[i]->th.th_info.ds.ds_stacksize,
218 "th_%d stack (refinement)", i);
219 }
220 return i;
221}
222
223int __kmp_get_global_thread_id_reg() {
224 int gtid;
225
226 if (!__kmp_init_serial) {
227 gtid = KMP_GTID_DNE;
228 } else
229#ifdef KMP_TDATA_GTID
230 if (TCR_4(__kmp_gtid_mode) >= 3) {
231 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
232 gtid = __kmp_gtid;
233 } else
234#endif
235 if (TCR_4(__kmp_gtid_mode) >= 2) {
236 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
237 gtid = __kmp_gtid_get_specific();
238 } else {
239 KA_TRACE(1000,
240 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
241 gtid = __kmp_get_global_thread_id();
242 }
243
244 /* we must be a new uber master sibling thread */
245 if (gtid == KMP_GTID_DNE) {
246 KA_TRACE(10,
247 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
248 "Registering a new gtid.\n"));
249 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
250 if (!__kmp_init_serial) {
251 __kmp_do_serial_initialize();
252 gtid = __kmp_gtid_get_specific();
253 } else {
254 gtid = __kmp_register_root(FALSE);
255 }
256 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
257 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
258 }
259
260 KMP_DEBUG_ASSERT(gtid >= 0);
261
262 return gtid;
263}
264
265/* caller must hold forkjoin_lock */
266void __kmp_check_stack_overlap(kmp_info_t *th) {
267 int f;
268 char *stack_beg = NULL;
269 char *stack_end = NULL;
270 int gtid;
271
272 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
273 if (__kmp_storage_map) {
274 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
275 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
276
277 gtid = __kmp_gtid_from_thread(th);
278
279 if (gtid == KMP_GTID_MONITOR) {
280 __kmp_print_storage_map_gtid(
281 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282 "th_%s stack (%s)", "mon",
283 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284 } else {
285 __kmp_print_storage_map_gtid(
286 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
287 "th_%d stack (%s)", gtid,
288 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
289 }
290 }
291
292 /* No point in checking ubermaster threads since they use refinement and
293 * cannot overlap */
294 gtid = __kmp_gtid_from_thread(th);
295 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
296 KA_TRACE(10,
297 ("__kmp_check_stack_overlap: performing extensive checking\n"));
298 if (stack_beg == NULL) {
299 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
300 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
301 }
302
303 for (f = 0; f < __kmp_threads_capacity; f++) {
304 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
305
306 if (f_th && f_th != th) {
307 char *other_stack_end =
308 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
309 char *other_stack_beg =
310 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
311 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
312 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
313
314 /* Print the other stack values before the abort */
315 if (__kmp_storage_map)
316 __kmp_print_storage_map_gtid(
317 -1, other_stack_beg, other_stack_end,
318 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
319 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
320
321 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
322 __kmp_msg_null);
323 }
324 }
325 }
326 }
327 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
328}
329
330/* ------------------------------------------------------------------------ */
331
332void __kmp_infinite_loop(void) {
333 static int done = FALSE;
334
335 while (!done) {
336 KMP_YIELD(TRUE);
337 }
338}
339
340#define MAX_MESSAGE 512
341
342void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
343 char const *format, ...) {
344 char buffer[MAX_MESSAGE];
345 va_list ap;
346
347 va_start(ap, format);
348 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
349 p2, (unsigned long)size, format);
350 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
351 __kmp_vprintf(kmp_err, buffer, ap);
352#if KMP_PRINT_DATA_PLACEMENT
353 int node;
354 if (gtid >= 0) {
355 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
356 if (__kmp_storage_map_verbose) {
357 node = __kmp_get_host_node(p1);
358 if (node < 0) /* doesn't work, so don't try this next time */
359 __kmp_storage_map_verbose = FALSE;
360 else {
361 char *last;
362 int lastNode;
363 int localProc = __kmp_get_cpu_from_gtid(gtid);
364
365 const int page_size = KMP_GET_PAGE_SIZE();
366
367 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
368 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
369 if (localProc >= 0)
370 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
371 localProc >> 1);
372 else
373 __kmp_printf_no_lock(" GTID %d\n", gtid);
374#if KMP_USE_PRCTL
375 /* The more elaborate format is disabled for now because of the prctl
376 * hanging bug. */
377 do {
378 last = p1;
379 lastNode = node;
380 /* This loop collates adjacent pages with the same host node. */
381 do {
382 (char *)p1 += page_size;
383 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
384 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
385 lastNode);
386 } while (p1 <= p2);
387#else
388 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
389 (char *)p1 + (page_size - 1),
390 __kmp_get_host_node(p1));
391 if (p1 < p2) {
392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
393 (char *)p2 + (page_size - 1),
394 __kmp_get_host_node(p2));
395 }
396#endif
397 }
398 }
399 } else
400 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
401 }
402#endif /* KMP_PRINT_DATA_PLACEMENT */
403 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
404}
405
406void __kmp_warn(char const *format, ...) {
407 char buffer[MAX_MESSAGE];
408 va_list ap;
409
410 if (__kmp_generate_warnings == kmp_warnings_off) {
411 return;
412 }
413
414 va_start(ap, format);
415
416 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
417 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
418 __kmp_vprintf(kmp_err, buffer, ap);
419 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
420
421 va_end(ap);
422}
423
424void __kmp_abort_process() {
425 // Later threads may stall here, but that's ok because abort() will kill them.
426 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
427
428 if (__kmp_debug_buf) {
429 __kmp_dump_debug_buffer();
430 }
431
432 if (KMP_OS_WINDOWS) {
433 // Let other threads know of abnormal termination and prevent deadlock
434 // if abort happened during library initialization or shutdown
435 __kmp_global.g.g_abort = SIGABRT;
436
437 /* On Windows* OS by default abort() causes pop-up error box, which stalls
438 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
439 boxes. _set_abort_behavior() works well, but this function is not
440 available in VS7 (this is not problem for DLL, but it is a problem for
441 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
442 help, at least in some versions of MS C RTL.
443
444 It seems following sequence is the only way to simulate abort() and
445 avoid pop-up error box. */
446 raise(SIGABRT);
447 _exit(3); // Just in case, if signal ignored, exit anyway.
448 } else {
449 __kmp_unregister_library();
450 abort();
451 }
452
453 __kmp_infinite_loop();
454 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
455
456} // __kmp_abort_process
457
458void __kmp_abort_thread(void) {
459 // TODO: Eliminate g_abort global variable and this function.
460 // In case of abort just call abort(), it will kill all the threads.
461 __kmp_infinite_loop();
462} // __kmp_abort_thread
463
464/* Print out the storage map for the major kmp_info_t thread data structures
465 that are allocated together. */
466
467static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
468 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
469 gtid);
470
471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
472 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
473
474 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
475 sizeof(kmp_local_t), "th_%d.th_local", gtid);
476
477 __kmp_print_storage_map_gtid(
478 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
479 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
480
481 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
482 &thr->th.th_bar[bs_plain_barrier + 1],
483 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
484 gtid);
485
486 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
487 &thr->th.th_bar[bs_forkjoin_barrier + 1],
488 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
489 gtid);
490
491#if KMP_FAST_REDUCTION_BARRIER
492 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
493 &thr->th.th_bar[bs_reduction_barrier + 1],
494 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
495 gtid);
496#endif // KMP_FAST_REDUCTION_BARRIER
497}
498
499/* Print out the storage map for the major kmp_team_t team data structures
500 that are allocated together. */
501
502static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
503 int team_id, int num_thr) {
504 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
505 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
506 header, team_id);
507
508 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
509 &team->t.t_bar[bs_last_barrier],
510 sizeof(kmp_balign_team_t) * bs_last_barrier,
511 "%s_%d.t_bar", header, team_id);
512
513 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
514 &team->t.t_bar[bs_plain_barrier + 1],
515 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
516 header, team_id);
517
518 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
519 &team->t.t_bar[bs_forkjoin_barrier + 1],
520 sizeof(kmp_balign_team_t),
521 "%s_%d.t_bar[forkjoin]", header, team_id);
522
523#if KMP_FAST_REDUCTION_BARRIER
524 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
525 &team->t.t_bar[bs_reduction_barrier + 1],
526 sizeof(kmp_balign_team_t),
527 "%s_%d.t_bar[reduction]", header, team_id);
528#endif // KMP_FAST_REDUCTION_BARRIER
529
530 __kmp_print_storage_map_gtid(
531 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
532 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
533
534 __kmp_print_storage_map_gtid(
535 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
536 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
537
538 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
539 &team->t.t_disp_buffer[num_disp_buff],
540 sizeof(dispatch_shared_info_t) * num_disp_buff,
541 "%s_%d.t_disp_buffer", header, team_id);
542}
543
544static void __kmp_init_allocator() {
545 __kmp_init_memkind();
546 __kmp_init_target_mem();
547}
548static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
549
550/* ------------------------------------------------------------------------ */
551
552#if KMP_DYNAMIC_LIB
553#if KMP_OS_WINDOWS
554
555BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
556 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
557
558 switch (fdwReason) {
559
560 case DLL_PROCESS_ATTACH:
561 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
562
563 return TRUE;
564
565 case DLL_PROCESS_DETACH:
566 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
567
568 // According to Windows* documentation for DllMain entry point:
569 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
570 // lpReserved == NULL when FreeLibrary() is called,
571 // lpReserved != NULL when the process is terminated.
572 // When FreeLibrary() is called, worker threads remain alive. So the
573 // runtime's state is consistent and executing proper shutdown is OK.
574 // When the process is terminated, worker threads have exited or been
575 // forcefully terminated by the OS and only the shutdown thread remains.
576 // This can leave the runtime in an inconsistent state.
577 // Hence, only attempt proper cleanup when FreeLibrary() is called.
578 // Otherwise, rely on OS to reclaim resources.
579 if (lpReserved == NULL)
580 __kmp_internal_end_library(__kmp_gtid_get_specific());
581
582 return TRUE;
583
584 case DLL_THREAD_ATTACH:
585 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
586
587 /* if we want to register new siblings all the time here call
588 * __kmp_get_gtid(); */
589 return TRUE;
590
591 case DLL_THREAD_DETACH:
592 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
593
594 __kmp_internal_end_thread(__kmp_gtid_get_specific());
595 return TRUE;
596 }
597
598 return TRUE;
599}
600
601#endif /* KMP_OS_WINDOWS */
602#endif /* KMP_DYNAMIC_LIB */
603
604/* __kmp_parallel_deo -- Wait until it's our turn. */
605void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
606 int gtid = *gtid_ref;
607#ifdef BUILD_PARALLEL_ORDERED
608 kmp_team_t *team = __kmp_team_from_gtid(gtid);
609#endif /* BUILD_PARALLEL_ORDERED */
610
611 if (__kmp_env_consistency_check) {
612 if (__kmp_threads[gtid]->th.th_root->r.r_active)
613#if KMP_USE_DYNAMIC_LOCK
614 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
615#else
616 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
617#endif
618 }
619#ifdef BUILD_PARALLEL_ORDERED
620 if (!team->t.t_serialized) {
621 KMP_MB();
622 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
623 NULL);
624 KMP_MB();
625 }
626#endif /* BUILD_PARALLEL_ORDERED */
627}
628
629/* __kmp_parallel_dxo -- Signal the next task. */
630void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
631 int gtid = *gtid_ref;
632#ifdef BUILD_PARALLEL_ORDERED
633 int tid = __kmp_tid_from_gtid(gtid);
634 kmp_team_t *team = __kmp_team_from_gtid(gtid);
635#endif /* BUILD_PARALLEL_ORDERED */
636
637 if (__kmp_env_consistency_check) {
638 if (__kmp_threads[gtid]->th.th_root->r.r_active)
639 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
640 }
641#ifdef BUILD_PARALLEL_ORDERED
642 if (!team->t.t_serialized) {
643 KMP_MB(); /* Flush all pending memory write invalidates. */
644
645 /* use the tid of the next thread in this team */
646 /* TODO replace with general release procedure */
647 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
648
649 KMP_MB(); /* Flush all pending memory write invalidates. */
650 }
651#endif /* BUILD_PARALLEL_ORDERED */
652}
653
654/* ------------------------------------------------------------------------ */
655/* The BARRIER for a SINGLE process section is always explicit */
656
657int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
658 int status;
659 kmp_info_t *th;
660 kmp_team_t *team;
661
662 if (!TCR_4(__kmp_init_parallel))
663 __kmp_parallel_initialize();
664 __kmp_resume_if_soft_paused();
665
666 th = __kmp_threads[gtid];
667 team = th->th.th_team;
668 status = 0;
669
670 th->th.th_ident = id_ref;
671
672 if (team->t.t_serialized) {
673 status = 1;
674 } else {
675 kmp_int32 old_this = th->th.th_local.this_construct;
676
677 ++th->th.th_local.this_construct;
678 /* try to set team count to thread count--success means thread got the
679 single block */
680 /* TODO: Should this be acquire or release? */
681 if (team->t.t_construct == old_this) {
682 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
683 th->th.th_local.this_construct);
684 }
685#if USE_ITT_BUILD
686 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
687 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
688 team->t.t_active_level == 1) {
689 // Only report metadata by primary thread of active team at level 1
690 __kmp_itt_metadata_single(id_ref);
691 }
692#endif /* USE_ITT_BUILD */
693 }
694
695 if (__kmp_env_consistency_check) {
696 if (status && push_ws) {
697 __kmp_push_workshare(gtid, ct_psingle, id_ref);
698 } else {
699 __kmp_check_workshare(gtid, ct_psingle, id_ref);
700 }
701 }
702#if USE_ITT_BUILD
703 if (status) {
704 __kmp_itt_single_start(gtid);
705 }
706#endif /* USE_ITT_BUILD */
707 return status;
708}
709
710void __kmp_exit_single(int gtid) {
711#if USE_ITT_BUILD
712 __kmp_itt_single_end(gtid);
713#endif /* USE_ITT_BUILD */
714 if (__kmp_env_consistency_check)
715 __kmp_pop_workshare(gtid, ct_psingle, NULL);
716}
717
718/* determine if we can go parallel or must use a serialized parallel region and
719 * how many threads we can use
720 * set_nproc is the number of threads requested for the team
721 * returns 0 if we should serialize or only use one thread,
722 * otherwise the number of threads to use
723 * The forkjoin lock is held by the caller. */
724static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
725 int master_tid, int set_nthreads,
726 int enter_teams) {
727 int capacity;
728 int new_nthreads;
729 KMP_DEBUG_ASSERT(__kmp_init_serial);
730 KMP_DEBUG_ASSERT(root && parent_team);
731 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
732
733 // If dyn-var is set, dynamically adjust the number of desired threads,
734 // according to the method specified by dynamic_mode.
735 new_nthreads = set_nthreads;
736 if (!get__dynamic_2(parent_team, master_tid)) {
737 ;
738 }
739#ifdef USE_LOAD_BALANCE
740 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
741 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
742 if (new_nthreads == 1) {
743 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
744 "reservation to 1 thread\n",
745 master_tid));
746 return 1;
747 }
748 if (new_nthreads < set_nthreads) {
749 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
750 "reservation to %d threads\n",
751 master_tid, new_nthreads));
752 }
753 }
754#endif /* USE_LOAD_BALANCE */
755 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
756 new_nthreads = __kmp_avail_proc - __kmp_nth +
757 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
758 if (new_nthreads <= 1) {
759 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
760 "reservation to 1 thread\n",
761 master_tid));
762 return 1;
763 }
764 if (new_nthreads < set_nthreads) {
765 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
766 "reservation to %d threads\n",
767 master_tid, new_nthreads));
768 } else {
769 new_nthreads = set_nthreads;
770 }
771 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
772 if (set_nthreads > 2) {
773 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
774 new_nthreads = (new_nthreads % set_nthreads) + 1;
775 if (new_nthreads == 1) {
776 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
777 "reservation to 1 thread\n",
778 master_tid));
779 return 1;
780 }
781 if (new_nthreads < set_nthreads) {
782 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
783 "reservation to %d threads\n",
784 master_tid, new_nthreads));
785 }
786 }
787 } else {
788 KMP_ASSERT(0);
789 }
790
791 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
792 if (__kmp_nth + new_nthreads -
793 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
794 __kmp_max_nth) {
795 int tl_nthreads = __kmp_max_nth - __kmp_nth +
796 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
797 if (tl_nthreads <= 0) {
798 tl_nthreads = 1;
799 }
800
801 // If dyn-var is false, emit a 1-time warning.
802 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
803 __kmp_reserve_warn = 1;
804 __kmp_msg(kmp_ms_warning,
805 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
806 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
807 }
808 if (tl_nthreads == 1) {
809 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
810 "reduced reservation to 1 thread\n",
811 master_tid));
812 return 1;
813 }
814 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
815 "reservation to %d threads\n",
816 master_tid, tl_nthreads));
817 new_nthreads = tl_nthreads;
818 }
819
820 // Respect OMP_THREAD_LIMIT
821 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
822 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
823 if (cg_nthreads + new_nthreads -
824 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
825 max_cg_threads) {
826 int tl_nthreads = max_cg_threads - cg_nthreads +
827 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
828 if (tl_nthreads <= 0) {
829 tl_nthreads = 1;
830 }
831
832 // If dyn-var is false, emit a 1-time warning.
833 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
834 __kmp_reserve_warn = 1;
835 __kmp_msg(kmp_ms_warning,
836 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
837 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
838 }
839 if (tl_nthreads == 1) {
840 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
841 "reduced reservation to 1 thread\n",
842 master_tid));
843 return 1;
844 }
845 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
846 "reservation to %d threads\n",
847 master_tid, tl_nthreads));
848 new_nthreads = tl_nthreads;
849 }
850
851 // Check if the threads array is large enough, or needs expanding.
852 // See comment in __kmp_register_root() about the adjustment if
853 // __kmp_threads[0] == NULL.
854 capacity = __kmp_threads_capacity;
855 if (TCR_PTR(__kmp_threads[0]) == NULL) {
856 --capacity;
857 }
858 // If it is not for initializing the hidden helper team, we need to take
859 // __kmp_hidden_helper_threads_num out of the capacity because it is included
860 // in __kmp_threads_capacity.
861 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
862 capacity -= __kmp_hidden_helper_threads_num;
863 }
864 if (__kmp_nth + new_nthreads -
865 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
866 capacity) {
867 // Expand the threads array.
868 int slotsRequired = __kmp_nth + new_nthreads -
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
870 capacity;
871 int slotsAdded = __kmp_expand_threads(slotsRequired);
872 if (slotsAdded < slotsRequired) {
873 // The threads array was not expanded enough.
874 new_nthreads -= (slotsRequired - slotsAdded);
875 KMP_ASSERT(new_nthreads >= 1);
876
877 // If dyn-var is false, emit a 1-time warning.
878 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879 __kmp_reserve_warn = 1;
880 if (__kmp_tp_cached) {
881 __kmp_msg(kmp_ms_warning,
882 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
883 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
884 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
885 } else {
886 __kmp_msg(kmp_ms_warning,
887 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
888 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
889 }
890 }
891 }
892 }
893
894#ifdef KMP_DEBUG
895 if (new_nthreads == 1) {
896 KC_TRACE(10,
897 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
898 "dead roots and rechecking; requested %d threads\n",
899 __kmp_get_gtid(), set_nthreads));
900 } else {
901 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
902 " %d threads\n",
903 __kmp_get_gtid(), new_nthreads, set_nthreads));
904 }
905#endif // KMP_DEBUG
906 return new_nthreads;
907}
908
909/* Allocate threads from the thread pool and assign them to the new team. We are
910 assured that there are enough threads available, because we checked on that
911 earlier within critical section forkjoin */
912static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
913 kmp_info_t *master_th, int master_gtid) {
914 int i;
915 int use_hot_team;
916
917 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
918 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
919 KMP_MB();
920
921 /* first, let's setup the primary thread */
922 master_th->th.th_info.ds.ds_tid = 0;
923 master_th->th.th_team = team;
924 master_th->th.th_team_nproc = team->t.t_nproc;
925 master_th->th.th_team_master = master_th;
926 master_th->th.th_team_serialized = FALSE;
927 master_th->th.th_dispatch = &team->t.t_dispatch[0];
928
929/* make sure we are not the optimized hot team */
930#if KMP_NESTED_HOT_TEAMS
931 use_hot_team = 0;
932 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
933 if (hot_teams) { // hot teams array is not allocated if
934 // KMP_HOT_TEAMS_MAX_LEVEL=0
935 int level = team->t.t_active_level - 1; // index in array of hot teams
936 if (master_th->th.th_teams_microtask) { // are we inside the teams?
937 if (master_th->th.th_teams_size.nteams > 1) {
938 ++level; // level was not increased in teams construct for
939 // team_of_masters
940 }
941 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
942 master_th->th.th_teams_level == team->t.t_level) {
943 ++level; // level was not increased in teams construct for
944 // team_of_workers before the parallel
945 } // team->t.t_level will be increased inside parallel
946 }
947 if (level < __kmp_hot_teams_max_level) {
948 if (hot_teams[level].hot_team) {
949 // hot team has already been allocated for given level
950 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
951 use_hot_team = 1; // the team is ready to use
952 } else {
953 use_hot_team = 0; // AC: threads are not allocated yet
954 hot_teams[level].hot_team = team; // remember new hot team
955 hot_teams[level].hot_team_nth = team->t.t_nproc;
956 }
957 } else {
958 use_hot_team = 0;
959 }
960 }
961#else
962 use_hot_team = team == root->r.r_hot_team;
963#endif
964 if (!use_hot_team) {
965
966 /* install the primary thread */
967 team->t.t_threads[0] = master_th;
968 __kmp_initialize_info(master_th, team, 0, master_gtid);
969
970 /* now, install the worker threads */
971 for (i = 1; i < team->t.t_nproc; i++) {
972
973 /* fork or reallocate a new thread and install it in team */
974 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
975 team->t.t_threads[i] = thr;
976 KMP_DEBUG_ASSERT(thr);
977 KMP_DEBUG_ASSERT(thr->th.th_team == team);
978 /* align team and thread arrived states */
979 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
980 "T#%d(%d:%d) join =%llu, plain=%llu\n",
981 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
982 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
983 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
984 team->t.t_bar[bs_plain_barrier].b_arrived));
985 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
986 thr->th.th_teams_level = master_th->th.th_teams_level;
987 thr->th.th_teams_size = master_th->th.th_teams_size;
988 { // Initialize threads' barrier data.
989 int b;
990 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
991 for (b = 0; b < bs_last_barrier; ++b) {
992 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
993 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
994#if USE_DEBUGGER
995 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
996#endif
997 }
998 }
999 }
1000
1001#if KMP_AFFINITY_SUPPORTED
1002 __kmp_partition_places(team);
1003#endif
1004 }
1005
1006 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1007 for (i = 0; i < team->t.t_nproc; i++) {
1008 kmp_info_t *thr = team->t.t_threads[i];
1009 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1010 thr->th.th_prev_level != team->t.t_level) {
1011 team->t.t_display_affinity = 1;
1012 break;
1013 }
1014 }
1015 }
1016
1017 KMP_MB();
1018}
1019
1020#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1021// Propagate any changes to the floating point control registers out to the team
1022// We try to avoid unnecessary writes to the relevant cache line in the team
1023// structure, so we don't make changes unless they are needed.
1024inline static void propagateFPControl(kmp_team_t *team) {
1025 if (__kmp_inherit_fp_control) {
1026 kmp_int16 x87_fpu_control_word;
1027 kmp_uint32 mxcsr;
1028
1029 // Get primary thread's values of FPU control flags (both X87 and vector)
1030 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1031 __kmp_store_mxcsr(&mxcsr);
1032 mxcsr &= KMP_X86_MXCSR_MASK;
1033
1034 // There is no point looking at t_fp_control_saved here.
1035 // If it is TRUE, we still have to update the values if they are different
1036 // from those we now have. If it is FALSE we didn't save anything yet, but
1037 // our objective is the same. We have to ensure that the values in the team
1038 // are the same as those we have.
1039 // So, this code achieves what we need whether or not t_fp_control_saved is
1040 // true. By checking whether the value needs updating we avoid unnecessary
1041 // writes that would put the cache-line into a written state, causing all
1042 // threads in the team to have to read it again.
1043 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1044 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1045 // Although we don't use this value, other code in the runtime wants to know
1046 // whether it should restore them. So we must ensure it is correct.
1047 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1048 } else {
1049 // Similarly here. Don't write to this cache-line in the team structure
1050 // unless we have to.
1051 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1052 }
1053}
1054
1055// Do the opposite, setting the hardware registers to the updated values from
1056// the team.
1057inline static void updateHWFPControl(kmp_team_t *team) {
1058 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1059 // Only reset the fp control regs if they have been changed in the team.
1060 // the parallel region that we are exiting.
1061 kmp_int16 x87_fpu_control_word;
1062 kmp_uint32 mxcsr;
1063 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1064 __kmp_store_mxcsr(&mxcsr);
1065 mxcsr &= KMP_X86_MXCSR_MASK;
1066
1067 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1068 __kmp_clear_x87_fpu_status_word();
1069 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1070 }
1071
1072 if (team->t.t_mxcsr != mxcsr) {
1073 __kmp_load_mxcsr(&team->t.t_mxcsr);
1074 }
1075 }
1076}
1077#else
1078#define propagateFPControl(x) ((void)0)
1079#define updateHWFPControl(x) ((void)0)
1080#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1081
1082static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1083 int realloc); // forward declaration
1084
1085/* Run a parallel region that has been serialized, so runs only in a team of the
1086 single primary thread. */
1087void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1088 kmp_info_t *this_thr;
1089 kmp_team_t *serial_team;
1090
1091 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1092
1093 /* Skip all this code for autopar serialized loops since it results in
1094 unacceptable overhead */
1095 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1096 return;
1097
1098 if (!TCR_4(__kmp_init_parallel))
1099 __kmp_parallel_initialize();
1100 __kmp_resume_if_soft_paused();
1101
1102 this_thr = __kmp_threads[global_tid];
1103 serial_team = this_thr->th.th_serial_team;
1104
1105 /* utilize the serialized team held by this thread */
1106 KMP_DEBUG_ASSERT(serial_team);
1107 KMP_MB();
1108
1109 if (__kmp_tasking_mode != tskm_immediate_exec) {
1110 KMP_DEBUG_ASSERT(
1111 this_thr->th.th_task_team ==
1112 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1113 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1114 NULL);
1115 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1116 "team %p, new task_team = NULL\n",
1117 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1118 this_thr->th.th_task_team = NULL;
1119 }
1120
1121 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1122 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1123 proc_bind = proc_bind_false;
1124 } else if (proc_bind == proc_bind_default) {
1125 // No proc_bind clause was specified, so use the current value
1126 // of proc-bind-var for this parallel region.
1127 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1128 }
1129 // Reset for next parallel region
1130 this_thr->th.th_set_proc_bind = proc_bind_default;
1131
1132#if OMPT_SUPPORT
1133 ompt_data_t ompt_parallel_data = ompt_data_none;
1134 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1135 if (ompt_enabled.enabled &&
1136 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1137
1138 ompt_task_info_t *parent_task_info;
1139 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1140
1141 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1142 if (ompt_enabled.ompt_callback_parallel_begin) {
1143 int team_size = 1;
1144
1145 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1146 &(parent_task_info->task_data), &(parent_task_info->frame),
1147 &ompt_parallel_data, team_size,
1148 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1149 }
1150 }
1151#endif // OMPT_SUPPORT
1152
1153 if (this_thr->th.th_team != serial_team) {
1154 // Nested level will be an index in the nested nthreads array
1155 int level = this_thr->th.th_team->t.t_level;
1156
1157 if (serial_team->t.t_serialized) {
1158 /* this serial team was already used
1159 TODO increase performance by making this locks more specific */
1160 kmp_team_t *new_team;
1161
1162 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1163
1164 new_team =
1165 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1166#if OMPT_SUPPORT
1167 ompt_parallel_data,
1168#endif
1169 proc_bind, &this_thr->th.th_current_task->td_icvs,
1170 0 USE_NESTED_HOT_ARG(NULL));
1171 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1172 KMP_ASSERT(new_team);
1173
1174 /* setup new serialized team and install it */
1175 new_team->t.t_threads[0] = this_thr;
1176 new_team->t.t_parent = this_thr->th.th_team;
1177 serial_team = new_team;
1178 this_thr->th.th_serial_team = serial_team;
1179
1180 KF_TRACE(
1181 10,
1182 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1183 global_tid, serial_team));
1184
1185 /* TODO the above breaks the requirement that if we run out of resources,
1186 then we can still guarantee that serialized teams are ok, since we may
1187 need to allocate a new one */
1188 } else {
1189 KF_TRACE(
1190 10,
1191 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1192 global_tid, serial_team));
1193 }
1194
1195 /* we have to initialize this serial team */
1196 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1197 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1198 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1199 serial_team->t.t_ident = loc;
1200 serial_team->t.t_serialized = 1;
1201 serial_team->t.t_nproc = 1;
1202 serial_team->t.t_parent = this_thr->th.th_team;
1203 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1204 this_thr->th.th_team = serial_team;
1205 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1206
1207 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1208 this_thr->th.th_current_task));
1209 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1210 this_thr->th.th_current_task->td_flags.executing = 0;
1211
1212 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1213
1214 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1215 implicit task for each serialized task represented by
1216 team->t.t_serialized? */
1217 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1218 &this_thr->th.th_current_task->td_parent->td_icvs);
1219
1220 // Thread value exists in the nested nthreads array for the next nested
1221 // level
1222 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1223 this_thr->th.th_current_task->td_icvs.nproc =
1224 __kmp_nested_nth.nth[level + 1];
1225 }
1226
1227 if (__kmp_nested_proc_bind.used &&
1228 (level + 1 < __kmp_nested_proc_bind.used)) {
1229 this_thr->th.th_current_task->td_icvs.proc_bind =
1230 __kmp_nested_proc_bind.bind_types[level + 1];
1231 }
1232
1233#if USE_DEBUGGER
1234 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1235#endif
1236 this_thr->th.th_info.ds.ds_tid = 0;
1237
1238 /* set thread cache values */
1239 this_thr->th.th_team_nproc = 1;
1240 this_thr->th.th_team_master = this_thr;
1241 this_thr->th.th_team_serialized = 1;
1242
1243 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1244 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1245 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1246
1247 propagateFPControl(serial_team);
1248
1249 /* check if we need to allocate dispatch buffers stack */
1250 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1251 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1252 serial_team->t.t_dispatch->th_disp_buffer =
1253 (dispatch_private_info_t *)__kmp_allocate(
1254 sizeof(dispatch_private_info_t));
1255 }
1256 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1257
1258 KMP_MB();
1259
1260 } else {
1261 /* this serialized team is already being used,
1262 * that's fine, just add another nested level */
1263 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1264 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266 ++serial_team->t.t_serialized;
1267 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1268
1269 // Nested level will be an index in the nested nthreads array
1270 int level = this_thr->th.th_team->t.t_level;
1271 // Thread value exists in the nested nthreads array for the next nested
1272 // level
1273 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1274 this_thr->th.th_current_task->td_icvs.nproc =
1275 __kmp_nested_nth.nth[level + 1];
1276 }
1277 serial_team->t.t_level++;
1278 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1279 "of serial team %p to %d\n",
1280 global_tid, serial_team, serial_team->t.t_level));
1281
1282 /* allocate/push dispatch buffers stack */
1283 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1284 {
1285 dispatch_private_info_t *disp_buffer =
1286 (dispatch_private_info_t *)__kmp_allocate(
1287 sizeof(dispatch_private_info_t));
1288 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1289 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1290 }
1291 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1292
1293 KMP_MB();
1294 }
1295 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1296
1297 // Perform the display affinity functionality for
1298 // serialized parallel regions
1299 if (__kmp_display_affinity) {
1300 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1301 this_thr->th.th_prev_num_threads != 1) {
1302 // NULL means use the affinity-format-var ICV
1303 __kmp_aux_display_affinity(global_tid, NULL);
1304 this_thr->th.th_prev_level = serial_team->t.t_level;
1305 this_thr->th.th_prev_num_threads = 1;
1306 }
1307 }
1308
1309 if (__kmp_env_consistency_check)
1310 __kmp_push_parallel(global_tid, NULL);
1311#if OMPT_SUPPORT
1312 serial_team->t.ompt_team_info.master_return_address = codeptr;
1313 if (ompt_enabled.enabled &&
1314 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1315 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1316 OMPT_GET_FRAME_ADDRESS(0);
1317
1318 ompt_lw_taskteam_t lw_taskteam;
1319 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1320 &ompt_parallel_data, codeptr);
1321
1322 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1323 // don't use lw_taskteam after linking. content was swaped
1324
1325 /* OMPT implicit task begin */
1326 if (ompt_enabled.ompt_callback_implicit_task) {
1327 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1328 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1329 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1330 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1331 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1332 __kmp_tid_from_gtid(global_tid);
1333 }
1334
1335 /* OMPT state */
1336 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1337 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1338 OMPT_GET_FRAME_ADDRESS(0);
1339 }
1340#endif
1341}
1342
1343/* most of the work for a fork */
1344/* return true if we really went parallel, false if serialized */
1345int __kmp_fork_call(ident_t *loc, int gtid,
1346 enum fork_context_e call_context, // Intel, GNU, ...
1347 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1348 kmp_va_list ap) {
1349 void **argv;
1350 int i;
1351 int master_tid;
1352 int master_this_cons;
1353 kmp_team_t *team;
1354 kmp_team_t *parent_team;
1355 kmp_info_t *master_th;
1356 kmp_root_t *root;
1357 int nthreads;
1358 int master_active;
1359 int master_set_numthreads;
1360 int level;
1361 int active_level;
1362 int teams_level;
1363#if KMP_NESTED_HOT_TEAMS
1364 kmp_hot_team_ptr_t **p_hot_teams;
1365#endif
1366 { // KMP_TIME_BLOCK
1367 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1368 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1369
1370 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1371 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1372 /* Some systems prefer the stack for the root thread(s) to start with */
1373 /* some gap from the parent stack to prevent false sharing. */
1374 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1375 /* These 2 lines below are so this does not get optimized out */
1376 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1377 __kmp_stkpadding += (short)((kmp_int64)dummy);
1378 }
1379
1380 /* initialize if needed */
1381 KMP_DEBUG_ASSERT(
1382 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1383 if (!TCR_4(__kmp_init_parallel))
1384 __kmp_parallel_initialize();
1385 __kmp_resume_if_soft_paused();
1386
1387 /* setup current data */
1388 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1389 // shutdown
1390 parent_team = master_th->th.th_team;
1391 master_tid = master_th->th.th_info.ds.ds_tid;
1392 master_this_cons = master_th->th.th_local.this_construct;
1393 root = master_th->th.th_root;
1394 master_active = root->r.r_active;
1395 master_set_numthreads = master_th->th.th_set_nproc;
1396
1397#if OMPT_SUPPORT
1398 ompt_data_t ompt_parallel_data = ompt_data_none;
1399 ompt_data_t *parent_task_data;
1400 ompt_frame_t *ompt_frame;
1401 ompt_data_t *implicit_task_data;
1402 void *return_address = NULL;
1403
1404 if (ompt_enabled.enabled) {
1405 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1406 NULL, NULL);
1407 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1408 }
1409#endif
1410
1411 // Assign affinity to root thread if it hasn't happened yet
1412 __kmp_assign_root_init_mask();
1413
1414 // Nested level will be an index in the nested nthreads array
1415 level = parent_team->t.t_level;
1416 // used to launch non-serial teams even if nested is not allowed
1417 active_level = parent_team->t.t_active_level;
1418 // needed to check nesting inside the teams
1419 teams_level = master_th->th.th_teams_level;
1420#if KMP_NESTED_HOT_TEAMS
1421 p_hot_teams = &master_th->th.th_hot_teams;
1422 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1423 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1424 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1425 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1426 // it is either actual or not needed (when active_level > 0)
1427 (*p_hot_teams)[0].hot_team_nth = 1;
1428 }
1429#endif
1430
1431#if OMPT_SUPPORT
1432 if (ompt_enabled.enabled) {
1433 if (ompt_enabled.ompt_callback_parallel_begin) {
1434 int team_size = master_set_numthreads
1435 ? master_set_numthreads
1436 : get__nproc_2(parent_team, master_tid);
1437 int flags = OMPT_INVOKER(call_context) |
1438 ((microtask == (microtask_t)__kmp_teams_master)
1439 ? ompt_parallel_league
1440 : ompt_parallel_team);
1441 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1442 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1443 return_address);
1444 }
1445 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1446 }
1447#endif
1448
1449 master_th->th.th_ident = loc;
1450
1451 if (master_th->th.th_teams_microtask && ap &&
1452 microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1453 // AC: This is start of parallel that is nested inside teams construct.
1454 // The team is actual (hot), all workers are ready at the fork barrier.
1455 // No lock needed to initialize the team a bit, then free workers.
1456 parent_team->t.t_ident = loc;
1457 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1458 parent_team->t.t_argc = argc;
1459 argv = (void **)parent_team->t.t_argv;
1460 for (i = argc - 1; i >= 0; --i)
1461 *argv++ = va_arg(kmp_va_deref(ap), void *);
1462 // Increment our nested depth levels, but not increase the serialization
1463 if (parent_team == master_th->th.th_serial_team) {
1464 // AC: we are in serialized parallel
1465 __kmpc_serialized_parallel(loc, gtid);
1466 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1467
1468 if (call_context == fork_context_gnu) {
1469 // AC: need to decrement t_serialized for enquiry functions to work
1470 // correctly, will restore at join time
1471 parent_team->t.t_serialized--;
1472 return TRUE;
1473 }
1474
1475#if OMPD_SUPPORT
1476 parent_team->t.t_pkfn = microtask;
1477#endif
1478
1479#if OMPT_SUPPORT
1480 void *dummy;
1481 void **exit_frame_p;
1482
1483 ompt_lw_taskteam_t lw_taskteam;
1484
1485 if (ompt_enabled.enabled) {
1486 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1487 &ompt_parallel_data, return_address);
1488 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1489
1490 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1491 // don't use lw_taskteam after linking. content was swaped
1492
1493 /* OMPT implicit task begin */
1494 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1495 if (ompt_enabled.ompt_callback_implicit_task) {
1496 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1497 __kmp_tid_from_gtid(gtid);
1498 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1499 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1500 implicit_task_data, 1,
1501 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1502 }
1503
1504 /* OMPT state */
1505 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1506 } else {
1507 exit_frame_p = &dummy;
1508 }
1509#endif
1510 // AC: need to decrement t_serialized for enquiry functions to work
1511 // correctly, will restore at join time
1512 parent_team->t.t_serialized--;
1513
1514 {
1515 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1516 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1517 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1518#if OMPT_SUPPORT
1519 ,
1520 exit_frame_p
1521#endif
1522 );
1523 }
1524
1525#if OMPT_SUPPORT
1526 if (ompt_enabled.enabled) {
1527 *exit_frame_p = NULL;
1528 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1529 if (ompt_enabled.ompt_callback_implicit_task) {
1530 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1531 ompt_scope_end, NULL, implicit_task_data, 1,
1532 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1533 }
1534 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1535 __ompt_lw_taskteam_unlink(master_th);
1536 if (ompt_enabled.ompt_callback_parallel_end) {
1537 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1538 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1539 OMPT_INVOKER(call_context) | ompt_parallel_team,
1540 return_address);
1541 }
1542 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1543 }
1544#endif
1545 return TRUE;
1546 }
1547
1548 parent_team->t.t_pkfn = microtask;
1549 parent_team->t.t_invoke = invoker;
1550 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1551 parent_team->t.t_active_level++;
1552 parent_team->t.t_level++;
1553 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1554
1555#if OMPT_SUPPORT
1556 if (ompt_enabled.enabled) {
1557 ompt_lw_taskteam_t lw_taskteam;
1558 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1559 &ompt_parallel_data, return_address);
1560 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1561 }
1562#endif
1563
1564 /* Change number of threads in the team if requested */
1565 if (master_set_numthreads) { // The parallel has num_threads clause
1566 if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1567 // AC: only can reduce number of threads dynamically, can't increase
1568 kmp_info_t **other_threads = parent_team->t.t_threads;
1569 parent_team->t.t_nproc = master_set_numthreads;
1570 for (i = 0; i < master_set_numthreads; ++i) {
1571 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1572 }
1573 // Keep extra threads hot in the team for possible next parallels
1574 }
1575 master_th->th.th_set_nproc = 0;
1576 }
1577
1578#if USE_DEBUGGER
1579 if (__kmp_debugging) { // Let debugger override number of threads.
1580 int nth = __kmp_omp_num_threads(loc);
1581 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1582 master_set_numthreads = nth;
1583 }
1584 }
1585#endif
1586
1587#if USE_ITT_BUILD && USE_ITT_NOTIFY
1588 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1589 KMP_ITT_DEBUG) &&
1590 __kmp_forkjoin_frames_mode == 3 &&
1591 parent_team->t.t_active_level == 1 // only report frames at level 1
1592 && master_th->th.th_teams_size.nteams == 1) {
1593 kmp_uint64 tmp_time = __itt_get_timestamp();
1594 master_th->th.th_frame_time = tmp_time;
1595 parent_team->t.t_region_time = tmp_time;
1596 }
1597 if (__itt_stack_caller_create_ptr) {
1598 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1599 // create new stack stitching id before entering fork barrier
1600 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1601 }
1602#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1603
1604 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1605 "master_th=%p, gtid=%d\n",
1606 root, parent_team, master_th, gtid));
1607 __kmp_internal_fork(loc, gtid, parent_team);
1608 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1609 "master_th=%p, gtid=%d\n",
1610 root, parent_team, master_th, gtid));
1611
1612 if (call_context == fork_context_gnu)
1613 return TRUE;
1614
1615 /* Invoke microtask for PRIMARY thread */
1616 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1617 parent_team->t.t_id, parent_team->t.t_pkfn));
1618
1619 if (!parent_team->t.t_invoke(gtid)) {
1620 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1621 }
1622 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1623 parent_team->t.t_id, parent_team->t.t_pkfn));
1624 KMP_MB(); /* Flush all pending memory write invalidates. */
1625
1626 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1627
1628 return TRUE;
1629 } // Parallel closely nested in teams construct
1630
1631#if KMP_DEBUG
1632 if (__kmp_tasking_mode != tskm_immediate_exec) {
1633 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1634 parent_team->t.t_task_team[master_th->th.th_task_state]);
1635 }
1636#endif
1637
1638 int enter_teams = 0;
1639 if (parent_team->t.t_active_level >=
1640 master_th->th.th_current_task->td_icvs.max_active_levels) {
1641 nthreads = 1;
1642 } else {
1643 enter_teams = ((ap == NULL && active_level == 0) ||
1644 (ap && teams_level > 0 && teams_level == level));
1645 nthreads =
1646 master_set_numthreads
1647 ? master_set_numthreads
1648 : get__nproc_2(
1649 parent_team,
1650 master_tid); // TODO: get nproc directly from current task
1651
1652 // Check if we need to take forkjoin lock? (no need for serialized
1653 // parallel out of teams construct). This code moved here from
1654 // __kmp_reserve_threads() to speedup nested serialized parallels.
1655 if (nthreads > 1) {
1656 if ((get__max_active_levels(master_th) == 1 &&
1657 (root->r.r_in_parallel && !enter_teams)) ||
1658 (__kmp_library == library_serial)) {
1659 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1660 " threads\n",
1661 gtid, nthreads));
1662 nthreads = 1;
1663 }
1664 }
1665 if (nthreads > 1) {
1666 /* determine how many new threads we can use */
1667 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1668 /* AC: If we execute teams from parallel region (on host), then teams
1669 should be created but each can only have 1 thread if nesting is
1670 disabled. If teams called from serial region, then teams and their
1671 threads should be created regardless of the nesting setting. */
1672 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1673 nthreads, enter_teams);
1674 if (nthreads == 1) {
1675 // Free lock for single thread execution here; for multi-thread
1676 // execution it will be freed later after team of threads created
1677 // and initialized
1678 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1679 }
1680 }
1681 }
1682 KMP_DEBUG_ASSERT(nthreads > 0);
1683
1684 // If we temporarily changed the set number of threads then restore it now
1685 master_th->th.th_set_nproc = 0;
1686
1687 /* create a serialized parallel region? */
1688 if (nthreads == 1) {
1689/* josh todo: hypothetical question: what do we do for OS X*? */
1690#if KMP_OS_LINUX && \
1691 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1692 void *args[argc];
1693#else
1694 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1695#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1696 KMP_ARCH_AARCH64) */
1697
1698 KA_TRACE(20,
1699 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1700
1701 __kmpc_serialized_parallel(loc, gtid);
1702
1703#if OMPD_SUPPORT
1704 master_th->th.th_serial_team->t.t_pkfn = microtask;
1705#endif
1706
1707 if (call_context == fork_context_intel) {
1708 /* TODO this sucks, use the compiler itself to pass args! :) */
1709 master_th->th.th_serial_team->t.t_ident = loc;
1710 if (!ap) {
1711 // revert change made in __kmpc_serialized_parallel()
1712 master_th->th.th_serial_team->t.t_level--;
1713 // Get args from parent team for teams construct
1714
1715#if OMPT_SUPPORT
1716 void *dummy;
1717 void **exit_frame_p;
1718 ompt_task_info_t *task_info;
1719
1720 ompt_lw_taskteam_t lw_taskteam;
1721
1722 if (ompt_enabled.enabled) {
1723 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724 &ompt_parallel_data, return_address);
1725
1726 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727 // don't use lw_taskteam after linking. content was swaped
1728
1729 task_info = OMPT_CUR_TASK_INFO(master_th);
1730 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1731 if (ompt_enabled.ompt_callback_implicit_task) {
1732 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1733 __kmp_tid_from_gtid(gtid);
1734 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1735 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1736 &(task_info->task_data), 1,
1737 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1738 ompt_task_implicit);
1739 }
1740
1741 /* OMPT state */
1742 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1743 } else {
1744 exit_frame_p = &dummy;
1745 }
1746#endif
1747
1748 {
1749 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1750 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1751 __kmp_invoke_microtask(microtask, gtid, 0, argc,
1752 parent_team->t.t_argv
1753#if OMPT_SUPPORT
1754 ,
1755 exit_frame_p
1756#endif
1757 );
1758 }
1759
1760#if OMPT_SUPPORT
1761 if (ompt_enabled.enabled) {
1762 *exit_frame_p = NULL;
1763 if (ompt_enabled.ompt_callback_implicit_task) {
1764 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1765 ompt_scope_end, NULL, &(task_info->task_data), 1,
1766 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1767 ompt_task_implicit);
1768 }
1769 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1770 __ompt_lw_taskteam_unlink(master_th);
1771 if (ompt_enabled.ompt_callback_parallel_end) {
1772 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1773 &ompt_parallel_data, parent_task_data,
1774 OMPT_INVOKER(call_context) | ompt_parallel_team,
1775 return_address);
1776 }
1777 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1778 }
1779#endif
1780 } else if (microtask == (microtask_t)__kmp_teams_master) {
1781 KMP_DEBUG_ASSERT(master_th->th.th_team ==
1782 master_th->th.th_serial_team);
1783 team = master_th->th.th_team;
1784 // team->t.t_pkfn = microtask;
1785 team->t.t_invoke = invoker;
1786 __kmp_alloc_argv_entries(argc, team, TRUE);
1787 team->t.t_argc = argc;
1788 argv = (void **)team->t.t_argv;
1789 if (ap) {
1790 for (i = argc - 1; i >= 0; --i)
1791 *argv++ = va_arg(kmp_va_deref(ap), void *);
1792 } else {
1793 for (i = 0; i < argc; ++i)
1794 // Get args from parent team for teams construct
1795 argv[i] = parent_team->t.t_argv[i];
1796 }
1797 // AC: revert change made in __kmpc_serialized_parallel()
1798 // because initial code in teams should have level=0
1799 team->t.t_level--;
1800 // AC: call special invoker for outer "parallel" of teams construct
1801 invoker(gtid);
1802#if OMPT_SUPPORT
1803 if (ompt_enabled.enabled) {
1804 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1805 if (ompt_enabled.ompt_callback_implicit_task) {
1806 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1807 ompt_scope_end, NULL, &(task_info->task_data), 0,
1808 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1809 }
1810 if (ompt_enabled.ompt_callback_parallel_end) {
1811 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1812 &ompt_parallel_data, parent_task_data,
1813 OMPT_INVOKER(call_context) | ompt_parallel_league,
1814 return_address);
1815 }
1816 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1817 }
1818#endif
1819 } else {
1820 argv = args;
1821 for (i = argc - 1; i >= 0; --i)
1822 *argv++ = va_arg(kmp_va_deref(ap), void *);
1823 KMP_MB();
1824
1825#if OMPT_SUPPORT
1826 void *dummy;
1827 void **exit_frame_p;
1828 ompt_task_info_t *task_info;
1829
1830 ompt_lw_taskteam_t lw_taskteam;
1831
1832 if (ompt_enabled.enabled) {
1833 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1834 &ompt_parallel_data, return_address);
1835 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1836 // don't use lw_taskteam after linking. content was swaped
1837 task_info = OMPT_CUR_TASK_INFO(master_th);
1838 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1839
1840 /* OMPT implicit task begin */
1841 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1842 if (ompt_enabled.ompt_callback_implicit_task) {
1843 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1844 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1845 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1846 ompt_task_implicit);
1847 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1848 __kmp_tid_from_gtid(gtid);
1849 }
1850
1851 /* OMPT state */
1852 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1853 } else {
1854 exit_frame_p = &dummy;
1855 }
1856#endif
1857
1858 {
1859 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1860 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1861 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1862#if OMPT_SUPPORT
1863 ,
1864 exit_frame_p
1865#endif
1866 );
1867 }
1868
1869#if OMPT_SUPPORT
1870 if (ompt_enabled.enabled) {
1871 *exit_frame_p = NULL;
1872 if (ompt_enabled.ompt_callback_implicit_task) {
1873 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1874 ompt_scope_end, NULL, &(task_info->task_data), 1,
1875 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1876 ompt_task_implicit);
1877 }
1878
1879 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1880 __ompt_lw_taskteam_unlink(master_th);
1881 if (ompt_enabled.ompt_callback_parallel_end) {
1882 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1883 &ompt_parallel_data, parent_task_data,
1884 OMPT_INVOKER(call_context) | ompt_parallel_team,
1885 return_address);
1886 }
1887 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1888 }
1889#endif
1890 }
1891 } else if (call_context == fork_context_gnu) {
1892#if OMPT_SUPPORT
1893 ompt_lw_taskteam_t lwt;
1894 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1895 return_address);
1896
1897 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1898 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1899// don't use lw_taskteam after linking. content was swaped
1900#endif
1901
1902 // we were called from GNU native code
1903 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1904 return FALSE;
1905 } else {
1906 KMP_ASSERT2(call_context < fork_context_last,
1907 "__kmp_fork_call: unknown fork_context parameter");
1908 }
1909
1910 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1911 KMP_MB();
1912 return FALSE;
1913 } // if (nthreads == 1)
1914
1915 // GEH: only modify the executing flag in the case when not serialized
1916 // serialized case is handled in kmpc_serialized_parallel
1917 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1918 "curtask=%p, curtask_max_aclevel=%d\n",
1919 parent_team->t.t_active_level, master_th,
1920 master_th->th.th_current_task,
1921 master_th->th.th_current_task->td_icvs.max_active_levels));
1922 // TODO: GEH - cannot do this assertion because root thread not set up as
1923 // executing
1924 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1925 master_th->th.th_current_task->td_flags.executing = 0;
1926
1927 if (!master_th->th.th_teams_microtask || level > teams_level) {
1928 /* Increment our nested depth level */
1929 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1930 }
1931
1932 // See if we need to make a copy of the ICVs.
1933 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1934 if ((level + 1 < __kmp_nested_nth.used) &&
1935 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1936 nthreads_icv = __kmp_nested_nth.nth[level + 1];
1937 } else {
1938 nthreads_icv = 0; // don't update
1939 }
1940
1941 // Figure out the proc_bind_policy for the new team.
1942 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1943 kmp_proc_bind_t proc_bind_icv =
1944 proc_bind_default; // proc_bind_default means don't update
1945 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1946 proc_bind = proc_bind_false;
1947 } else {
1948 if (proc_bind == proc_bind_default) {
1949 // No proc_bind clause specified; use current proc-bind-var for this
1950 // parallel region
1951 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1952 }
1953 /* else: The proc_bind policy was specified explicitly on parallel clause.
1954 This overrides proc-bind-var for this parallel region, but does not
1955 change proc-bind-var. */
1956 // Figure the value of proc-bind-var for the child threads.
1957 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1958 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1959 master_th->th.th_current_task->td_icvs.proc_bind)) {
1960 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1961 }
1962 }
1963
1964 // Reset for next parallel region
1965 master_th->th.th_set_proc_bind = proc_bind_default;
1966
1967 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1968 kmp_internal_control_t new_icvs;
1969 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1970 new_icvs.next = NULL;
1971 if (nthreads_icv > 0) {
1972 new_icvs.nproc = nthreads_icv;
1973 }
1974 if (proc_bind_icv != proc_bind_default) {
1975 new_icvs.proc_bind = proc_bind_icv;
1976 }
1977
1978 /* allocate a new parallel team */
1979 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1980 team = __kmp_allocate_team(root, nthreads, nthreads,
1981#if OMPT_SUPPORT
1982 ompt_parallel_data,
1983#endif
1984 proc_bind, &new_icvs,
1985 argc USE_NESTED_HOT_ARG(master_th));
1986 } else {
1987 /* allocate a new parallel team */
1988 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1989 team = __kmp_allocate_team(root, nthreads, nthreads,
1990#if OMPT_SUPPORT
1991 ompt_parallel_data,
1992#endif
1993 proc_bind,
1994 &master_th->th.th_current_task->td_icvs,
1995 argc USE_NESTED_HOT_ARG(master_th));
1996 }
1997 KF_TRACE(
1998 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1999
2000 /* setup the new team */
2001 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2002 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2003 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2004 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2005 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2006#if OMPT_SUPPORT
2007 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2008 return_address);
2009#endif
2010 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2011 // TODO: parent_team->t.t_level == INT_MAX ???
2012 if (!master_th->th.th_teams_microtask || level > teams_level) {
2013 int new_level = parent_team->t.t_level + 1;
2014 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2015 new_level = parent_team->t.t_active_level + 1;
2016 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2017 } else {
2018 // AC: Do not increase parallel level at start of the teams construct
2019 int new_level = parent_team->t.t_level;
2020 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2021 new_level = parent_team->t.t_active_level;
2022 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2023 }
2024 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2025 // set primary thread's schedule as new run-time schedule
2026 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2027
2028 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2029 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2030
2031 // Update the floating point rounding in the team if required.
2032 propagateFPControl(team);
2033#if OMPD_SUPPORT
2034 if (ompd_state & OMPD_ENABLE_BP)
2035 ompd_bp_parallel_begin();
2036#endif
2037
2038 if (__kmp_tasking_mode != tskm_immediate_exec) {
2039 // Set primary thread's task team to team's task team. Unless this is hot
2040 // team, it should be NULL.
2041 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2042 parent_team->t.t_task_team[master_th->th.th_task_state]);
2043 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2044 "%p, new task_team %p / team %p\n",
2045 __kmp_gtid_from_thread(master_th),
2046 master_th->th.th_task_team, parent_team,
2047 team->t.t_task_team[master_th->th.th_task_state], team));
2048
2049 if (active_level || master_th->th.th_task_team) {
2050 // Take a memo of primary thread's task_state
2051 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2052 if (master_th->th.th_task_state_top >=
2053 master_th->th.th_task_state_stack_sz) { // increase size
2054 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2055 kmp_uint8 *old_stack, *new_stack;
2056 kmp_uint32 i;
2057 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2058 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2059 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2060 }
2061 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2062 ++i) { // zero-init rest of stack
2063 new_stack[i] = 0;
2064 }
2065 old_stack = master_th->th.th_task_state_memo_stack;
2066 master_th->th.th_task_state_memo_stack = new_stack;
2067 master_th->th.th_task_state_stack_sz = new_size;
2068 __kmp_free(old_stack);
2069 }
2070 // Store primary thread's task_state on stack
2071 master_th->th
2072 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2073 master_th->th.th_task_state;
2074 master_th->th.th_task_state_top++;
2075#if KMP_NESTED_HOT_TEAMS
2076 if (master_th->th.th_hot_teams &&
2077 active_level < __kmp_hot_teams_max_level &&
2078 team == master_th->th.th_hot_teams[active_level].hot_team) {
2079 // Restore primary thread's nested state if nested hot team
2080 master_th->th.th_task_state =
2081 master_th->th
2082 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2083 } else {
2084#endif
2085 master_th->th.th_task_state = 0;
2086#if KMP_NESTED_HOT_TEAMS
2087 }
2088#endif
2089 }
2090#if !KMP_NESTED_HOT_TEAMS
2091 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2092 (team == root->r.r_hot_team));
2093#endif
2094 }
2095
2096 KA_TRACE(
2097 20,
2098 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2099 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2100 team->t.t_nproc));
2101 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2102 (team->t.t_master_tid == 0 &&
2103 (team->t.t_parent == root->r.r_root_team ||
2104 team->t.t_parent->t.t_serialized)));
2105 KMP_MB();
2106
2107 /* now, setup the arguments */
2108 argv = (void **)team->t.t_argv;
2109 if (ap) {
2110 for (i = argc - 1; i >= 0; --i) {
2111 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2112 KMP_CHECK_UPDATE(*argv, new_argv);
2113 argv++;
2114 }
2115 } else {
2116 for (i = 0; i < argc; ++i) {
2117 // Get args from parent team for teams construct
2118 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2119 }
2120 }
2121
2122 /* now actually fork the threads */
2123 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2124 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2125 root->r.r_active = TRUE;
2126
2127 __kmp_fork_team_threads(root, team, master_th, gtid);
2128 __kmp_setup_icv_copy(team, nthreads,
2129 &master_th->th.th_current_task->td_icvs, loc);
2130
2131#if OMPT_SUPPORT
2132 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2133#endif
2134
2135 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2136
2137#if USE_ITT_BUILD
2138 if (team->t.t_active_level == 1 // only report frames at level 1
2139 && !master_th->th.th_teams_microtask) { // not in teams construct
2140#if USE_ITT_NOTIFY
2141 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2142 (__kmp_forkjoin_frames_mode == 3 ||
2143 __kmp_forkjoin_frames_mode == 1)) {
2144 kmp_uint64 tmp_time = 0;
2145 if (__itt_get_timestamp_ptr)
2146 tmp_time = __itt_get_timestamp();
2147 // Internal fork - report frame begin
2148 master_th->th.th_frame_time = tmp_time;
2149 if (__kmp_forkjoin_frames_mode == 3)
2150 team->t.t_region_time = tmp_time;
2151 } else
2152// only one notification scheme (either "submit" or "forking/joined", not both)
2153#endif /* USE_ITT_NOTIFY */
2154 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2155 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2156 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2157 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2158 }
2159 }
2160#endif /* USE_ITT_BUILD */
2161
2162 /* now go on and do the work */
2163 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2164 KMP_MB();
2165 KF_TRACE(10,
2166 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2167 root, team, master_th, gtid));
2168
2169#if USE_ITT_BUILD
2170 if (__itt_stack_caller_create_ptr) {
2171 // create new stack stitching id before entering fork barrier
2172 if (!enter_teams) {
2173 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2174 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2175 } else if (parent_team->t.t_serialized) {
2176 // keep stack stitching id in the serialized parent_team;
2177 // current team will be used for parallel inside the teams;
2178 // if parent_team is active, then it already keeps stack stitching id
2179 // for the league of teams
2180 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2181 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2182 }
2183 }
2184#endif /* USE_ITT_BUILD */
2185
2186 // AC: skip __kmp_internal_fork at teams construct, let only primary
2187 // threads execute
2188 if (ap) {
2189 __kmp_internal_fork(loc, gtid, team);
2190 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2191 "master_th=%p, gtid=%d\n",
2192 root, team, master_th, gtid));
2193 }
2194
2195 if (call_context == fork_context_gnu) {
2196 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2197 return TRUE;
2198 }
2199
2200 /* Invoke microtask for PRIMARY thread */
2201 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2202 team->t.t_id, team->t.t_pkfn));
2203 } // END of timer KMP_fork_call block
2204
2205#if KMP_STATS_ENABLED
2206 // If beginning a teams construct, then change thread state
2207 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2208 if (!ap) {
2209 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2210 }
2211#endif
2212
2213 if (!team->t.t_invoke(gtid)) {
2214 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2215 }
2216
2217#if KMP_STATS_ENABLED
2218 // If was beginning of a teams construct, then reset thread state
2219 if (!ap) {
2220 KMP_SET_THREAD_STATE(previous_state);
2221 }
2222#endif
2223
2224 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2225 team->t.t_id, team->t.t_pkfn));
2226 KMP_MB(); /* Flush all pending memory write invalidates. */
2227
2228 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2229#if OMPT_SUPPORT
2230 if (ompt_enabled.enabled) {
2231 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2232 }
2233#endif
2234
2235 return TRUE;
2236}
2237
2238#if OMPT_SUPPORT
2239static inline void __kmp_join_restore_state(kmp_info_t *thread,
2240 kmp_team_t *team) {
2241 // restore state outside the region
2242 thread->th.ompt_thread_info.state =
2243 ((team->t.t_serialized) ? ompt_state_work_serial
2244 : ompt_state_work_parallel);
2245}
2246
2247static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2248 kmp_team_t *team, ompt_data_t *parallel_data,
2249 int flags, void *codeptr) {
2250 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2251 if (ompt_enabled.ompt_callback_parallel_end) {
2252 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2253 parallel_data, &(task_info->task_data), flags, codeptr);
2254 }
2255
2256 task_info->frame.enter_frame = ompt_data_none;
2257 __kmp_join_restore_state(thread, team);
2258}
2259#endif
2260
2261void __kmp_join_call(ident_t *loc, int gtid
2262#if OMPT_SUPPORT
2263 ,
2264 enum fork_context_e fork_context
2265#endif
2266 ,
2267 int exit_teams) {
2268 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2269 kmp_team_t *team;
2270 kmp_team_t *parent_team;
2271 kmp_info_t *master_th;
2272 kmp_root_t *root;
2273 int master_active;
2274
2275 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2276
2277 /* setup current data */
2278 master_th = __kmp_threads[gtid];
2279 root = master_th->th.th_root;
2280 team = master_th->th.th_team;
2281 parent_team = team->t.t_parent;
2282
2283 master_th->th.th_ident = loc;
2284
2285#if OMPT_SUPPORT
2286 void *team_microtask = (void *)team->t.t_pkfn;
2287 // For GOMP interface with serialized parallel, need the
2288 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2289 // and end-parallel events.
2290 if (ompt_enabled.enabled &&
2291 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2292 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2293 }
2294#endif
2295
2296#if KMP_DEBUG
2297 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2298 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2299 "th_task_team = %p\n",
2300 __kmp_gtid_from_thread(master_th), team,
2301 team->t.t_task_team[master_th->th.th_task_state],
2302 master_th->th.th_task_team));
2303 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2304 team->t.t_task_team[master_th->th.th_task_state]);
2305 }
2306#endif
2307
2308 if (team->t.t_serialized) {
2309 if (master_th->th.th_teams_microtask) {
2310 // We are in teams construct
2311 int level = team->t.t_level;
2312 int tlevel = master_th->th.th_teams_level;
2313 if (level == tlevel) {
2314 // AC: we haven't incremented it earlier at start of teams construct,
2315 // so do it here - at the end of teams construct
2316 team->t.t_level++;
2317 } else if (level == tlevel + 1) {
2318 // AC: we are exiting parallel inside teams, need to increment
2319 // serialization in order to restore it in the next call to
2320 // __kmpc_end_serialized_parallel
2321 team->t.t_serialized++;
2322 }
2323 }
2325
2326#if OMPT_SUPPORT
2327 if (ompt_enabled.enabled) {
2328 __kmp_join_restore_state(master_th, parent_team);
2329 }
2330#endif
2331
2332 return;
2333 }
2334
2335 master_active = team->t.t_master_active;
2336
2337 if (!exit_teams) {
2338 // AC: No barrier for internal teams at exit from teams construct.
2339 // But there is barrier for external team (league).
2340 __kmp_internal_join(loc, gtid, team);
2341#if USE_ITT_BUILD
2342 if (__itt_stack_caller_create_ptr) {
2343 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2344 // destroy the stack stitching id after join barrier
2345 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2346 team->t.t_stack_id = NULL;
2347 }
2348#endif
2349 } else {
2350 master_th->th.th_task_state =
2351 0; // AC: no tasking in teams (out of any parallel)
2352#if USE_ITT_BUILD
2353 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2354 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2355 // destroy the stack stitching id on exit from the teams construct
2356 // if parent_team is active, then the id will be destroyed later on
2357 // by master of the league of teams
2358 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2359 parent_team->t.t_stack_id = NULL;
2360 }
2361#endif
2362 }
2363
2364 KMP_MB();
2365
2366#if OMPT_SUPPORT
2367 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2368 void *codeptr = team->t.ompt_team_info.master_return_address;
2369#endif
2370
2371#if USE_ITT_BUILD
2372 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2373 if (team->t.t_active_level == 1 &&
2374 (!master_th->th.th_teams_microtask || /* not in teams construct */
2375 master_th->th.th_teams_size.nteams == 1)) {
2376 master_th->th.th_ident = loc;
2377 // only one notification scheme (either "submit" or "forking/joined", not
2378 // both)
2379 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2380 __kmp_forkjoin_frames_mode == 3)
2381 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2382 master_th->th.th_frame_time, 0, loc,
2383 master_th->th.th_team_nproc, 1);
2384 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2385 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2386 __kmp_itt_region_joined(gtid);
2387 } // active_level == 1
2388#endif /* USE_ITT_BUILD */
2389
2390 if (master_th->th.th_teams_microtask && !exit_teams &&
2391 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2392 team->t.t_level == master_th->th.th_teams_level + 1) {
2393// AC: We need to leave the team structure intact at the end of parallel
2394// inside the teams construct, so that at the next parallel same (hot) team
2395// works, only adjust nesting levels
2396#if OMPT_SUPPORT
2397 ompt_data_t ompt_parallel_data = ompt_data_none;
2398 if (ompt_enabled.enabled) {
2399 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2400 if (ompt_enabled.ompt_callback_implicit_task) {
2401 int ompt_team_size = team->t.t_nproc;
2402 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2403 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2404 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2405 }
2406 task_info->frame.exit_frame = ompt_data_none;
2407 task_info->task_data = ompt_data_none;
2408 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2409 __ompt_lw_taskteam_unlink(master_th);
2410 }
2411#endif
2412 /* Decrement our nested depth level */
2413 team->t.t_level--;
2414 team->t.t_active_level--;
2415 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2416
2417 // Restore number of threads in the team if needed. This code relies on
2418 // the proper adjustment of th_teams_size.nth after the fork in
2419 // __kmp_teams_master on each teams primary thread in the case that
2420 // __kmp_reserve_threads reduced it.
2421 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2422 int old_num = master_th->th.th_team_nproc;
2423 int new_num = master_th->th.th_teams_size.nth;
2424 kmp_info_t **other_threads = team->t.t_threads;
2425 team->t.t_nproc = new_num;
2426 for (int i = 0; i < old_num; ++i) {
2427 other_threads[i]->th.th_team_nproc = new_num;
2428 }
2429 // Adjust states of non-used threads of the team
2430 for (int i = old_num; i < new_num; ++i) {
2431 // Re-initialize thread's barrier data.
2432 KMP_DEBUG_ASSERT(other_threads[i]);
2433 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2434 for (int b = 0; b < bs_last_barrier; ++b) {
2435 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2436 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2437#if USE_DEBUGGER
2438 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2439#endif
2440 }
2441 if (__kmp_tasking_mode != tskm_immediate_exec) {
2442 // Synchronize thread's task state
2443 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2444 }
2445 }
2446 }
2447
2448#if OMPT_SUPPORT
2449 if (ompt_enabled.enabled) {
2450 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2451 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2452 }
2453#endif
2454
2455 return;
2456 }
2457
2458 /* do cleanup and restore the parent team */
2459 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2460 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2461
2462 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2463
2464 /* jc: The following lock has instructions with REL and ACQ semantics,
2465 separating the parallel user code called in this parallel region
2466 from the serial user code called after this function returns. */
2467 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2468
2469 if (!master_th->th.th_teams_microtask ||
2470 team->t.t_level > master_th->th.th_teams_level) {
2471 /* Decrement our nested depth level */
2472 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2473 }
2474 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2475
2476#if OMPT_SUPPORT
2477 if (ompt_enabled.enabled) {
2478 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2479 if (ompt_enabled.ompt_callback_implicit_task) {
2480 int flags = (team_microtask == (void *)__kmp_teams_master)
2481 ? ompt_task_initial
2482 : ompt_task_implicit;
2483 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2484 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2485 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2486 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2487 }
2488 task_info->frame.exit_frame = ompt_data_none;
2489 task_info->task_data = ompt_data_none;
2490 }
2491#endif
2492
2493 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2494 master_th, team));
2495 __kmp_pop_current_task_from_thread(master_th);
2496
2497#if KMP_AFFINITY_SUPPORTED
2498 // Restore master thread's partition.
2499 master_th->th.th_first_place = team->t.t_first_place;
2500 master_th->th.th_last_place = team->t.t_last_place;
2501#endif // KMP_AFFINITY_SUPPORTED
2502 master_th->th.th_def_allocator = team->t.t_def_allocator;
2503
2504#if OMPD_SUPPORT
2505 if (ompd_state & OMPD_ENABLE_BP)
2506 ompd_bp_parallel_end();
2507#endif
2508 updateHWFPControl(team);
2509
2510 if (root->r.r_active != master_active)
2511 root->r.r_active = master_active;
2512
2513 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2514 master_th)); // this will free worker threads
2515
2516 /* this race was fun to find. make sure the following is in the critical
2517 region otherwise assertions may fail occasionally since the old team may be
2518 reallocated and the hierarchy appears inconsistent. it is actually safe to
2519 run and won't cause any bugs, but will cause those assertion failures. it's
2520 only one deref&assign so might as well put this in the critical region */
2521 master_th->th.th_team = parent_team;
2522 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2523 master_th->th.th_team_master = parent_team->t.t_threads[0];
2524 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2525
2526 /* restore serialized team, if need be */
2527 if (parent_team->t.t_serialized &&
2528 parent_team != master_th->th.th_serial_team &&
2529 parent_team != root->r.r_root_team) {
2530 __kmp_free_team(root,
2531 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2532 master_th->th.th_serial_team = parent_team;
2533 }
2534
2535 if (__kmp_tasking_mode != tskm_immediate_exec) {
2536 if (master_th->th.th_task_state_top >
2537 0) { // Restore task state from memo stack
2538 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2539 // Remember primary thread's state if we re-use this nested hot team
2540 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2541 master_th->th.th_task_state;
2542 --master_th->th.th_task_state_top; // pop
2543 // Now restore state at this level
2544 master_th->th.th_task_state =
2545 master_th->th
2546 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2547 }
2548 // Copy the task team from the parent team to the primary thread
2549 master_th->th.th_task_team =
2550 parent_team->t.t_task_team[master_th->th.th_task_state];
2551 KA_TRACE(20,
2552 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2553 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2554 parent_team));
2555 }
2556
2557 // TODO: GEH - cannot do this assertion because root thread not set up as
2558 // executing
2559 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2560 master_th->th.th_current_task->td_flags.executing = 1;
2561
2562 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2563
2564#if OMPT_SUPPORT
2565 int flags =
2566 OMPT_INVOKER(fork_context) |
2567 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2568 : ompt_parallel_team);
2569 if (ompt_enabled.enabled) {
2570 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2571 codeptr);
2572 }
2573#endif
2574
2575 KMP_MB();
2576 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2577}
2578
2579/* Check whether we should push an internal control record onto the
2580 serial team stack. If so, do it. */
2581void __kmp_save_internal_controls(kmp_info_t *thread) {
2582
2583 if (thread->th.th_team != thread->th.th_serial_team) {
2584 return;
2585 }
2586 if (thread->th.th_team->t.t_serialized > 1) {
2587 int push = 0;
2588
2589 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2590 push = 1;
2591 } else {
2592 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2593 thread->th.th_team->t.t_serialized) {
2594 push = 1;
2595 }
2596 }
2597 if (push) { /* push a record on the serial team's stack */
2598 kmp_internal_control_t *control =
2599 (kmp_internal_control_t *)__kmp_allocate(
2600 sizeof(kmp_internal_control_t));
2601
2602 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2603
2604 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2605
2606 control->next = thread->th.th_team->t.t_control_stack_top;
2607 thread->th.th_team->t.t_control_stack_top = control;
2608 }
2609 }
2610}
2611
2612/* Changes set_nproc */
2613void __kmp_set_num_threads(int new_nth, int gtid) {
2614 kmp_info_t *thread;
2615 kmp_root_t *root;
2616
2617 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2618 KMP_DEBUG_ASSERT(__kmp_init_serial);
2619
2620 if (new_nth < 1)
2621 new_nth = 1;
2622 else if (new_nth > __kmp_max_nth)
2623 new_nth = __kmp_max_nth;
2624
2625 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2626 thread = __kmp_threads[gtid];
2627 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2628 return; // nothing to do
2629
2630 __kmp_save_internal_controls(thread);
2631
2632 set__nproc(thread, new_nth);
2633
2634 // If this omp_set_num_threads() call will cause the hot team size to be
2635 // reduced (in the absence of a num_threads clause), then reduce it now,
2636 // rather than waiting for the next parallel region.
2637 root = thread->th.th_root;
2638 if (__kmp_init_parallel && (!root->r.r_active) &&
2639 (root->r.r_hot_team->t.t_nproc > new_nth)
2640#if KMP_NESTED_HOT_TEAMS
2641 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2642#endif
2643 ) {
2644 kmp_team_t *hot_team = root->r.r_hot_team;
2645 int f;
2646
2647 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2648
2649 // Release the extra threads we don't need any more.
2650 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2651 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2652 if (__kmp_tasking_mode != tskm_immediate_exec) {
2653 // When decreasing team size, threads no longer in the team should unref
2654 // task team.
2655 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2656 }
2657 __kmp_free_thread(hot_team->t.t_threads[f]);
2658 hot_team->t.t_threads[f] = NULL;
2659 }
2660 hot_team->t.t_nproc = new_nth;
2661#if KMP_NESTED_HOT_TEAMS
2662 if (thread->th.th_hot_teams) {
2663 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2664 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2665 }
2666#endif
2667
2668 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2669
2670 // Update the t_nproc field in the threads that are still active.
2671 for (f = 0; f < new_nth; f++) {
2672 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2673 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2674 }
2675 // Special flag in case omp_set_num_threads() call
2676 hot_team->t.t_size_changed = -1;
2677 }
2678}
2679
2680/* Changes max_active_levels */
2681void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2682 kmp_info_t *thread;
2683
2684 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2685 "%d = (%d)\n",
2686 gtid, max_active_levels));
2687 KMP_DEBUG_ASSERT(__kmp_init_serial);
2688
2689 // validate max_active_levels
2690 if (max_active_levels < 0) {
2691 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2692 // We ignore this call if the user has specified a negative value.
2693 // The current setting won't be changed. The last valid setting will be
2694 // used. A warning will be issued (if warnings are allowed as controlled by
2695 // the KMP_WARNINGS env var).
2696 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2697 "max_active_levels for thread %d = (%d)\n",
2698 gtid, max_active_levels));
2699 return;
2700 }
2701 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2702 // it's OK, the max_active_levels is within the valid range: [ 0;
2703 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2704 // We allow a zero value. (implementation defined behavior)
2705 } else {
2706 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2707 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2708 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2709 // Current upper limit is MAX_INT. (implementation defined behavior)
2710 // If the input exceeds the upper limit, we correct the input to be the
2711 // upper limit. (implementation defined behavior)
2712 // Actually, the flow should never get here until we use MAX_INT limit.
2713 }
2714 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2715 "max_active_levels for thread %d = (%d)\n",
2716 gtid, max_active_levels));
2717
2718 thread = __kmp_threads[gtid];
2719
2720 __kmp_save_internal_controls(thread);
2721
2722 set__max_active_levels(thread, max_active_levels);
2723}
2724
2725/* Gets max_active_levels */
2726int __kmp_get_max_active_levels(int gtid) {
2727 kmp_info_t *thread;
2728
2729 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2730 KMP_DEBUG_ASSERT(__kmp_init_serial);
2731
2732 thread = __kmp_threads[gtid];
2733 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2734 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2735 "curtask_maxaclevel=%d\n",
2736 gtid, thread->th.th_current_task,
2737 thread->th.th_current_task->td_icvs.max_active_levels));
2738 return thread->th.th_current_task->td_icvs.max_active_levels;
2739}
2740
2741// nteams-var per-device ICV
2742void __kmp_set_num_teams(int num_teams) {
2743 if (num_teams > 0)
2744 __kmp_nteams = num_teams;
2745}
2746int __kmp_get_max_teams(void) { return __kmp_nteams; }
2747// teams-thread-limit-var per-device ICV
2748void __kmp_set_teams_thread_limit(int limit) {
2749 if (limit > 0)
2750 __kmp_teams_thread_limit = limit;
2751}
2752int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2753
2754KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2755KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2756
2757/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2758void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2759 kmp_info_t *thread;
2760 kmp_sched_t orig_kind;
2761 // kmp_team_t *team;
2762
2763 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2764 gtid, (int)kind, chunk));
2765 KMP_DEBUG_ASSERT(__kmp_init_serial);
2766
2767 // Check if the kind parameter is valid, correct if needed.
2768 // Valid parameters should fit in one of two intervals - standard or extended:
2769 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2770 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2771 orig_kind = kind;
2772 kind = __kmp_sched_without_mods(kind);
2773
2774 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2775 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2776 // TODO: Hint needs attention in case we change the default schedule.
2777 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2778 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2779 __kmp_msg_null);
2780 kind = kmp_sched_default;
2781 chunk = 0; // ignore chunk value in case of bad kind
2782 }
2783
2784 thread = __kmp_threads[gtid];
2785
2786 __kmp_save_internal_controls(thread);
2787
2788 if (kind < kmp_sched_upper_std) {
2789 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2790 // differ static chunked vs. unchunked: chunk should be invalid to
2791 // indicate unchunked schedule (which is the default)
2792 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2793 } else {
2794 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2795 __kmp_sch_map[kind - kmp_sched_lower - 1];
2796 }
2797 } else {
2798 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799 // kmp_sched_lower - 2 ];
2800 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2801 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2802 kmp_sched_lower - 2];
2803 }
2804 __kmp_sched_apply_mods_intkind(
2805 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2806 if (kind == kmp_sched_auto || chunk < 1) {
2807 // ignore parameter chunk for schedule auto
2808 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2809 } else {
2810 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2811 }
2812}
2813
2814/* Gets def_sched_var ICV values */
2815void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2816 kmp_info_t *thread;
2817 enum sched_type th_type;
2818
2819 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2820 KMP_DEBUG_ASSERT(__kmp_init_serial);
2821
2822 thread = __kmp_threads[gtid];
2823
2824 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2825 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2826 case kmp_sch_static:
2827 case kmp_sch_static_greedy:
2828 case kmp_sch_static_balanced:
2829 *kind = kmp_sched_static;
2830 __kmp_sched_apply_mods_stdkind(kind, th_type);
2831 *chunk = 0; // chunk was not set, try to show this fact via zero value
2832 return;
2833 case kmp_sch_static_chunked:
2834 *kind = kmp_sched_static;
2835 break;
2836 case kmp_sch_dynamic_chunked:
2837 *kind = kmp_sched_dynamic;
2838 break;
2840 case kmp_sch_guided_iterative_chunked:
2841 case kmp_sch_guided_analytical_chunked:
2842 *kind = kmp_sched_guided;
2843 break;
2844 case kmp_sch_auto:
2845 *kind = kmp_sched_auto;
2846 break;
2847 case kmp_sch_trapezoidal:
2848 *kind = kmp_sched_trapezoidal;
2849 break;
2850#if KMP_STATIC_STEAL_ENABLED
2851 case kmp_sch_static_steal:
2852 *kind = kmp_sched_static_steal;
2853 break;
2854#endif
2855 default:
2856 KMP_FATAL(UnknownSchedulingType, th_type);
2857 }
2858
2859 __kmp_sched_apply_mods_stdkind(kind, th_type);
2860 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2861}
2862
2863int __kmp_get_ancestor_thread_num(int gtid, int level) {
2864
2865 int ii, dd;
2866 kmp_team_t *team;
2867 kmp_info_t *thr;
2868
2869 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2870 KMP_DEBUG_ASSERT(__kmp_init_serial);
2871
2872 // validate level
2873 if (level == 0)
2874 return 0;
2875 if (level < 0)
2876 return -1;
2877 thr = __kmp_threads[gtid];
2878 team = thr->th.th_team;
2879 ii = team->t.t_level;
2880 if (level > ii)
2881 return -1;
2882
2883 if (thr->th.th_teams_microtask) {
2884 // AC: we are in teams region where multiple nested teams have same level
2885 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2886 if (level <=
2887 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2888 KMP_DEBUG_ASSERT(ii >= tlevel);
2889 // AC: As we need to pass by the teams league, we need to artificially
2890 // increase ii
2891 if (ii == tlevel) {
2892 ii += 2; // three teams have same level
2893 } else {
2894 ii++; // two teams have same level
2895 }
2896 }
2897 }
2898
2899 if (ii == level)
2900 return __kmp_tid_from_gtid(gtid);
2901
2902 dd = team->t.t_serialized;
2903 level++;
2904 while (ii > level) {
2905 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2906 }
2907 if ((team->t.t_serialized) && (!dd)) {
2908 team = team->t.t_parent;
2909 continue;
2910 }
2911 if (ii > level) {
2912 team = team->t.t_parent;
2913 dd = team->t.t_serialized;
2914 ii--;
2915 }
2916 }
2917
2918 return (dd > 1) ? (0) : (team->t.t_master_tid);
2919}
2920
2921int __kmp_get_team_size(int gtid, int level) {
2922
2923 int ii, dd;
2924 kmp_team_t *team;
2925 kmp_info_t *thr;
2926
2927 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2928 KMP_DEBUG_ASSERT(__kmp_init_serial);
2929
2930 // validate level
2931 if (level == 0)
2932 return 1;
2933 if (level < 0)
2934 return -1;
2935 thr = __kmp_threads[gtid];
2936 team = thr->th.th_team;
2937 ii = team->t.t_level;
2938 if (level > ii)
2939 return -1;
2940
2941 if (thr->th.th_teams_microtask) {
2942 // AC: we are in teams region where multiple nested teams have same level
2943 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2944 if (level <=
2945 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2946 KMP_DEBUG_ASSERT(ii >= tlevel);
2947 // AC: As we need to pass by the teams league, we need to artificially
2948 // increase ii
2949 if (ii == tlevel) {
2950 ii += 2; // three teams have same level
2951 } else {
2952 ii++; // two teams have same level
2953 }
2954 }
2955 }
2956
2957 while (ii > level) {
2958 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2959 }
2960 if (team->t.t_serialized && (!dd)) {
2961 team = team->t.t_parent;
2962 continue;
2963 }
2964 if (ii > level) {
2965 team = team->t.t_parent;
2966 ii--;
2967 }
2968 }
2969
2970 return team->t.t_nproc;
2971}
2972
2973kmp_r_sched_t __kmp_get_schedule_global() {
2974 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2975 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2976 // independently. So one can get the updated schedule here.
2977
2978 kmp_r_sched_t r_sched;
2979
2980 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2981 // __kmp_guided. __kmp_sched should keep original value, so that user can set
2982 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2983 // different roots (even in OMP 2.5)
2984 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2985 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2986 if (s == kmp_sch_static) {
2987 // replace STATIC with more detailed schedule (balanced or greedy)
2988 r_sched.r_sched_type = __kmp_static;
2989 } else if (s == kmp_sch_guided_chunked) {
2990 // replace GUIDED with more detailed schedule (iterative or analytical)
2991 r_sched.r_sched_type = __kmp_guided;
2992 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2993 r_sched.r_sched_type = __kmp_sched;
2994 }
2995 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2996
2997 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2998 // __kmp_chunk may be wrong here (if it was not ever set)
2999 r_sched.chunk = KMP_DEFAULT_CHUNK;
3000 } else {
3001 r_sched.chunk = __kmp_chunk;
3002 }
3003
3004 return r_sched;
3005}
3006
3007/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3008 at least argc number of *t_argv entries for the requested team. */
3009static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3010
3011 KMP_DEBUG_ASSERT(team);
3012 if (!realloc || argc > team->t.t_max_argc) {
3013
3014 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3015 "current entries=%d\n",
3016 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3017 /* if previously allocated heap space for args, free them */
3018 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3019 __kmp_free((void *)team->t.t_argv);
3020
3021 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3022 /* use unused space in the cache line for arguments */
3023 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3024 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3025 "argv entries\n",
3026 team->t.t_id, team->t.t_max_argc));
3027 team->t.t_argv = &team->t.t_inline_argv[0];
3028 if (__kmp_storage_map) {
3029 __kmp_print_storage_map_gtid(
3030 -1, &team->t.t_inline_argv[0],
3031 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3032 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3033 team->t.t_id);
3034 }
3035 } else {
3036 /* allocate space for arguments in the heap */
3037 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3038 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3039 : 2 * argc;
3040 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3041 "argv entries\n",
3042 team->t.t_id, team->t.t_max_argc));
3043 team->t.t_argv =
3044 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3045 if (__kmp_storage_map) {
3046 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3047 &team->t.t_argv[team->t.t_max_argc],
3048 sizeof(void *) * team->t.t_max_argc,
3049 "team_%d.t_argv", team->t.t_id);
3050 }
3051 }
3052 }
3053}
3054
3055static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3056 int i;
3057 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3058 team->t.t_threads =
3059 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3060 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3061 sizeof(dispatch_shared_info_t) * num_disp_buff);
3062 team->t.t_dispatch =
3063 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3064 team->t.t_implicit_task_taskdata =
3065 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3066 team->t.t_max_nproc = max_nth;
3067
3068 /* setup dispatch buffers */
3069 for (i = 0; i < num_disp_buff; ++i) {
3070 team->t.t_disp_buffer[i].buffer_index = i;
3071 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3072 }
3073}
3074
3075static void __kmp_free_team_arrays(kmp_team_t *team) {
3076 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3077 int i;
3078 for (i = 0; i < team->t.t_max_nproc; ++i) {
3079 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3080 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3081 team->t.t_dispatch[i].th_disp_buffer = NULL;
3082 }
3083 }
3084#if KMP_USE_HIER_SCHED
3085 __kmp_dispatch_free_hierarchies(team);
3086#endif
3087 __kmp_free(team->t.t_threads);
3088 __kmp_free(team->t.t_disp_buffer);
3089 __kmp_free(team->t.t_dispatch);
3090 __kmp_free(team->t.t_implicit_task_taskdata);
3091 team->t.t_threads = NULL;
3092 team->t.t_disp_buffer = NULL;
3093 team->t.t_dispatch = NULL;
3094 team->t.t_implicit_task_taskdata = 0;
3095}
3096
3097static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3098 kmp_info_t **oldThreads = team->t.t_threads;
3099
3100 __kmp_free(team->t.t_disp_buffer);
3101 __kmp_free(team->t.t_dispatch);
3102 __kmp_free(team->t.t_implicit_task_taskdata);
3103 __kmp_allocate_team_arrays(team, max_nth);
3104
3105 KMP_MEMCPY(team->t.t_threads, oldThreads,
3106 team->t.t_nproc * sizeof(kmp_info_t *));
3107
3108 __kmp_free(oldThreads);
3109}
3110
3111static kmp_internal_control_t __kmp_get_global_icvs(void) {
3112
3113 kmp_r_sched_t r_sched =
3114 __kmp_get_schedule_global(); // get current state of scheduling globals
3115
3116 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3117
3118 kmp_internal_control_t g_icvs = {
3119 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3120 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3121 // adjustment of threads (per thread)
3122 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3123 // whether blocktime is explicitly set
3124 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3125#if KMP_USE_MONITOR
3126 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3127// intervals
3128#endif
3129 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3130 // next parallel region (per thread)
3131 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3132 __kmp_cg_max_nth, // int thread_limit;
3133 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3134 // for max_active_levels
3135 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3136 // {sched,chunk} pair
3137 __kmp_nested_proc_bind.bind_types[0],
3138 __kmp_default_device,
3139 NULL // struct kmp_internal_control *next;
3140 };
3141
3142 return g_icvs;
3143}
3144
3145static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3146
3147 kmp_internal_control_t gx_icvs;
3148 gx_icvs.serial_nesting_level =
3149 0; // probably =team->t.t_serial like in save_inter_controls
3150 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3151 gx_icvs.next = NULL;
3152
3153 return gx_icvs;
3154}
3155
3156static void __kmp_initialize_root(kmp_root_t *root) {
3157 int f;
3158 kmp_team_t *root_team;
3159 kmp_team_t *hot_team;
3160 int hot_team_max_nth;
3161 kmp_r_sched_t r_sched =
3162 __kmp_get_schedule_global(); // get current state of scheduling globals
3163 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3164 KMP_DEBUG_ASSERT(root);
3165 KMP_ASSERT(!root->r.r_begin);
3166
3167 /* setup the root state structure */
3168 __kmp_init_lock(&root->r.r_begin_lock);
3169 root->r.r_begin = FALSE;
3170 root->r.r_active = FALSE;
3171 root->r.r_in_parallel = 0;
3172 root->r.r_blocktime = __kmp_dflt_blocktime;
3173#if KMP_AFFINITY_SUPPORTED
3174 root->r.r_affinity_assigned = FALSE;
3175#endif
3176
3177 /* setup the root team for this task */
3178 /* allocate the root team structure */
3179 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3180
3181 root_team =
3182 __kmp_allocate_team(root,
3183 1, // new_nproc
3184 1, // max_nproc
3185#if OMPT_SUPPORT
3186 ompt_data_none, // root parallel id
3187#endif
3188 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3189 0 // argc
3190 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3191 );
3192#if USE_DEBUGGER
3193 // Non-NULL value should be assigned to make the debugger display the root
3194 // team.
3195 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3196#endif
3197
3198 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3199
3200 root->r.r_root_team = root_team;
3201 root_team->t.t_control_stack_top = NULL;
3202
3203 /* initialize root team */
3204 root_team->t.t_threads[0] = NULL;
3205 root_team->t.t_nproc = 1;
3206 root_team->t.t_serialized = 1;
3207 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3208 root_team->t.t_sched.sched = r_sched.sched;
3209 KA_TRACE(
3210 20,
3211 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3212 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3213
3214 /* setup the hot team for this task */
3215 /* allocate the hot team structure */
3216 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3217
3218 hot_team =
3219 __kmp_allocate_team(root,
3220 1, // new_nproc
3221 __kmp_dflt_team_nth_ub * 2, // max_nproc
3222#if OMPT_SUPPORT
3223 ompt_data_none, // root parallel id
3224#endif
3225 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3226 0 // argc
3227 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3228 );
3229 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3230
3231 root->r.r_hot_team = hot_team;
3232 root_team->t.t_control_stack_top = NULL;
3233
3234 /* first-time initialization */
3235 hot_team->t.t_parent = root_team;
3236
3237 /* initialize hot team */
3238 hot_team_max_nth = hot_team->t.t_max_nproc;
3239 for (f = 0; f < hot_team_max_nth; ++f) {
3240 hot_team->t.t_threads[f] = NULL;
3241 }
3242 hot_team->t.t_nproc = 1;
3243 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3244 hot_team->t.t_sched.sched = r_sched.sched;
3245 hot_team->t.t_size_changed = 0;
3246}
3247
3248#ifdef KMP_DEBUG
3249
3250typedef struct kmp_team_list_item {
3251 kmp_team_p const *entry;
3252 struct kmp_team_list_item *next;
3253} kmp_team_list_item_t;
3254typedef kmp_team_list_item_t *kmp_team_list_t;
3255
3256static void __kmp_print_structure_team_accum( // Add team to list of teams.
3257 kmp_team_list_t list, // List of teams.
3258 kmp_team_p const *team // Team to add.
3259) {
3260
3261 // List must terminate with item where both entry and next are NULL.
3262 // Team is added to the list only once.
3263 // List is sorted in ascending order by team id.
3264 // Team id is *not* a key.
3265
3266 kmp_team_list_t l;
3267
3268 KMP_DEBUG_ASSERT(list != NULL);
3269 if (team == NULL) {
3270 return;
3271 }
3272
3273 __kmp_print_structure_team_accum(list, team->t.t_parent);
3274 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3275
3276 // Search list for the team.
3277 l = list;
3278 while (l->next != NULL && l->entry != team) {
3279 l = l->next;
3280 }
3281 if (l->next != NULL) {
3282 return; // Team has been added before, exit.
3283 }
3284
3285 // Team is not found. Search list again for insertion point.
3286 l = list;
3287 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3288 l = l->next;
3289 }
3290
3291 // Insert team.
3292 {
3293 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3294 sizeof(kmp_team_list_item_t));
3295 *item = *l;
3296 l->entry = team;
3297 l->next = item;
3298 }
3299}
3300
3301static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3302
3303) {
3304 __kmp_printf("%s", title);
3305 if (team != NULL) {
3306 __kmp_printf("%2x %p\n", team->t.t_id, team);
3307 } else {
3308 __kmp_printf(" - (nil)\n");
3309 }
3310}
3311
3312static void __kmp_print_structure_thread(char const *title,
3313 kmp_info_p const *thread) {
3314 __kmp_printf("%s", title);
3315 if (thread != NULL) {
3316 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3317 } else {
3318 __kmp_printf(" - (nil)\n");
3319 }
3320}
3321
3322void __kmp_print_structure(void) {
3323
3324 kmp_team_list_t list;
3325
3326 // Initialize list of teams.
3327 list =
3328 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3329 list->entry = NULL;
3330 list->next = NULL;
3331
3332 __kmp_printf("\n------------------------------\nGlobal Thread "
3333 "Table\n------------------------------\n");
3334 {
3335 int gtid;
3336 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3337 __kmp_printf("%2d", gtid);
3338 if (__kmp_threads != NULL) {
3339 __kmp_printf(" %p", __kmp_threads[gtid]);
3340 }
3341 if (__kmp_root != NULL) {
3342 __kmp_printf(" %p", __kmp_root[gtid]);
3343 }
3344 __kmp_printf("\n");
3345 }
3346 }
3347
3348 // Print out __kmp_threads array.
3349 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3350 "----------\n");
3351 if (__kmp_threads != NULL) {
3352 int gtid;
3353 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3354 kmp_info_t const *thread = __kmp_threads[gtid];
3355 if (thread != NULL) {
3356 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3357 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3358 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3359 __kmp_print_structure_team(" Serial Team: ",
3360 thread->th.th_serial_team);
3361 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3362 __kmp_print_structure_thread(" Primary: ",
3363 thread->th.th_team_master);
3364 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3365 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3366 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3367 __kmp_print_structure_thread(" Next in pool: ",
3368 thread->th.th_next_pool);
3369 __kmp_printf("\n");
3370 __kmp_print_structure_team_accum(list, thread->th.th_team);
3371 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3372 }
3373 }
3374 } else {
3375 __kmp_printf("Threads array is not allocated.\n");
3376 }
3377
3378 // Print out __kmp_root array.
3379 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3380 "--------\n");
3381 if (__kmp_root != NULL) {
3382 int gtid;
3383 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3384 kmp_root_t const *root = __kmp_root[gtid];
3385 if (root != NULL) {
3386 __kmp_printf("GTID %2d %p:\n", gtid, root);
3387 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3388 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3389 __kmp_print_structure_thread(" Uber Thread: ",
3390 root->r.r_uber_thread);
3391 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3392 __kmp_printf(" In Parallel: %2d\n",
3393 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3394 __kmp_printf("\n");
3395 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3396 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3397 }
3398 }
3399 } else {
3400 __kmp_printf("Ubers array is not allocated.\n");
3401 }
3402
3403 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3404 "--------\n");
3405 while (list->next != NULL) {
3406 kmp_team_p const *team = list->entry;
3407 int i;
3408 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3409 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3410 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3411 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3412 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3413 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3414 for (i = 0; i < team->t.t_nproc; ++i) {
3415 __kmp_printf(" Thread %2d: ", i);
3416 __kmp_print_structure_thread("", team->t.t_threads[i]);
3417 }
3418 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3419 __kmp_printf("\n");
3420 list = list->next;
3421 }
3422
3423 // Print out __kmp_thread_pool and __kmp_team_pool.
3424 __kmp_printf("\n------------------------------\nPools\n----------------------"
3425 "--------\n");
3426 __kmp_print_structure_thread("Thread pool: ",
3427 CCAST(kmp_info_t *, __kmp_thread_pool));
3428 __kmp_print_structure_team("Team pool: ",
3429 CCAST(kmp_team_t *, __kmp_team_pool));
3430 __kmp_printf("\n");
3431
3432 // Free team list.
3433 while (list != NULL) {
3434 kmp_team_list_item_t *item = list;
3435 list = list->next;
3436 KMP_INTERNAL_FREE(item);
3437 }
3438}
3439
3440#endif
3441
3442//---------------------------------------------------------------------------
3443// Stuff for per-thread fast random number generator
3444// Table of primes
3445static const unsigned __kmp_primes[] = {
3446 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3447 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3448 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3449 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3450 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3451 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3452 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3453 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3454 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3455 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3456 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3457
3458//---------------------------------------------------------------------------
3459// __kmp_get_random: Get a random number using a linear congruential method.
3460unsigned short __kmp_get_random(kmp_info_t *thread) {
3461 unsigned x = thread->th.th_x;
3462 unsigned short r = (unsigned short)(x >> 16);
3463
3464 thread->th.th_x = x * thread->th.th_a + 1;
3465
3466 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3467 thread->th.th_info.ds.ds_tid, r));
3468
3469 return r;
3470}
3471//--------------------------------------------------------
3472// __kmp_init_random: Initialize a random number generator
3473void __kmp_init_random(kmp_info_t *thread) {
3474 unsigned seed = thread->th.th_info.ds.ds_tid;
3475
3476 thread->th.th_a =
3477 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3478 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3479 KA_TRACE(30,
3480 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3481}
3482
3483#if KMP_OS_WINDOWS
3484/* reclaim array entries for root threads that are already dead, returns number
3485 * reclaimed */
3486static int __kmp_reclaim_dead_roots(void) {
3487 int i, r = 0;
3488
3489 for (i = 0; i < __kmp_threads_capacity; ++i) {
3490 if (KMP_UBER_GTID(i) &&
3491 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3492 !__kmp_root[i]
3493 ->r.r_active) { // AC: reclaim only roots died in non-active state
3494 r += __kmp_unregister_root_other_thread(i);
3495 }
3496 }
3497 return r;
3498}
3499#endif
3500
3501/* This function attempts to create free entries in __kmp_threads and
3502 __kmp_root, and returns the number of free entries generated.
3503
3504 For Windows* OS static library, the first mechanism used is to reclaim array
3505 entries for root threads that are already dead.
3506
3507 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3508 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3509 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3510 threadprivate cache array has been created. Synchronization with
3511 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3512
3513 After any dead root reclamation, if the clipping value allows array expansion
3514 to result in the generation of a total of nNeed free slots, the function does
3515 that expansion. If not, nothing is done beyond the possible initial root
3516 thread reclamation.
3517
3518 If any argument is negative, the behavior is undefined. */
3519static int __kmp_expand_threads(int nNeed) {
3520 int added = 0;
3521 int minimumRequiredCapacity;
3522 int newCapacity;
3523 kmp_info_t **newThreads;
3524 kmp_root_t **newRoot;
3525
3526 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3527 // resizing __kmp_threads does not need additional protection if foreign
3528 // threads are present
3529
3530#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3531 /* only for Windows static library */
3532 /* reclaim array entries for root threads that are already dead */
3533 added = __kmp_reclaim_dead_roots();
3534
3535 if (nNeed) {
3536 nNeed -= added;
3537 if (nNeed < 0)
3538 nNeed = 0;
3539 }
3540#endif
3541 if (nNeed <= 0)
3542 return added;
3543
3544 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3545 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3546 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3547 // > __kmp_max_nth in one of two ways:
3548 //
3549 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3550 // may not be reused by another thread, so we may need to increase
3551 // __kmp_threads_capacity to __kmp_max_nth + 1.
3552 //
3553 // 2) New foreign root(s) are encountered. We always register new foreign
3554 // roots. This may cause a smaller # of threads to be allocated at
3555 // subsequent parallel regions, but the worker threads hang around (and
3556 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3557 //
3558 // Anyway, that is the reason for moving the check to see if
3559 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3560 // instead of having it performed here. -BB
3561
3562 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3563
3564 /* compute expansion headroom to check if we can expand */
3565 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3566 /* possible expansion too small -- give up */
3567 return added;
3568 }
3569 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3570
3571 newCapacity = __kmp_threads_capacity;
3572 do {
3573 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3574 : __kmp_sys_max_nth;
3575 } while (newCapacity < minimumRequiredCapacity);
3576 newThreads = (kmp_info_t **)__kmp_allocate(
3577 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3578 newRoot =
3579 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3580 KMP_MEMCPY(newThreads, __kmp_threads,
3581 __kmp_threads_capacity * sizeof(kmp_info_t *));
3582 KMP_MEMCPY(newRoot, __kmp_root,
3583 __kmp_threads_capacity * sizeof(kmp_root_t *));
3584
3585 kmp_info_t **temp_threads = __kmp_threads;
3586 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3587 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3588 __kmp_free(temp_threads);
3589 added += newCapacity - __kmp_threads_capacity;
3590 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3591
3592 if (newCapacity > __kmp_tp_capacity) {
3593 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3594 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3595 __kmp_threadprivate_resize_cache(newCapacity);
3596 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3597 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3598 }
3599 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3600 }
3601
3602 return added;
3603}
3604
3605/* Register the current thread as a root thread and obtain our gtid. We must
3606 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3607 thread that calls from __kmp_do_serial_initialize() */
3608int __kmp_register_root(int initial_thread) {
3609 kmp_info_t *root_thread;
3610 kmp_root_t *root;
3611 int gtid;
3612 int capacity;
3613 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3614 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3615 KMP_MB();
3616
3617 /* 2007-03-02:
3618 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3619 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3620 work as expected -- it may return false (that means there is at least one
3621 empty slot in __kmp_threads array), but it is possible the only free slot
3622 is #0, which is reserved for initial thread and so cannot be used for this
3623 one. Following code workarounds this bug.
3624
3625 However, right solution seems to be not reserving slot #0 for initial
3626 thread because:
3627 (1) there is no magic in slot #0,
3628 (2) we cannot detect initial thread reliably (the first thread which does
3629 serial initialization may be not a real initial thread).
3630 */
3631 capacity = __kmp_threads_capacity;
3632 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3633 --capacity;
3634 }
3635
3636 // If it is not for initializing the hidden helper team, we need to take
3637 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3638 // in __kmp_threads_capacity.
3639 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3640 capacity -= __kmp_hidden_helper_threads_num;
3641 }
3642
3643 /* see if there are too many threads */
3644 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3645 if (__kmp_tp_cached) {
3646 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3647 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3648 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3649 } else {
3650 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3651 __kmp_msg_null);
3652 }
3653 }
3654
3655 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3656 // 0: initial thread, also a regular OpenMP thread.
3657 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3658 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3659 // regular OpenMP threads.
3660 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3661 // Find an available thread slot for hidden helper thread. Slots for hidden
3662 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3663 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3664 gtid <= __kmp_hidden_helper_threads_num;
3665 gtid++)
3666 ;
3667 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3668 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3669 "hidden helper thread: T#%d\n",
3670 gtid));
3671 } else {
3672 /* find an available thread slot */
3673 // Don't reassign the zero slot since we need that to only be used by
3674 // initial thread. Slots for hidden helper threads should also be skipped.
3675 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3676 gtid = 0;
3677 } else {
3678 for (gtid = __kmp_hidden_helper_threads_num + 1;
3679 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3680 ;
3681 }
3682 KA_TRACE(
3683 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3684 KMP_ASSERT(gtid < __kmp_threads_capacity);
3685 }
3686
3687 /* update global accounting */
3688 __kmp_all_nth++;
3689 TCW_4(__kmp_nth, __kmp_nth + 1);
3690
3691 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3692 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3693 if (__kmp_adjust_gtid_mode) {
3694 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3695 if (TCR_4(__kmp_gtid_mode) != 2) {
3696 TCW_4(__kmp_gtid_mode, 2);
3697 }
3698 } else {
3699 if (TCR_4(__kmp_gtid_mode) != 1) {
3700 TCW_4(__kmp_gtid_mode, 1);
3701 }
3702 }
3703 }
3704
3705#ifdef KMP_ADJUST_BLOCKTIME
3706 /* Adjust blocktime to zero if necessary */
3707 /* Middle initialization might not have occurred yet */
3708 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3709 if (__kmp_nth > __kmp_avail_proc) {
3710 __kmp_zero_bt = TRUE;
3711 }
3712 }
3713#endif /* KMP_ADJUST_BLOCKTIME */
3714
3715 /* setup this new hierarchy */
3716 if (!(root = __kmp_root[gtid])) {
3717 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3718 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3719 }
3720
3721#if KMP_STATS_ENABLED
3722 // Initialize stats as soon as possible (right after gtid assignment).
3723 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3724 __kmp_stats_thread_ptr->startLife();
3725 KMP_SET_THREAD_STATE(SERIAL_REGION);
3726 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3727#endif
3728 __kmp_initialize_root(root);
3729
3730 /* setup new root thread structure */
3731 if (root->r.r_uber_thread) {
3732 root_thread = root->r.r_uber_thread;
3733 } else {
3734 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3735 if (__kmp_storage_map) {
3736 __kmp_print_thread_storage_map(root_thread, gtid);
3737 }
3738 root_thread->th.th_info.ds.ds_gtid = gtid;
3739#if OMPT_SUPPORT
3740 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3741#endif
3742 root_thread->th.th_root = root;
3743 if (__kmp_env_consistency_check) {
3744 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3745 }
3746#if USE_FAST_MEMORY
3747 __kmp_initialize_fast_memory(root_thread);
3748#endif /* USE_FAST_MEMORY */
3749
3750#if KMP_USE_BGET
3751 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3752 __kmp_initialize_bget(root_thread);
3753#endif
3754 __kmp_init_random(root_thread); // Initialize random number generator
3755 }
3756
3757 /* setup the serial team held in reserve by the root thread */
3758 if (!root_thread->th.th_serial_team) {
3759 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3760 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3761 root_thread->th.th_serial_team = __kmp_allocate_team(
3762 root, 1, 1,
3763#if OMPT_SUPPORT
3764 ompt_data_none, // root parallel id
3765#endif
3766 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3767 }
3768 KMP_ASSERT(root_thread->th.th_serial_team);
3769 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3770 root_thread->th.th_serial_team));
3771
3772 /* drop root_thread into place */
3773 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3774
3775 root->r.r_root_team->t.t_threads[0] = root_thread;
3776 root->r.r_hot_team->t.t_threads[0] = root_thread;
3777 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3778 // AC: the team created in reserve, not for execution (it is unused for now).
3779 root_thread->th.th_serial_team->t.t_serialized = 0;
3780 root->r.r_uber_thread = root_thread;
3781
3782 /* initialize the thread, get it ready to go */
3783 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3784 TCW_4(__kmp_init_gtid, TRUE);
3785
3786 /* prepare the primary thread for get_gtid() */
3787 __kmp_gtid_set_specific(gtid);
3788
3789#if USE_ITT_BUILD
3790 __kmp_itt_thread_name(gtid);
3791#endif /* USE_ITT_BUILD */
3792
3793#ifdef KMP_TDATA_GTID
3794 __kmp_gtid = gtid;
3795#endif
3796 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3797 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3798
3799 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3800 "plain=%u\n",
3801 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3802 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3803 KMP_INIT_BARRIER_STATE));
3804 { // Initialize barrier data.
3805 int b;
3806 for (b = 0; b < bs_last_barrier; ++b) {
3807 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3808#if USE_DEBUGGER
3809 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3810#endif
3811 }
3812 }
3813 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3814 KMP_INIT_BARRIER_STATE);
3815
3816#if KMP_AFFINITY_SUPPORTED
3817 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3818 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3819 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3820 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3821#endif /* KMP_AFFINITY_SUPPORTED */
3822 root_thread->th.th_def_allocator = __kmp_def_allocator;
3823 root_thread->th.th_prev_level = 0;
3824 root_thread->th.th_prev_num_threads = 1;
3825
3826 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3827 tmp->cg_root = root_thread;
3828 tmp->cg_thread_limit = __kmp_cg_max_nth;
3829 tmp->cg_nthreads = 1;
3830 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3831 " cg_nthreads init to 1\n",
3832 root_thread, tmp));
3833 tmp->up = NULL;
3834 root_thread->th.th_cg_roots = tmp;
3835
3836 __kmp_root_counter++;
3837
3838#if OMPT_SUPPORT
3839 if (!initial_thread && ompt_enabled.enabled) {
3840
3841 kmp_info_t *root_thread = ompt_get_thread();
3842
3843 ompt_set_thread_state(root_thread, ompt_state_overhead);
3844
3845 if (ompt_enabled.ompt_callback_thread_begin) {
3846 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3847 ompt_thread_initial, __ompt_get_thread_data_internal());
3848 }
3849 ompt_data_t *task_data;
3850 ompt_data_t *parallel_data;
3851 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3852 NULL);
3853 if (ompt_enabled.ompt_callback_implicit_task) {
3854 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3855 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3856 }
3857
3858 ompt_set_thread_state(root_thread, ompt_state_work_serial);
3859 }
3860#endif
3861#if OMPD_SUPPORT
3862 if (ompd_state & OMPD_ENABLE_BP)
3863 ompd_bp_thread_begin();
3864#endif
3865
3866 KMP_MB();
3867 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3868
3869 return gtid;
3870}
3871
3872#if KMP_NESTED_HOT_TEAMS
3873static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3874 const int max_level) {
3875 int i, n, nth;
3876 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3877 if (!hot_teams || !hot_teams[level].hot_team) {
3878 return 0;
3879 }
3880 KMP_DEBUG_ASSERT(level < max_level);
3881 kmp_team_t *team = hot_teams[level].hot_team;
3882 nth = hot_teams[level].hot_team_nth;
3883 n = nth - 1; // primary thread is not freed
3884 if (level < max_level - 1) {
3885 for (i = 0; i < nth; ++i) {
3886 kmp_info_t *th = team->t.t_threads[i];
3887 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3888 if (i > 0 && th->th.th_hot_teams) {
3889 __kmp_free(th->th.th_hot_teams);
3890 th->th.th_hot_teams = NULL;
3891 }
3892 }
3893 }
3894 __kmp_free_team(root, team, NULL);
3895 return n;
3896}
3897#endif
3898
3899// Resets a root thread and clear its root and hot teams.
3900// Returns the number of __kmp_threads entries directly and indirectly freed.
3901static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3902 kmp_team_t *root_team = root->r.r_root_team;
3903 kmp_team_t *hot_team = root->r.r_hot_team;
3904 int n = hot_team->t.t_nproc;
3905 int i;
3906
3907 KMP_DEBUG_ASSERT(!root->r.r_active);
3908
3909 root->r.r_root_team = NULL;
3910 root->r.r_hot_team = NULL;
3911 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3912 // before call to __kmp_free_team().
3913 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3914#if KMP_NESTED_HOT_TEAMS
3915 if (__kmp_hot_teams_max_level >
3916 0) { // need to free nested hot teams and their threads if any
3917 for (i = 0; i < hot_team->t.t_nproc; ++i) {
3918 kmp_info_t *th = hot_team->t.t_threads[i];
3919 if (__kmp_hot_teams_max_level > 1) {
3920 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3921 }
3922 if (th->th.th_hot_teams) {
3923 __kmp_free(th->th.th_hot_teams);
3924 th->th.th_hot_teams = NULL;
3925 }
3926 }
3927 }
3928#endif
3929 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3930
3931 // Before we can reap the thread, we need to make certain that all other
3932 // threads in the teams that had this root as ancestor have stopped trying to
3933 // steal tasks.
3934 if (__kmp_tasking_mode != tskm_immediate_exec) {
3935 __kmp_wait_to_unref_task_teams();
3936 }
3937
3938#if KMP_OS_WINDOWS
3939 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3940 KA_TRACE(
3941 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3942 "\n",
3943 (LPVOID) & (root->r.r_uber_thread->th),
3944 root->r.r_uber_thread->th.th_info.ds.ds_thread));
3945 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3946#endif /* KMP_OS_WINDOWS */
3947
3948#if OMPD_SUPPORT
3949 if (ompd_state & OMPD_ENABLE_BP)
3950 ompd_bp_thread_end();
3951#endif
3952
3953#if OMPT_SUPPORT
3954 ompt_data_t *task_data;
3955 ompt_data_t *parallel_data;
3956 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3957 NULL);
3958 if (ompt_enabled.ompt_callback_implicit_task) {
3959 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3960 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3961 }
3962 if (ompt_enabled.ompt_callback_thread_end) {
3963 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3964 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3965 }
3966#endif
3967
3968 TCW_4(__kmp_nth,
3969 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3970 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3971 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3972 " to %d\n",
3973 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3974 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3975 if (i == 1) {
3976 // need to free contention group structure
3977 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3978 root->r.r_uber_thread->th.th_cg_roots->cg_root);
3979 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3980 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3981 root->r.r_uber_thread->th.th_cg_roots = NULL;
3982 }
3983 __kmp_reap_thread(root->r.r_uber_thread, 1);
3984
3985 // We canot put root thread to __kmp_thread_pool, so we have to reap it
3986 // instead of freeing.
3987 root->r.r_uber_thread = NULL;
3988 /* mark root as no longer in use */
3989 root->r.r_begin = FALSE;
3990
3991 return n;
3992}
3993
3994void __kmp_unregister_root_current_thread(int gtid) {
3995 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3996 /* this lock should be ok, since unregister_root_current_thread is never
3997 called during an abort, only during a normal close. furthermore, if you
3998 have the forkjoin lock, you should never try to get the initz lock */
3999 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4000 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4001 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4002 "exiting T#%d\n",
4003 gtid));
4004 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4005 return;
4006 }
4007 kmp_root_t *root = __kmp_root[gtid];
4008
4009 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4010 KMP_ASSERT(KMP_UBER_GTID(gtid));
4011 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4012 KMP_ASSERT(root->r.r_active == FALSE);
4013
4014 KMP_MB();
4015
4016 kmp_info_t *thread = __kmp_threads[gtid];
4017 kmp_team_t *team = thread->th.th_team;
4018 kmp_task_team_t *task_team = thread->th.th_task_team;
4019
4020 // we need to wait for the proxy tasks before finishing the thread
4021 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4022#if OMPT_SUPPORT
4023 // the runtime is shutting down so we won't report any events
4024 thread->th.ompt_thread_info.state = ompt_state_undefined;
4025#endif
4026 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4027 }
4028
4029 __kmp_reset_root(gtid, root);
4030
4031 KMP_MB();
4032 KC_TRACE(10,
4033 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4034
4035 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4036}
4037
4038#if KMP_OS_WINDOWS
4039/* __kmp_forkjoin_lock must be already held
4040 Unregisters a root thread that is not the current thread. Returns the number
4041 of __kmp_threads entries freed as a result. */
4042static int __kmp_unregister_root_other_thread(int gtid) {
4043 kmp_root_t *root = __kmp_root[gtid];
4044 int r;
4045
4046 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4047 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4048 KMP_ASSERT(KMP_UBER_GTID(gtid));
4049 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4050 KMP_ASSERT(root->r.r_active == FALSE);
4051
4052 r = __kmp_reset_root(gtid, root);
4053 KC_TRACE(10,
4054 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4055 return r;
4056}
4057#endif
4058
4059#if KMP_DEBUG
4060void __kmp_task_info() {
4061
4062 kmp_int32 gtid = __kmp_entry_gtid();
4063 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4064 kmp_info_t *this_thr = __kmp_threads[gtid];
4065 kmp_team_t *steam = this_thr->th.th_serial_team;
4066 kmp_team_t *team = this_thr->th.th_team;
4067
4068 __kmp_printf(
4069 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4070 "ptask=%p\n",
4071 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4072 team->t.t_implicit_task_taskdata[tid].td_parent);
4073}
4074#endif // KMP_DEBUG
4075
4076/* TODO optimize with one big memclr, take out what isn't needed, split
4077 responsibility to workers as much as possible, and delay initialization of
4078 features as much as possible */
4079static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4080 int tid, int gtid) {
4081 /* this_thr->th.th_info.ds.ds_gtid is setup in
4082 kmp_allocate_thread/create_worker.
4083 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4084 KMP_DEBUG_ASSERT(this_thr != NULL);
4085 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4086 KMP_DEBUG_ASSERT(team);
4087 KMP_DEBUG_ASSERT(team->t.t_threads);
4088 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4089 kmp_info_t *master = team->t.t_threads[0];
4090 KMP_DEBUG_ASSERT(master);
4091 KMP_DEBUG_ASSERT(master->th.th_root);
4092
4093 KMP_MB();
4094
4095 TCW_SYNC_PTR(this_thr->th.th_team, team);
4096
4097 this_thr->th.th_info.ds.ds_tid = tid;
4098 this_thr->th.th_set_nproc = 0;
4099 if (__kmp_tasking_mode != tskm_immediate_exec)
4100 // When tasking is possible, threads are not safe to reap until they are
4101 // done tasking; this will be set when tasking code is exited in wait
4102 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4103 else // no tasking --> always safe to reap
4104 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4105 this_thr->th.th_set_proc_bind = proc_bind_default;
4106#if KMP_AFFINITY_SUPPORTED
4107 this_thr->th.th_new_place = this_thr->th.th_current_place;
4108#endif
4109 this_thr->th.th_root = master->th.th_root;
4110
4111 /* setup the thread's cache of the team structure */
4112 this_thr->th.th_team_nproc = team->t.t_nproc;
4113 this_thr->th.th_team_master = master;
4114 this_thr->th.th_team_serialized = team->t.t_serialized;
4115 TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4116
4117 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4118
4119 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4120 tid, gtid, this_thr, this_thr->th.th_current_task));
4121
4122 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4123 team, tid, TRUE);
4124
4125 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4126 tid, gtid, this_thr, this_thr->th.th_current_task));
4127 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4128 // __kmp_initialize_team()?
4129
4130 /* TODO no worksharing in speculative threads */
4131 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4132
4133 this_thr->th.th_local.this_construct = 0;
4134
4135 if (!this_thr->th.th_pri_common) {
4136 this_thr->th.th_pri_common =
4137 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4138 if (__kmp_storage_map) {
4139 __kmp_print_storage_map_gtid(
4140 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4141 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4142 }
4143 this_thr->th.th_pri_head = NULL;
4144 }
4145
4146 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4147 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4148 // Make new thread's CG root same as primary thread's
4149 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4150 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4151 if (tmp) {
4152 // worker changes CG, need to check if old CG should be freed
4153 int i = tmp->cg_nthreads--;
4154 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4155 " on node %p of thread %p to %d\n",
4156 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4157 if (i == 1) {
4158 __kmp_free(tmp); // last thread left CG --> free it
4159 }
4160 }
4161 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4162 // Increment new thread's CG root's counter to add the new thread
4163 this_thr->th.th_cg_roots->cg_nthreads++;
4164 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4165 " node %p of thread %p to %d\n",
4166 this_thr, this_thr->th.th_cg_roots,
4167 this_thr->th.th_cg_roots->cg_root,
4168 this_thr->th.th_cg_roots->cg_nthreads));
4169 this_thr->th.th_current_task->td_icvs.thread_limit =
4170 this_thr->th.th_cg_roots->cg_thread_limit;
4171 }
4172
4173 /* Initialize dynamic dispatch */
4174 {
4175 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4176 // Use team max_nproc since this will never change for the team.
4177 size_t disp_size =
4178 sizeof(dispatch_private_info_t) *
4179 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4180 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4181 team->t.t_max_nproc));
4182 KMP_ASSERT(dispatch);
4183 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4184 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4185
4186 dispatch->th_disp_index = 0;
4187 dispatch->th_doacross_buf_idx = 0;
4188 if (!dispatch->th_disp_buffer) {
4189 dispatch->th_disp_buffer =
4190 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4191
4192 if (__kmp_storage_map) {
4193 __kmp_print_storage_map_gtid(
4194 gtid, &dispatch->th_disp_buffer[0],
4195 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4196 ? 1
4197 : __kmp_dispatch_num_buffers],
4198 disp_size,
4199 "th_%d.th_dispatch.th_disp_buffer "
4200 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4201 gtid, team->t.t_id, gtid);
4202 }
4203 } else {
4204 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4205 }
4206
4207 dispatch->th_dispatch_pr_current = 0;
4208 dispatch->th_dispatch_sh_current = 0;
4209
4210 dispatch->th_deo_fcn = 0; /* ORDERED */
4211 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4212 }
4213
4214 this_thr->th.th_next_pool = NULL;
4215
4216 if (!this_thr->th.th_task_state_memo_stack) {
4217 size_t i;
4218 this_thr->th.th_task_state_memo_stack =
4219 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4220 this_thr->th.th_task_state_top = 0;
4221 this_thr->th.th_task_state_stack_sz = 4;
4222 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4223 ++i) // zero init the stack
4224 this_thr->th.th_task_state_memo_stack[i] = 0;
4225 }
4226
4227 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4228 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4229
4230 KMP_MB();
4231}
4232
4233/* allocate a new thread for the requesting team. this is only called from
4234 within a forkjoin critical section. we will first try to get an available
4235 thread from the thread pool. if none is available, we will fork a new one
4236 assuming we are able to create a new one. this should be assured, as the
4237 caller should check on this first. */
4238kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4239 int new_tid) {
4240 kmp_team_t *serial_team;
4241 kmp_info_t *new_thr;
4242 int new_gtid;
4243
4244 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4245 KMP_DEBUG_ASSERT(root && team);
4246#if !KMP_NESTED_HOT_TEAMS
4247 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4248#endif
4249 KMP_MB();
4250
4251 /* first, try to get one from the thread pool */
4252 if (__kmp_thread_pool) {
4253 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4254 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4255 if (new_thr == __kmp_thread_pool_insert_pt) {
4256 __kmp_thread_pool_insert_pt = NULL;
4257 }
4258 TCW_4(new_thr->th.th_in_pool, FALSE);
4259 __kmp_suspend_initialize_thread(new_thr);
4260 __kmp_lock_suspend_mx(new_thr);
4261 if (new_thr->th.th_active_in_pool == TRUE) {
4262 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4263 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4264 new_thr->th.th_active_in_pool = FALSE;
4265 }
4266 __kmp_unlock_suspend_mx(new_thr);
4267
4268 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4269 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4270 KMP_ASSERT(!new_thr->th.th_team);
4271 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4272
4273 /* setup the thread structure */
4274 __kmp_initialize_info(new_thr, team, new_tid,
4275 new_thr->th.th_info.ds.ds_gtid);
4276 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4277
4278 TCW_4(__kmp_nth, __kmp_nth + 1);
4279
4280 new_thr->th.th_task_state = 0;
4281 new_thr->th.th_task_state_top = 0;
4282 new_thr->th.th_task_state_stack_sz = 4;
4283
4284#ifdef KMP_ADJUST_BLOCKTIME
4285 /* Adjust blocktime back to zero if necessary */
4286 /* Middle initialization might not have occurred yet */
4287 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4288 if (__kmp_nth > __kmp_avail_proc) {
4289 __kmp_zero_bt = TRUE;
4290 }
4291 }
4292#endif /* KMP_ADJUST_BLOCKTIME */
4293
4294#if KMP_DEBUG
4295 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4296 // KMP_BARRIER_PARENT_FLAG.
4297 int b;
4298 kmp_balign_t *balign = new_thr->th.th_bar;
4299 for (b = 0; b < bs_last_barrier; ++b)
4300 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4301#endif
4302
4303 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4304 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4305
4306 KMP_MB();
4307 return new_thr;
4308 }
4309
4310 /* no, well fork a new one */
4311 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4312 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4313
4314#if KMP_USE_MONITOR
4315 // If this is the first worker thread the RTL is creating, then also
4316 // launch the monitor thread. We try to do this as early as possible.
4317 if (!TCR_4(__kmp_init_monitor)) {
4318 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4319 if (!TCR_4(__kmp_init_monitor)) {
4320 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4321 TCW_4(__kmp_init_monitor, 1);
4322 __kmp_create_monitor(&__kmp_monitor);
4323 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4324#if KMP_OS_WINDOWS
4325 // AC: wait until monitor has started. This is a fix for CQ232808.
4326 // The reason is that if the library is loaded/unloaded in a loop with
4327 // small (parallel) work in between, then there is high probability that
4328 // monitor thread started after the library shutdown. At shutdown it is
4329 // too late to cope with the problem, because when the primary thread is
4330 // in DllMain (process detach) the monitor has no chances to start (it is
4331 // blocked), and primary thread has no means to inform the monitor that
4332 // the library has gone, because all the memory which the monitor can
4333 // access is going to be released/reset.
4334 while (TCR_4(__kmp_init_monitor) < 2) {
4335 KMP_YIELD(TRUE);
4336 }
4337 KF_TRACE(10, ("after monitor thread has started\n"));
4338#endif
4339 }
4340 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4341 }
4342#endif
4343
4344 KMP_MB();
4345
4346 {
4347 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4348 ? 1
4349 : __kmp_hidden_helper_threads_num + 1;
4350
4351 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4352 ++new_gtid) {
4353 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4354 }
4355
4356 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4357 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4358 }
4359 }
4360
4361 /* allocate space for it. */
4362 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4363
4364 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4365
4366#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4367 // suppress race conditions detection on synchronization flags in debug mode
4368 // this helps to analyze library internals eliminating false positives
4369 __itt_suppress_mark_range(
4370 __itt_suppress_range, __itt_suppress_threading_errors,
4371 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4372 __itt_suppress_mark_range(
4373 __itt_suppress_range, __itt_suppress_threading_errors,
4374 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4375#if KMP_OS_WINDOWS
4376 __itt_suppress_mark_range(
4377 __itt_suppress_range, __itt_suppress_threading_errors,
4378 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4379#else
4380 __itt_suppress_mark_range(__itt_suppress_range,
4381 __itt_suppress_threading_errors,
4382 &new_thr->th.th_suspend_init_count,
4383 sizeof(new_thr->th.th_suspend_init_count));
4384#endif
4385 // TODO: check if we need to also suppress b_arrived flags
4386 __itt_suppress_mark_range(__itt_suppress_range,
4387 __itt_suppress_threading_errors,
4388 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4389 sizeof(new_thr->th.th_bar[0].bb.b_go));
4390 __itt_suppress_mark_range(__itt_suppress_range,
4391 __itt_suppress_threading_errors,
4392 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4393 sizeof(new_thr->th.th_bar[1].bb.b_go));
4394 __itt_suppress_mark_range(__itt_suppress_range,
4395 __itt_suppress_threading_errors,
4396 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4397 sizeof(new_thr->th.th_bar[2].bb.b_go));
4398#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4399 if (__kmp_storage_map) {
4400 __kmp_print_thread_storage_map(new_thr, new_gtid);
4401 }
4402
4403 // add the reserve serialized team, initialized from the team's primary thread
4404 {
4405 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4406 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4407 new_thr->th.th_serial_team = serial_team =
4408 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4409#if OMPT_SUPPORT
4410 ompt_data_none, // root parallel id
4411#endif
4412 proc_bind_default, &r_icvs,
4413 0 USE_NESTED_HOT_ARG(NULL));
4414 }
4415 KMP_ASSERT(serial_team);
4416 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4417 // execution (it is unused for now).
4418 serial_team->t.t_threads[0] = new_thr;
4419 KF_TRACE(10,
4420 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4421 new_thr));
4422
4423 /* setup the thread structures */
4424 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4425
4426#if USE_FAST_MEMORY
4427 __kmp_initialize_fast_memory(new_thr);
4428#endif /* USE_FAST_MEMORY */
4429
4430#if KMP_USE_BGET
4431 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4432 __kmp_initialize_bget(new_thr);
4433#endif
4434
4435 __kmp_init_random(new_thr); // Initialize random number generator
4436
4437 /* Initialize these only once when thread is grabbed for a team allocation */
4438 KA_TRACE(20,
4439 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4440 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4441
4442 int b;
4443 kmp_balign_t *balign = new_thr->th.th_bar;
4444 for (b = 0; b < bs_last_barrier; ++b) {
4445 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4446 balign[b].bb.team = NULL;
4447 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4448 balign[b].bb.use_oncore_barrier = 0;
4449 }
4450
4451 new_thr->th.th_spin_here = FALSE;
4452 new_thr->th.th_next_waiting = 0;
4453#if KMP_OS_UNIX
4454 new_thr->th.th_blocking = false;
4455#endif
4456
4457#if KMP_AFFINITY_SUPPORTED
4458 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4459 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4460 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4461 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4462#endif
4463 new_thr->th.th_def_allocator = __kmp_def_allocator;
4464 new_thr->th.th_prev_level = 0;
4465 new_thr->th.th_prev_num_threads = 1;
4466
4467 TCW_4(new_thr->th.th_in_pool, FALSE);
4468 new_thr->th.th_active_in_pool = FALSE;
4469 TCW_4(new_thr->th.th_active, TRUE);
4470
4471 /* adjust the global counters */
4472 __kmp_all_nth++;
4473 __kmp_nth++;
4474
4475 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4476 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4477 if (__kmp_adjust_gtid_mode) {
4478 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4479 if (TCR_4(__kmp_gtid_mode) != 2) {
4480 TCW_4(__kmp_gtid_mode, 2);
4481 }
4482 } else {
4483 if (TCR_4(__kmp_gtid_mode) != 1) {
4484 TCW_4(__kmp_gtid_mode, 1);
4485 }
4486 }
4487 }
4488
4489#ifdef KMP_ADJUST_BLOCKTIME
4490 /* Adjust blocktime back to zero if necessary */
4491 /* Middle initialization might not have occurred yet */
4492 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4493 if (__kmp_nth > __kmp_avail_proc) {
4494 __kmp_zero_bt = TRUE;
4495 }
4496 }
4497#endif /* KMP_ADJUST_BLOCKTIME */
4498
4499 /* actually fork it and create the new worker thread */
4500 KF_TRACE(
4501 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4502 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4503 KF_TRACE(10,
4504 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4505
4506 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4507 new_gtid));
4508 KMP_MB();
4509 return new_thr;
4510}
4511
4512/* Reinitialize team for reuse.
4513 The hot team code calls this case at every fork barrier, so EPCC barrier
4514 test are extremely sensitive to changes in it, esp. writes to the team
4515 struct, which cause a cache invalidation in all threads.
4516 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4517static void __kmp_reinitialize_team(kmp_team_t *team,
4518 kmp_internal_control_t *new_icvs,
4519 ident_t *loc) {
4520 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4521 team->t.t_threads[0], team));
4522 KMP_DEBUG_ASSERT(team && new_icvs);
4523 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4524 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4525
4526 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4527 // Copy ICVs to the primary thread's implicit taskdata
4528 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4529 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4530
4531 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4532 team->t.t_threads[0], team));
4533}
4534
4535/* Initialize the team data structure.
4536 This assumes the t_threads and t_max_nproc are already set.
4537 Also, we don't touch the arguments */
4538static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4539 kmp_internal_control_t *new_icvs,
4540 ident_t *loc) {
4541 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4542
4543 /* verify */
4544 KMP_DEBUG_ASSERT(team);
4545 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4546 KMP_DEBUG_ASSERT(team->t.t_threads);
4547 KMP_MB();
4548
4549 team->t.t_master_tid = 0; /* not needed */
4550 /* team->t.t_master_bar; not needed */
4551 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4552 team->t.t_nproc = new_nproc;
4553
4554 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4555 team->t.t_next_pool = NULL;
4556 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4557 * up hot team */
4558
4559 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4560 team->t.t_invoke = NULL; /* not needed */
4561
4562 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4563 team->t.t_sched.sched = new_icvs->sched.sched;
4564
4565#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4566 team->t.t_fp_control_saved = FALSE; /* not needed */
4567 team->t.t_x87_fpu_control_word = 0; /* not needed */
4568 team->t.t_mxcsr = 0; /* not needed */
4569#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4570
4571 team->t.t_construct = 0;
4572
4573 team->t.t_ordered.dt.t_value = 0;
4574 team->t.t_master_active = FALSE;
4575
4576#ifdef KMP_DEBUG
4577 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4578#endif
4579#if KMP_OS_WINDOWS
4580 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4581#endif
4582
4583 team->t.t_control_stack_top = NULL;
4584
4585 __kmp_reinitialize_team(team, new_icvs, loc);
4586
4587 KMP_MB();
4588 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4589}
4590
4591#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4592/* Sets full mask for thread and returns old mask, no changes to structures. */
4593static void
4594__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4595 if (KMP_AFFINITY_CAPABLE()) {
4596 int status;
4597 if (old_mask != NULL) {
4598 status = __kmp_get_system_affinity(old_mask, TRUE);
4599 int error = errno;
4600 if (status != 0) {
4601 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4602 __kmp_msg_null);
4603 }
4604 }
4605 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4606 }
4607}
4608#endif
4609
4610#if KMP_AFFINITY_SUPPORTED
4611
4612// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4613// It calculates the worker + primary thread's partition based upon the parent
4614// thread's partition, and binds each worker to a thread in their partition.
4615// The primary thread's partition should already include its current binding.
4616static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4617 // Do not partition places for the hidden helper team
4618 if (KMP_HIDDEN_HELPER_TEAM(team))
4619 return;
4620 // Copy the primary thread's place partition to the team struct
4621 kmp_info_t *master_th = team->t.t_threads[0];
4622 KMP_DEBUG_ASSERT(master_th != NULL);
4623 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4624 int first_place = master_th->th.th_first_place;
4625 int last_place = master_th->th.th_last_place;
4626 int masters_place = master_th->th.th_current_place;
4627 team->t.t_first_place = first_place;
4628 team->t.t_last_place = last_place;
4629
4630 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4631 "bound to place %d partition = [%d,%d]\n",
4632 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4633 team->t.t_id, masters_place, first_place, last_place));
4634
4635 switch (proc_bind) {
4636
4637 case proc_bind_default:
4638 // Serial teams might have the proc_bind policy set to proc_bind_default.
4639 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4640 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4641 break;
4642
4643 case proc_bind_primary: {
4644 int f;
4645 int n_th = team->t.t_nproc;
4646 for (f = 1; f < n_th; f++) {
4647 kmp_info_t *th = team->t.t_threads[f];
4648 KMP_DEBUG_ASSERT(th != NULL);
4649 th->th.th_first_place = first_place;
4650 th->th.th_last_place = last_place;
4651 th->th.th_new_place = masters_place;
4652 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4653 team->t.t_display_affinity != 1) {
4654 team->t.t_display_affinity = 1;
4655 }
4656
4657 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4658 "partition = [%d,%d]\n",
4659 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4660 f, masters_place, first_place, last_place));
4661 }
4662 } break;
4663
4664 case proc_bind_close: {
4665 int f;
4666 int n_th = team->t.t_nproc;
4667 int n_places;
4668 if (first_place <= last_place) {
4669 n_places = last_place - first_place + 1;
4670 } else {
4671 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4672 }
4673 if (n_th <= n_places) {
4674 int place = masters_place;
4675 for (f = 1; f < n_th; f++) {
4676 kmp_info_t *th = team->t.t_threads[f];
4677 KMP_DEBUG_ASSERT(th != NULL);
4678
4679 if (place == last_place) {
4680 place = first_place;
4681 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4682 place = 0;
4683 } else {
4684 place++;
4685 }
4686 th->th.th_first_place = first_place;
4687 th->th.th_last_place = last_place;
4688 th->th.th_new_place = place;
4689 if (__kmp_display_affinity && place != th->th.th_current_place &&
4690 team->t.t_display_affinity != 1) {
4691 team->t.t_display_affinity = 1;
4692 }
4693
4694 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4695 "partition = [%d,%d]\n",
4696 __kmp_gtid_from_thread(team->t.t_threads[f]),
4697 team->t.t_id, f, place, first_place, last_place));
4698 }
4699 } else {
4700 int S, rem, gap, s_count;
4701 S = n_th / n_places;
4702 s_count = 0;
4703 rem = n_th - (S * n_places);
4704 gap = rem > 0 ? n_places / rem : n_places;
4705 int place = masters_place;
4706 int gap_ct = gap;
4707 for (f = 0; f < n_th; f++) {
4708 kmp_info_t *th = team->t.t_threads[f];
4709 KMP_DEBUG_ASSERT(th != NULL);
4710
4711 th->th.th_first_place = first_place;
4712 th->th.th_last_place = last_place;
4713 th->th.th_new_place = place;
4714 if (__kmp_display_affinity && place != th->th.th_current_place &&
4715 team->t.t_display_affinity != 1) {
4716 team->t.t_display_affinity = 1;
4717 }
4718 s_count++;
4719
4720 if ((s_count == S) && rem && (gap_ct == gap)) {
4721 // do nothing, add an extra thread to place on next iteration
4722 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4723 // we added an extra thread to this place; move to next place
4724 if (place == last_place) {
4725 place = first_place;
4726 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4727 place = 0;
4728 } else {
4729 place++;
4730 }
4731 s_count = 0;
4732 gap_ct = 1;
4733 rem--;
4734 } else if (s_count == S) { // place full; don't add extra
4735 if (place == last_place) {
4736 place = first_place;
4737 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4738 place = 0;
4739 } else {
4740 place++;
4741 }
4742 gap_ct++;
4743 s_count = 0;
4744 }
4745
4746 KA_TRACE(100,
4747 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4748 "partition = [%d,%d]\n",
4749 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4750 th->th.th_new_place, first_place, last_place));
4751 }
4752 KMP_DEBUG_ASSERT(place == masters_place);
4753 }
4754 } break;
4755
4756 case proc_bind_spread: {
4757 int f;
4758 int n_th = team->t.t_nproc;
4759 int n_places;
4760 int thidx;
4761 if (first_place <= last_place) {
4762 n_places = last_place - first_place + 1;
4763 } else {
4764 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4765 }
4766 if (n_th <= n_places) {
4767 int place = -1;
4768
4769 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4770 int S = n_places / n_th;
4771 int s_count, rem, gap, gap_ct;
4772
4773 place = masters_place;
4774 rem = n_places - n_th * S;
4775 gap = rem ? n_th / rem : 1;
4776 gap_ct = gap;
4777 thidx = n_th;
4778 if (update_master_only == 1)
4779 thidx = 1;
4780 for (f = 0; f < thidx; f++) {
4781 kmp_info_t *th = team->t.t_threads[f];
4782 KMP_DEBUG_ASSERT(th != NULL);
4783
4784 th->th.th_first_place = place;
4785 th->th.th_new_place = place;
4786 if (__kmp_display_affinity && place != th->th.th_current_place &&
4787 team->t.t_display_affinity != 1) {
4788 team->t.t_display_affinity = 1;
4789 }
4790 s_count = 1;
4791 while (s_count < S) {
4792 if (place == last_place) {
4793 place = first_place;
4794 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4795 place = 0;
4796 } else {
4797 place++;
4798 }
4799 s_count++;
4800 }
4801 if (rem && (gap_ct == gap)) {
4802 if (place == last_place) {
4803 place = first_place;
4804 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4805 place = 0;
4806 } else {
4807 place++;
4808 }
4809 rem--;
4810 gap_ct = 0;
4811 }
4812 th->th.th_last_place = place;
4813 gap_ct++;
4814
4815 if (place == last_place) {
4816 place = first_place;
4817 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4818 place = 0;
4819 } else {
4820 place++;
4821 }
4822
4823 KA_TRACE(100,
4824 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4825 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4826 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4827 f, th->th.th_new_place, th->th.th_first_place,
4828 th->th.th_last_place, __kmp_affinity_num_masks));
4829 }
4830 } else {
4831 /* Having uniform space of available computation places I can create
4832 T partitions of round(P/T) size and put threads into the first
4833 place of each partition. */
4834 double current = static_cast<double>(masters_place);
4835 double spacing =
4836 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4837 int first, last;
4838 kmp_info_t *th;
4839
4840 thidx = n_th + 1;
4841 if (update_master_only == 1)
4842 thidx = 1;
4843 for (f = 0; f < thidx; f++) {
4844 first = static_cast<int>(current);
4845 last = static_cast<int>(current + spacing) - 1;
4846 KMP_DEBUG_ASSERT(last >= first);
4847 if (first >= n_places) {
4848 if (masters_place) {
4849 first -= n_places;
4850 last -= n_places;
4851 if (first == (masters_place + 1)) {
4852 KMP_DEBUG_ASSERT(f == n_th);
4853 first--;
4854 }
4855 if (last == masters_place) {
4856 KMP_DEBUG_ASSERT(f == (n_th - 1));
4857 last--;
4858 }
4859 } else {
4860 KMP_DEBUG_ASSERT(f == n_th);
4861 first = 0;
4862 last = 0;
4863 }
4864 }
4865 if (last >= n_places) {
4866 last = (n_places - 1);
4867 }
4868 place = first;
4869 current += spacing;
4870 if (f < n_th) {
4871 KMP_DEBUG_ASSERT(0 <= first);
4872 KMP_DEBUG_ASSERT(n_places > first);
4873 KMP_DEBUG_ASSERT(0 <= last);
4874 KMP_DEBUG_ASSERT(n_places > last);
4875 KMP_DEBUG_ASSERT(last_place >= first_place);
4876 th = team->t.t_threads[f];
4877 KMP_DEBUG_ASSERT(th);
4878 th->th.th_first_place = first;
4879 th->th.th_new_place = place;
4880 th->th.th_last_place = last;
4881 if (__kmp_display_affinity && place != th->th.th_current_place &&
4882 team->t.t_display_affinity != 1) {
4883 team->t.t_display_affinity = 1;
4884 }
4885 KA_TRACE(100,
4886 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4887 "partition = [%d,%d], spacing = %.4f\n",
4888 __kmp_gtid_from_thread(team->t.t_threads[f]),
4889 team->t.t_id, f, th->th.th_new_place,
4890 th->th.th_first_place, th->th.th_last_place, spacing));
4891 }
4892 }
4893 }
4894 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4895 } else {
4896 int S, rem, gap, s_count;
4897 S = n_th / n_places;
4898 s_count = 0;
4899 rem = n_th - (S * n_places);
4900 gap = rem > 0 ? n_places / rem : n_places;
4901 int place = masters_place;
4902 int gap_ct = gap;
4903 thidx = n_th;
4904 if (update_master_only == 1)
4905 thidx = 1;
4906 for (f = 0; f < thidx; f++) {
4907 kmp_info_t *th = team->t.t_threads[f];
4908 KMP_DEBUG_ASSERT(th != NULL);
4909
4910 th->th.th_first_place = place;
4911 th->th.th_last_place = place;
4912 th->th.th_new_place = place;
4913 if (__kmp_display_affinity && place != th->th.th_current_place &&
4914 team->t.t_display_affinity != 1) {
4915 team->t.t_display_affinity = 1;
4916 }
4917 s_count++;
4918
4919 if ((s_count == S) && rem && (gap_ct == gap)) {
4920 // do nothing, add an extra thread to place on next iteration
4921 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4922 // we added an extra thread to this place; move on to next place
4923 if (place == last_place) {
4924 place = first_place;
4925 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4926 place = 0;
4927 } else {
4928 place++;
4929 }
4930 s_count = 0;
4931 gap_ct = 1;
4932 rem--;
4933 } else if (s_count == S) { // place is full; don't add extra thread
4934 if (place == last_place) {
4935 place = first_place;
4936 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4937 place = 0;
4938 } else {
4939 place++;
4940 }
4941 gap_ct++;
4942 s_count = 0;
4943 }
4944
4945 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4946 "partition = [%d,%d]\n",
4947 __kmp_gtid_from_thread(team->t.t_threads[f]),
4948 team->t.t_id, f, th->th.th_new_place,
4949 th->th.th_first_place, th->th.th_last_place));
4950 }
4951 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4952 }
4953 } break;
4954
4955 default:
4956 break;
4957 }
4958
4959 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4960}
4961
4962#endif // KMP_AFFINITY_SUPPORTED
4963
4964/* allocate a new team data structure to use. take one off of the free pool if
4965 available */
4966kmp_team_t *
4967__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4968#if OMPT_SUPPORT
4969 ompt_data_t ompt_parallel_data,
4970#endif
4971 kmp_proc_bind_t new_proc_bind,
4972 kmp_internal_control_t *new_icvs,
4973 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4974 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4975 int f;
4976 kmp_team_t *team;
4977 int use_hot_team = !root->r.r_active;
4978 int level = 0;
4979
4980 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4981 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4982 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4983 KMP_MB();
4984
4985#if KMP_NESTED_HOT_TEAMS
4986 kmp_hot_team_ptr_t *hot_teams;
4987 if (master) {
4988 team = master->th.th_team;
4989 level = team->t.t_active_level;
4990 if (master->th.th_teams_microtask) { // in teams construct?
4991 if (master->th.th_teams_size.nteams > 1 &&
4992 ( // #teams > 1
4993 team->t.t_pkfn ==
4994 (microtask_t)__kmp_teams_master || // inner fork of the teams
4995 master->th.th_teams_level <
4996 team->t.t_level)) { // or nested parallel inside the teams
4997 ++level; // not increment if #teams==1, or for outer fork of the teams;
4998 // increment otherwise
4999 }
5000 }
5001 hot_teams = master->th.th_hot_teams;
5002 if (level < __kmp_hot_teams_max_level && hot_teams &&
5003 hot_teams[level].hot_team) {
5004 // hot team has already been allocated for given level
5005 use_hot_team = 1;
5006 } else {
5007 use_hot_team = 0;
5008 }
5009 } else {
5010 // check we won't access uninitialized hot_teams, just in case
5011 KMP_DEBUG_ASSERT(new_nproc == 1);
5012 }
5013#endif
5014 // Optimization to use a "hot" team
5015 if (use_hot_team && new_nproc > 1) {
5016 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5017#if KMP_NESTED_HOT_TEAMS
5018 team = hot_teams[level].hot_team;
5019#else
5020 team = root->r.r_hot_team;
5021#endif
5022#if KMP_DEBUG
5023 if (__kmp_tasking_mode != tskm_immediate_exec) {
5024 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5025 "task_team[1] = %p before reinit\n",
5026 team->t.t_task_team[0], team->t.t_task_team[1]));
5027 }
5028#endif
5029
5030 // Has the number of threads changed?
5031 /* Let's assume the most common case is that the number of threads is
5032 unchanged, and put that case first. */
5033 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5034 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5035 // This case can mean that omp_set_num_threads() was called and the hot
5036 // team size was already reduced, so we check the special flag
5037 if (team->t.t_size_changed == -1) {
5038 team->t.t_size_changed = 1;
5039 } else {
5040 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5041 }
5042
5043 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5044 kmp_r_sched_t new_sched = new_icvs->sched;
5045 // set primary thread's schedule as new run-time schedule
5046 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5047
5048 __kmp_reinitialize_team(team, new_icvs,
5049 root->r.r_uber_thread->th.th_ident);
5050
5051 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5052 team->t.t_threads[0], team));
5053 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5054
5055#if KMP_AFFINITY_SUPPORTED
5056 if ((team->t.t_size_changed == 0) &&
5057 (team->t.t_proc_bind == new_proc_bind)) {
5058 if (new_proc_bind == proc_bind_spread) {
5059 __kmp_partition_places(
5060 team, 1); // add flag to update only master for spread
5061 }
5062 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5063 "proc_bind = %d, partition = [%d,%d]\n",
5064 team->t.t_id, new_proc_bind, team->t.t_first_place,
5065 team->t.t_last_place));
5066 } else {
5067 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5068 __kmp_partition_places(team);
5069 }
5070#else
5071 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5072#endif /* KMP_AFFINITY_SUPPORTED */
5073 } else if (team->t.t_nproc > new_nproc) {
5074 KA_TRACE(20,
5075 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5076 new_nproc));
5077
5078 team->t.t_size_changed = 1;
5079#if KMP_NESTED_HOT_TEAMS
5080 if (__kmp_hot_teams_mode == 0) {
5081 // AC: saved number of threads should correspond to team's value in this
5082 // mode, can be bigger in mode 1, when hot team has threads in reserve
5083 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5084 hot_teams[level].hot_team_nth = new_nproc;
5085#endif // KMP_NESTED_HOT_TEAMS
5086 /* release the extra threads we don't need any more */
5087 for (f = new_nproc; f < team->t.t_nproc; f++) {
5088 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5089 if (__kmp_tasking_mode != tskm_immediate_exec) {
5090 // When decreasing team size, threads no longer in the team should
5091 // unref task team.
5092 team->t.t_threads[f]->th.th_task_team = NULL;
5093 }
5094 __kmp_free_thread(team->t.t_threads[f]);
5095 team->t.t_threads[f] = NULL;
5096 }
5097#if KMP_NESTED_HOT_TEAMS
5098 } // (__kmp_hot_teams_mode == 0)
5099 else {
5100 // When keeping extra threads in team, switch threads to wait on own
5101 // b_go flag
5102 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5103 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5104 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5105 for (int b = 0; b < bs_last_barrier; ++b) {
5106 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5107 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5108 }
5109 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5110 }
5111 }
5112 }
5113#endif // KMP_NESTED_HOT_TEAMS
5114 team->t.t_nproc = new_nproc;
5115 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5116 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5117 __kmp_reinitialize_team(team, new_icvs,
5118 root->r.r_uber_thread->th.th_ident);
5119
5120 // Update remaining threads
5121 for (f = 0; f < new_nproc; ++f) {
5122 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5123 }
5124
5125 // restore the current task state of the primary thread: should be the
5126 // implicit task
5127 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5128 team->t.t_threads[0], team));
5129
5130 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5131
5132#ifdef KMP_DEBUG
5133 for (f = 0; f < team->t.t_nproc; f++) {
5134 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5135 team->t.t_threads[f]->th.th_team_nproc ==
5136 team->t.t_nproc);
5137 }
5138#endif
5139
5140 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5141#if KMP_AFFINITY_SUPPORTED
5142 __kmp_partition_places(team);
5143#endif
5144 } else { // team->t.t_nproc < new_nproc
5145#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5146 kmp_affin_mask_t *old_mask;
5147 if (KMP_AFFINITY_CAPABLE()) {
5148 KMP_CPU_ALLOC(old_mask);
5149 }
5150#endif
5151
5152 KA_TRACE(20,
5153 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5154 new_nproc));
5155
5156 team->t.t_size_changed = 1;
5157
5158#if KMP_NESTED_HOT_TEAMS
5159 int avail_threads = hot_teams[level].hot_team_nth;
5160 if (new_nproc < avail_threads)
5161 avail_threads = new_nproc;
5162 kmp_info_t **other_threads = team->t.t_threads;
5163 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5164 // Adjust barrier data of reserved threads (if any) of the team
5165 // Other data will be set in __kmp_initialize_info() below.
5166 int b;
5167 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5168 for (b = 0; b < bs_last_barrier; ++b) {
5169 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5170 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5171#if USE_DEBUGGER
5172 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5173#endif
5174 }
5175 }
5176 if (hot_teams[level].hot_team_nth >= new_nproc) {
5177 // we have all needed threads in reserve, no need to allocate any
5178 // this only possible in mode 1, cannot have reserved threads in mode 0
5179 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5180 team->t.t_nproc = new_nproc; // just get reserved threads involved
5181 } else {
5182 // we may have some threads in reserve, but not enough
5183 team->t.t_nproc =
5184 hot_teams[level]
5185 .hot_team_nth; // get reserved threads involved if any
5186 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5187#endif // KMP_NESTED_HOT_TEAMS
5188 if (team->t.t_max_nproc < new_nproc) {
5189 /* reallocate larger arrays */
5190 __kmp_reallocate_team_arrays(team, new_nproc);
5191 __kmp_reinitialize_team(team, new_icvs, NULL);
5192 }
5193
5194#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5195 /* Temporarily set full mask for primary thread before creation of
5196 workers. The reason is that workers inherit the affinity from the
5197 primary thread, so if a lot of workers are created on the single
5198 core quickly, they don't get a chance to set their own affinity for
5199 a long time. */
5200 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5201#endif
5202
5203 /* allocate new threads for the hot team */
5204 for (f = team->t.t_nproc; f < new_nproc; f++) {
5205 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5206 KMP_DEBUG_ASSERT(new_worker);
5207 team->t.t_threads[f] = new_worker;
5208
5209 KA_TRACE(20,
5210 ("__kmp_allocate_team: team %d init T#%d arrived: "
5211 "join=%llu, plain=%llu\n",
5212 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5213 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5214 team->t.t_bar[bs_plain_barrier].b_arrived));
5215
5216 { // Initialize barrier data for new threads.
5217 int b;
5218 kmp_balign_t *balign = new_worker->th.th_bar;
5219 for (b = 0; b < bs_last_barrier; ++b) {
5220 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5221 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5222 KMP_BARRIER_PARENT_FLAG);
5223#if USE_DEBUGGER
5224 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5225#endif
5226 }
5227 }
5228 }
5229
5230#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5231 if (KMP_AFFINITY_CAPABLE()) {
5232 /* Restore initial primary thread's affinity mask */
5233 __kmp_set_system_affinity(old_mask, TRUE);
5234 KMP_CPU_FREE(old_mask);
5235 }
5236#endif
5237#if KMP_NESTED_HOT_TEAMS
5238 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5239#endif // KMP_NESTED_HOT_TEAMS
5240 /* make sure everyone is syncronized */
5241 int old_nproc = team->t.t_nproc; // save old value and use to update only
5242 // new threads below
5243 __kmp_initialize_team(team, new_nproc, new_icvs,
5244 root->r.r_uber_thread->th.th_ident);
5245
5246 /* reinitialize the threads */
5247 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5248 for (f = 0; f < team->t.t_nproc; ++f)
5249 __kmp_initialize_info(team->t.t_threads[f], team, f,
5250 __kmp_gtid_from_tid(f, team));
5251
5252 if (level) { // set th_task_state for new threads in nested hot team
5253 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5254 // only need to set the th_task_state for the new threads. th_task_state
5255 // for primary thread will not be accurate until after this in
5256 // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5257 // get the correct value.
5258 for (f = old_nproc; f < team->t.t_nproc; ++f)
5259 team->t.t_threads[f]->th.th_task_state =
5260 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5261 } else { // set th_task_state for new threads in non-nested hot team
5262 // copy primary thread's state
5263 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5264 for (f = old_nproc; f < team->t.t_nproc; ++f)
5265 team->t.t_threads[f]->th.th_task_state = old_state;
5266 }
5267
5268#ifdef KMP_DEBUG
5269 for (f = 0; f < team->t.t_nproc; ++f) {
5270 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5271 team->t.t_threads[f]->th.th_team_nproc ==
5272 team->t.t_nproc);
5273 }
5274#endif
5275
5276 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5277#if KMP_AFFINITY_SUPPORTED
5278 __kmp_partition_places(team);
5279#endif
5280 } // Check changes in number of threads
5281
5282 kmp_info_t *master = team->t.t_threads[0];
5283 if (master->th.th_teams_microtask) {
5284 for (f = 1; f < new_nproc; ++f) {
5285 // propagate teams construct specific info to workers
5286 kmp_info_t *thr = team->t.t_threads[f];
5287 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5288 thr->th.th_teams_level = master->th.th_teams_level;
5289 thr->th.th_teams_size = master->th.th_teams_size;
5290 }
5291 }
5292#if KMP_NESTED_HOT_TEAMS
5293 if (level) {
5294 // Sync barrier state for nested hot teams, not needed for outermost hot
5295 // team.
5296 for (f = 1; f < new_nproc; ++f) {
5297 kmp_info_t *thr = team->t.t_threads[f];
5298 int b;
5299 kmp_balign_t *balign = thr->th.th_bar;
5300 for (b = 0; b < bs_last_barrier; ++b) {
5301 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5302 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5303#if USE_DEBUGGER
5304 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5305#endif
5306 }
5307 }
5308 }
5309#endif // KMP_NESTED_HOT_TEAMS
5310
5311 /* reallocate space for arguments if necessary */
5312 __kmp_alloc_argv_entries(argc, team, TRUE);
5313 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5314 // The hot team re-uses the previous task team,
5315 // if untouched during the previous release->gather phase.
5316
5317 KF_TRACE(10, (" hot_team = %p\n", team));
5318
5319#if KMP_DEBUG
5320 if (__kmp_tasking_mode != tskm_immediate_exec) {
5321 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5322 "task_team[1] = %p after reinit\n",
5323 team->t.t_task_team[0], team->t.t_task_team[1]));
5324 }
5325#endif
5326
5327#if OMPT_SUPPORT
5328 __ompt_team_assign_id(team, ompt_parallel_data);
5329#endif
5330
5331 KMP_MB();
5332
5333 return team;
5334 }
5335
5336 /* next, let's try to take one from the team pool */
5337 KMP_MB();
5338 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5339 /* TODO: consider resizing undersized teams instead of reaping them, now
5340 that we have a resizing mechanism */
5341 if (team->t.t_max_nproc >= max_nproc) {
5342 /* take this team from the team pool */
5343 __kmp_team_pool = team->t.t_next_pool;
5344
5345 /* setup the team for fresh use */
5346 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5347
5348 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5349 "task_team[1] %p to NULL\n",
5350 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5351 team->t.t_task_team[0] = NULL;
5352 team->t.t_task_team[1] = NULL;
5353
5354 /* reallocate space for arguments if necessary */
5355 __kmp_alloc_argv_entries(argc, team, TRUE);
5356 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5357
5358 KA_TRACE(
5359 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5360 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5361 { // Initialize barrier data.
5362 int b;
5363 for (b = 0; b < bs_last_barrier; ++b) {
5364 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5365#if USE_DEBUGGER
5366 team->t.t_bar[b].b_master_arrived = 0;
5367 team->t.t_bar[b].b_team_arrived = 0;
5368#endif
5369 }
5370 }
5371
5372 team->t.t_proc_bind = new_proc_bind;
5373
5374 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5375 team->t.t_id));
5376
5377#if OMPT_SUPPORT
5378 __ompt_team_assign_id(team, ompt_parallel_data);
5379#endif
5380
5381 KMP_MB();
5382
5383 return team;
5384 }
5385
5386 /* reap team if it is too small, then loop back and check the next one */
5387 // not sure if this is wise, but, will be redone during the hot-teams
5388 // rewrite.
5389 /* TODO: Use technique to find the right size hot-team, don't reap them */
5390 team = __kmp_reap_team(team);
5391 __kmp_team_pool = team;
5392 }
5393
5394 /* nothing available in the pool, no matter, make a new team! */
5395 KMP_MB();
5396 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5397
5398 /* and set it up */
5399 team->t.t_max_nproc = max_nproc;
5400 /* NOTE well, for some reason allocating one big buffer and dividing it up
5401 seems to really hurt performance a lot on the P4, so, let's not use this */
5402 __kmp_allocate_team_arrays(team, max_nproc);
5403
5404 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5405 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5406
5407 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5408 "%p to NULL\n",
5409 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5410 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5411 // memory, no need to duplicate
5412 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5413 // memory, no need to duplicate
5414
5415 if (__kmp_storage_map) {
5416 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5417 }
5418
5419 /* allocate space for arguments */
5420 __kmp_alloc_argv_entries(argc, team, FALSE);
5421 team->t.t_argc = argc;
5422
5423 KA_TRACE(20,
5424 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5425 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5426 { // Initialize barrier data.
5427 int b;
5428 for (b = 0; b < bs_last_barrier; ++b) {
5429 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5430#if USE_DEBUGGER
5431 team->t.t_bar[b].b_master_arrived = 0;
5432 team->t.t_bar[b].b_team_arrived = 0;
5433#endif
5434 }
5435 }
5436
5437 team->t.t_proc_bind = new_proc_bind;
5438
5439#if OMPT_SUPPORT
5440 __ompt_team_assign_id(team, ompt_parallel_data);
5441 team->t.ompt_serialized_team_info = NULL;
5442#endif
5443
5444 KMP_MB();
5445
5446 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5447 team->t.t_id));
5448
5449 return team;
5450}
5451
5452/* TODO implement hot-teams at all levels */
5453/* TODO implement lazy thread release on demand (disband request) */
5454
5455/* free the team. return it to the team pool. release all the threads
5456 * associated with it */
5457void __kmp_free_team(kmp_root_t *root,
5458 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5459 int f;
5460 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5461 team->t.t_id));
5462
5463 /* verify state */
5464 KMP_DEBUG_ASSERT(root);
5465 KMP_DEBUG_ASSERT(team);
5466 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5467 KMP_DEBUG_ASSERT(team->t.t_threads);
5468
5469 int use_hot_team = team == root->r.r_hot_team;
5470#if KMP_NESTED_HOT_TEAMS
5471 int level;
5472 kmp_hot_team_ptr_t *hot_teams;
5473 if (master) {
5474 level = team->t.t_active_level - 1;
5475 if (master->th.th_teams_microtask) { // in teams construct?
5476 if (master->th.th_teams_size.nteams > 1) {
5477 ++level; // level was not increased in teams construct for
5478 // team_of_masters
5479 }
5480 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5481 master->th.th_teams_level == team->t.t_level) {
5482 ++level; // level was not increased in teams construct for
5483 // team_of_workers before the parallel
5484 } // team->t.t_level will be increased inside parallel
5485 }
5486 hot_teams = master->th.th_hot_teams;
5487 if (level < __kmp_hot_teams_max_level) {
5488 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5489 use_hot_team = 1;
5490 }
5491 }
5492#endif // KMP_NESTED_HOT_TEAMS
5493
5494 /* team is done working */
5495 TCW_SYNC_PTR(team->t.t_pkfn,
5496 NULL); // Important for Debugging Support Library.
5497#if KMP_OS_WINDOWS
5498 team->t.t_copyin_counter = 0; // init counter for possible reuse
5499#endif
5500 // Do not reset pointer to parent team to NULL for hot teams.
5501
5502 /* if we are non-hot team, release our threads */
5503 if (!use_hot_team) {
5504 if (__kmp_tasking_mode != tskm_immediate_exec) {
5505 // Wait for threads to reach reapable state
5506 for (f = 1; f < team->t.t_nproc; ++f) {
5507 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5508 kmp_info_t *th = team->t.t_threads[f];
5509 volatile kmp_uint32 *state = &th->th.th_reap_state;
5510 while (*state != KMP_SAFE_TO_REAP) {
5511#if KMP_OS_WINDOWS
5512 // On Windows a thread can be killed at any time, check this
5513 DWORD ecode;
5514 if (!__kmp_is_thread_alive(th, &ecode)) {
5515 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5516 break;
5517 }
5518#endif
5519 // first check if thread is sleeping
5520 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5521 if (fl.is_sleeping())
5522 fl.resume(__kmp_gtid_from_thread(th));
5523 KMP_CPU_PAUSE();
5524 }
5525 }
5526
5527 // Delete task teams
5528 int tt_idx;
5529 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5530 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5531 if (task_team != NULL) {
5532 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5533 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5534 team->t.t_threads[f]->th.th_task_team = NULL;
5535 }
5536 KA_TRACE(
5537 20,
5538 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5539 __kmp_get_gtid(), task_team, team->t.t_id));
5540#if KMP_NESTED_HOT_TEAMS
5541 __kmp_free_task_team(master, task_team);
5542#endif
5543 team->t.t_task_team[tt_idx] = NULL;
5544 }
5545 }
5546 }
5547
5548 // Reset pointer to parent team only for non-hot teams.
5549 team->t.t_parent = NULL;
5550 team->t.t_level = 0;
5551 team->t.t_active_level = 0;
5552
5553 /* free the worker threads */
5554 for (f = 1; f < team->t.t_nproc; ++f) {
5555 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5556 __kmp_free_thread(team->t.t_threads[f]);
5557 team->t.t_threads[f] = NULL;
5558 }
5559
5560 /* put the team back in the team pool */
5561 /* TODO limit size of team pool, call reap_team if pool too large */
5562 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5563 __kmp_team_pool = (volatile kmp_team_t *)team;
5564 } else { // Check if team was created for primary threads in teams construct
5565 // See if first worker is a CG root
5566 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5567 team->t.t_threads[1]->th.th_cg_roots);
5568 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5569 // Clean up the CG root nodes on workers so that this team can be re-used
5570 for (f = 1; f < team->t.t_nproc; ++f) {
5571 kmp_info_t *thr = team->t.t_threads[f];
5572 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5573 thr->th.th_cg_roots->cg_root == thr);
5574 // Pop current CG root off list
5575 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5576 thr->th.th_cg_roots = tmp->up;
5577 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5578 " up to node %p. cg_nthreads was %d\n",
5579 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5580 int i = tmp->cg_nthreads--;
5581 if (i == 1) {
5582 __kmp_free(tmp); // free CG if we are the last thread in it
5583 }
5584 // Restore current task's thread_limit from CG root
5585 if (thr->th.th_cg_roots)
5586 thr->th.th_current_task->td_icvs.thread_limit =
5587 thr->th.th_cg_roots->cg_thread_limit;
5588 }
5589 }
5590 }
5591
5592 KMP_MB();
5593}
5594
5595/* reap the team. destroy it, reclaim all its resources and free its memory */
5596kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5597 kmp_team_t *next_pool = team->t.t_next_pool;
5598
5599 KMP_DEBUG_ASSERT(team);
5600 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5601 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5602 KMP_DEBUG_ASSERT(team->t.t_threads);
5603 KMP_DEBUG_ASSERT(team->t.t_argv);
5604
5605 /* TODO clean the threads that are a part of this? */
5606
5607 /* free stuff */
5608 __kmp_free_team_arrays(team);
5609 if (team->t.t_argv != &team->t.t_inline_argv[0])
5610 __kmp_free((void *)team->t.t_argv);
5611 __kmp_free(team);
5612
5613 KMP_MB();
5614 return next_pool;
5615}
5616
5617// Free the thread. Don't reap it, just place it on the pool of available
5618// threads.
5619//
5620// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5621// binding for the affinity mechanism to be useful.
5622//
5623// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5624// However, we want to avoid a potential performance problem by always
5625// scanning through the list to find the correct point at which to insert
5626// the thread (potential N**2 behavior). To do this we keep track of the
5627// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5628// With single-level parallelism, threads will always be added to the tail
5629// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5630// parallelism, all bets are off and we may need to scan through the entire
5631// free list.
5632//
5633// This change also has a potentially large performance benefit, for some
5634// applications. Previously, as threads were freed from the hot team, they
5635// would be placed back on the free list in inverse order. If the hot team
5636// grew back to it's original size, then the freed thread would be placed
5637// back on the hot team in reverse order. This could cause bad cache
5638// locality problems on programs where the size of the hot team regularly
5639// grew and shrunk.
5640//
5641// Now, for single-level parallelism, the OMP tid is always == gtid.
5642void __kmp_free_thread(kmp_info_t *this_th) {
5643 int gtid;
5644 kmp_info_t **scan;
5645
5646 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5647 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5648
5649 KMP_DEBUG_ASSERT(this_th);
5650
5651 // When moving thread to pool, switch thread to wait on own b_go flag, and
5652 // uninitialized (NULL team).
5653 int b;
5654 kmp_balign_t *balign = this_th->th.th_bar;
5655 for (b = 0; b < bs_last_barrier; ++b) {
5656 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5657 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5658 balign[b].bb.team = NULL;
5659 balign[b].bb.leaf_kids = 0;
5660 }
5661 this_th->th.th_task_state = 0;
5662 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5663
5664 /* put thread back on the free pool */
5665 TCW_PTR(this_th->th.th_team, NULL);
5666 TCW_PTR(this_th->th.th_root, NULL);
5667 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5668
5669 while (this_th->th.th_cg_roots) {
5670 this_th->th.th_cg_roots->cg_nthreads--;
5671 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5672 " %p of thread %p to %d\n",
5673 this_th, this_th->th.th_cg_roots,
5674 this_th->th.th_cg_roots->cg_root,
5675 this_th->th.th_cg_roots->cg_nthreads));
5676 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5677 if (tmp->cg_root == this_th) { // Thread is a cg_root
5678 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5679 KA_TRACE(
5680 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5681 this_th->th.th_cg_roots = tmp->up;
5682 __kmp_free(tmp);
5683 } else { // Worker thread
5684 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5685 __kmp_free(tmp);
5686 }
5687 this_th->th.th_cg_roots = NULL;
5688 break;
5689 }
5690 }
5691
5692 /* If the implicit task assigned to this thread can be used by other threads
5693 * -> multiple threads can share the data and try to free the task at
5694 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5695 * with higher probability when hot team is disabled but can occurs even when
5696 * the hot team is enabled */
5697 __kmp_free_implicit_task(this_th);
5698 this_th->th.th_current_task = NULL;
5699
5700 // If the __kmp_thread_pool_insert_pt is already past the new insert
5701 // point, then we need to re-scan the entire list.
5702 gtid = this_th->th.th_info.ds.ds_gtid;
5703 if (__kmp_thread_pool_insert_pt != NULL) {
5704 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5705 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5706 __kmp_thread_pool_insert_pt = NULL;
5707 }
5708 }
5709
5710 // Scan down the list to find the place to insert the thread.
5711 // scan is the address of a link in the list, possibly the address of
5712 // __kmp_thread_pool itself.
5713 //
5714 // In the absence of nested parallelism, the for loop will have 0 iterations.
5715 if (__kmp_thread_pool_insert_pt != NULL) {
5716 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5717 } else {
5718 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5719 }
5720 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5721 scan = &((*scan)->th.th_next_pool))
5722 ;
5723
5724 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5725 // to its address.
5726 TCW_PTR(this_th->th.th_next_pool, *scan);
5727 __kmp_thread_pool_insert_pt = *scan = this_th;
5728 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5729 (this_th->th.th_info.ds.ds_gtid <
5730 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5731 TCW_4(this_th->th.th_in_pool, TRUE);
5732 __kmp_suspend_initialize_thread(this_th);
5733 __kmp_lock_suspend_mx(this_th);
5734 if (this_th->th.th_active == TRUE) {
5735 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5736 this_th->th.th_active_in_pool = TRUE;
5737 }
5738#if KMP_DEBUG
5739 else {
5740 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5741 }
5742#endif
5743 __kmp_unlock_suspend_mx(this_th);
5744
5745 TCW_4(__kmp_nth, __kmp_nth - 1);
5746
5747#ifdef KMP_ADJUST_BLOCKTIME
5748 /* Adjust blocktime back to user setting or default if necessary */
5749 /* Middle initialization might never have occurred */
5750 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5751 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5752 if (__kmp_nth <= __kmp_avail_proc) {
5753 __kmp_zero_bt = FALSE;
5754 }
5755 }
5756#endif /* KMP_ADJUST_BLOCKTIME */
5757
5758 KMP_MB();
5759}
5760
5761/* ------------------------------------------------------------------------ */
5762
5763void *__kmp_launch_thread(kmp_info_t *this_thr) {
5764#if OMP_PROFILING_SUPPORT
5765 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5766 // TODO: add a configuration option for time granularity
5767 if (ProfileTraceFile)
5768 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5769#endif
5770
5771 int gtid = this_thr->th.th_info.ds.ds_gtid;
5772 /* void *stack_data;*/
5773 kmp_team_t **volatile pteam;
5774
5775 KMP_MB();
5776 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5777
5778 if (__kmp_env_consistency_check) {
5779 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5780 }
5781
5782#if OMPD_SUPPORT
5783 if (ompd_state & OMPD_ENABLE_BP)
5784 ompd_bp_thread_begin();
5785#endif
5786
5787#if OMPT_SUPPORT
5788 ompt_data_t *thread_data = nullptr;
5789 if (ompt_enabled.enabled) {
5790 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5791 *thread_data = ompt_data_none;
5792
5793 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5794 this_thr->th.ompt_thread_info.wait_id = 0;
5795 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5796 this_thr->th.ompt_thread_info.parallel_flags = 0;
5797 if (ompt_enabled.ompt_callback_thread_begin) {
5798 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5799 ompt_thread_worker, thread_data);
5800 }
5801 this_thr->th.ompt_thread_info.state = ompt_state_idle;
5802 }
5803#endif
5804
5805 /* This is the place where threads wait for work */
5806 while (!TCR_4(__kmp_global.g.g_done)) {
5807 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5808 KMP_MB();
5809
5810 /* wait for work to do */
5811 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5812
5813 /* No tid yet since not part of a team */
5814 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5815
5816#if OMPT_SUPPORT
5817 if (ompt_enabled.enabled) {
5818 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5819 }
5820#endif
5821
5822 pteam = &this_thr->th.th_team;
5823
5824 /* have we been allocated? */
5825 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5826 /* we were just woken up, so run our new task */
5827 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5828 int rc;
5829 KA_TRACE(20,
5830 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5831 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5832 (*pteam)->t.t_pkfn));
5833
5834 updateHWFPControl(*pteam);
5835
5836#if OMPT_SUPPORT
5837 if (ompt_enabled.enabled) {
5838 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5839 }
5840#endif
5841
5842 rc = (*pteam)->t.t_invoke(gtid);
5843 KMP_ASSERT(rc);
5844
5845 KMP_MB();
5846 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5847 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5848 (*pteam)->t.t_pkfn));
5849 }
5850#if OMPT_SUPPORT
5851 if (ompt_enabled.enabled) {
5852 /* no frame set while outside task */
5853 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5854
5855 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5856 }
5857#endif
5858 /* join barrier after parallel region */
5859 __kmp_join_barrier(gtid);
5860 }
5861 }
5862 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5863
5864#if OMPD_SUPPORT
5865 if (ompd_state & OMPD_ENABLE_BP)
5866 ompd_bp_thread_end();
5867#endif
5868
5869#if OMPT_SUPPORT
5870 if (ompt_enabled.ompt_callback_thread_end) {
5871 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5872 }
5873#endif
5874
5875 this_thr->th.th_task_team = NULL;
5876 /* run the destructors for the threadprivate data for this thread */
5877 __kmp_common_destroy_gtid(gtid);
5878
5879 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5880 KMP_MB();
5881
5882#if OMP_PROFILING_SUPPORT
5883 llvm::timeTraceProfilerFinishThread();
5884#endif
5885 return this_thr;
5886}
5887
5888/* ------------------------------------------------------------------------ */
5889
5890void __kmp_internal_end_dest(void *specific_gtid) {
5891 // Make sure no significant bits are lost
5892 int gtid;
5893 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5894
5895 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5896 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5897 * this is because 0 is reserved for the nothing-stored case */
5898
5899 __kmp_internal_end_thread(gtid);
5900}
5901
5902#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5903
5904__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5905 __kmp_internal_end_atexit();
5906}
5907
5908#endif
5909
5910/* [Windows] josh: when the atexit handler is called, there may still be more
5911 than one thread alive */
5912void __kmp_internal_end_atexit(void) {
5913 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5914 /* [Windows]
5915 josh: ideally, we want to completely shutdown the library in this atexit
5916 handler, but stat code that depends on thread specific data for gtid fails
5917 because that data becomes unavailable at some point during the shutdown, so
5918 we call __kmp_internal_end_thread instead. We should eventually remove the
5919 dependency on __kmp_get_specific_gtid in the stat code and use
5920 __kmp_internal_end_library to cleanly shutdown the library.
5921
5922 // TODO: Can some of this comment about GVS be removed?
5923 I suspect that the offending stat code is executed when the calling thread
5924 tries to clean up a dead root thread's data structures, resulting in GVS
5925 code trying to close the GVS structures for that thread, but since the stat
5926 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5927 the calling thread is cleaning up itself instead of another thread, it get
5928 confused. This happens because allowing a thread to unregister and cleanup
5929 another thread is a recent modification for addressing an issue.
5930 Based on the current design (20050722), a thread may end up
5931 trying to unregister another thread only if thread death does not trigger
5932 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5933 thread specific data destructor function to detect thread death. For
5934 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5935 is nothing. Thus, the workaround is applicable only for Windows static
5936 stat library. */
5937 __kmp_internal_end_library(-1);
5938#if KMP_OS_WINDOWS
5939 __kmp_close_console();
5940#endif
5941}
5942
5943static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5944 // It is assumed __kmp_forkjoin_lock is acquired.
5945
5946 int gtid;
5947
5948 KMP_DEBUG_ASSERT(thread != NULL);
5949
5950 gtid = thread->th.th_info.ds.ds_gtid;
5951
5952 if (!is_root) {
5953 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5954 /* Assume the threads are at the fork barrier here */
5955 KA_TRACE(
5956 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5957 gtid));
5958 /* Need release fence here to prevent seg faults for tree forkjoin barrier
5959 * (GEH) */
5960 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5961 thread);
5962 __kmp_release_64(&flag);
5963 }
5964
5965 // Terminate OS thread.
5966 __kmp_reap_worker(thread);
5967
5968 // The thread was killed asynchronously. If it was actively
5969 // spinning in the thread pool, decrement the global count.
5970 //
5971 // There is a small timing hole here - if the worker thread was just waking
5972 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5973 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5974 // the global counter might not get updated.
5975 //
5976 // Currently, this can only happen as the library is unloaded,
5977 // so there are no harmful side effects.
5978 if (thread->th.th_active_in_pool) {
5979 thread->th.th_active_in_pool = FALSE;
5980 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5981 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5982 }
5983 }
5984
5985 __kmp_free_implicit_task(thread);
5986
5987// Free the fast memory for tasking
5988#if USE_FAST_MEMORY
5989 __kmp_free_fast_memory(thread);
5990#endif /* USE_FAST_MEMORY */
5991
5992 __kmp_suspend_uninitialize_thread(thread);
5993
5994 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5995 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5996
5997 --__kmp_all_nth;
5998 // __kmp_nth was decremented when thread is added to the pool.
5999
6000#ifdef KMP_ADJUST_BLOCKTIME
6001 /* Adjust blocktime back to user setting or default if necessary */
6002 /* Middle initialization might never have occurred */
6003 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6004 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6005 if (__kmp_nth <= __kmp_avail_proc) {
6006 __kmp_zero_bt = FALSE;
6007 }
6008 }
6009#endif /* KMP_ADJUST_BLOCKTIME */
6010
6011 /* free the memory being used */
6012 if (__kmp_env_consistency_check) {
6013 if (thread->th.th_cons) {
6014 __kmp_free_cons_stack(thread->th.th_cons);
6015 thread->th.th_cons = NULL;
6016 }
6017 }
6018
6019 if (thread->th.th_pri_common != NULL) {
6020 __kmp_free(thread->th.th_pri_common);
6021 thread->th.th_pri_common = NULL;
6022 }
6023
6024 if (thread->th.th_task_state_memo_stack != NULL) {
6025 __kmp_free(thread->th.th_task_state_memo_stack);
6026 thread->th.th_task_state_memo_stack = NULL;
6027 }
6028
6029#if KMP_USE_BGET
6030 if (thread->th.th_local.bget_data != NULL) {
6031 __kmp_finalize_bget(thread);
6032 }
6033#endif
6034
6035#if KMP_AFFINITY_SUPPORTED
6036 if (thread->th.th_affin_mask != NULL) {
6037 KMP_CPU_FREE(thread->th.th_affin_mask);
6038 thread->th.th_affin_mask = NULL;
6039 }
6040#endif /* KMP_AFFINITY_SUPPORTED */
6041
6042#if KMP_USE_HIER_SCHED
6043 if (thread->th.th_hier_bar_data != NULL) {
6044 __kmp_free(thread->th.th_hier_bar_data);
6045 thread->th.th_hier_bar_data = NULL;
6046 }
6047#endif
6048
6049 __kmp_reap_team(thread->th.th_serial_team);
6050 thread->th.th_serial_team = NULL;
6051 __kmp_free(thread);
6052
6053 KMP_MB();
6054
6055} // __kmp_reap_thread
6056
6057static void __kmp_internal_end(void) {
6058 int i;
6059
6060 /* First, unregister the library */
6061 __kmp_unregister_library();
6062
6063#if KMP_OS_WINDOWS
6064 /* In Win static library, we can't tell when a root actually dies, so we
6065 reclaim the data structures for any root threads that have died but not
6066 unregistered themselves, in order to shut down cleanly.
6067 In Win dynamic library we also can't tell when a thread dies. */
6068 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6069// dead roots
6070#endif
6071
6072 for (i = 0; i < __kmp_threads_capacity; i++)
6073 if (__kmp_root[i])
6074 if (__kmp_root[i]->r.r_active)
6075 break;
6076 KMP_MB(); /* Flush all pending memory write invalidates. */
6077 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6078
6079 if (i < __kmp_threads_capacity) {
6080#if KMP_USE_MONITOR
6081 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6082 KMP_MB(); /* Flush all pending memory write invalidates. */
6083
6084 // Need to check that monitor was initialized before reaping it. If we are
6085 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6086 // __kmp_monitor will appear to contain valid data, but it is only valid in
6087 // the parent process, not the child.
6088 // New behavior (201008): instead of keying off of the flag
6089 // __kmp_init_parallel, the monitor thread creation is keyed off
6090 // of the new flag __kmp_init_monitor.
6091 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6092 if (TCR_4(__kmp_init_monitor)) {
6093 __kmp_reap_monitor(&__kmp_monitor);
6094 TCW_4(__kmp_init_monitor, 0);
6095 }
6096 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6097 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6098#endif // KMP_USE_MONITOR
6099 } else {
6100/* TODO move this to cleanup code */
6101#ifdef KMP_DEBUG
6102 /* make sure that everything has properly ended */
6103 for (i = 0; i < __kmp_threads_capacity; i++) {
6104 if (__kmp_root[i]) {
6105 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6106 // there can be uber threads alive here
6107 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6108 }
6109 }
6110#endif
6111
6112 KMP_MB();
6113
6114 // Reap the worker threads.
6115 // This is valid for now, but be careful if threads are reaped sooner.
6116 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6117 // Get the next thread from the pool.
6118 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6119 __kmp_thread_pool = thread->th.th_next_pool;
6120 // Reap it.
6121 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6122 thread->th.th_next_pool = NULL;
6123 thread->th.th_in_pool = FALSE;
6124 __kmp_reap_thread(thread, 0);
6125 }
6126 __kmp_thread_pool_insert_pt = NULL;
6127
6128 // Reap teams.
6129 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6130 // Get the next team from the pool.
6131 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6132 __kmp_team_pool = team->t.t_next_pool;
6133 // Reap it.
6134 team->t.t_next_pool = NULL;
6135 __kmp_reap_team(team);
6136 }
6137
6138 __kmp_reap_task_teams();
6139
6140#if KMP_OS_UNIX
6141 // Threads that are not reaped should not access any resources since they
6142 // are going to be deallocated soon, so the shutdown sequence should wait
6143 // until all threads either exit the final spin-waiting loop or begin
6144 // sleeping after the given blocktime.
6145 for (i = 0; i < __kmp_threads_capacity; i++) {
6146 kmp_info_t *thr = __kmp_threads[i];
6147 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6148 KMP_CPU_PAUSE();
6149 }
6150#endif
6151
6152 for (i = 0; i < __kmp_threads_capacity; ++i) {
6153 // TBD: Add some checking...
6154 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6155 }
6156
6157 /* Make sure all threadprivate destructors get run by joining with all
6158 worker threads before resetting this flag */
6159 TCW_SYNC_4(__kmp_init_common, FALSE);
6160
6161 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6162 KMP_MB();
6163
6164#if KMP_USE_MONITOR
6165 // See note above: One of the possible fixes for CQ138434 / CQ140126
6166 //
6167 // FIXME: push both code fragments down and CSE them?
6168 // push them into __kmp_cleanup() ?
6169 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6170 if (TCR_4(__kmp_init_monitor)) {
6171 __kmp_reap_monitor(&__kmp_monitor);
6172 TCW_4(__kmp_init_monitor, 0);
6173 }
6174 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6175 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6176#endif
6177 } /* else !__kmp_global.t_active */
6178 TCW_4(__kmp_init_gtid, FALSE);
6179 KMP_MB(); /* Flush all pending memory write invalidates. */
6180
6181 __kmp_cleanup();
6182#if OMPT_SUPPORT
6183 ompt_fini();
6184#endif
6185}
6186
6187void __kmp_internal_end_library(int gtid_req) {
6188 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6189 /* this shouldn't be a race condition because __kmp_internal_end() is the
6190 only place to clear __kmp_serial_init */
6191 /* we'll check this later too, after we get the lock */
6192 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6193 // redundant, because the next check will work in any case.
6194 if (__kmp_global.g.g_abort) {
6195 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6196 /* TODO abort? */
6197 return;
6198 }
6199 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6200 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6201 return;
6202 }
6203
6204 // If hidden helper team has been initialized, we need to deinit it
6205 if (TCR_4(__kmp_init_hidden_helper) &&
6206 !TCR_4(__kmp_hidden_helper_team_done)) {
6207 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6208 // First release the main thread to let it continue its work
6209 __kmp_hidden_helper_main_thread_release();
6210 // Wait until the hidden helper team has been destroyed
6211 __kmp_hidden_helper_threads_deinitz_wait();
6212 }
6213
6214 KMP_MB(); /* Flush all pending memory write invalidates. */
6215 /* find out who we are and what we should do */
6216 {
6217 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6218 KA_TRACE(
6219 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6220 if (gtid == KMP_GTID_SHUTDOWN) {
6221 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6222 "already shutdown\n"));
6223 return;
6224 } else if (gtid == KMP_GTID_MONITOR) {
6225 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6226 "registered, or system shutdown\n"));
6227 return;
6228 } else if (gtid == KMP_GTID_DNE) {
6229 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6230 "shutdown\n"));
6231 /* we don't know who we are, but we may still shutdown the library */
6232 } else if (KMP_UBER_GTID(gtid)) {
6233 /* unregister ourselves as an uber thread. gtid is no longer valid */
6234 if (__kmp_root[gtid]->r.r_active) {
6235 __kmp_global.g.g_abort = -1;
6236 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6237 __kmp_unregister_library();
6238 KA_TRACE(10,
6239 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6240 gtid));
6241 return;
6242 } else {
6243 KA_TRACE(
6244 10,
6245 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6246 __kmp_unregister_root_current_thread(gtid);
6247 }
6248 } else {
6249/* worker threads may call this function through the atexit handler, if they
6250 * call exit() */
6251/* For now, skip the usual subsequent processing and just dump the debug buffer.
6252 TODO: do a thorough shutdown instead */
6253#ifdef DUMP_DEBUG_ON_EXIT
6254 if (__kmp_debug_buf)
6255 __kmp_dump_debug_buffer();
6256#endif
6257 // added unregister library call here when we switch to shm linux
6258 // if we don't, it will leave lots of files in /dev/shm
6259 // cleanup shared memory file before exiting.
6260 __kmp_unregister_library();
6261 return;
6262 }
6263 }
6264 /* synchronize the termination process */
6265 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6266
6267 /* have we already finished */
6268 if (__kmp_global.g.g_abort) {
6269 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6270 /* TODO abort? */
6271 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6272 return;
6273 }
6274 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6275 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6276 return;
6277 }
6278
6279 /* We need this lock to enforce mutex between this reading of
6280 __kmp_threads_capacity and the writing by __kmp_register_root.
6281 Alternatively, we can use a counter of roots that is atomically updated by
6282 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6283 __kmp_internal_end_*. */
6284 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6285
6286 /* now we can safely conduct the actual termination */
6287 __kmp_internal_end();
6288
6289 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6290 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6291
6292 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6293
6294#ifdef DUMP_DEBUG_ON_EXIT
6295 if (__kmp_debug_buf)
6296 __kmp_dump_debug_buffer();
6297#endif
6298
6299#if KMP_OS_WINDOWS
6300 __kmp_close_console();
6301#endif
6302
6303 __kmp_fini_allocator();
6304
6305} // __kmp_internal_end_library
6306
6307void __kmp_internal_end_thread(int gtid_req) {
6308 int i;
6309
6310 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6311 /* this shouldn't be a race condition because __kmp_internal_end() is the
6312 * only place to clear __kmp_serial_init */
6313 /* we'll check this later too, after we get the lock */
6314 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6315 // redundant, because the next check will work in any case.
6316 if (__kmp_global.g.g_abort) {
6317 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6318 /* TODO abort? */
6319 return;
6320 }
6321 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6322 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6323 return;
6324 }
6325
6326 // If hidden helper team has been initialized, we need to deinit it
6327 if (TCR_4(__kmp_init_hidden_helper) &&
6328 !TCR_4(__kmp_hidden_helper_team_done)) {
6329 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6330 // First release the main thread to let it continue its work
6331 __kmp_hidden_helper_main_thread_release();
6332 // Wait until the hidden helper team has been destroyed
6333 __kmp_hidden_helper_threads_deinitz_wait();
6334 }
6335
6336 KMP_MB(); /* Flush all pending memory write invalidates. */
6337
6338 /* find out who we are and what we should do */
6339 {
6340 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6341 KA_TRACE(10,
6342 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6343 if (gtid == KMP_GTID_SHUTDOWN) {
6344 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6345 "already shutdown\n"));
6346 return;
6347 } else if (gtid == KMP_GTID_MONITOR) {
6348 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6349 "registered, or system shutdown\n"));
6350 return;
6351 } else if (gtid == KMP_GTID_DNE) {
6352 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6353 "shutdown\n"));
6354 return;
6355 /* we don't know who we are */
6356 } else if (KMP_UBER_GTID(gtid)) {
6357 /* unregister ourselves as an uber thread. gtid is no longer valid */
6358 if (__kmp_root[gtid]->r.r_active) {
6359 __kmp_global.g.g_abort = -1;
6360 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6361 KA_TRACE(10,
6362 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6363 gtid));
6364 return;
6365 } else {
6366 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6367 gtid));
6368 __kmp_unregister_root_current_thread(gtid);
6369 }
6370 } else {
6371 /* just a worker thread, let's leave */
6372 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6373
6374 if (gtid >= 0) {
6375 __kmp_threads[gtid]->th.th_task_team = NULL;
6376 }
6377
6378 KA_TRACE(10,
6379 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6380 gtid));
6381 return;
6382 }
6383 }
6384#if KMP_DYNAMIC_LIB
6385 if (__kmp_pause_status != kmp_hard_paused)
6386 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6387 // because we will better shutdown later in the library destructor.
6388 {
6389 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6390 return;
6391 }
6392#endif
6393 /* synchronize the termination process */
6394 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6395
6396 /* have we already finished */
6397 if (__kmp_global.g.g_abort) {
6398 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6399 /* TODO abort? */
6400 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6401 return;
6402 }
6403 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6404 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6405 return;
6406 }
6407
6408 /* We need this lock to enforce mutex between this reading of
6409 __kmp_threads_capacity and the writing by __kmp_register_root.
6410 Alternatively, we can use a counter of roots that is atomically updated by
6411 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6412 __kmp_internal_end_*. */
6413
6414 /* should we finish the run-time? are all siblings done? */
6415 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6416
6417 for (i = 0; i < __kmp_threads_capacity; ++i) {
6418 if (KMP_UBER_GTID(i)) {
6419 KA_TRACE(
6420 10,
6421 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6422 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6423 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6424 return;
6425 }
6426 }
6427
6428 /* now we can safely conduct the actual termination */
6429
6430 __kmp_internal_end();
6431
6432 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6433 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6434
6435 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6436
6437#ifdef DUMP_DEBUG_ON_EXIT
6438 if (__kmp_debug_buf)
6439 __kmp_dump_debug_buffer();
6440#endif
6441} // __kmp_internal_end_thread
6442
6443// -----------------------------------------------------------------------------
6444// Library registration stuff.
6445
6446static long __kmp_registration_flag = 0;
6447// Random value used to indicate library initialization.
6448static char *__kmp_registration_str = NULL;
6449// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6450
6451static inline char *__kmp_reg_status_name() {
6452/* On RHEL 3u5 if linked statically, getpid() returns different values in
6453 each thread. If registration and unregistration go in different threads
6454 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6455 env var can not be found, because the name will contain different pid. */
6456// macOS* complains about name being too long with additional getuid()
6457#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6458 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6459 (int)getuid());
6460#else
6461 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6462#endif
6463} // __kmp_reg_status_get
6464
6465void __kmp_register_library_startup(void) {
6466
6467 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6468 int done = 0;
6469 union {
6470 double dtime;
6471 long ltime;
6472 } time;
6473#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6474 __kmp_initialize_system_tick();
6475#endif
6476 __kmp_read_system_time(&time.dtime);
6477 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6478 __kmp_registration_str =
6479 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6480 __kmp_registration_flag, KMP_LIBRARY_FILE);
6481
6482 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6483 __kmp_registration_str));
6484
6485 while (!done) {
6486
6487 char *value = NULL; // Actual value of the environment variable.
6488
6489#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6490 char *shm_name = __kmp_str_format("/%s", name);
6491 int shm_preexist = 0;
6492 char *data1;
6493 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6494 if ((fd1 == -1) && (errno == EEXIST)) {
6495 // file didn't open because it already exists.
6496 // try opening existing file
6497 fd1 = shm_open(shm_name, O_RDWR, 0666);
6498 if (fd1 == -1) { // file didn't open
6499 // error out here
6500 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6501 __kmp_msg_null);
6502 } else {
6503 // able to open existing file
6504 shm_preexist = 1;
6505 }
6506 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6507 // already exists.
6508 // error out here.
6509 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6510 __kmp_msg_null);
6511 }
6512 if (shm_preexist == 0) {
6513 // we created SHM now set size
6514 if (ftruncate(fd1, SHM_SIZE) == -1) {
6515 // error occured setting size;
6516 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6517 KMP_ERR(errno), __kmp_msg_null);
6518 }
6519 }
6520 data1 =
6521 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6522 if (data1 == MAP_FAILED) {
6523 // failed to map shared memory
6524 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6525 __kmp_msg_null);
6526 }
6527 if (shm_preexist == 0) { // set data to SHM, set value
6528 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6529 }
6530 // Read value from either what we just wrote or existing file.
6531 value = __kmp_str_format("%s", data1); // read value from SHM
6532 munmap(data1, SHM_SIZE);
6533 close(fd1);
6534#else // Windows and unix with static library
6535 // Set environment variable, but do not overwrite if it is exist.
6536 __kmp_env_set(name, __kmp_registration_str, 0);
6537 // read value to see if it got set
6538 value = __kmp_env_get(name);
6539#endif
6540
6541 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6542 done = 1; // Ok, environment variable set successfully, exit the loop.
6543 } else {
6544 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6545 // Check whether it alive or dead.
6546 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6547 char *tail = value;
6548 char *flag_addr_str = NULL;
6549 char *flag_val_str = NULL;
6550 char const *file_name = NULL;
6551 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6552 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6553 file_name = tail;
6554 if (tail != NULL) {
6555 unsigned long *flag_addr = 0;
6556 unsigned long flag_val = 0;
6557 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6558 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6559 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6560 // First, check whether environment-encoded address is mapped into
6561 // addr space.
6562 // If so, dereference it to see if it still has the right value.
6563 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6564 neighbor = 1;
6565 } else {
6566 // If not, then we know the other copy of the library is no longer
6567 // running.
6568 neighbor = 2;
6569 }
6570 }
6571 }
6572 switch (neighbor) {
6573 case 0: // Cannot parse environment variable -- neighbor status unknown.
6574 // Assume it is the incompatible format of future version of the
6575 // library. Assume the other library is alive.
6576 // WARN( ... ); // TODO: Issue a warning.
6577 file_name = "unknown library";
6578 KMP_FALLTHROUGH();
6579 // Attention! Falling to the next case. That's intentional.
6580 case 1: { // Neighbor is alive.
6581 // Check it is allowed.
6582 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6583 if (!__kmp_str_match_true(duplicate_ok)) {
6584 // That's not allowed. Issue fatal error.
6585 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6586 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6587 }
6588 KMP_INTERNAL_FREE(duplicate_ok);
6589 __kmp_duplicate_library_ok = 1;
6590 done = 1; // Exit the loop.
6591 } break;
6592 case 2: { // Neighbor is dead.
6593
6594#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6595 // close shared memory.
6596 shm_unlink(shm_name); // this removes file in /dev/shm
6597#else
6598 // Clear the variable and try to register library again.
6599 __kmp_env_unset(name);
6600#endif
6601 } break;
6602 default: {
6603 KMP_DEBUG_ASSERT(0);
6604 } break;
6605 }
6606 }
6607 KMP_INTERNAL_FREE((void *)value);
6608#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6609 KMP_INTERNAL_FREE((void *)shm_name);
6610#endif
6611 } // while
6612 KMP_INTERNAL_FREE((void *)name);
6613
6614} // func __kmp_register_library_startup
6615
6616void __kmp_unregister_library(void) {
6617
6618 char *name = __kmp_reg_status_name();
6619 char *value = NULL;
6620
6621#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6622 char *shm_name = __kmp_str_format("/%s", name);
6623 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6624 if (fd1 == -1) {
6625 // file did not open. return.
6626 return;
6627 }
6628 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6629 if (data1 != MAP_FAILED) {
6630 value = __kmp_str_format("%s", data1); // read value from SHM
6631 munmap(data1, SHM_SIZE);
6632 }
6633 close(fd1);
6634#else
6635 value = __kmp_env_get(name);
6636#endif
6637
6638 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6639 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6640 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6641// Ok, this is our variable. Delete it.
6642#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6643 shm_unlink(shm_name); // this removes file in /dev/shm
6644#else
6645 __kmp_env_unset(name);
6646#endif
6647 }
6648
6649#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6650 KMP_INTERNAL_FREE(shm_name);
6651#endif
6652
6653 KMP_INTERNAL_FREE(__kmp_registration_str);
6654 KMP_INTERNAL_FREE(value);
6655 KMP_INTERNAL_FREE(name);
6656
6657 __kmp_registration_flag = 0;
6658 __kmp_registration_str = NULL;
6659
6660} // __kmp_unregister_library
6661
6662// End of Library registration stuff.
6663// -----------------------------------------------------------------------------
6664
6665#if KMP_MIC_SUPPORTED
6666
6667static void __kmp_check_mic_type() {
6668 kmp_cpuid_t cpuid_state = {0};
6669 kmp_cpuid_t *cs_p = &cpuid_state;
6670 __kmp_x86_cpuid(1, 0, cs_p);
6671 // We don't support mic1 at the moment
6672 if ((cs_p->eax & 0xff0) == 0xB10) {
6673 __kmp_mic_type = mic2;
6674 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6675 __kmp_mic_type = mic3;
6676 } else {
6677 __kmp_mic_type = non_mic;
6678 }
6679}
6680
6681#endif /* KMP_MIC_SUPPORTED */
6682
6683#if KMP_HAVE_UMWAIT
6684static void __kmp_user_level_mwait_init() {
6685 struct kmp_cpuid buf;
6686 __kmp_x86_cpuid(7, 0, &buf);
6687 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6688 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6689 __kmp_umwait_enabled));
6690}
6691#elif KMP_HAVE_MWAIT
6692#ifndef AT_INTELPHIUSERMWAIT
6693// Spurious, non-existent value that should always fail to return anything.
6694// Will be replaced with the correct value when we know that.
6695#define AT_INTELPHIUSERMWAIT 10000
6696#endif
6697// getauxval() function is available in RHEL7 and SLES12. If a system with an
6698// earlier OS is used to build the RTL, we'll use the following internal
6699// function when the entry is not found.
6700unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6701unsigned long getauxval(unsigned long) { return 0; }
6702
6703static void __kmp_user_level_mwait_init() {
6704 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6705 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6706 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6707 // KMP_USER_LEVEL_MWAIT was set to TRUE.
6708 if (__kmp_mic_type == mic3) {
6709 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6710 if ((res & 0x1) || __kmp_user_level_mwait) {
6711 __kmp_mwait_enabled = TRUE;
6712 if (__kmp_user_level_mwait) {
6713 KMP_INFORM(EnvMwaitWarn);
6714 }
6715 } else {
6716 __kmp_mwait_enabled = FALSE;
6717 }
6718 }
6719 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6720 "__kmp_mwait_enabled = %d\n",
6721 __kmp_mic_type, __kmp_mwait_enabled));
6722}
6723#endif /* KMP_HAVE_UMWAIT */
6724
6725static void __kmp_do_serial_initialize(void) {
6726 int i, gtid;
6727 size_t size;
6728
6729 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6730
6731 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6732 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6733 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6734 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6735 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6736
6737#if OMPT_SUPPORT
6738 ompt_pre_init();
6739#endif
6740#if OMPD_SUPPORT
6741 __kmp_env_dump();
6742 ompd_init();
6743#endif
6744
6745 __kmp_validate_locks();
6746
6747 /* Initialize internal memory allocator */
6748 __kmp_init_allocator();
6749
6750 /* Register the library startup via an environment variable and check to see
6751 whether another copy of the library is already registered. */
6752
6753 __kmp_register_library_startup();
6754
6755 /* TODO reinitialization of library */
6756 if (TCR_4(__kmp_global.g.g_done)) {
6757 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6758 }
6759
6760 __kmp_global.g.g_abort = 0;
6761 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6762
6763/* initialize the locks */
6764#if KMP_USE_ADAPTIVE_LOCKS
6765#if KMP_DEBUG_ADAPTIVE_LOCKS
6766 __kmp_init_speculative_stats();
6767#endif
6768#endif
6769#if KMP_STATS_ENABLED
6770 __kmp_stats_init();
6771#endif
6772 __kmp_init_lock(&__kmp_global_lock);
6773 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6774 __kmp_init_lock(&__kmp_debug_lock);
6775 __kmp_init_atomic_lock(&__kmp_atomic_lock);
6776 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6777 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6778 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6779 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6780 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6781 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6782 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6783 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6784 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6785 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6786 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6787 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6788 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6789 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6790#if KMP_USE_MONITOR
6791 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6792#endif
6793 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6794
6795 /* conduct initialization and initial setup of configuration */
6796
6797 __kmp_runtime_initialize();
6798
6799#if KMP_MIC_SUPPORTED
6800 __kmp_check_mic_type();
6801#endif
6802
6803// Some global variable initialization moved here from kmp_env_initialize()
6804#ifdef KMP_DEBUG
6805 kmp_diag = 0;
6806#endif
6807 __kmp_abort_delay = 0;
6808
6809 // From __kmp_init_dflt_team_nth()
6810 /* assume the entire machine will be used */
6811 __kmp_dflt_team_nth_ub = __kmp_xproc;
6812 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6813 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6814 }
6815 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6816 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6817 }
6818 __kmp_max_nth = __kmp_sys_max_nth;
6819 __kmp_cg_max_nth = __kmp_sys_max_nth;
6820 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6821 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6822 __kmp_teams_max_nth = __kmp_sys_max_nth;
6823 }
6824
6825 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6826 // part
6827 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6828#if KMP_USE_MONITOR
6829 __kmp_monitor_wakeups =
6830 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6831 __kmp_bt_intervals =
6832 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6833#endif
6834 // From "KMP_LIBRARY" part of __kmp_env_initialize()
6835 __kmp_library = library_throughput;
6836 // From KMP_SCHEDULE initialization
6837 __kmp_static = kmp_sch_static_balanced;
6838// AC: do not use analytical here, because it is non-monotonous
6839//__kmp_guided = kmp_sch_guided_iterative_chunked;
6840//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6841// need to repeat assignment
6842// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6843// bit control and barrier method control parts
6844#if KMP_FAST_REDUCTION_BARRIER
6845#define kmp_reduction_barrier_gather_bb ((int)1)
6846#define kmp_reduction_barrier_release_bb ((int)1)
6847#define kmp_reduction_barrier_gather_pat bp_hyper_bar
6848#define kmp_reduction_barrier_release_pat bp_hyper_bar
6849#endif // KMP_FAST_REDUCTION_BARRIER
6850 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6851 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6852 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6853 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6854 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6855#if KMP_FAST_REDUCTION_BARRIER
6856 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6857 // lin_64 ): hyper,1
6858 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6859 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6860 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6861 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6862 }
6863#endif // KMP_FAST_REDUCTION_BARRIER
6864 }
6865#if KMP_FAST_REDUCTION_BARRIER
6866#undef kmp_reduction_barrier_release_pat
6867#undef kmp_reduction_barrier_gather_pat
6868#undef kmp_reduction_barrier_release_bb
6869#undef kmp_reduction_barrier_gather_bb
6870#endif // KMP_FAST_REDUCTION_BARRIER
6871#if KMP_MIC_SUPPORTED
6872 if (__kmp_mic_type == mic2) { // KNC
6873 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6874 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6875 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6876 1; // forkjoin release
6877 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6878 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6879 }
6880#if KMP_FAST_REDUCTION_BARRIER
6881 if (__kmp_mic_type == mic2) { // KNC
6882 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6883 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6884 }
6885#endif // KMP_FAST_REDUCTION_BARRIER
6886#endif // KMP_MIC_SUPPORTED
6887
6888// From KMP_CHECKS initialization
6889#ifdef KMP_DEBUG
6890 __kmp_env_checks = TRUE; /* development versions have the extra checks */
6891#else
6892 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6893#endif
6894
6895 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6896 __kmp_foreign_tp = TRUE;
6897
6898 __kmp_global.g.g_dynamic = FALSE;
6899 __kmp_global.g.g_dynamic_mode = dynamic_default;
6900
6901 __kmp_init_nesting_mode();
6902
6903 __kmp_env_initialize(NULL);
6904
6905#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6906 __kmp_user_level_mwait_init();
6907#endif
6908// Print all messages in message catalog for testing purposes.
6909#ifdef KMP_DEBUG
6910 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6911 if (__kmp_str_match_true(val)) {
6912 kmp_str_buf_t buffer;
6913 __kmp_str_buf_init(&buffer);
6914 __kmp_i18n_dump_catalog(&buffer);
6915 __kmp_printf("%s", buffer.str);
6916 __kmp_str_buf_free(&buffer);
6917 }
6918 __kmp_env_free(&val);
6919#endif
6920
6921 __kmp_threads_capacity =
6922 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6923 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6924 __kmp_tp_capacity = __kmp_default_tp_capacity(
6925 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6926
6927 // If the library is shut down properly, both pools must be NULL. Just in
6928 // case, set them to NULL -- some memory may leak, but subsequent code will
6929 // work even if pools are not freed.
6930 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6931 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6932 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6933 __kmp_thread_pool = NULL;
6934 __kmp_thread_pool_insert_pt = NULL;
6935 __kmp_team_pool = NULL;
6936
6937 /* Allocate all of the variable sized records */
6938 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6939 * expandable */
6940 /* Since allocation is cache-aligned, just add extra padding at the end */
6941 size =
6942 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6943 CACHE_LINE;
6944 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6945 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6946 sizeof(kmp_info_t *) * __kmp_threads_capacity);
6947
6948 /* init thread counts */
6949 KMP_DEBUG_ASSERT(__kmp_all_nth ==
6950 0); // Asserts fail if the library is reinitializing and
6951 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6952 __kmp_all_nth = 0;
6953 __kmp_nth = 0;
6954
6955 /* setup the uber master thread and hierarchy */
6956 gtid = __kmp_register_root(TRUE);
6957 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6958 KMP_ASSERT(KMP_UBER_GTID(gtid));
6959 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6960
6961 KMP_MB(); /* Flush all pending memory write invalidates. */
6962
6963 __kmp_common_initialize();
6964
6965#if KMP_OS_UNIX
6966 /* invoke the child fork handler */
6967 __kmp_register_atfork();
6968#endif
6969
6970#if !KMP_DYNAMIC_LIB
6971 {
6972 /* Invoke the exit handler when the program finishes, only for static
6973 library. For dynamic library, we already have _fini and DllMain. */
6974 int rc = atexit(__kmp_internal_end_atexit);
6975 if (rc != 0) {
6976 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6977 __kmp_msg_null);
6978 }
6979 }
6980#endif
6981
6982#if KMP_HANDLE_SIGNALS
6983#if KMP_OS_UNIX
6984 /* NOTE: make sure that this is called before the user installs their own
6985 signal handlers so that the user handlers are called first. this way they
6986 can return false, not call our handler, avoid terminating the library, and
6987 continue execution where they left off. */
6988 __kmp_install_signals(FALSE);
6989#endif /* KMP_OS_UNIX */
6990#if KMP_OS_WINDOWS
6991 __kmp_install_signals(TRUE);
6992#endif /* KMP_OS_WINDOWS */
6993#endif
6994
6995 /* we have finished the serial initialization */
6996 __kmp_init_counter++;
6997
6998 __kmp_init_serial = TRUE;
6999
7000 if (__kmp_settings) {
7001 __kmp_env_print();
7002 }
7003
7004 if (__kmp_display_env || __kmp_display_env_verbose) {
7005 __kmp_env_print_2();
7006 }
7007
7008#if OMPT_SUPPORT
7009 ompt_post_init();
7010#endif
7011
7012 KMP_MB();
7013
7014 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7015}
7016
7017void __kmp_serial_initialize(void) {
7018 if (__kmp_init_serial) {
7019 return;
7020 }
7021 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7022 if (__kmp_init_serial) {
7023 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7024 return;
7025 }
7026 __kmp_do_serial_initialize();
7027 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7028}
7029
7030static void __kmp_do_middle_initialize(void) {
7031 int i, j;
7032 int prev_dflt_team_nth;
7033
7034 if (!__kmp_init_serial) {
7035 __kmp_do_serial_initialize();
7036 }
7037
7038 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7039
7040 // Save the previous value for the __kmp_dflt_team_nth so that
7041 // we can avoid some reinitialization if it hasn't changed.
7042 prev_dflt_team_nth = __kmp_dflt_team_nth;
7043
7044#if KMP_AFFINITY_SUPPORTED
7045 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7046 // number of cores on the machine.
7047 __kmp_affinity_initialize();
7048
7049#endif /* KMP_AFFINITY_SUPPORTED */
7050
7051 KMP_ASSERT(__kmp_xproc > 0);
7052 if (__kmp_avail_proc == 0) {
7053 __kmp_avail_proc = __kmp_xproc;
7054 }
7055
7056 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7057 // correct them now
7058 j = 0;
7059 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7060 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7061 __kmp_avail_proc;
7062 j++;
7063 }
7064
7065 if (__kmp_dflt_team_nth == 0) {
7066#ifdef KMP_DFLT_NTH_CORES
7067 // Default #threads = #cores
7068 __kmp_dflt_team_nth = __kmp_ncores;
7069 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7070 "__kmp_ncores (%d)\n",
7071 __kmp_dflt_team_nth));
7072#else
7073 // Default #threads = #available OS procs
7074 __kmp_dflt_team_nth = __kmp_avail_proc;
7075 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7076 "__kmp_avail_proc(%d)\n",
7077 __kmp_dflt_team_nth));
7078#endif /* KMP_DFLT_NTH_CORES */
7079 }
7080
7081 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7082 __kmp_dflt_team_nth = KMP_MIN_NTH;
7083 }
7084 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7085 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7086 }
7087
7088 if (__kmp_nesting_mode > 0)
7089 __kmp_set_nesting_mode_threads();
7090
7091 // There's no harm in continuing if the following check fails,
7092 // but it indicates an error in the previous logic.
7093 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7094
7095 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7096 // Run through the __kmp_threads array and set the num threads icv for each
7097 // root thread that is currently registered with the RTL (which has not
7098 // already explicitly set its nthreads-var with a call to
7099 // omp_set_num_threads()).
7100 for (i = 0; i < __kmp_threads_capacity; i++) {
7101 kmp_info_t *thread = __kmp_threads[i];
7102 if (thread == NULL)
7103 continue;
7104 if (thread->th.th_current_task->td_icvs.nproc != 0)
7105 continue;
7106
7107 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7108 }
7109 }
7110 KA_TRACE(
7111 20,
7112 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7113 __kmp_dflt_team_nth));
7114
7115#ifdef KMP_ADJUST_BLOCKTIME
7116 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7117 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7118 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7119 if (__kmp_nth > __kmp_avail_proc) {
7120 __kmp_zero_bt = TRUE;
7121 }
7122 }
7123#endif /* KMP_ADJUST_BLOCKTIME */
7124
7125 /* we have finished middle initialization */
7126 TCW_SYNC_4(__kmp_init_middle, TRUE);
7127
7128 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7129}
7130
7131void __kmp_middle_initialize(void) {
7132 if (__kmp_init_middle) {
7133 return;
7134 }
7135 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7136 if (__kmp_init_middle) {
7137 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7138 return;
7139 }
7140 __kmp_do_middle_initialize();
7141 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7142}
7143
7144void __kmp_parallel_initialize(void) {
7145 int gtid = __kmp_entry_gtid(); // this might be a new root
7146
7147 /* synchronize parallel initialization (for sibling) */
7148 if (TCR_4(__kmp_init_parallel))
7149 return;
7150 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7151 if (TCR_4(__kmp_init_parallel)) {
7152 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7153 return;
7154 }
7155
7156 /* TODO reinitialization after we have already shut down */
7157 if (TCR_4(__kmp_global.g.g_done)) {
7158 KA_TRACE(
7159 10,
7160 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7161 __kmp_infinite_loop();
7162 }
7163
7164 /* jc: The lock __kmp_initz_lock is already held, so calling
7165 __kmp_serial_initialize would cause a deadlock. So we call
7166 __kmp_do_serial_initialize directly. */
7167 if (!__kmp_init_middle) {
7168 __kmp_do_middle_initialize();
7169 }
7170 __kmp_assign_root_init_mask();
7171 __kmp_resume_if_hard_paused();
7172
7173 /* begin initialization */
7174 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7175 KMP_ASSERT(KMP_UBER_GTID(gtid));
7176
7177#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7178 // Save the FP control regs.
7179 // Worker threads will set theirs to these values at thread startup.
7180 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7181 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7182 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7183#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7184
7185#if KMP_OS_UNIX
7186#if KMP_HANDLE_SIGNALS
7187 /* must be after __kmp_serial_initialize */
7188 __kmp_install_signals(TRUE);
7189#endif
7190#endif
7191
7192 __kmp_suspend_initialize();
7193
7194#if defined(USE_LOAD_BALANCE)
7195 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7196 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7197 }
7198#else
7199 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7200 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7201 }
7202#endif
7203
7204 if (__kmp_version) {
7205 __kmp_print_version_2();
7206 }
7207
7208 /* we have finished parallel initialization */
7209 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7210
7211 KMP_MB();
7212 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7213
7214 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7215}
7216
7217void __kmp_hidden_helper_initialize() {
7218 if (TCR_4(__kmp_init_hidden_helper))
7219 return;
7220
7221 // __kmp_parallel_initialize is required before we initialize hidden helper
7222 if (!TCR_4(__kmp_init_parallel))
7223 __kmp_parallel_initialize();
7224
7225 // Double check. Note that this double check should not be placed before
7226 // __kmp_parallel_initialize as it will cause dead lock.
7227 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7228 if (TCR_4(__kmp_init_hidden_helper)) {
7229 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7230 return;
7231 }
7232
7233 // Set the count of hidden helper tasks to be executed to zero
7234 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7235
7236 // Set the global variable indicating that we're initializing hidden helper
7237 // team/threads
7238 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7239
7240 // Platform independent initialization
7241 __kmp_do_initialize_hidden_helper_threads();
7242
7243 // Wait here for the finish of initialization of hidden helper teams
7244 __kmp_hidden_helper_threads_initz_wait();
7245
7246 // We have finished hidden helper initialization
7247 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7248
7249 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7250}
7251
7252/* ------------------------------------------------------------------------ */
7253
7254void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7255 kmp_team_t *team) {
7256 kmp_disp_t *dispatch;
7257
7258 KMP_MB();
7259
7260 /* none of the threads have encountered any constructs, yet. */
7261 this_thr->th.th_local.this_construct = 0;
7262#if KMP_CACHE_MANAGE
7263 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7264#endif /* KMP_CACHE_MANAGE */
7265 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7266 KMP_DEBUG_ASSERT(dispatch);
7267 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7268 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7269 // this_thr->th.th_info.ds.ds_tid ] );
7270
7271 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7272 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7273 if (__kmp_env_consistency_check)
7274 __kmp_push_parallel(gtid, team->t.t_ident);
7275
7276 KMP_MB(); /* Flush all pending memory write invalidates. */
7277}
7278
7279void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7280 kmp_team_t *team) {
7281 if (__kmp_env_consistency_check)
7282 __kmp_pop_parallel(gtid, team->t.t_ident);
7283
7284 __kmp_finish_implicit_task(this_thr);
7285}
7286
7287int __kmp_invoke_task_func(int gtid) {
7288 int rc;
7289 int tid = __kmp_tid_from_gtid(gtid);
7290 kmp_info_t *this_thr = __kmp_threads[gtid];
7291 kmp_team_t *team = this_thr->th.th_team;
7292
7293 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7294#if USE_ITT_BUILD
7295 if (__itt_stack_caller_create_ptr) {
7296 // inform ittnotify about entering user's code
7297 if (team->t.t_stack_id != NULL) {
7298 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7299 } else {
7300 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7301 __kmp_itt_stack_callee_enter(
7302 (__itt_caller)team->t.t_parent->t.t_stack_id);
7303 }
7304 }
7305#endif /* USE_ITT_BUILD */
7306#if INCLUDE_SSC_MARKS
7307 SSC_MARK_INVOKING();
7308#endif
7309
7310#if OMPT_SUPPORT
7311 void *dummy;
7312 void **exit_frame_p;
7313 ompt_data_t *my_task_data;
7314 ompt_data_t *my_parallel_data;
7315 int ompt_team_size;
7316
7317 if (ompt_enabled.enabled) {
7318 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7319 .ompt_task_info.frame.exit_frame.ptr);
7320 } else {
7321 exit_frame_p = &dummy;
7322 }
7323
7324 my_task_data =
7325 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7326 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7327 if (ompt_enabled.ompt_callback_implicit_task) {
7328 ompt_team_size = team->t.t_nproc;
7329 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7330 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7331 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7332 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7333 }
7334#endif
7335
7336#if KMP_STATS_ENABLED
7337 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7338 if (previous_state == stats_state_e::TEAMS_REGION) {
7339 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7340 } else {
7341 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7342 }
7343 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7344#endif
7345
7346 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7347 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7348#if OMPT_SUPPORT
7349 ,
7350 exit_frame_p
7351#endif
7352 );
7353#if OMPT_SUPPORT
7354 *exit_frame_p = NULL;
7355 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7356#endif
7357
7358#if KMP_STATS_ENABLED
7359 if (previous_state == stats_state_e::TEAMS_REGION) {
7360 KMP_SET_THREAD_STATE(previous_state);
7361 }
7362 KMP_POP_PARTITIONED_TIMER();
7363#endif
7364
7365#if USE_ITT_BUILD
7366 if (__itt_stack_caller_create_ptr) {
7367 // inform ittnotify about leaving user's code
7368 if (team->t.t_stack_id != NULL) {
7369 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7370 } else {
7371 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7372 __kmp_itt_stack_callee_leave(
7373 (__itt_caller)team->t.t_parent->t.t_stack_id);
7374 }
7375 }
7376#endif /* USE_ITT_BUILD */
7377 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7378
7379 return rc;
7380}
7381
7382void __kmp_teams_master(int gtid) {
7383 // This routine is called by all primary threads in teams construct
7384 kmp_info_t *thr = __kmp_threads[gtid];
7385 kmp_team_t *team = thr->th.th_team;
7386 ident_t *loc = team->t.t_ident;
7387 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7388 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7389 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7390 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7391 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7392
7393 // This thread is a new CG root. Set up the proper variables.
7394 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7395 tmp->cg_root = thr; // Make thr the CG root
7396 // Init to thread limit stored when league primary threads were forked
7397 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7398 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7399 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7400 " cg_nthreads to 1\n",
7401 thr, tmp));
7402 tmp->up = thr->th.th_cg_roots;
7403 thr->th.th_cg_roots = tmp;
7404
7405// Launch league of teams now, but not let workers execute
7406// (they hang on fork barrier until next parallel)
7407#if INCLUDE_SSC_MARKS
7408 SSC_MARK_FORKING();
7409#endif
7410 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7411 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7412 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7413#if INCLUDE_SSC_MARKS
7414 SSC_MARK_JOINING();
7415#endif
7416 // If the team size was reduced from the limit, set it to the new size
7417 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7418 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7419 // AC: last parameter "1" eliminates join barrier which won't work because
7420 // worker threads are in a fork barrier waiting for more parallel regions
7421 __kmp_join_call(loc, gtid
7422#if OMPT_SUPPORT
7423 ,
7424 fork_context_intel
7425#endif
7426 ,
7427 1);
7428}
7429
7430int __kmp_invoke_teams_master(int gtid) {
7431 kmp_info_t *this_thr = __kmp_threads[gtid];
7432 kmp_team_t *team = this_thr->th.th_team;
7433#if KMP_DEBUG
7434 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7435 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7436 (void *)__kmp_teams_master);
7437#endif
7438 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7439#if OMPT_SUPPORT
7440 int tid = __kmp_tid_from_gtid(gtid);
7441 ompt_data_t *task_data =
7442 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7443 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7444 if (ompt_enabled.ompt_callback_implicit_task) {
7445 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7446 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7447 ompt_task_initial);
7448 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7449 }
7450#endif
7451 __kmp_teams_master(gtid);
7452#if OMPT_SUPPORT
7453 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7454#endif
7455 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7456 return 1;
7457}
7458
7459/* this sets the requested number of threads for the next parallel region
7460 encountered by this team. since this should be enclosed in the forkjoin
7461 critical section it should avoid race conditions with asymmetrical nested
7462 parallelism */
7463
7464void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7465 kmp_info_t *thr = __kmp_threads[gtid];
7466
7467 if (num_threads > 0)
7468 thr->th.th_set_nproc = num_threads;
7469}
7470
7471static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7472 int num_threads) {
7473 KMP_DEBUG_ASSERT(thr);
7474 // Remember the number of threads for inner parallel regions
7475 if (!TCR_4(__kmp_init_middle))
7476 __kmp_middle_initialize(); // get internal globals calculated
7477 __kmp_assign_root_init_mask();
7478 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7479 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7480
7481 if (num_threads == 0) {
7482 if (__kmp_teams_thread_limit > 0) {
7483 num_threads = __kmp_teams_thread_limit;
7484 } else {
7485 num_threads = __kmp_avail_proc / num_teams;
7486 }
7487 // adjust num_threads w/o warning as it is not user setting
7488 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7489 // no thread_limit clause specified - do not change thread-limit-var ICV
7490 if (num_threads > __kmp_dflt_team_nth) {
7491 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7492 }
7493 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7494 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7495 } // prevent team size to exceed thread-limit-var
7496 if (num_teams * num_threads > __kmp_teams_max_nth) {
7497 num_threads = __kmp_teams_max_nth / num_teams;
7498 }
7499 if (num_threads == 0) {
7500 num_threads = 1;
7501 }
7502 } else {
7503 // This thread will be the primary thread of the league primary threads
7504 // Store new thread limit; old limit is saved in th_cg_roots list
7505 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7506 // num_threads = min(num_threads, nthreads-var)
7507 if (num_threads > __kmp_dflt_team_nth) {
7508 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7509 }
7510 if (num_teams * num_threads > __kmp_teams_max_nth) {
7511 int new_threads = __kmp_teams_max_nth / num_teams;
7512 if (new_threads == 0) {
7513 new_threads = 1;
7514 }
7515 if (new_threads != num_threads) {
7516 if (!__kmp_reserve_warn) { // user asked for too many threads
7517 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7518 __kmp_msg(kmp_ms_warning,
7519 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7520 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7521 }
7522 }
7523 num_threads = new_threads;
7524 }
7525 }
7526 thr->th.th_teams_size.nth = num_threads;
7527}
7528
7529/* this sets the requested number of teams for the teams region and/or
7530 the number of threads for the next parallel region encountered */
7531void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7532 int num_threads) {
7533 kmp_info_t *thr = __kmp_threads[gtid];
7534 KMP_DEBUG_ASSERT(num_teams >= 0);
7535 KMP_DEBUG_ASSERT(num_threads >= 0);
7536
7537 if (num_teams == 0) {
7538 if (__kmp_nteams > 0) {
7539 num_teams = __kmp_nteams;
7540 } else {
7541 num_teams = 1; // default number of teams is 1.
7542 }
7543 }
7544 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7545 if (!__kmp_reserve_warn) {
7546 __kmp_reserve_warn = 1;
7547 __kmp_msg(kmp_ms_warning,
7548 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7549 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7550 }
7551 num_teams = __kmp_teams_max_nth;
7552 }
7553 // Set number of teams (number of threads in the outer "parallel" of the
7554 // teams)
7555 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7556
7557 __kmp_push_thread_limit(thr, num_teams, num_threads);
7558}
7559
7560/* This sets the requested number of teams for the teams region and/or
7561 the number of threads for the next parallel region encountered */
7562void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7563 int num_teams_ub, int num_threads) {
7564 kmp_info_t *thr = __kmp_threads[gtid];
7565 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7566 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7567 KMP_DEBUG_ASSERT(num_threads >= 0);
7568
7569 if (num_teams_lb > num_teams_ub) {
7570 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7571 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7572 }
7573
7574 int num_teams = 1; // defalt number of teams is 1.
7575
7576 if (num_teams_lb == 0 && num_teams_ub > 0)
7577 num_teams_lb = num_teams_ub;
7578
7579 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7580 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7581 if (num_teams > __kmp_teams_max_nth) {
7582 if (!__kmp_reserve_warn) {
7583 __kmp_reserve_warn = 1;
7584 __kmp_msg(kmp_ms_warning,
7585 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7586 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7587 }
7588 num_teams = __kmp_teams_max_nth;
7589 }
7590 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7591 num_teams = num_teams_ub;
7592 } else { // num_teams_lb <= num_teams <= num_teams_ub
7593 if (num_threads == 0) {
7594 if (num_teams_ub > __kmp_teams_max_nth) {
7595 num_teams = num_teams_lb;
7596 } else {
7597 num_teams = num_teams_ub;
7598 }
7599 } else {
7600 num_teams = (num_threads > __kmp_teams_max_nth)
7601 ? num_teams
7602 : __kmp_teams_max_nth / num_threads;
7603 if (num_teams < num_teams_lb) {
7604 num_teams = num_teams_lb;
7605 } else if (num_teams > num_teams_ub) {
7606 num_teams = num_teams_ub;
7607 }
7608 }
7609 }
7610 // Set number of teams (number of threads in the outer "parallel" of the
7611 // teams)
7612 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7613
7614 __kmp_push_thread_limit(thr, num_teams, num_threads);
7615}
7616
7617// Set the proc_bind var to use in the following parallel region.
7618void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7619 kmp_info_t *thr = __kmp_threads[gtid];
7620 thr->th.th_set_proc_bind = proc_bind;
7621}
7622
7623/* Launch the worker threads into the microtask. */
7624
7625void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7626 kmp_info_t *this_thr = __kmp_threads[gtid];
7627
7628#ifdef KMP_DEBUG
7629 int f;
7630#endif /* KMP_DEBUG */
7631
7632 KMP_DEBUG_ASSERT(team);
7633 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7634 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7635 KMP_MB(); /* Flush all pending memory write invalidates. */
7636
7637 team->t.t_construct = 0; /* no single directives seen yet */
7638 team->t.t_ordered.dt.t_value =
7639 0; /* thread 0 enters the ordered section first */
7640
7641 /* Reset the identifiers on the dispatch buffer */
7642 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7643 if (team->t.t_max_nproc > 1) {
7644 int i;
7645 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7646 team->t.t_disp_buffer[i].buffer_index = i;
7647 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7648 }
7649 } else {
7650 team->t.t_disp_buffer[0].buffer_index = 0;
7651 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7652 }
7653
7654 KMP_MB(); /* Flush all pending memory write invalidates. */
7655 KMP_ASSERT(this_thr->th.th_team == team);
7656
7657#ifdef KMP_DEBUG
7658 for (f = 0; f < team->t.t_nproc; f++) {
7659 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7660 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7661 }
7662#endif /* KMP_DEBUG */
7663
7664 /* release the worker threads so they may begin working */
7665 __kmp_fork_barrier(gtid, 0);
7666}
7667
7668void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7669 kmp_info_t *this_thr = __kmp_threads[gtid];
7670
7671 KMP_DEBUG_ASSERT(team);
7672 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7673 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7674 KMP_MB(); /* Flush all pending memory write invalidates. */
7675
7676 /* Join barrier after fork */
7677
7678#ifdef KMP_DEBUG
7679 if (__kmp_threads[gtid] &&
7680 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7681 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7682 __kmp_threads[gtid]);
7683 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7684 "team->t.t_nproc=%d\n",
7685 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7686 team->t.t_nproc);
7687 __kmp_print_structure();
7688 }
7689 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7690 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7691#endif /* KMP_DEBUG */
7692
7693 __kmp_join_barrier(gtid); /* wait for everyone */
7694#if OMPT_SUPPORT
7695 if (ompt_enabled.enabled &&
7696 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7697 int ds_tid = this_thr->th.th_info.ds.ds_tid;
7698 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7699 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7700#if OMPT_OPTIONAL
7701 void *codeptr = NULL;
7702 if (KMP_MASTER_TID(ds_tid) &&
7703 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7704 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7705 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7706
7707 if (ompt_enabled.ompt_callback_sync_region_wait) {
7708 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7709 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7710 codeptr);
7711 }
7712 if (ompt_enabled.ompt_callback_sync_region) {
7713 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7714 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7715 codeptr);
7716 }
7717#endif
7718 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7719 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7720 ompt_scope_end, NULL, task_data, 0, ds_tid,
7721 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7722 }
7723 }
7724#endif
7725
7726 KMP_MB(); /* Flush all pending memory write invalidates. */
7727 KMP_ASSERT(this_thr->th.th_team == team);
7728}
7729
7730/* ------------------------------------------------------------------------ */
7731
7732#ifdef USE_LOAD_BALANCE
7733
7734// Return the worker threads actively spinning in the hot team, if we
7735// are at the outermost level of parallelism. Otherwise, return 0.
7736static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7737 int i;
7738 int retval;
7739 kmp_team_t *hot_team;
7740
7741 if (root->r.r_active) {
7742 return 0;
7743 }
7744 hot_team = root->r.r_hot_team;
7745 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7746 return hot_team->t.t_nproc - 1; // Don't count primary thread
7747 }
7748
7749 // Skip the primary thread - it is accounted for elsewhere.
7750 retval = 0;
7751 for (i = 1; i < hot_team->t.t_nproc; i++) {
7752 if (hot_team->t.t_threads[i]->th.th_active) {
7753 retval++;
7754 }
7755 }
7756 return retval;
7757}
7758
7759// Perform an automatic adjustment to the number of
7760// threads used by the next parallel region.
7761static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7762 int retval;
7763 int pool_active;
7764 int hot_team_active;
7765 int team_curr_active;
7766 int system_active;
7767
7768 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7769 set_nproc));
7770 KMP_DEBUG_ASSERT(root);
7771 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7772 ->th.th_current_task->td_icvs.dynamic == TRUE);
7773 KMP_DEBUG_ASSERT(set_nproc > 1);
7774
7775 if (set_nproc == 1) {
7776 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7777 return 1;
7778 }
7779
7780 // Threads that are active in the thread pool, active in the hot team for this
7781 // particular root (if we are at the outer par level), and the currently
7782 // executing thread (to become the primary thread) are available to add to the
7783 // new team, but are currently contributing to the system load, and must be
7784 // accounted for.
7785 pool_active = __kmp_thread_pool_active_nth;
7786 hot_team_active = __kmp_active_hot_team_nproc(root);
7787 team_curr_active = pool_active + hot_team_active + 1;
7788
7789 // Check the system load.
7790 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7791 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7792 "hot team active = %d\n",
7793 system_active, pool_active, hot_team_active));
7794
7795 if (system_active < 0) {
7796 // There was an error reading the necessary info from /proc, so use the
7797 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7798 // = dynamic_thread_limit, we shouldn't wind up getting back here.
7799 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7800 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7801
7802 // Make this call behave like the thread limit algorithm.
7803 retval = __kmp_avail_proc - __kmp_nth +
7804 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7805 if (retval > set_nproc) {
7806 retval = set_nproc;
7807 }
7808 if (retval < KMP_MIN_NTH) {
7809 retval = KMP_MIN_NTH;
7810 }
7811
7812 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7813 retval));
7814 return retval;
7815 }
7816
7817 // There is a slight delay in the load balance algorithm in detecting new
7818 // running procs. The real system load at this instant should be at least as
7819 // large as the #active omp thread that are available to add to the team.
7820 if (system_active < team_curr_active) {
7821 system_active = team_curr_active;
7822 }
7823 retval = __kmp_avail_proc - system_active + team_curr_active;
7824 if (retval > set_nproc) {
7825 retval = set_nproc;
7826 }
7827 if (retval < KMP_MIN_NTH) {
7828 retval = KMP_MIN_NTH;
7829 }
7830
7831 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7832 return retval;
7833} // __kmp_load_balance_nproc()
7834
7835#endif /* USE_LOAD_BALANCE */
7836
7837/* ------------------------------------------------------------------------ */
7838
7839/* NOTE: this is called with the __kmp_init_lock held */
7840void __kmp_cleanup(void) {
7841 int f;
7842
7843 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7844
7845 if (TCR_4(__kmp_init_parallel)) {
7846#if KMP_HANDLE_SIGNALS
7847 __kmp_remove_signals();
7848#endif
7849 TCW_4(__kmp_init_parallel, FALSE);
7850 }
7851
7852 if (TCR_4(__kmp_init_middle)) {
7853#if KMP_AFFINITY_SUPPORTED
7854 __kmp_affinity_uninitialize();
7855#endif /* KMP_AFFINITY_SUPPORTED */
7856 __kmp_cleanup_hierarchy();
7857 TCW_4(__kmp_init_middle, FALSE);
7858 }
7859
7860 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7861
7862 if (__kmp_init_serial) {
7863 __kmp_runtime_destroy();
7864 __kmp_init_serial = FALSE;
7865 }
7866
7867 __kmp_cleanup_threadprivate_caches();
7868
7869 for (f = 0; f < __kmp_threads_capacity; f++) {
7870 if (__kmp_root[f] != NULL) {
7871 __kmp_free(__kmp_root[f]);
7872 __kmp_root[f] = NULL;
7873 }
7874 }
7875 __kmp_free(__kmp_threads);
7876 // __kmp_threads and __kmp_root were allocated at once, as single block, so
7877 // there is no need in freeing __kmp_root.
7878 __kmp_threads = NULL;
7879 __kmp_root = NULL;
7880 __kmp_threads_capacity = 0;
7881
7882#if KMP_USE_DYNAMIC_LOCK
7883 __kmp_cleanup_indirect_user_locks();
7884#else
7885 __kmp_cleanup_user_locks();
7886#endif
7887#if OMPD_SUPPORT
7888 if (ompd_state) {
7889 __kmp_free(ompd_env_block);
7890 ompd_env_block = NULL;
7891 ompd_env_block_size = 0;
7892 }
7893#endif
7894
7895#if KMP_AFFINITY_SUPPORTED
7896 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7897 __kmp_cpuinfo_file = NULL;
7898#endif /* KMP_AFFINITY_SUPPORTED */
7899
7900#if KMP_USE_ADAPTIVE_LOCKS
7901#if KMP_DEBUG_ADAPTIVE_LOCKS
7902 __kmp_print_speculative_stats();
7903#endif
7904#endif
7905 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7906 __kmp_nested_nth.nth = NULL;
7907 __kmp_nested_nth.size = 0;
7908 __kmp_nested_nth.used = 0;
7909 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7910 __kmp_nested_proc_bind.bind_types = NULL;
7911 __kmp_nested_proc_bind.size = 0;
7912 __kmp_nested_proc_bind.used = 0;
7913 if (__kmp_affinity_format) {
7914 KMP_INTERNAL_FREE(__kmp_affinity_format);
7915 __kmp_affinity_format = NULL;
7916 }
7917
7918 __kmp_i18n_catclose();
7919
7920#if KMP_USE_HIER_SCHED
7921 __kmp_hier_scheds.deallocate();
7922#endif
7923
7924#if KMP_STATS_ENABLED
7925 __kmp_stats_fini();
7926#endif
7927
7928 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7929}
7930
7931/* ------------------------------------------------------------------------ */
7932
7933int __kmp_ignore_mppbeg(void) {
7934 char *env;
7935
7936 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7937 if (__kmp_str_match_false(env))
7938 return FALSE;
7939 }
7940 // By default __kmpc_begin() is no-op.
7941 return TRUE;
7942}
7943
7944int __kmp_ignore_mppend(void) {
7945 char *env;
7946
7947 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7948 if (__kmp_str_match_false(env))
7949 return FALSE;
7950 }
7951 // By default __kmpc_end() is no-op.
7952 return TRUE;
7953}
7954
7955void __kmp_internal_begin(void) {
7956 int gtid;
7957 kmp_root_t *root;
7958
7959 /* this is a very important step as it will register new sibling threads
7960 and assign these new uber threads a new gtid */
7961 gtid = __kmp_entry_gtid();
7962 root = __kmp_threads[gtid]->th.th_root;
7963 KMP_ASSERT(KMP_UBER_GTID(gtid));
7964
7965 if (root->r.r_begin)
7966 return;
7967 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7968 if (root->r.r_begin) {
7969 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7970 return;
7971 }
7972
7973 root->r.r_begin = TRUE;
7974
7975 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7976}
7977
7978/* ------------------------------------------------------------------------ */
7979
7980void __kmp_user_set_library(enum library_type arg) {
7981 int gtid;
7982 kmp_root_t *root;
7983 kmp_info_t *thread;
7984
7985 /* first, make sure we are initialized so we can get our gtid */
7986
7987 gtid = __kmp_entry_gtid();
7988 thread = __kmp_threads[gtid];
7989
7990 root = thread->th.th_root;
7991
7992 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7993 library_serial));
7994 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7995 thread */
7996 KMP_WARNING(SetLibraryIncorrectCall);
7997 return;
7998 }
7999
8000 switch (arg) {
8001 case library_serial:
8002 thread->th.th_set_nproc = 0;
8003 set__nproc(thread, 1);
8004 break;
8005 case library_turnaround:
8006 thread->th.th_set_nproc = 0;
8007 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8008 : __kmp_dflt_team_nth_ub);
8009 break;
8010 case library_throughput:
8011 thread->th.th_set_nproc = 0;
8012 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8013 : __kmp_dflt_team_nth_ub);
8014 break;
8015 default:
8016 KMP_FATAL(UnknownLibraryType, arg);
8017 }
8018
8019 __kmp_aux_set_library(arg);
8020}
8021
8022void __kmp_aux_set_stacksize(size_t arg) {
8023 if (!__kmp_init_serial)
8024 __kmp_serial_initialize();
8025
8026#if KMP_OS_DARWIN
8027 if (arg & (0x1000 - 1)) {
8028 arg &= ~(0x1000 - 1);
8029 if (arg + 0x1000) /* check for overflow if we round up */
8030 arg += 0x1000;
8031 }
8032#endif
8033 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8034
8035 /* only change the default stacksize before the first parallel region */
8036 if (!TCR_4(__kmp_init_parallel)) {
8037 size_t value = arg; /* argument is in bytes */
8038
8039 if (value < __kmp_sys_min_stksize)
8040 value = __kmp_sys_min_stksize;
8041 else if (value > KMP_MAX_STKSIZE)
8042 value = KMP_MAX_STKSIZE;
8043
8044 __kmp_stksize = value;
8045
8046 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8047 }
8048
8049 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8050}
8051
8052/* set the behaviour of the runtime library */
8053/* TODO this can cause some odd behaviour with sibling parallelism... */
8054void __kmp_aux_set_library(enum library_type arg) {
8055 __kmp_library = arg;
8056
8057 switch (__kmp_library) {
8058 case library_serial: {
8059 KMP_INFORM(LibraryIsSerial);
8060 } break;
8061 case library_turnaround:
8062 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8063 __kmp_use_yield = 2; // only yield when oversubscribed
8064 break;
8065 case library_throughput:
8066 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8067 __kmp_dflt_blocktime = 200;
8068 break;
8069 default:
8070 KMP_FATAL(UnknownLibraryType, arg);
8071 }
8072}
8073
8074/* Getting team information common for all team API */
8075// Returns NULL if not in teams construct
8076static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8077 kmp_info_t *thr = __kmp_entry_thread();
8078 teams_serialized = 0;
8079 if (thr->th.th_teams_microtask) {
8080 kmp_team_t *team = thr->th.th_team;
8081 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8082 int ii = team->t.t_level;
8083 teams_serialized = team->t.t_serialized;
8084 int level = tlevel + 1;
8085 KMP_DEBUG_ASSERT(ii >= tlevel);
8086 while (ii > level) {
8087 for (teams_serialized = team->t.t_serialized;
8088 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8089 }
8090 if (team->t.t_serialized && (!teams_serialized)) {
8091 team = team->t.t_parent;
8092 continue;
8093 }
8094 if (ii > level) {
8095 team = team->t.t_parent;
8096 ii--;
8097 }
8098 }
8099 return team;
8100 }
8101 return NULL;
8102}
8103
8104int __kmp_aux_get_team_num() {
8105 int serialized;
8106 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8107 if (team) {
8108 if (serialized > 1) {
8109 return 0; // teams region is serialized ( 1 team of 1 thread ).
8110 } else {
8111 return team->t.t_master_tid;
8112 }
8113 }
8114 return 0;
8115}
8116
8117int __kmp_aux_get_num_teams() {
8118 int serialized;
8119 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8120 if (team) {
8121 if (serialized > 1) {
8122 return 1;
8123 } else {
8124 return team->t.t_parent->t.t_nproc;
8125 }
8126 }
8127 return 1;
8128}
8129
8130/* ------------------------------------------------------------------------ */
8131
8132/*
8133 * Affinity Format Parser
8134 *
8135 * Field is in form of: %[[[0].]size]type
8136 * % and type are required (%% means print a literal '%')
8137 * type is either single char or long name surrounded by {},
8138 * e.g., N or {num_threads}
8139 * 0 => leading zeros
8140 * . => right justified when size is specified
8141 * by default output is left justified
8142 * size is the *minimum* field length
8143 * All other characters are printed as is
8144 *
8145 * Available field types:
8146 * L {thread_level} - omp_get_level()
8147 * n {thread_num} - omp_get_thread_num()
8148 * h {host} - name of host machine
8149 * P {process_id} - process id (integer)
8150 * T {thread_identifier} - native thread identifier (integer)
8151 * N {num_threads} - omp_get_num_threads()
8152 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8153 * a {thread_affinity} - comma separated list of integers or integer ranges
8154 * (values of affinity mask)
8155 *
8156 * Implementation-specific field types can be added
8157 * If a type is unknown, print "undefined"
8158 */
8159
8160// Structure holding the short name, long name, and corresponding data type
8161// for snprintf. A table of these will represent the entire valid keyword
8162// field types.
8163typedef struct kmp_affinity_format_field_t {
8164 char short_name; // from spec e.g., L -> thread level
8165 const char *long_name; // from spec thread_level -> thread level
8166 char field_format; // data type for snprintf (typically 'd' or 's'
8167 // for integer or string)
8168} kmp_affinity_format_field_t;
8169
8170static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8171#if KMP_AFFINITY_SUPPORTED
8172 {'A', "thread_affinity", 's'},
8173#endif
8174 {'t', "team_num", 'd'},
8175 {'T', "num_teams", 'd'},
8176 {'L', "nesting_level", 'd'},
8177 {'n', "thread_num", 'd'},
8178 {'N', "num_threads", 'd'},
8179 {'a', "ancestor_tnum", 'd'},
8180 {'H', "host", 's'},
8181 {'P', "process_id", 'd'},
8182 {'i', "native_thread_id", 'd'}};
8183
8184// Return the number of characters it takes to hold field
8185static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8186 const char **ptr,
8187 kmp_str_buf_t *field_buffer) {
8188 int rc, format_index, field_value;
8189 const char *width_left, *width_right;
8190 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8191 static const int FORMAT_SIZE = 20;
8192 char format[FORMAT_SIZE] = {0};
8193 char absolute_short_name = 0;
8194
8195 KMP_DEBUG_ASSERT(gtid >= 0);
8196 KMP_DEBUG_ASSERT(th);
8197 KMP_DEBUG_ASSERT(**ptr == '%');
8198 KMP_DEBUG_ASSERT(field_buffer);
8199
8200 __kmp_str_buf_clear(field_buffer);
8201
8202 // Skip the initial %
8203 (*ptr)++;
8204
8205 // Check for %% first
8206 if (**ptr == '%') {
8207 __kmp_str_buf_cat(field_buffer, "%", 1);
8208 (*ptr)++; // skip over the second %
8209 return 1;
8210 }
8211
8212 // Parse field modifiers if they are present
8213 pad_zeros = false;
8214 if (**ptr == '0') {
8215 pad_zeros = true;
8216 (*ptr)++; // skip over 0
8217 }
8218 right_justify = false;
8219 if (**ptr == '.') {
8220 right_justify = true;
8221 (*ptr)++; // skip over .
8222 }
8223 // Parse width of field: [width_left, width_right)
8224 width_left = width_right = NULL;
8225 if (**ptr >= '0' && **ptr <= '9') {
8226 width_left = *ptr;
8227 SKIP_DIGITS(*ptr);
8228 width_right = *ptr;
8229 }
8230
8231 // Create the format for KMP_SNPRINTF based on flags parsed above
8232 format_index = 0;
8233 format[format_index++] = '%';
8234 if (!right_justify)
8235 format[format_index++] = '-';
8236 if (pad_zeros)
8237 format[format_index++] = '0';
8238 if (width_left && width_right) {
8239 int i = 0;
8240 // Only allow 8 digit number widths.
8241 // This also prevents overflowing format variable
8242 while (i < 8 && width_left < width_right) {
8243 format[format_index++] = *width_left;
8244 width_left++;
8245 i++;
8246 }
8247 }
8248
8249 // Parse a name (long or short)
8250 // Canonicalize the name into absolute_short_name
8251 found_valid_name = false;
8252 parse_long_name = (**ptr == '{');
8253 if (parse_long_name)
8254 (*ptr)++; // skip initial left brace
8255 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8256 sizeof(__kmp_affinity_format_table[0]);
8257 ++i) {
8258 char short_name = __kmp_affinity_format_table[i].short_name;
8259 const char *long_name = __kmp_affinity_format_table[i].long_name;
8260 char field_format = __kmp_affinity_format_table[i].field_format;
8261 if (parse_long_name) {
8262 size_t length = KMP_STRLEN(long_name);
8263 if (strncmp(*ptr, long_name, length) == 0) {
8264 found_valid_name = true;
8265 (*ptr) += length; // skip the long name
8266 }
8267 } else if (**ptr == short_name) {
8268 found_valid_name = true;
8269 (*ptr)++; // skip the short name
8270 }
8271 if (found_valid_name) {
8272 format[format_index++] = field_format;
8273 format[format_index++] = '\0';
8274 absolute_short_name = short_name;
8275 break;
8276 }
8277 }
8278 if (parse_long_name) {
8279 if (**ptr != '}') {
8280 absolute_short_name = 0;
8281 } else {
8282 (*ptr)++; // skip over the right brace
8283 }
8284 }
8285
8286 // Attempt to fill the buffer with the requested
8287 // value using snprintf within __kmp_str_buf_print()
8288 switch (absolute_short_name) {
8289 case 't':
8290 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8291 break;
8292 case 'T':
8293 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8294 break;
8295 case 'L':
8296 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8297 break;
8298 case 'n':
8299 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8300 break;
8301 case 'H': {
8302 static const int BUFFER_SIZE = 256;
8303 char buf[BUFFER_SIZE];
8304 __kmp_expand_host_name(buf, BUFFER_SIZE);
8305 rc = __kmp_str_buf_print(field_buffer, format, buf);
8306 } break;
8307 case 'P':
8308 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8309 break;
8310 case 'i':
8311 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8312 break;
8313 case 'N':
8314 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8315 break;
8316 case 'a':
8317 field_value =
8318 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8319 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8320 break;
8321#if KMP_AFFINITY_SUPPORTED
8322 case 'A': {
8323 kmp_str_buf_t buf;
8324 __kmp_str_buf_init(&buf);
8325 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8326 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8327 __kmp_str_buf_free(&buf);
8328 } break;
8329#endif
8330 default:
8331 // According to spec, If an implementation does not have info for field
8332 // type, then "undefined" is printed
8333 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8334 // Skip the field
8335 if (parse_long_name) {
8336 SKIP_TOKEN(*ptr);
8337 if (**ptr == '}')
8338 (*ptr)++;
8339 } else {
8340 (*ptr)++;
8341 }
8342 }
8343
8344 KMP_ASSERT(format_index <= FORMAT_SIZE);
8345 return rc;
8346}
8347
8348/*
8349 * Return number of characters needed to hold the affinity string
8350 * (not including null byte character)
8351 * The resultant string is printed to buffer, which the caller can then
8352 * handle afterwards
8353 */
8354size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8355 kmp_str_buf_t *buffer) {
8356 const char *parse_ptr;
8357 size_t retval;
8358 const kmp_info_t *th;
8359 kmp_str_buf_t field;
8360
8361 KMP_DEBUG_ASSERT(buffer);
8362 KMP_DEBUG_ASSERT(gtid >= 0);
8363
8364 __kmp_str_buf_init(&field);
8365 __kmp_str_buf_clear(buffer);
8366
8367 th = __kmp_threads[gtid];
8368 retval = 0;
8369
8370 // If format is NULL or zero-length string, then we use
8371 // affinity-format-var ICV
8372 parse_ptr = format;
8373 if (parse_ptr == NULL || *parse_ptr == '\0') {
8374 parse_ptr = __kmp_affinity_format;
8375 }
8376 KMP_DEBUG_ASSERT(parse_ptr);
8377
8378 while (*parse_ptr != '\0') {
8379 // Parse a field
8380 if (*parse_ptr == '%') {
8381 // Put field in the buffer
8382 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8383 __kmp_str_buf_catbuf(buffer, &field);
8384 retval += rc;
8385 } else {
8386 // Put literal character in buffer
8387 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8388 retval++;
8389 parse_ptr++;
8390 }
8391 }
8392 __kmp_str_buf_free(&field);
8393 return retval;
8394}
8395
8396// Displays the affinity string to stdout
8397void __kmp_aux_display_affinity(int gtid, const char *format) {
8398 kmp_str_buf_t buf;
8399 __kmp_str_buf_init(&buf);
8400 __kmp_aux_capture_affinity(gtid, format, &buf);
8401 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8402 __kmp_str_buf_free(&buf);
8403}
8404
8405/* ------------------------------------------------------------------------ */
8406
8407void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8408 int blocktime = arg; /* argument is in milliseconds */
8409#if KMP_USE_MONITOR
8410 int bt_intervals;
8411#endif
8412 kmp_int8 bt_set;
8413
8414 __kmp_save_internal_controls(thread);
8415
8416 /* Normalize and set blocktime for the teams */
8417 if (blocktime < KMP_MIN_BLOCKTIME)
8418 blocktime = KMP_MIN_BLOCKTIME;
8419 else if (blocktime > KMP_MAX_BLOCKTIME)
8420 blocktime = KMP_MAX_BLOCKTIME;
8421
8422 set__blocktime_team(thread->th.th_team, tid, blocktime);
8423 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8424
8425#if KMP_USE_MONITOR
8426 /* Calculate and set blocktime intervals for the teams */
8427 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8428
8429 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8430 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8431#endif
8432
8433 /* Set whether blocktime has been set to "TRUE" */
8434 bt_set = TRUE;
8435
8436 set__bt_set_team(thread->th.th_team, tid, bt_set);
8437 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8438#if KMP_USE_MONITOR
8439 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8440 "bt_intervals=%d, monitor_updates=%d\n",
8441 __kmp_gtid_from_tid(tid, thread->th.th_team),
8442 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8443 __kmp_monitor_wakeups));
8444#else
8445 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8446 __kmp_gtid_from_tid(tid, thread->th.th_team),
8447 thread->th.th_team->t.t_id, tid, blocktime));
8448#endif
8449}
8450
8451void __kmp_aux_set_defaults(char const *str, size_t len) {
8452 if (!__kmp_init_serial) {
8453 __kmp_serial_initialize();
8454 }
8455 __kmp_env_initialize(str);
8456
8457 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8458 __kmp_env_print();
8459 }
8460} // __kmp_aux_set_defaults
8461
8462/* ------------------------------------------------------------------------ */
8463/* internal fast reduction routines */
8464
8465PACKED_REDUCTION_METHOD_T
8466__kmp_determine_reduction_method(
8467 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8468 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8469 kmp_critical_name *lck) {
8470
8471 // Default reduction method: critical construct ( lck != NULL, like in current
8472 // PAROPT )
8473 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8474 // can be selected by RTL
8475 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8476 // can be selected by RTL
8477 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8478 // among generated by PAROPT.
8479
8480 PACKED_REDUCTION_METHOD_T retval;
8481
8482 int team_size;
8483
8484 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8485 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8486
8487#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8488 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8489#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8490
8491 retval = critical_reduce_block;
8492
8493 // another choice of getting a team size (with 1 dynamic deference) is slower
8494 team_size = __kmp_get_team_num_threads(global_tid);
8495 if (team_size == 1) {
8496
8497 retval = empty_reduce_block;
8498
8499 } else {
8500
8501 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8502
8503#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8504 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8505
8506#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8507 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8508
8509 int teamsize_cutoff = 4;
8510
8511#if KMP_MIC_SUPPORTED
8512 if (__kmp_mic_type != non_mic) {
8513 teamsize_cutoff = 8;
8514 }
8515#endif
8516 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8517 if (tree_available) {
8518 if (team_size <= teamsize_cutoff) {
8519 if (atomic_available) {
8520 retval = atomic_reduce_block;
8521 }
8522 } else {
8523 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8524 }
8525 } else if (atomic_available) {
8526 retval = atomic_reduce_block;
8527 }
8528#else
8529#error "Unknown or unsupported OS"
8530#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8531 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8532
8533#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8534
8535#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8536
8537 // basic tuning
8538
8539 if (atomic_available) {
8540 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8541 retval = atomic_reduce_block;
8542 }
8543 } // otherwise: use critical section
8544
8545#elif KMP_OS_DARWIN
8546
8547 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8548 if (atomic_available && (num_vars <= 3)) {
8549 retval = atomic_reduce_block;
8550 } else if (tree_available) {
8551 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8552 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8553 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8554 }
8555 } // otherwise: use critical section
8556
8557#else
8558#error "Unknown or unsupported OS"
8559#endif
8560
8561#else
8562#error "Unknown or unsupported architecture"
8563#endif
8564 }
8565
8566 // KMP_FORCE_REDUCTION
8567
8568 // If the team is serialized (team_size == 1), ignore the forced reduction
8569 // method and stay with the unsynchronized method (empty_reduce_block)
8570 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8571 team_size != 1) {
8572
8573 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8574
8575 int atomic_available, tree_available;
8576
8577 switch ((forced_retval = __kmp_force_reduction_method)) {
8578 case critical_reduce_block:
8579 KMP_ASSERT(lck); // lck should be != 0
8580 break;
8581
8582 case atomic_reduce_block:
8583 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8584 if (!atomic_available) {
8585 KMP_WARNING(RedMethodNotSupported, "atomic");
8586 forced_retval = critical_reduce_block;
8587 }
8588 break;
8589
8590 case tree_reduce_block:
8591 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8592 if (!tree_available) {
8593 KMP_WARNING(RedMethodNotSupported, "tree");
8594 forced_retval = critical_reduce_block;
8595 } else {
8596#if KMP_FAST_REDUCTION_BARRIER
8597 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8598#endif
8599 }
8600 break;
8601
8602 default:
8603 KMP_ASSERT(0); // "unsupported method specified"
8604 }
8605
8606 retval = forced_retval;
8607 }
8608
8609 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8610
8611#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8612#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8613
8614 return (retval);
8615}
8616// this function is for testing set/get/determine reduce method
8617kmp_int32 __kmp_get_reduce_method(void) {
8618 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8619}
8620
8621// Soft pause sets up threads to ignore blocktime and just go to sleep.
8622// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8623void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8624
8625// Hard pause shuts down the runtime completely. Resume happens naturally when
8626// OpenMP is used subsequently.
8627void __kmp_hard_pause() {
8628 __kmp_pause_status = kmp_hard_paused;
8629 __kmp_internal_end_thread(-1);
8630}
8631
8632// Soft resume sets __kmp_pause_status, and wakes up all threads.
8633void __kmp_resume_if_soft_paused() {
8634 if (__kmp_pause_status == kmp_soft_paused) {
8635 __kmp_pause_status = kmp_not_paused;
8636
8637 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8638 kmp_info_t *thread = __kmp_threads[gtid];
8639 if (thread) { // Wake it if sleeping
8640 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8641 thread);
8642 if (fl.is_sleeping())
8643 fl.resume(gtid);
8644 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8645 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8646 } else { // thread holds the lock and may sleep soon
8647 do { // until either the thread sleeps, or we can get the lock
8648 if (fl.is_sleeping()) {
8649 fl.resume(gtid);
8650 break;
8651 } else if (__kmp_try_suspend_mx(thread)) {
8652 __kmp_unlock_suspend_mx(thread);
8653 break;
8654 }
8655 } while (1);
8656 }
8657 }
8658 }
8659 }
8660}
8661
8662// This function is called via __kmpc_pause_resource. Returns 0 if successful.
8663// TODO: add warning messages
8664int __kmp_pause_resource(kmp_pause_status_t level) {
8665 if (level == kmp_not_paused) { // requesting resume
8666 if (__kmp_pause_status == kmp_not_paused) {
8667 // error message about runtime not being paused, so can't resume
8668 return 1;
8669 } else {
8670 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8671 __kmp_pause_status == kmp_hard_paused);
8672 __kmp_pause_status = kmp_not_paused;
8673 return 0;
8674 }
8675 } else if (level == kmp_soft_paused) { // requesting soft pause
8676 if (__kmp_pause_status != kmp_not_paused) {
8677 // error message about already being paused
8678 return 1;
8679 } else {
8680 __kmp_soft_pause();
8681 return 0;
8682 }
8683 } else if (level == kmp_hard_paused) { // requesting hard pause
8684 if (__kmp_pause_status != kmp_not_paused) {
8685 // error message about already being paused
8686 return 1;
8687 } else {
8688 __kmp_hard_pause();
8689 return 0;
8690 }
8691 } else {
8692 // error message about invalid level
8693 return 1;
8694 }
8695}
8696
8697void __kmp_omp_display_env(int verbose) {
8698 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8699 if (__kmp_init_serial == 0)
8700 __kmp_do_serial_initialize();
8701 __kmp_display_env_impl(!verbose, verbose);
8702 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8703}
8704
8705// Globals and functions for hidden helper task
8706kmp_info_t **__kmp_hidden_helper_threads;
8707kmp_info_t *__kmp_hidden_helper_main_thread;
8708std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8709#if KMP_OS_LINUX
8710kmp_int32 __kmp_hidden_helper_threads_num = 8;
8711kmp_int32 __kmp_enable_hidden_helper = TRUE;
8712#else
8713kmp_int32 __kmp_hidden_helper_threads_num = 0;
8714kmp_int32 __kmp_enable_hidden_helper = FALSE;
8715#endif
8716
8717namespace {
8718std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8719
8720void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8721 // This is an explicit synchronization on all hidden helper threads in case
8722 // that when a regular thread pushes a hidden helper task to one hidden
8723 // helper thread, the thread has not been awaken once since they're released
8724 // by the main thread after creating the team.
8725 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8726 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8727 __kmp_hidden_helper_threads_num)
8728 ;
8729
8730 // If main thread, then wait for signal
8731 if (__kmpc_master(nullptr, *gtid)) {
8732 // First, unset the initial state and release the initial thread
8733 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8734 __kmp_hidden_helper_initz_release();
8735 __kmp_hidden_helper_main_thread_wait();
8736 // Now wake up all worker threads
8737 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8738 __kmp_hidden_helper_worker_thread_signal();
8739 }
8740 }
8741}
8742} // namespace
8743
8744void __kmp_hidden_helper_threads_initz_routine() {
8745 // Create a new root for hidden helper team/threads
8746 const int gtid = __kmp_register_root(TRUE);
8747 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8748 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8749 __kmp_hidden_helper_main_thread->th.th_set_nproc =
8750 __kmp_hidden_helper_threads_num;
8751
8752 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8753
8754 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8755
8756 // Set the initialization flag to FALSE
8757 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8758
8759 __kmp_hidden_helper_threads_deinitz_release();
8760}
8761
8762/* Nesting Mode:
8763 Set via KMP_NESTING_MODE, which takes an integer.
8764 Note: we skip duplicate topology levels, and skip levels with only
8765 one entity.
8766 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8767 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8768 in the topology, and initializes the number of threads at each of those
8769 levels to the number of entities at each level, respectively, below the
8770 entity at the parent level.
8771 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8772 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8773 the user to turn nesting on explicitly. This is an even more experimental
8774 option to this experimental feature, and may change or go away in the
8775 future.
8776*/
8777
8778// Allocate space to store nesting levels
8779void __kmp_init_nesting_mode() {
8780 int levels = KMP_HW_LAST;
8781 __kmp_nesting_mode_nlevels = levels;
8782 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8783 for (int i = 0; i < levels; ++i)
8784 __kmp_nesting_nth_level[i] = 0;
8785 if (__kmp_nested_nth.size < levels) {
8786 __kmp_nested_nth.nth =
8787 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8788 __kmp_nested_nth.size = levels;
8789 }
8790}
8791
8792// Set # threads for top levels of nesting; must be called after topology set
8793void __kmp_set_nesting_mode_threads() {
8794 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8795
8796 if (__kmp_nesting_mode == 1)
8797 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8798 else if (__kmp_nesting_mode > 1)
8799 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8800
8801 if (__kmp_topology) { // use topology info
8802 int loc, hw_level;
8803 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8804 loc < __kmp_nesting_mode_nlevels;
8805 loc++, hw_level++) {
8806 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8807 if (__kmp_nesting_nth_level[loc] == 1)
8808 loc--;
8809 }
8810 // Make sure all cores are used
8811 if (__kmp_nesting_mode > 1 && loc > 1) {
8812 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
8813 int num_cores = __kmp_topology->get_count(core_level);
8814 int upper_levels = 1;
8815 for (int level = 0; level < loc - 1; ++level)
8816 upper_levels *= __kmp_nesting_nth_level[level];
8817 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
8818 __kmp_nesting_nth_level[loc - 1] =
8819 num_cores / __kmp_nesting_nth_level[loc - 2];
8820 }
8821 __kmp_nesting_mode_nlevels = loc;
8822 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8823 } else { // no topology info available; provide a reasonable guesstimation
8824 if (__kmp_avail_proc >= 4) {
8825 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
8826 __kmp_nesting_nth_level[1] = 2;
8827 __kmp_nesting_mode_nlevels = 2;
8828 } else {
8829 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
8830 __kmp_nesting_mode_nlevels = 1;
8831 }
8832 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8833 }
8834 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
8835 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
8836 }
8837 set__nproc(thread, __kmp_nesting_nth_level[0]);
8838 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
8839 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8840 if (get__max_active_levels(thread) > 1) {
8841 // if max levels was set, set nesting mode levels to same
8842 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
8843 }
8844 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
8845 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
8846}
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:198
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:933
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:891
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:355
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:362
@ kmp_sch_static
Definition: kmp.h:358
@ kmp_sch_guided_chunked
Definition: kmp.h:360
Definition: kmp.h:233
kmp_int32 flags
Definition: kmp.h:235