LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43  KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48  "5.0 (201611)";
49 #elif OMP_45_ENABLED
50  "4.5 (201511)";
51 #elif OMP_40_ENABLED
52  "4.0 (201307)";
53 #else
54  "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59  KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73  int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75  kmp_internal_control_t *new_icvs,
76  ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79  int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85  kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103  int i;
104  kmp_info_t **other_threads;
105  size_t stack_data;
106  char *stack_addr;
107  size_t stack_size;
108  char *stack_base;
109 
110  KA_TRACE(
111  1000,
112  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
113  __kmp_nth, __kmp_all_nth));
114 
115  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118  __kmp_init_gtid for this to work. */
119 
120  if (!TCR_4(__kmp_init_gtid))
121  return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124  if (TCR_4(__kmp_gtid_mode) >= 3) {
125  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126  return __kmp_gtid;
127  }
128 #endif
129  if (TCR_4(__kmp_gtid_mode) >= 2) {
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131  return __kmp_gtid_get_specific();
132  }
133  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135  stack_addr = (char *)&stack_data;
136  other_threads = __kmp_threads;
137 
138  /* ATT: The code below is a source of potential bugs due to unsynchronized
139  access to __kmp_threads array. For example:
140  1. Current thread loads other_threads[i] to thr and checks it, it is
141  non-NULL.
142  2. Current thread is suspended by OS.
143  3. Another thread unregisters and finishes (debug versions of free()
144  may fill memory with something like 0xEF).
145  4. Current thread is resumed.
146  5. Current thread reads junk from *thr.
147  TODO: Fix it. --ln */
148 
149  for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152  if (!thr)
153  continue;
154 
155  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158  /* stack grows down -- search through all of the active threads */
159 
160  if (stack_addr <= stack_base) {
161  size_t stack_diff = stack_base - stack_addr;
162 
163  if (stack_diff <= stack_size) {
164  /* The only way we can be closer than the allocated */
165  /* stack size is if we are running on this thread. */
166  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167  return i;
168  }
169  }
170  }
171 
172  /* get specific to try and determine our gtid */
173  KA_TRACE(1000,
174  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175  "thread, using TLS\n"));
176  i = __kmp_gtid_get_specific();
177 
178  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
179 
180  /* if we havn't been assigned a gtid, then return code */
181  if (i < 0)
182  return i;
183 
184  /* dynamically updated stack window for uber threads to avoid get_specific
185  call */
186  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187  KMP_FATAL(StackOverflow, i);
188  }
189 
190  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191  if (stack_addr > stack_base) {
192  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195  stack_base);
196  } else {
197  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198  stack_base - stack_addr);
199  }
200 
201  /* Reprint stack bounds for ubermaster since they have been refined */
202  if (__kmp_storage_map) {
203  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206  other_threads[i]->th.th_info.ds.ds_stacksize,
207  "th_%d stack (refinement)", i);
208  }
209  return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213  int gtid;
214 
215  if (!__kmp_init_serial) {
216  gtid = KMP_GTID_DNE;
217  } else
218 #ifdef KMP_TDATA_GTID
219  if (TCR_4(__kmp_gtid_mode) >= 3) {
220  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221  gtid = __kmp_gtid;
222  } else
223 #endif
224  if (TCR_4(__kmp_gtid_mode) >= 2) {
225  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226  gtid = __kmp_gtid_get_specific();
227  } else {
228  KA_TRACE(1000,
229  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230  gtid = __kmp_get_global_thread_id();
231  }
232 
233  /* we must be a new uber master sibling thread */
234  if (gtid == KMP_GTID_DNE) {
235  KA_TRACE(10,
236  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237  "Registering a new gtid.\n"));
238  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239  if (!__kmp_init_serial) {
240  __kmp_do_serial_initialize();
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  gtid = __kmp_register_root(FALSE);
244  }
245  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247  }
248 
249  KMP_DEBUG_ASSERT(gtid >= 0);
250 
251  return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256  int f;
257  char *stack_beg = NULL;
258  char *stack_end = NULL;
259  int gtid;
260 
261  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262  if (__kmp_storage_map) {
263  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266  gtid = __kmp_gtid_from_thread(th);
267 
268  if (gtid == KMP_GTID_MONITOR) {
269  __kmp_print_storage_map_gtid(
270  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271  "th_%s stack (%s)", "mon",
272  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273  } else {
274  __kmp_print_storage_map_gtid(
275  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276  "th_%d stack (%s)", gtid,
277  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278  }
279  }
280 
281  /* No point in checking ubermaster threads since they use refinement and
282  * cannot overlap */
283  gtid = __kmp_gtid_from_thread(th);
284  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285  KA_TRACE(10,
286  ("__kmp_check_stack_overlap: performing extensive checking\n"));
287  if (stack_beg == NULL) {
288  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290  }
291 
292  for (f = 0; f < __kmp_threads_capacity; f++) {
293  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295  if (f_th && f_th != th) {
296  char *other_stack_end =
297  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298  char *other_stack_beg =
299  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303  /* Print the other stack values before the abort */
304  if (__kmp_storage_map)
305  __kmp_print_storage_map_gtid(
306  -1, other_stack_beg, other_stack_end,
307  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311  __kmp_msg_null);
312  }
313  }
314  }
315  }
316  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322  static int done = FALSE;
323 
324  while (!done) {
325  KMP_YIELD(1);
326  }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332  char const *format, ...) {
333  char buffer[MAX_MESSAGE];
334  va_list ap;
335 
336  va_start(ap, format);
337  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338  p2, (unsigned long)size, format);
339  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340  __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342  int node;
343  if (gtid >= 0) {
344  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345  if (__kmp_storage_map_verbose) {
346  node = __kmp_get_host_node(p1);
347  if (node < 0) /* doesn't work, so don't try this next time */
348  __kmp_storage_map_verbose = FALSE;
349  else {
350  char *last;
351  int lastNode;
352  int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354  const int page_size = KMP_GET_PAGE_SIZE();
355 
356  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358  if (localProc >= 0)
359  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
360  localProc >> 1);
361  else
362  __kmp_printf_no_lock(" GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364  /* The more elaborate format is disabled for now because of the prctl
365  * hanging bug. */
366  do {
367  last = p1;
368  lastNode = node;
369  /* This loop collates adjacent pages with the same host node. */
370  do {
371  (char *)p1 += page_size;
372  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
374  lastNode);
375  } while (p1 <= p2);
376 #else
377  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
378  (char *)p1 + (page_size - 1),
379  __kmp_get_host_node(p1));
380  if (p1 < p2) {
381  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
382  (char *)p2 + (page_size - 1),
383  __kmp_get_host_node(p2));
384  }
385 #endif
386  }
387  }
388  } else
389  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
390  }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396  char buffer[MAX_MESSAGE];
397  va_list ap;
398 
399  if (__kmp_generate_warnings == kmp_warnings_off) {
400  return;
401  }
402 
403  va_start(ap, format);
404 
405  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407  __kmp_vprintf(kmp_err, buffer, ap);
408  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410  va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414  // Later threads may stall here, but that's ok because abort() will kill them.
415  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417  if (__kmp_debug_buf) {
418  __kmp_dump_debug_buffer();
419  }
420 
421  if (KMP_OS_WINDOWS) {
422  // Let other threads know of abnormal termination and prevent deadlock
423  // if abort happened during library initialization or shutdown
424  __kmp_global.g.g_abort = SIGABRT;
425 
426  /* On Windows* OS by default abort() causes pop-up error box, which stalls
427  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428  boxes. _set_abort_behavior() works well, but this function is not
429  available in VS7 (this is not problem for DLL, but it is a problem for
430  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431  help, at least in some versions of MS C RTL.
432 
433  It seems following sequence is the only way to simulate abort() and
434  avoid pop-up error box. */
435  raise(SIGABRT);
436  _exit(3); // Just in case, if signal ignored, exit anyway.
437  } else {
438  abort();
439  }
440 
441  __kmp_infinite_loop();
442  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447  // TODO: Eliminate g_abort global variable and this function.
448  // In case of abort just call abort(), it will kill all the threads.
449  __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453  that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457  gtid);
458 
459  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463  sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465  __kmp_print_storage_map_gtid(
466  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470  &thr->th.th_bar[bs_plain_barrier + 1],
471  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472  gtid);
473 
474  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475  &thr->th.th_bar[bs_forkjoin_barrier + 1],
476  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477  gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481  &thr->th.th_bar[bs_reduction_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483  gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488  that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491  int team_id, int num_thr) {
492  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494  header, team_id);
495 
496  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497  &team->t.t_bar[bs_last_barrier],
498  sizeof(kmp_balign_team_t) * bs_last_barrier,
499  "%s_%d.t_bar", header, team_id);
500 
501  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502  &team->t.t_bar[bs_plain_barrier + 1],
503  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504  header, team_id);
505 
506  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507  &team->t.t_bar[bs_forkjoin_barrier + 1],
508  sizeof(kmp_balign_team_t),
509  "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513  &team->t.t_bar[bs_reduction_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518  __kmp_print_storage_map_gtid(
519  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522  __kmp_print_storage_map_gtid(
523  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527  &team->t.t_disp_buffer[num_disp_buff],
528  sizeof(dispatch_shared_info_t) * num_disp_buff,
529  "%s_%d.t_disp_buffer", header, team_id);
530 
531  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533  team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545  // TODO: Change to __kmp_break_bootstrap_lock().
546  __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550  int i;
551  int thread_count;
552 
553  // PROCESS_DETACH is expected to be called by a thread that executes
554  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557  // threads can be still alive here, although being about to be terminated. The
558  // threads in the array with ds_thread==0 are most suspicious. Actually, it
559  // can be not safe to access the __kmp_threads[].
560 
561  // TODO: does it make sense to check __kmp_roots[] ?
562 
563  // Let's check that there are no other alive threads registered with the OMP
564  // lib.
565  while (1) {
566  thread_count = 0;
567  for (i = 0; i < __kmp_threads_capacity; ++i) {
568  if (!__kmp_threads)
569  continue;
570  kmp_info_t *th = __kmp_threads[i];
571  if (th == NULL)
572  continue;
573  int gtid = th->th.th_info.ds.ds_gtid;
574  if (gtid == gtid_req)
575  continue;
576  if (gtid < 0)
577  continue;
578  DWORD exit_val;
579  int alive = __kmp_is_thread_alive(th, &exit_val);
580  if (alive) {
581  ++thread_count;
582  }
583  }
584  if (thread_count == 0)
585  break; // success
586  }
587 
588  // Assume that I'm alone. Now it might be safe to check and reset locks.
589  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590  __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592  __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599  switch (fdwReason) {
600 
601  case DLL_PROCESS_ATTACH:
602  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604  return TRUE;
605 
606  case DLL_PROCESS_DETACH:
607  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609  if (lpReserved != NULL) {
610  // lpReserved is used for telling the difference:
611  // lpReserved == NULL when FreeLibrary() was called,
612  // lpReserved != NULL when the process terminates.
613  // When FreeLibrary() is called, worker threads remain alive. So they will
614  // release the forkjoin lock by themselves. When the process terminates,
615  // worker threads disappear triggering the problem of unreleased forkjoin
616  // lock as described below.
617 
618  // A worker thread can take the forkjoin lock. The problem comes up if
619  // that worker thread becomes dead before it releases the forkjoin lock.
620  // The forkjoin lock remains taken, while the thread executing
621  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622  // to take the forkjoin lock and will always fail, so that the application
623  // will never finish [normally]. This scenario is possible if
624  // __kmpc_end() has not been executed. It looks like it's not a corner
625  // case, but common cases:
626  // - the main function was compiled by an alternative compiler;
627  // - the main function was compiled by icl but without /Qopenmp
628  // (application with plugins);
629  // - application terminates by calling C exit(), Fortran CALL EXIT() or
630  // Fortran STOP.
631  // - alive foreign thread prevented __kmpc_end from doing cleanup.
632  //
633  // This is a hack to work around the problem.
634  // TODO: !!! figure out something better.
635  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636  }
637 
638  __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640  return TRUE;
641 
642  case DLL_THREAD_ATTACH:
643  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645  /* if we want to register new siblings all the time here call
646  * __kmp_get_gtid(); */
647  return TRUE;
648 
649  case DLL_THREAD_DETACH:
650  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652  __kmp_internal_end_thread(__kmp_gtid_get_specific());
653  return TRUE;
654  }
655 
656  return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665  int old_status;
666 
667  old_status = __kmp_yield_init &
668  1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670  if (status) {
671  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672  } else {
673  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674  }
675 
676  return old_status; // return previous setting of whether
677  // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682  int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684  kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687  if (__kmp_env_consistency_check) {
688  if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694  }
695 #ifdef BUILD_PARALLEL_ORDERED
696  if (!team->t.t_serialized) {
697  KMP_MB();
698  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699  KMP_EQ, NULL);
700  KMP_MB();
701  }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707  int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709  int tid = __kmp_tid_from_gtid(gtid);
710  kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713  if (__kmp_env_consistency_check) {
714  if (__kmp_threads[gtid]->th.th_root->r.r_active)
715  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716  }
717 #ifdef BUILD_PARALLEL_ORDERED
718  if (!team->t.t_serialized) {
719  KMP_MB(); /* Flush all pending memory write invalidates. */
720 
721  /* use the tid of the next thread in this team */
722  /* TODO replace with general release procedure */
723  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725  KMP_MB(); /* Flush all pending memory write invalidates. */
726  }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734  int status;
735  kmp_info_t *th;
736  kmp_team_t *team;
737 
738  if (!TCR_4(__kmp_init_parallel))
739  __kmp_parallel_initialize();
740 
741  th = __kmp_threads[gtid];
742  team = th->th.th_team;
743  status = 0;
744 
745  th->th.th_ident = id_ref;
746 
747  if (team->t.t_serialized) {
748  status = 1;
749  } else {
750  kmp_int32 old_this = th->th.th_local.this_construct;
751 
752  ++th->th.th_local.this_construct;
753  /* try to set team count to thread count--success means thread got the
754  single block */
755  /* TODO: Should this be acquire or release? */
756  if (team->t.t_construct == old_this) {
757  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758  th->th.th_local.this_construct);
759  }
760 #if USE_ITT_BUILD
761  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762  KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764  th->th.th_teams_microtask == NULL &&
765 #endif
766  team->t.t_active_level ==
767  1) { // Only report metadata by master of active team at level 1
768  __kmp_itt_metadata_single(id_ref);
769  }
770 #endif /* USE_ITT_BUILD */
771  }
772 
773  if (__kmp_env_consistency_check) {
774  if (status && push_ws) {
775  __kmp_push_workshare(gtid, ct_psingle, id_ref);
776  } else {
777  __kmp_check_workshare(gtid, ct_psingle, id_ref);
778  }
779  }
780 #if USE_ITT_BUILD
781  if (status) {
782  __kmp_itt_single_start(gtid);
783  }
784 #endif /* USE_ITT_BUILD */
785  return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790  __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792  if (__kmp_env_consistency_check)
793  __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805  ,
806  int enter_teams
807 #endif /* OMP_40_ENABLED */
808  ) {
809  int capacity;
810  int new_nthreads;
811  KMP_DEBUG_ASSERT(__kmp_init_serial);
812  KMP_DEBUG_ASSERT(root && parent_team);
813 
814  // If dyn-var is set, dynamically adjust the number of desired threads,
815  // according to the method specified by dynamic_mode.
816  new_nthreads = set_nthreads;
817  if (!get__dynamic_2(parent_team, master_tid)) {
818  ;
819  }
820 #ifdef USE_LOAD_BALANCE
821  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823  if (new_nthreads == 1) {
824  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825  "reservation to 1 thread\n",
826  master_tid));
827  return 1;
828  }
829  if (new_nthreads < set_nthreads) {
830  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831  "reservation to %d threads\n",
832  master_tid, new_nthreads));
833  }
834  }
835 #endif /* USE_LOAD_BALANCE */
836  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837  new_nthreads = __kmp_avail_proc - __kmp_nth +
838  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839  if (new_nthreads <= 1) {
840  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841  "reservation to 1 thread\n",
842  master_tid));
843  return 1;
844  }
845  if (new_nthreads < set_nthreads) {
846  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847  "reservation to %d threads\n",
848  master_tid, new_nthreads));
849  } else {
850  new_nthreads = set_nthreads;
851  }
852  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853  if (set_nthreads > 2) {
854  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855  new_nthreads = (new_nthreads % set_nthreads) + 1;
856  if (new_nthreads == 1) {
857  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858  "reservation to 1 thread\n",
859  master_tid));
860  return 1;
861  }
862  if (new_nthreads < set_nthreads) {
863  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864  "reservation to %d threads\n",
865  master_tid, new_nthreads));
866  }
867  }
868  } else {
869  KMP_ASSERT(0);
870  }
871 
872  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873  if (__kmp_nth + new_nthreads -
874  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875  __kmp_max_nth) {
876  int tl_nthreads = __kmp_max_nth - __kmp_nth +
877  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878  if (tl_nthreads <= 0) {
879  tl_nthreads = 1;
880  }
881 
882  // If dyn-var is false, emit a 1-time warning.
883  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884  __kmp_reserve_warn = 1;
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888  }
889  if (tl_nthreads == 1) {
890  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891  "reduced reservation to 1 thread\n",
892  master_tid));
893  return 1;
894  }
895  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896  "reservation to %d threads\n",
897  master_tid, tl_nthreads));
898  new_nthreads = tl_nthreads;
899  }
900 
901  // Respect OMP_THREAD_LIMIT
902  if (root->r.r_cg_nthreads + new_nthreads -
903  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904  __kmp_cg_max_nth) {
905  int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907  if (tl_nthreads <= 0) {
908  tl_nthreads = 1;
909  }
910 
911  // If dyn-var is false, emit a 1-time warning.
912  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913  __kmp_reserve_warn = 1;
914  __kmp_msg(kmp_ms_warning,
915  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917  }
918  if (tl_nthreads == 1) {
919  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920  "reduced reservation to 1 thread\n",
921  master_tid));
922  return 1;
923  }
924  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925  "reservation to %d threads\n",
926  master_tid, tl_nthreads));
927  new_nthreads = tl_nthreads;
928  }
929 
930  // Check if the threads array is large enough, or needs expanding.
931  // See comment in __kmp_register_root() about the adjustment if
932  // __kmp_threads[0] == NULL.
933  capacity = __kmp_threads_capacity;
934  if (TCR_PTR(__kmp_threads[0]) == NULL) {
935  --capacity;
936  }
937  if (__kmp_nth + new_nthreads -
938  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939  capacity) {
940  // Expand the threads array.
941  int slotsRequired = __kmp_nth + new_nthreads -
942  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943  capacity;
944  int slotsAdded = __kmp_expand_threads(slotsRequired);
945  if (slotsAdded < slotsRequired) {
946  // The threads array was not expanded enough.
947  new_nthreads -= (slotsRequired - slotsAdded);
948  KMP_ASSERT(new_nthreads >= 1);
949 
950  // If dyn-var is false, emit a 1-time warning.
951  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952  __kmp_reserve_warn = 1;
953  if (__kmp_tp_cached) {
954  __kmp_msg(kmp_ms_warning,
955  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958  } else {
959  __kmp_msg(kmp_ms_warning,
960  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962  }
963  }
964  }
965  }
966 
967 #ifdef KMP_DEBUG
968  if (new_nthreads == 1) {
969  KC_TRACE(10,
970  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971  "dead roots and rechecking; requested %d threads\n",
972  __kmp_get_gtid(), set_nthreads));
973  } else {
974  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975  " %d threads\n",
976  __kmp_get_gtid(), new_nthreads, set_nthreads));
977  }
978 #endif // KMP_DEBUG
979  return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983  assured that there are enough threads available, because we checked on that
984  earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986  kmp_info_t *master_th, int master_gtid) {
987  int i;
988  int use_hot_team;
989 
990  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992  KMP_MB();
993 
994  /* first, let's setup the master thread */
995  master_th->th.th_info.ds.ds_tid = 0;
996  master_th->th.th_team = team;
997  master_th->th.th_team_nproc = team->t.t_nproc;
998  master_th->th.th_team_master = master_th;
999  master_th->th.th_team_serialized = FALSE;
1000  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004  use_hot_team = 0;
1005  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006  if (hot_teams) { // hot teams array is not allocated if
1007  // KMP_HOT_TEAMS_MAX_LEVEL=0
1008  int level = team->t.t_active_level - 1; // index in array of hot teams
1009  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010  if (master_th->th.th_teams_size.nteams > 1) {
1011  ++level; // level was not increased in teams construct for
1012  // team_of_masters
1013  }
1014  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015  master_th->th.th_teams_level == team->t.t_level) {
1016  ++level; // level was not increased in teams construct for
1017  // team_of_workers before the parallel
1018  } // team->t.t_level will be increased inside parallel
1019  }
1020  if (level < __kmp_hot_teams_max_level) {
1021  if (hot_teams[level].hot_team) {
1022  // hot team has already been allocated for given level
1023  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024  use_hot_team = 1; // the team is ready to use
1025  } else {
1026  use_hot_team = 0; // AC: threads are not allocated yet
1027  hot_teams[level].hot_team = team; // remember new hot team
1028  hot_teams[level].hot_team_nth = team->t.t_nproc;
1029  }
1030  } else {
1031  use_hot_team = 0;
1032  }
1033  }
1034 #else
1035  use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037  if (!use_hot_team) {
1038 
1039  /* install the master thread */
1040  team->t.t_threads[0] = master_th;
1041  __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043  /* now, install the worker threads */
1044  for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046  /* fork or reallocate a new thread and install it in team */
1047  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048  team->t.t_threads[i] = thr;
1049  KMP_DEBUG_ASSERT(thr);
1050  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051  /* align team and thread arrived states */
1052  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057  team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060  thr->th.th_teams_level = master_th->th.th_teams_level;
1061  thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063  { // Initialize threads' barrier data.
1064  int b;
1065  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066  for (b = 0; b < bs_last_barrier; ++b) {
1067  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072  }
1073  }
1074  }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077  __kmp_partition_places(team);
1078 #endif
1079  }
1080 
1081  KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089  if (__kmp_inherit_fp_control) {
1090  kmp_int16 x87_fpu_control_word;
1091  kmp_uint32 mxcsr;
1092 
1093  // Get master values of FPU control flags (both X87 and vector)
1094  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095  __kmp_store_mxcsr(&mxcsr);
1096  mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098  // There is no point looking at t_fp_control_saved here.
1099  // If it is TRUE, we still have to update the values if they are different
1100  // from those we now have. If it is FALSE we didn't save anything yet, but
1101  // our objective is the same. We have to ensure that the values in the team
1102  // are the same as those we have.
1103  // So, this code achieves what we need whether or not t_fp_control_saved is
1104  // true. By checking whether the value needs updating we avoid unnecessary
1105  // writes that would put the cache-line into a written state, causing all
1106  // threads in the team to have to read it again.
1107  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109  // Although we don't use this value, other code in the runtime wants to know
1110  // whether it should restore them. So we must ensure it is correct.
1111  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112  } else {
1113  // Similarly here. Don't write to this cache-line in the team structure
1114  // unless we have to.
1115  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116  }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123  // Only reset the fp control regs if they have been changed in the team.
1124  // the parallel region that we are exiting.
1125  kmp_int16 x87_fpu_control_word;
1126  kmp_uint32 mxcsr;
1127  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128  __kmp_store_mxcsr(&mxcsr);
1129  mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132  __kmp_clear_x87_fpu_status_word();
1133  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134  }
1135 
1136  if (team->t.t_mxcsr != mxcsr) {
1137  __kmp_load_mxcsr(&team->t.t_mxcsr);
1138  }
1139  }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147  int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150  single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152  kmp_info_t *this_thr;
1153  kmp_team_t *serial_team;
1154 
1155  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157  /* Skip all this code for autopar serialized loops since it results in
1158  unacceptable overhead */
1159  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160  return;
1161 
1162  if (!TCR_4(__kmp_init_parallel))
1163  __kmp_parallel_initialize();
1164 
1165  this_thr = __kmp_threads[global_tid];
1166  serial_team = this_thr->th.th_serial_team;
1167 
1168  /* utilize the serialized team held by this thread */
1169  KMP_DEBUG_ASSERT(serial_team);
1170  KMP_MB();
1171 
1172  if (__kmp_tasking_mode != tskm_immediate_exec) {
1173  KMP_DEBUG_ASSERT(
1174  this_thr->th.th_task_team ==
1175  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177  NULL);
1178  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179  "team %p, new task_team = NULL\n",
1180  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181  this_thr->th.th_task_team = NULL;
1182  }
1183 
1184 #if OMP_40_ENABLED
1185  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187  proc_bind = proc_bind_false;
1188  } else if (proc_bind == proc_bind_default) {
1189  // No proc_bind clause was specified, so use the current value
1190  // of proc-bind-var for this parallel region.
1191  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192  }
1193  // Reset for next parallel region
1194  this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198  ompt_data_t ompt_parallel_data;
1199  ompt_parallel_data.ptr = NULL;
1200  ompt_data_t *implicit_task_data;
1201  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202  if (ompt_enabled.enabled &&
1203  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205  ompt_task_info_t *parent_task_info;
1206  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208  parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209  if (ompt_enabled.ompt_callback_parallel_begin) {
1210  int team_size = 1;
1211 
1212  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213  &(parent_task_info->task_data), &(parent_task_info->frame),
1214  &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215  }
1216  }
1217 #endif // OMPT_SUPPORT
1218 
1219  if (this_thr->th.th_team != serial_team) {
1220  // Nested level will be an index in the nested nthreads array
1221  int level = this_thr->th.th_team->t.t_level;
1222 
1223  if (serial_team->t.t_serialized) {
1224  /* this serial team was already used
1225  TODO increase performance by making this locks more specific */
1226  kmp_team_t *new_team;
1227 
1228  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232  ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235  proc_bind,
1236 #endif
1237  &this_thr->th.th_current_task->td_icvs,
1238  0 USE_NESTED_HOT_ARG(NULL));
1239  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240  KMP_ASSERT(new_team);
1241 
1242  /* setup new serialized team and install it */
1243  new_team->t.t_threads[0] = this_thr;
1244  new_team->t.t_parent = this_thr->th.th_team;
1245  serial_team = new_team;
1246  this_thr->th.th_serial_team = serial_team;
1247 
1248  KF_TRACE(
1249  10,
1250  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251  global_tid, serial_team));
1252 
1253  /* TODO the above breaks the requirement that if we run out of resources,
1254  then we can still guarantee that serialized teams are ok, since we may
1255  need to allocate a new one */
1256  } else {
1257  KF_TRACE(
1258  10,
1259  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260  global_tid, serial_team));
1261  }
1262 
1263  /* we have to initialize this serial team */
1264  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267  serial_team->t.t_ident = loc;
1268  serial_team->t.t_serialized = 1;
1269  serial_team->t.t_nproc = 1;
1270  serial_team->t.t_parent = this_thr->th.th_team;
1271  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1272  this_thr->th.th_team = serial_team;
1273  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276  this_thr->th.th_current_task));
1277  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278  this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283  implicit task for each serialized task represented by
1284  team->t.t_serialized? */
1285  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286  &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288  // Thread value exists in the nested nthreads array for the next nested
1289  // level
1290  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291  this_thr->th.th_current_task->td_icvs.nproc =
1292  __kmp_nested_nth.nth[level + 1];
1293  }
1294 
1295 #if OMP_40_ENABLED
1296  if (__kmp_nested_proc_bind.used &&
1297  (level + 1 < __kmp_nested_proc_bind.used)) {
1298  this_thr->th.th_current_task->td_icvs.proc_bind =
1299  __kmp_nested_proc_bind.bind_types[level + 1];
1300  }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306  this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308  /* set thread cache values */
1309  this_thr->th.th_team_nproc = 1;
1310  this_thr->th.th_team_master = this_thr;
1311  this_thr->th.th_team_serialized = 1;
1312 
1313  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316  propagateFPControl(serial_team);
1317 
1318  /* check if we need to allocate dispatch buffers stack */
1319  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321  serial_team->t.t_dispatch->th_disp_buffer =
1322  (dispatch_private_info_t *)__kmp_allocate(
1323  sizeof(dispatch_private_info_t));
1324  }
1325  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327  KMP_MB();
1328 
1329  } else {
1330  /* this serialized team is already being used,
1331  * that's fine, just add another nested level */
1332  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335  ++serial_team->t.t_serialized;
1336  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338  // Nested level will be an index in the nested nthreads array
1339  int level = this_thr->th.th_team->t.t_level;
1340  // Thread value exists in the nested nthreads array for the next nested
1341  // level
1342  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343  this_thr->th.th_current_task->td_icvs.nproc =
1344  __kmp_nested_nth.nth[level + 1];
1345  }
1346  serial_team->t.t_level++;
1347  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348  "of serial team %p to %d\n",
1349  global_tid, serial_team, serial_team->t.t_level));
1350 
1351  /* allocate/push dispatch buffers stack */
1352  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353  {
1354  dispatch_private_info_t *disp_buffer =
1355  (dispatch_private_info_t *)__kmp_allocate(
1356  sizeof(dispatch_private_info_t));
1357  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359  }
1360  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362  KMP_MB();
1363  }
1364 #if OMP_40_ENABLED
1365  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368  if (__kmp_env_consistency_check)
1369  __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371  serial_team->t.ompt_team_info.master_return_address = codeptr;
1372  if (ompt_enabled.enabled &&
1373  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376  ompt_lw_taskteam_t lw_taskteam;
1377  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378  &ompt_parallel_data, codeptr);
1379 
1380  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381  // don't use lw_taskteam after linking. content was swaped
1382 
1383  /* OMPT implicit task begin */
1384  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385  if (ompt_enabled.ompt_callback_implicit_task) {
1386  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389  OMPT_CUR_TASK_INFO(this_thr)
1390  ->thread_num = __kmp_tid_from_gtid(global_tid);
1391  }
1392 
1393  /* OMPT state */
1394  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1395  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1396  }
1397 #endif
1398 }
1399 
1400 /* most of the work for a fork */
1401 /* return true if we really went parallel, false if serialized */
1402 int __kmp_fork_call(ident_t *loc, int gtid,
1403  enum fork_context_e call_context, // Intel, GNU, ...
1404  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1405 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1406 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1407  va_list *ap
1408 #else
1409  va_list ap
1410 #endif
1411  ) {
1412  void **argv;
1413  int i;
1414  int master_tid;
1415  int master_this_cons;
1416  kmp_team_t *team;
1417  kmp_team_t *parent_team;
1418  kmp_info_t *master_th;
1419  kmp_root_t *root;
1420  int nthreads;
1421  int master_active;
1422  int master_set_numthreads;
1423  int level;
1424 #if OMP_40_ENABLED
1425  int active_level;
1426  int teams_level;
1427 #endif
1428 #if KMP_NESTED_HOT_TEAMS
1429  kmp_hot_team_ptr_t **p_hot_teams;
1430 #endif
1431  { // KMP_TIME_BLOCK
1432  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1433  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1434 
1435  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1436  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1437  /* Some systems prefer the stack for the root thread(s) to start with */
1438  /* some gap from the parent stack to prevent false sharing. */
1439  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1440  /* These 2 lines below are so this does not get optimized out */
1441  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1442  __kmp_stkpadding += (short)((kmp_int64)dummy);
1443  }
1444 
1445  /* initialize if needed */
1446  KMP_DEBUG_ASSERT(
1447  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1448  if (!TCR_4(__kmp_init_parallel))
1449  __kmp_parallel_initialize();
1450 
1451  /* setup current data */
1452  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1453  // shutdown
1454  parent_team = master_th->th.th_team;
1455  master_tid = master_th->th.th_info.ds.ds_tid;
1456  master_this_cons = master_th->th.th_local.this_construct;
1457  root = master_th->th.th_root;
1458  master_active = root->r.r_active;
1459  master_set_numthreads = master_th->th.th_set_nproc;
1460 
1461 #if OMPT_SUPPORT
1462  ompt_data_t ompt_parallel_data;
1463  ompt_parallel_data.ptr = NULL;
1464  ompt_data_t *parent_task_data;
1465  omp_frame_t *ompt_frame;
1466  ompt_data_t *implicit_task_data;
1467  void *return_address = NULL;
1468 
1469  if (ompt_enabled.enabled) {
1470  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1471  NULL, NULL);
1472  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1473  }
1474 #endif
1475 
1476  // Nested level will be an index in the nested nthreads array
1477  level = parent_team->t.t_level;
1478  // used to launch non-serial teams even if nested is not allowed
1479  active_level = parent_team->t.t_active_level;
1480 #if OMP_40_ENABLED
1481  // needed to check nesting inside the teams
1482  teams_level = master_th->th.th_teams_level;
1483 #endif
1484 #if KMP_NESTED_HOT_TEAMS
1485  p_hot_teams = &master_th->th.th_hot_teams;
1486  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1487  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1488  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1489  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1490  // it is either actual or not needed (when active_level > 0)
1491  (*p_hot_teams)[0].hot_team_nth = 1;
1492  }
1493 #endif
1494 
1495 #if OMPT_SUPPORT
1496  if (ompt_enabled.enabled) {
1497  if (ompt_enabled.ompt_callback_parallel_begin) {
1498  int team_size = master_set_numthreads
1499  ? master_set_numthreads
1500  : get__nproc_2(parent_team, master_tid);
1501  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1502  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1503  OMPT_INVOKER(call_context), return_address);
1504  }
1505  master_th->th.ompt_thread_info.state = omp_state_overhead;
1506  }
1507 #endif
1508 
1509  master_th->th.th_ident = loc;
1510 
1511 #if OMP_40_ENABLED
1512  if (master_th->th.th_teams_microtask && ap &&
1513  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1514  // AC: This is start of parallel that is nested inside teams construct.
1515  // The team is actual (hot), all workers are ready at the fork barrier.
1516  // No lock needed to initialize the team a bit, then free workers.
1517  parent_team->t.t_ident = loc;
1518  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1519  parent_team->t.t_argc = argc;
1520  argv = (void **)parent_team->t.t_argv;
1521  for (i = argc - 1; i >= 0; --i)
1522 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1523 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1524  *argv++ = va_arg(*ap, void *);
1525 #else
1526  *argv++ = va_arg(ap, void *);
1527 #endif
1528  // Increment our nested depth levels, but not increase the serialization
1529  if (parent_team == master_th->th.th_serial_team) {
1530  // AC: we are in serialized parallel
1531  __kmpc_serialized_parallel(loc, gtid);
1532  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1533  // AC: need this in order enquiry functions work
1534  // correctly, will restore at join time
1535  parent_team->t.t_serialized--;
1536 #if OMPT_SUPPORT
1537  void *dummy;
1538  void **exit_runtime_p;
1539 
1540  ompt_lw_taskteam_t lw_taskteam;
1541 
1542  if (ompt_enabled.enabled) {
1543  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1544  &ompt_parallel_data, return_address);
1545  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1546 
1547  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1548  // don't use lw_taskteam after linking. content was swaped
1549 
1550  /* OMPT implicit task begin */
1551  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1552  if (ompt_enabled.ompt_callback_implicit_task) {
1553  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1554  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1555  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1556  OMPT_CUR_TASK_INFO(master_th)
1557  ->thread_num = __kmp_tid_from_gtid(gtid);
1558  }
1559 
1560  /* OMPT state */
1561  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1562  } else {
1563  exit_runtime_p = &dummy;
1564  }
1565 #endif
1566 
1567  {
1568  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1569  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1570  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1571 #if OMPT_SUPPORT
1572  ,
1573  exit_runtime_p
1574 #endif
1575  );
1576  }
1577 
1578 #if OMPT_SUPPORT
1579  *exit_runtime_p = NULL;
1580  if (ompt_enabled.enabled) {
1581  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1582  if (ompt_enabled.ompt_callback_implicit_task) {
1583  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1584  ompt_scope_end, NULL, implicit_task_data, 1,
1585  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1586  }
1587  __ompt_lw_taskteam_unlink(master_th);
1588 
1589  if (ompt_enabled.ompt_callback_parallel_end) {
1590  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1591  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1592  OMPT_INVOKER(call_context), return_address);
1593  }
1594  master_th->th.ompt_thread_info.state = omp_state_overhead;
1595  }
1596 #endif
1597  return TRUE;
1598  }
1599 
1600  parent_team->t.t_pkfn = microtask;
1601  parent_team->t.t_invoke = invoker;
1602  KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1603  parent_team->t.t_active_level++;
1604  parent_team->t.t_level++;
1605 
1606  /* Change number of threads in the team if requested */
1607  if (master_set_numthreads) { // The parallel has num_threads clause
1608  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1609  // AC: only can reduce number of threads dynamically, can't increase
1610  kmp_info_t **other_threads = parent_team->t.t_threads;
1611  parent_team->t.t_nproc = master_set_numthreads;
1612  for (i = 0; i < master_set_numthreads; ++i) {
1613  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1614  }
1615  // Keep extra threads hot in the team for possible next parallels
1616  }
1617  master_th->th.th_set_nproc = 0;
1618  }
1619 
1620 #if USE_DEBUGGER
1621  if (__kmp_debugging) { // Let debugger override number of threads.
1622  int nth = __kmp_omp_num_threads(loc);
1623  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1624  master_set_numthreads = nth;
1625  }
1626  }
1627 #endif
1628 
1629  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1630  "master_th=%p, gtid=%d\n",
1631  root, parent_team, master_th, gtid));
1632  __kmp_internal_fork(loc, gtid, parent_team);
1633  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1634  "master_th=%p, gtid=%d\n",
1635  root, parent_team, master_th, gtid));
1636 
1637  /* Invoke microtask for MASTER thread */
1638  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1639  parent_team->t.t_id, parent_team->t.t_pkfn));
1640 
1641  {
1642  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1643  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1644  if (!parent_team->t.t_invoke(gtid)) {
1645  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1646  }
1647  }
1648  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1649  parent_team->t.t_id, parent_team->t.t_pkfn));
1650  KMP_MB(); /* Flush all pending memory write invalidates. */
1651 
1652  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1653 
1654  return TRUE;
1655  } // Parallel closely nested in teams construct
1656 #endif /* OMP_40_ENABLED */
1657 
1658 #if KMP_DEBUG
1659  if (__kmp_tasking_mode != tskm_immediate_exec) {
1660  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1661  parent_team->t.t_task_team[master_th->th.th_task_state]);
1662  }
1663 #endif
1664 
1665  if (parent_team->t.t_active_level >=
1666  master_th->th.th_current_task->td_icvs.max_active_levels) {
1667  nthreads = 1;
1668  } else {
1669 #if OMP_40_ENABLED
1670  int enter_teams = ((ap == NULL && active_level == 0) ||
1671  (ap && teams_level > 0 && teams_level == level));
1672 #endif
1673  nthreads =
1674  master_set_numthreads
1675  ? master_set_numthreads
1676  : get__nproc_2(
1677  parent_team,
1678  master_tid); // TODO: get nproc directly from current task
1679 
1680  // Check if we need to take forkjoin lock? (no need for serialized
1681  // parallel out of teams construct). This code moved here from
1682  // __kmp_reserve_threads() to speedup nested serialized parallels.
1683  if (nthreads > 1) {
1684  if ((!get__nested(master_th) && (root->r.r_in_parallel
1685 #if OMP_40_ENABLED
1686  && !enter_teams
1687 #endif /* OMP_40_ENABLED */
1688  )) ||
1689  (__kmp_library == library_serial)) {
1690  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1691  " threads\n",
1692  gtid, nthreads));
1693  nthreads = 1;
1694  }
1695  }
1696  if (nthreads > 1) {
1697  /* determine how many new threads we can use */
1698  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1699  nthreads = __kmp_reserve_threads(
1700  root, parent_team, master_tid, nthreads
1701 #if OMP_40_ENABLED
1702  /* AC: If we execute teams from parallel region (on host), then
1703  teams should be created but each can only have 1 thread if
1704  nesting is disabled. If teams called from serial region, then
1705  teams and their threads should be created regardless of the
1706  nesting setting. */
1707  ,
1708  enter_teams
1709 #endif /* OMP_40_ENABLED */
1710  );
1711  if (nthreads == 1) {
1712  // Free lock for single thread execution here; for multi-thread
1713  // execution it will be freed later after team of threads created
1714  // and initialized
1715  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1716  }
1717  }
1718  }
1719  KMP_DEBUG_ASSERT(nthreads > 0);
1720 
1721  // If we temporarily changed the set number of threads then restore it now
1722  master_th->th.th_set_nproc = 0;
1723 
1724  /* create a serialized parallel region? */
1725  if (nthreads == 1) {
1726 /* josh todo: hypothetical question: what do we do for OS X*? */
1727 #if KMP_OS_LINUX && \
1728  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1729  void *args[argc];
1730 #else
1731  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1732 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1733  KMP_ARCH_AARCH64) */
1734 
1735  KA_TRACE(20,
1736  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1737 
1738  __kmpc_serialized_parallel(loc, gtid);
1739 
1740  if (call_context == fork_context_intel) {
1741  /* TODO this sucks, use the compiler itself to pass args! :) */
1742  master_th->th.th_serial_team->t.t_ident = loc;
1743 #if OMP_40_ENABLED
1744  if (!ap) {
1745  // revert change made in __kmpc_serialized_parallel()
1746  master_th->th.th_serial_team->t.t_level--;
1747 // Get args from parent team for teams construct
1748 
1749 #if OMPT_SUPPORT
1750  void *dummy;
1751  void **exit_runtime_p;
1752  ompt_task_info_t *task_info;
1753 
1754  ompt_lw_taskteam_t lw_taskteam;
1755 
1756  if (ompt_enabled.enabled) {
1757  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1758  &ompt_parallel_data, return_address);
1759 
1760  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1761  // don't use lw_taskteam after linking. content was swaped
1762 
1763  task_info = OMPT_CUR_TASK_INFO(master_th);
1764  exit_runtime_p = &(task_info->frame.exit_frame);
1765  if (ompt_enabled.ompt_callback_implicit_task) {
1766  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1767  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1768  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1769  OMPT_CUR_TASK_INFO(master_th)
1770  ->thread_num = __kmp_tid_from_gtid(gtid);
1771  }
1772 
1773  /* OMPT state */
1774  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1775  } else {
1776  exit_runtime_p = &dummy;
1777  }
1778 #endif
1779 
1780  {
1781  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1782  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1783  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1784  parent_team->t.t_argv
1785 #if OMPT_SUPPORT
1786  ,
1787  exit_runtime_p
1788 #endif
1789  );
1790  }
1791 
1792 #if OMPT_SUPPORT
1793  if (ompt_enabled.enabled) {
1794  exit_runtime_p = NULL;
1795  if (ompt_enabled.ompt_callback_implicit_task) {
1796  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1797  ompt_scope_end, NULL, &(task_info->task_data), 1,
1798  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1799  }
1800 
1801  __ompt_lw_taskteam_unlink(master_th);
1802  if (ompt_enabled.ompt_callback_parallel_end) {
1803  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1804  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1805  OMPT_INVOKER(call_context), return_address);
1806  }
1807  master_th->th.ompt_thread_info.state = omp_state_overhead;
1808  }
1809 #endif
1810  } else if (microtask == (microtask_t)__kmp_teams_master) {
1811  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1812  master_th->th.th_serial_team);
1813  team = master_th->th.th_team;
1814  // team->t.t_pkfn = microtask;
1815  team->t.t_invoke = invoker;
1816  __kmp_alloc_argv_entries(argc, team, TRUE);
1817  team->t.t_argc = argc;
1818  argv = (void **)team->t.t_argv;
1819  if (ap) {
1820  for (i = argc - 1; i >= 0; --i)
1821 // TODO: revert workaround for Intel(R) 64 tracker #96
1822 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1823  *argv++ = va_arg(*ap, void *);
1824 #else
1825  *argv++ = va_arg(ap, void *);
1826 #endif
1827  } else {
1828  for (i = 0; i < argc; ++i)
1829  // Get args from parent team for teams construct
1830  argv[i] = parent_team->t.t_argv[i];
1831  }
1832  // AC: revert change made in __kmpc_serialized_parallel()
1833  // because initial code in teams should have level=0
1834  team->t.t_level--;
1835  // AC: call special invoker for outer "parallel" of teams construct
1836  {
1837  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1838  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1839  invoker(gtid);
1840  }
1841  } else {
1842 #endif /* OMP_40_ENABLED */
1843  argv = args;
1844  for (i = argc - 1; i >= 0; --i)
1845 // TODO: revert workaround for Intel(R) 64 tracker #96
1846 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1847  *argv++ = va_arg(*ap, void *);
1848 #else
1849  *argv++ = va_arg(ap, void *);
1850 #endif
1851  KMP_MB();
1852 
1853 #if OMPT_SUPPORT
1854  void *dummy;
1855  void **exit_runtime_p;
1856  ompt_task_info_t *task_info;
1857 
1858  ompt_lw_taskteam_t lw_taskteam;
1859 
1860  if (ompt_enabled.enabled) {
1861  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1862  &ompt_parallel_data, return_address);
1863  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1864  // don't use lw_taskteam after linking. content was swaped
1865  task_info = OMPT_CUR_TASK_INFO(master_th);
1866  exit_runtime_p = &(task_info->frame.exit_frame);
1867 
1868  /* OMPT implicit task begin */
1869  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1870  if (ompt_enabled.ompt_callback_implicit_task) {
1871  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1872  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1873  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1874  OMPT_CUR_TASK_INFO(master_th)
1875  ->thread_num = __kmp_tid_from_gtid(gtid);
1876  }
1877 
1878  /* OMPT state */
1879  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1880  } else {
1881  exit_runtime_p = &dummy;
1882  }
1883 #endif
1884 
1885  {
1886  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1887  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1888  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1889 #if OMPT_SUPPORT
1890  ,
1891  exit_runtime_p
1892 #endif
1893  );
1894  }
1895 
1896 #if OMPT_SUPPORT
1897  if (ompt_enabled.enabled) {
1898  *exit_runtime_p = NULL;
1899  if (ompt_enabled.ompt_callback_implicit_task) {
1900  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901  ompt_scope_end, NULL, &(task_info->task_data), 1,
1902  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1903  }
1904 
1905  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1906  __ompt_lw_taskteam_unlink(master_th);
1907  if (ompt_enabled.ompt_callback_parallel_end) {
1908  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1909  &ompt_parallel_data, parent_task_data,
1910  OMPT_INVOKER(call_context), return_address);
1911  }
1912  master_th->th.ompt_thread_info.state = omp_state_overhead;
1913  }
1914 #endif
1915 #if OMP_40_ENABLED
1916  }
1917 #endif /* OMP_40_ENABLED */
1918  } else if (call_context == fork_context_gnu) {
1919 #if OMPT_SUPPORT
1920  ompt_lw_taskteam_t lwt;
1921  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1922  return_address);
1923 
1924  lwt.ompt_task_info.frame.exit_frame = NULL;
1925  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1926 // don't use lw_taskteam after linking. content was swaped
1927 #endif
1928 
1929  // we were called from GNU native code
1930  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1931  return FALSE;
1932  } else {
1933  KMP_ASSERT2(call_context < fork_context_last,
1934  "__kmp_fork_call: unknown fork_context parameter");
1935  }
1936 
1937  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1938  KMP_MB();
1939  return FALSE;
1940  }
1941 
1942  // GEH: only modify the executing flag in the case when not serialized
1943  // serialized case is handled in kmpc_serialized_parallel
1944  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1945  "curtask=%p, curtask_max_aclevel=%d\n",
1946  parent_team->t.t_active_level, master_th,
1947  master_th->th.th_current_task,
1948  master_th->th.th_current_task->td_icvs.max_active_levels));
1949  // TODO: GEH - cannot do this assertion because root thread not set up as
1950  // executing
1951  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1952  master_th->th.th_current_task->td_flags.executing = 0;
1953 
1954 #if OMP_40_ENABLED
1955  if (!master_th->th.th_teams_microtask || level > teams_level)
1956 #endif /* OMP_40_ENABLED */
1957  {
1958  /* Increment our nested depth level */
1959  KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1960  }
1961 
1962  // See if we need to make a copy of the ICVs.
1963  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1964  if ((level + 1 < __kmp_nested_nth.used) &&
1965  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1966  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1967  } else {
1968  nthreads_icv = 0; // don't update
1969  }
1970 
1971 #if OMP_40_ENABLED
1972  // Figure out the proc_bind_policy for the new team.
1973  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1974  kmp_proc_bind_t proc_bind_icv =
1975  proc_bind_default; // proc_bind_default means don't update
1976  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1977  proc_bind = proc_bind_false;
1978  } else {
1979  if (proc_bind == proc_bind_default) {
1980  // No proc_bind clause specified; use current proc-bind-var for this
1981  // parallel region
1982  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1983  }
1984  /* else: The proc_bind policy was specified explicitly on parallel clause.
1985  This overrides proc-bind-var for this parallel region, but does not
1986  change proc-bind-var. */
1987  // Figure the value of proc-bind-var for the child threads.
1988  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1989  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1990  master_th->th.th_current_task->td_icvs.proc_bind)) {
1991  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1992  }
1993  }
1994 
1995  // Reset for next parallel region
1996  master_th->th.th_set_proc_bind = proc_bind_default;
1997 #endif /* OMP_40_ENABLED */
1998 
1999  if ((nthreads_icv > 0)
2000 #if OMP_40_ENABLED
2001  || (proc_bind_icv != proc_bind_default)
2002 #endif /* OMP_40_ENABLED */
2003  ) {
2004  kmp_internal_control_t new_icvs;
2005  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2006  new_icvs.next = NULL;
2007  if (nthreads_icv > 0) {
2008  new_icvs.nproc = nthreads_icv;
2009  }
2010 
2011 #if OMP_40_ENABLED
2012  if (proc_bind_icv != proc_bind_default) {
2013  new_icvs.proc_bind = proc_bind_icv;
2014  }
2015 #endif /* OMP_40_ENABLED */
2016 
2017  /* allocate a new parallel team */
2018  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2019  team = __kmp_allocate_team(root, nthreads, nthreads,
2020 #if OMPT_SUPPORT
2021  ompt_parallel_data,
2022 #endif
2023 #if OMP_40_ENABLED
2024  proc_bind,
2025 #endif
2026  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2027  } else {
2028  /* allocate a new parallel team */
2029  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2030  team = __kmp_allocate_team(root, nthreads, nthreads,
2031 #if OMPT_SUPPORT
2032  ompt_parallel_data,
2033 #endif
2034 #if OMP_40_ENABLED
2035  proc_bind,
2036 #endif
2037  &master_th->th.th_current_task->td_icvs,
2038  argc USE_NESTED_HOT_ARG(master_th));
2039  }
2040  KF_TRACE(
2041  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2042 
2043  /* setup the new team */
2044  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2045  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2046  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2047  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2048  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2049 #if OMPT_SUPPORT
2050  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2051  return_address);
2052 #endif
2053  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2054 // TODO: parent_team->t.t_level == INT_MAX ???
2055 #if OMP_40_ENABLED
2056  if (!master_th->th.th_teams_microtask || level > teams_level) {
2057 #endif /* OMP_40_ENABLED */
2058  int new_level = parent_team->t.t_level + 1;
2059  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2060  new_level = parent_team->t.t_active_level + 1;
2061  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2062 #if OMP_40_ENABLED
2063  } else {
2064  // AC: Do not increase parallel level at start of the teams construct
2065  int new_level = parent_team->t.t_level;
2066  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2067  new_level = parent_team->t.t_active_level;
2068  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2069  }
2070 #endif /* OMP_40_ENABLED */
2071  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2072  // set master's schedule as new run-time schedule
2073  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2074 
2075 #if OMP_40_ENABLED
2076  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2077 #endif
2078 
2079  // Update the floating point rounding in the team if required.
2080  propagateFPControl(team);
2081 
2082  if (__kmp_tasking_mode != tskm_immediate_exec) {
2083  // Set master's task team to team's task team. Unless this is hot team, it
2084  // should be NULL.
2085  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2086  parent_team->t.t_task_team[master_th->th.th_task_state]);
2087  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2088  "%p, new task_team %p / team %p\n",
2089  __kmp_gtid_from_thread(master_th),
2090  master_th->th.th_task_team, parent_team,
2091  team->t.t_task_team[master_th->th.th_task_state], team));
2092 
2093  if (active_level || master_th->th.th_task_team) {
2094  // Take a memo of master's task_state
2095  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2096  if (master_th->th.th_task_state_top >=
2097  master_th->th.th_task_state_stack_sz) { // increase size
2098  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2099  kmp_uint8 *old_stack, *new_stack;
2100  kmp_uint32 i;
2101  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2102  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2103  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2104  }
2105  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2106  ++i) { // zero-init rest of stack
2107  new_stack[i] = 0;
2108  }
2109  old_stack = master_th->th.th_task_state_memo_stack;
2110  master_th->th.th_task_state_memo_stack = new_stack;
2111  master_th->th.th_task_state_stack_sz = new_size;
2112  __kmp_free(old_stack);
2113  }
2114  // Store master's task_state on stack
2115  master_th->th
2116  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2117  master_th->th.th_task_state;
2118  master_th->th.th_task_state_top++;
2119 #if KMP_NESTED_HOT_TEAMS
2120  if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2121  // Restore master's nested state if nested hot team
2122  master_th->th.th_task_state =
2123  master_th->th
2124  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2125  } else {
2126 #endif
2127  master_th->th.th_task_state = 0;
2128 #if KMP_NESTED_HOT_TEAMS
2129  }
2130 #endif
2131  }
2132 #if !KMP_NESTED_HOT_TEAMS
2133  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2134  (team == root->r.r_hot_team));
2135 #endif
2136  }
2137 
2138  KA_TRACE(
2139  20,
2140  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2141  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2142  team->t.t_nproc));
2143  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2144  (team->t.t_master_tid == 0 &&
2145  (team->t.t_parent == root->r.r_root_team ||
2146  team->t.t_parent->t.t_serialized)));
2147  KMP_MB();
2148 
2149  /* now, setup the arguments */
2150  argv = (void **)team->t.t_argv;
2151 #if OMP_40_ENABLED
2152  if (ap) {
2153 #endif /* OMP_40_ENABLED */
2154  for (i = argc - 1; i >= 0; --i) {
2155 // TODO: revert workaround for Intel(R) 64 tracker #96
2156 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2157  void *new_argv = va_arg(*ap, void *);
2158 #else
2159  void *new_argv = va_arg(ap, void *);
2160 #endif
2161  KMP_CHECK_UPDATE(*argv, new_argv);
2162  argv++;
2163  }
2164 #if OMP_40_ENABLED
2165  } else {
2166  for (i = 0; i < argc; ++i) {
2167  // Get args from parent team for teams construct
2168  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2169  }
2170  }
2171 #endif /* OMP_40_ENABLED */
2172 
2173  /* now actually fork the threads */
2174  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2175  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2176  root->r.r_active = TRUE;
2177 
2178  __kmp_fork_team_threads(root, team, master_th, gtid);
2179  __kmp_setup_icv_copy(team, nthreads,
2180  &master_th->th.th_current_task->td_icvs, loc);
2181 
2182 #if OMPT_SUPPORT
2183  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2184 #endif
2185 
2186  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2187 
2188 #if USE_ITT_BUILD
2189  if (team->t.t_active_level == 1 // only report frames at level 1
2190 #if OMP_40_ENABLED
2191  && !master_th->th.th_teams_microtask // not in teams construct
2192 #endif /* OMP_40_ENABLED */
2193  ) {
2194 #if USE_ITT_NOTIFY
2195  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2196  (__kmp_forkjoin_frames_mode == 3 ||
2197  __kmp_forkjoin_frames_mode == 1)) {
2198  kmp_uint64 tmp_time = 0;
2199  if (__itt_get_timestamp_ptr)
2200  tmp_time = __itt_get_timestamp();
2201  // Internal fork - report frame begin
2202  master_th->th.th_frame_time = tmp_time;
2203  if (__kmp_forkjoin_frames_mode == 3)
2204  team->t.t_region_time = tmp_time;
2205  } else
2206 // only one notification scheme (either "submit" or "forking/joined", not both)
2207 #endif /* USE_ITT_NOTIFY */
2208  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2209  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2210  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2211  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2212  }
2213  }
2214 #endif /* USE_ITT_BUILD */
2215 
2216  /* now go on and do the work */
2217  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2218  KMP_MB();
2219  KF_TRACE(10,
2220  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2221  root, team, master_th, gtid));
2222 
2223 #if USE_ITT_BUILD
2224  if (__itt_stack_caller_create_ptr) {
2225  team->t.t_stack_id =
2226  __kmp_itt_stack_caller_create(); // create new stack stitching id
2227  // before entering fork barrier
2228  }
2229 #endif /* USE_ITT_BUILD */
2230 
2231 #if OMP_40_ENABLED
2232  // AC: skip __kmp_internal_fork at teams construct, let only master
2233  // threads execute
2234  if (ap)
2235 #endif /* OMP_40_ENABLED */
2236  {
2237  __kmp_internal_fork(loc, gtid, team);
2238  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2239  "master_th=%p, gtid=%d\n",
2240  root, team, master_th, gtid));
2241  }
2242 
2243  if (call_context == fork_context_gnu) {
2244  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2245  return TRUE;
2246  }
2247 
2248  /* Invoke microtask for MASTER thread */
2249  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2250  team->t.t_id, team->t.t_pkfn));
2251  } // END of timer KMP_fork_call block
2252 
2253  {
2254  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2255  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2256  if (!team->t.t_invoke(gtid)) {
2257  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2258  }
2259  }
2260  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2261  team->t.t_id, team->t.t_pkfn));
2262  KMP_MB(); /* Flush all pending memory write invalidates. */
2263 
2264  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2265 
2266 #if OMPT_SUPPORT
2267  if (ompt_enabled.enabled) {
2268  master_th->th.ompt_thread_info.state = omp_state_overhead;
2269  }
2270 #endif
2271 
2272  return TRUE;
2273 }
2274 
2275 #if OMPT_SUPPORT
2276 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2277  kmp_team_t *team) {
2278  // restore state outside the region
2279  thread->th.ompt_thread_info.state =
2280  ((team->t.t_serialized) ? omp_state_work_serial
2281  : omp_state_work_parallel);
2282 }
2283 
2284 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2285  kmp_team_t *team, ompt_data_t *parallel_data,
2286  fork_context_e fork_context, void *codeptr) {
2287  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2288  if (ompt_enabled.ompt_callback_parallel_end) {
2289  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2290  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2291  codeptr);
2292  }
2293 
2294  task_info->frame.enter_frame = NULL;
2295  __kmp_join_restore_state(thread, team);
2296 }
2297 #endif
2298 
2299 void __kmp_join_call(ident_t *loc, int gtid
2300 #if OMPT_SUPPORT
2301  ,
2302  enum fork_context_e fork_context
2303 #endif
2304 #if OMP_40_ENABLED
2305  ,
2306  int exit_teams
2307 #endif /* OMP_40_ENABLED */
2308  ) {
2309  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2310  kmp_team_t *team;
2311  kmp_team_t *parent_team;
2312  kmp_info_t *master_th;
2313  kmp_root_t *root;
2314  int master_active;
2315  int i;
2316 
2317  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2318 
2319  /* setup current data */
2320  master_th = __kmp_threads[gtid];
2321  root = master_th->th.th_root;
2322  team = master_th->th.th_team;
2323  parent_team = team->t.t_parent;
2324 
2325  master_th->th.th_ident = loc;
2326 
2327 #if OMPT_SUPPORT
2328  if (ompt_enabled.enabled) {
2329  master_th->th.ompt_thread_info.state = omp_state_overhead;
2330  }
2331 #endif
2332 
2333 #if KMP_DEBUG
2334  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2335  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2336  "th_task_team = %p\n",
2337  __kmp_gtid_from_thread(master_th), team,
2338  team->t.t_task_team[master_th->th.th_task_state],
2339  master_th->th.th_task_team));
2340  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2341  team->t.t_task_team[master_th->th.th_task_state]);
2342  }
2343 #endif
2344 
2345  if (team->t.t_serialized) {
2346 #if OMP_40_ENABLED
2347  if (master_th->th.th_teams_microtask) {
2348  // We are in teams construct
2349  int level = team->t.t_level;
2350  int tlevel = master_th->th.th_teams_level;
2351  if (level == tlevel) {
2352  // AC: we haven't incremented it earlier at start of teams construct,
2353  // so do it here - at the end of teams construct
2354  team->t.t_level++;
2355  } else if (level == tlevel + 1) {
2356  // AC: we are exiting parallel inside teams, need to increment
2357  // serialization in order to restore it in the next call to
2358  // __kmpc_end_serialized_parallel
2359  team->t.t_serialized++;
2360  }
2361  }
2362 #endif /* OMP_40_ENABLED */
2363  __kmpc_end_serialized_parallel(loc, gtid);
2364 
2365 #if OMPT_SUPPORT
2366  if (ompt_enabled.enabled) {
2367  __kmp_join_restore_state(master_th, parent_team);
2368  }
2369 #endif
2370 
2371  return;
2372  }
2373 
2374  master_active = team->t.t_master_active;
2375 
2376 #if OMP_40_ENABLED
2377  if (!exit_teams)
2378 #endif /* OMP_40_ENABLED */
2379  {
2380  // AC: No barrier for internal teams at exit from teams construct.
2381  // But there is barrier for external team (league).
2382  __kmp_internal_join(loc, gtid, team);
2383  }
2384 #if OMP_40_ENABLED
2385  else {
2386  master_th->th.th_task_state =
2387  0; // AC: no tasking in teams (out of any parallel)
2388  }
2389 #endif /* OMP_40_ENABLED */
2390 
2391  KMP_MB();
2392 
2393 #if OMPT_SUPPORT
2394  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2395  void *codeptr = team->t.ompt_team_info.master_return_address;
2396 #endif
2397 
2398 #if USE_ITT_BUILD
2399  if (__itt_stack_caller_create_ptr) {
2400  __kmp_itt_stack_caller_destroy(
2401  (__itt_caller)team->t
2402  .t_stack_id); // destroy the stack stitching id after join barrier
2403  }
2404 
2405  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2406  if (team->t.t_active_level == 1
2407 #if OMP_40_ENABLED
2408  && !master_th->th.th_teams_microtask /* not in teams construct */
2409 #endif /* OMP_40_ENABLED */
2410  ) {
2411  master_th->th.th_ident = loc;
2412  // only one notification scheme (either "submit" or "forking/joined", not
2413  // both)
2414  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2415  __kmp_forkjoin_frames_mode == 3)
2416  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2417  master_th->th.th_frame_time, 0, loc,
2418  master_th->th.th_team_nproc, 1);
2419  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2420  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2421  __kmp_itt_region_joined(gtid);
2422  } // active_level == 1
2423 #endif /* USE_ITT_BUILD */
2424 
2425 #if OMP_40_ENABLED
2426  if (master_th->th.th_teams_microtask && !exit_teams &&
2427  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2428  team->t.t_level == master_th->th.th_teams_level + 1) {
2429  // AC: We need to leave the team structure intact at the end of parallel
2430  // inside the teams construct, so that at the next parallel same (hot) team
2431  // works, only adjust nesting levels
2432 
2433  /* Decrement our nested depth level */
2434  team->t.t_level--;
2435  team->t.t_active_level--;
2436  KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2437 
2438  /* Restore number of threads in the team if needed */
2439  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2440  int old_num = master_th->th.th_team_nproc;
2441  int new_num = master_th->th.th_teams_size.nth;
2442  kmp_info_t **other_threads = team->t.t_threads;
2443  team->t.t_nproc = new_num;
2444  for (i = 0; i < old_num; ++i) {
2445  other_threads[i]->th.th_team_nproc = new_num;
2446  }
2447  // Adjust states of non-used threads of the team
2448  for (i = old_num; i < new_num; ++i) {
2449  // Re-initialize thread's barrier data.
2450  int b;
2451  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2452  for (b = 0; b < bs_last_barrier; ++b) {
2453  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2454  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2455 #if USE_DEBUGGER
2456  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2457 #endif
2458  }
2459  if (__kmp_tasking_mode != tskm_immediate_exec) {
2460  // Synchronize thread's task state
2461  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2462  }
2463  }
2464  }
2465 
2466 #if OMPT_SUPPORT
2467  if (ompt_enabled.enabled) {
2468  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2469  codeptr);
2470  }
2471 #endif
2472 
2473  return;
2474  }
2475 #endif /* OMP_40_ENABLED */
2476 
2477  /* do cleanup and restore the parent team */
2478  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2479  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2480 
2481  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2482 
2483  /* jc: The following lock has instructions with REL and ACQ semantics,
2484  separating the parallel user code called in this parallel region
2485  from the serial user code called after this function returns. */
2486  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2487 
2488 #if OMP_40_ENABLED
2489  if (!master_th->th.th_teams_microtask ||
2490  team->t.t_level > master_th->th.th_teams_level)
2491 #endif /* OMP_40_ENABLED */
2492  {
2493  /* Decrement our nested depth level */
2494  KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2495  }
2496  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2497 
2498 #if OMPT_SUPPORT
2499  if (ompt_enabled.enabled) {
2500  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2501  if (ompt_enabled.ompt_callback_implicit_task) {
2502  int ompt_team_size = team->t.t_nproc;
2503  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2504  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2505  OMPT_CUR_TASK_INFO(master_th)->thread_num);
2506  }
2507 
2508  task_info->frame.exit_frame = NULL;
2509  task_info->task_data = ompt_data_none;
2510  }
2511 #endif
2512 
2513  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2514  master_th, team));
2515  __kmp_pop_current_task_from_thread(master_th);
2516 
2517 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2518  // Restore master thread's partition.
2519  master_th->th.th_first_place = team->t.t_first_place;
2520  master_th->th.th_last_place = team->t.t_last_place;
2521 #endif /* OMP_40_ENABLED */
2522 
2523  updateHWFPControl(team);
2524 
2525  if (root->r.r_active != master_active)
2526  root->r.r_active = master_active;
2527 
2528  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2529  master_th)); // this will free worker threads
2530 
2531  /* this race was fun to find. make sure the following is in the critical
2532  region otherwise assertions may fail occasionally since the old team may be
2533  reallocated and the hierarchy appears inconsistent. it is actually safe to
2534  run and won't cause any bugs, but will cause those assertion failures. it's
2535  only one deref&assign so might as well put this in the critical region */
2536  master_th->th.th_team = parent_team;
2537  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2538  master_th->th.th_team_master = parent_team->t.t_threads[0];
2539  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2540 
2541  /* restore serialized team, if need be */
2542  if (parent_team->t.t_serialized &&
2543  parent_team != master_th->th.th_serial_team &&
2544  parent_team != root->r.r_root_team) {
2545  __kmp_free_team(root,
2546  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2547  master_th->th.th_serial_team = parent_team;
2548  }
2549 
2550  if (__kmp_tasking_mode != tskm_immediate_exec) {
2551  if (master_th->th.th_task_state_top >
2552  0) { // Restore task state from memo stack
2553  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2554  // Remember master's state if we re-use this nested hot team
2555  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2556  master_th->th.th_task_state;
2557  --master_th->th.th_task_state_top; // pop
2558  // Now restore state at this level
2559  master_th->th.th_task_state =
2560  master_th->th
2561  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2562  }
2563  // Copy the task team from the parent team to the master thread
2564  master_th->th.th_task_team =
2565  parent_team->t.t_task_team[master_th->th.th_task_state];
2566  KA_TRACE(20,
2567  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2568  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2569  parent_team));
2570  }
2571 
2572  // TODO: GEH - cannot do this assertion because root thread not set up as
2573  // executing
2574  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2575  master_th->th.th_current_task->td_flags.executing = 1;
2576 
2577  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2578 
2579 #if OMPT_SUPPORT
2580  if (ompt_enabled.enabled) {
2581  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2582  codeptr);
2583  }
2584 #endif
2585 
2586  KMP_MB();
2587  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2588 }
2589 
2590 /* Check whether we should push an internal control record onto the
2591  serial team stack. If so, do it. */
2592 void __kmp_save_internal_controls(kmp_info_t *thread) {
2593 
2594  if (thread->th.th_team != thread->th.th_serial_team) {
2595  return;
2596  }
2597  if (thread->th.th_team->t.t_serialized > 1) {
2598  int push = 0;
2599 
2600  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2601  push = 1;
2602  } else {
2603  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2604  thread->th.th_team->t.t_serialized) {
2605  push = 1;
2606  }
2607  }
2608  if (push) { /* push a record on the serial team's stack */
2609  kmp_internal_control_t *control =
2610  (kmp_internal_control_t *)__kmp_allocate(
2611  sizeof(kmp_internal_control_t));
2612 
2613  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2614 
2615  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2616 
2617  control->next = thread->th.th_team->t.t_control_stack_top;
2618  thread->th.th_team->t.t_control_stack_top = control;
2619  }
2620  }
2621 }
2622 
2623 /* Changes set_nproc */
2624 void __kmp_set_num_threads(int new_nth, int gtid) {
2625  kmp_info_t *thread;
2626  kmp_root_t *root;
2627 
2628  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2629  KMP_DEBUG_ASSERT(__kmp_init_serial);
2630 
2631  if (new_nth < 1)
2632  new_nth = 1;
2633  else if (new_nth > __kmp_max_nth)
2634  new_nth = __kmp_max_nth;
2635 
2636  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2637  thread = __kmp_threads[gtid];
2638 
2639  __kmp_save_internal_controls(thread);
2640 
2641  set__nproc(thread, new_nth);
2642 
2643  // If this omp_set_num_threads() call will cause the hot team size to be
2644  // reduced (in the absence of a num_threads clause), then reduce it now,
2645  // rather than waiting for the next parallel region.
2646  root = thread->th.th_root;
2647  if (__kmp_init_parallel && (!root->r.r_active) &&
2648  (root->r.r_hot_team->t.t_nproc > new_nth)
2649 #if KMP_NESTED_HOT_TEAMS
2650  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2651 #endif
2652  ) {
2653  kmp_team_t *hot_team = root->r.r_hot_team;
2654  int f;
2655 
2656  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2657 
2658  // Release the extra threads we don't need any more.
2659  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2660  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2661  if (__kmp_tasking_mode != tskm_immediate_exec) {
2662  // When decreasing team size, threads no longer in the team should unref
2663  // task team.
2664  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2665  }
2666  __kmp_free_thread(hot_team->t.t_threads[f]);
2667  hot_team->t.t_threads[f] = NULL;
2668  }
2669  hot_team->t.t_nproc = new_nth;
2670 #if KMP_NESTED_HOT_TEAMS
2671  if (thread->th.th_hot_teams) {
2672  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2673  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2674  }
2675 #endif
2676 
2677  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2678 
2679  // Update the t_nproc field in the threads that are still active.
2680  for (f = 0; f < new_nth; f++) {
2681  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2682  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2683  }
2684  // Special flag in case omp_set_num_threads() call
2685  hot_team->t.t_size_changed = -1;
2686  }
2687 }
2688 
2689 /* Changes max_active_levels */
2690 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2691  kmp_info_t *thread;
2692 
2693  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2694  "%d = (%d)\n",
2695  gtid, max_active_levels));
2696  KMP_DEBUG_ASSERT(__kmp_init_serial);
2697 
2698  // validate max_active_levels
2699  if (max_active_levels < 0) {
2700  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2701  // We ignore this call if the user has specified a negative value.
2702  // The current setting won't be changed. The last valid setting will be
2703  // used. A warning will be issued (if warnings are allowed as controlled by
2704  // the KMP_WARNINGS env var).
2705  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2706  "max_active_levels for thread %d = (%d)\n",
2707  gtid, max_active_levels));
2708  return;
2709  }
2710  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2711  // it's OK, the max_active_levels is within the valid range: [ 0;
2712  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2713  // We allow a zero value. (implementation defined behavior)
2714  } else {
2715  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2716  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2717  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2718  // Current upper limit is MAX_INT. (implementation defined behavior)
2719  // If the input exceeds the upper limit, we correct the input to be the
2720  // upper limit. (implementation defined behavior)
2721  // Actually, the flow should never get here until we use MAX_INT limit.
2722  }
2723  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2724  "max_active_levels for thread %d = (%d)\n",
2725  gtid, max_active_levels));
2726 
2727  thread = __kmp_threads[gtid];
2728 
2729  __kmp_save_internal_controls(thread);
2730 
2731  set__max_active_levels(thread, max_active_levels);
2732 }
2733 
2734 /* Gets max_active_levels */
2735 int __kmp_get_max_active_levels(int gtid) {
2736  kmp_info_t *thread;
2737 
2738  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2739  KMP_DEBUG_ASSERT(__kmp_init_serial);
2740 
2741  thread = __kmp_threads[gtid];
2742  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2743  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2744  "curtask_maxaclevel=%d\n",
2745  gtid, thread->th.th_current_task,
2746  thread->th.th_current_task->td_icvs.max_active_levels));
2747  return thread->th.th_current_task->td_icvs.max_active_levels;
2748 }
2749 
2750 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2751 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2752  kmp_info_t *thread;
2753  // kmp_team_t *team;
2754 
2755  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2756  gtid, (int)kind, chunk));
2757  KMP_DEBUG_ASSERT(__kmp_init_serial);
2758 
2759  // Check if the kind parameter is valid, correct if needed.
2760  // Valid parameters should fit in one of two intervals - standard or extended:
2761  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2762  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2763  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2764  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2765  // TODO: Hint needs attention in case we change the default schedule.
2766  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2767  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2768  __kmp_msg_null);
2769  kind = kmp_sched_default;
2770  chunk = 0; // ignore chunk value in case of bad kind
2771  }
2772 
2773  thread = __kmp_threads[gtid];
2774 
2775  __kmp_save_internal_controls(thread);
2776 
2777  if (kind < kmp_sched_upper_std) {
2778  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2779  // differ static chunked vs. unchunked: chunk should be invalid to
2780  // indicate unchunked schedule (which is the default)
2781  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2782  } else {
2783  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2784  __kmp_sch_map[kind - kmp_sched_lower - 1];
2785  }
2786  } else {
2787  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2788  // kmp_sched_lower - 2 ];
2789  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2790  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2791  kmp_sched_lower - 2];
2792  }
2793  if (kind == kmp_sched_auto || chunk < 1) {
2794  // ignore parameter chunk for schedule auto
2795  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2796  } else {
2797  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2798  }
2799 }
2800 
2801 /* Gets def_sched_var ICV values */
2802 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2803  kmp_info_t *thread;
2804  enum sched_type th_type;
2805 
2806  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2807  KMP_DEBUG_ASSERT(__kmp_init_serial);
2808 
2809  thread = __kmp_threads[gtid];
2810 
2811  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2812 
2813  switch (th_type) {
2814  case kmp_sch_static:
2815  case kmp_sch_static_greedy:
2816  case kmp_sch_static_balanced:
2817  *kind = kmp_sched_static;
2818  *chunk = 0; // chunk was not set, try to show this fact via zero value
2819  return;
2820  case kmp_sch_static_chunked:
2821  *kind = kmp_sched_static;
2822  break;
2823  case kmp_sch_dynamic_chunked:
2824  *kind = kmp_sched_dynamic;
2825  break;
2827  case kmp_sch_guided_iterative_chunked:
2828  case kmp_sch_guided_analytical_chunked:
2829  *kind = kmp_sched_guided;
2830  break;
2831  case kmp_sch_auto:
2832  *kind = kmp_sched_auto;
2833  break;
2834  case kmp_sch_trapezoidal:
2835  *kind = kmp_sched_trapezoidal;
2836  break;
2837 #if KMP_STATIC_STEAL_ENABLED
2838  case kmp_sch_static_steal:
2839  *kind = kmp_sched_static_steal;
2840  break;
2841 #endif
2842  default:
2843  KMP_FATAL(UnknownSchedulingType, th_type);
2844  }
2845 
2846  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2847 }
2848 
2849 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2850 
2851  int ii, dd;
2852  kmp_team_t *team;
2853  kmp_info_t *thr;
2854 
2855  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2856  KMP_DEBUG_ASSERT(__kmp_init_serial);
2857 
2858  // validate level
2859  if (level == 0)
2860  return 0;
2861  if (level < 0)
2862  return -1;
2863  thr = __kmp_threads[gtid];
2864  team = thr->th.th_team;
2865  ii = team->t.t_level;
2866  if (level > ii)
2867  return -1;
2868 
2869 #if OMP_40_ENABLED
2870  if (thr->th.th_teams_microtask) {
2871  // AC: we are in teams region where multiple nested teams have same level
2872  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2873  if (level <=
2874  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2875  KMP_DEBUG_ASSERT(ii >= tlevel);
2876  // AC: As we need to pass by the teams league, we need to artificially
2877  // increase ii
2878  if (ii == tlevel) {
2879  ii += 2; // three teams have same level
2880  } else {
2881  ii++; // two teams have same level
2882  }
2883  }
2884  }
2885 #endif
2886 
2887  if (ii == level)
2888  return __kmp_tid_from_gtid(gtid);
2889 
2890  dd = team->t.t_serialized;
2891  level++;
2892  while (ii > level) {
2893  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2894  }
2895  if ((team->t.t_serialized) && (!dd)) {
2896  team = team->t.t_parent;
2897  continue;
2898  }
2899  if (ii > level) {
2900  team = team->t.t_parent;
2901  dd = team->t.t_serialized;
2902  ii--;
2903  }
2904  }
2905 
2906  return (dd > 1) ? (0) : (team->t.t_master_tid);
2907 }
2908 
2909 int __kmp_get_team_size(int gtid, int level) {
2910 
2911  int ii, dd;
2912  kmp_team_t *team;
2913  kmp_info_t *thr;
2914 
2915  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2916  KMP_DEBUG_ASSERT(__kmp_init_serial);
2917 
2918  // validate level
2919  if (level == 0)
2920  return 1;
2921  if (level < 0)
2922  return -1;
2923  thr = __kmp_threads[gtid];
2924  team = thr->th.th_team;
2925  ii = team->t.t_level;
2926  if (level > ii)
2927  return -1;
2928 
2929 #if OMP_40_ENABLED
2930  if (thr->th.th_teams_microtask) {
2931  // AC: we are in teams region where multiple nested teams have same level
2932  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2933  if (level <=
2934  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2935  KMP_DEBUG_ASSERT(ii >= tlevel);
2936  // AC: As we need to pass by the teams league, we need to artificially
2937  // increase ii
2938  if (ii == tlevel) {
2939  ii += 2; // three teams have same level
2940  } else {
2941  ii++; // two teams have same level
2942  }
2943  }
2944  }
2945 #endif
2946 
2947  while (ii > level) {
2948  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2949  }
2950  if (team->t.t_serialized && (!dd)) {
2951  team = team->t.t_parent;
2952  continue;
2953  }
2954  if (ii > level) {
2955  team = team->t.t_parent;
2956  ii--;
2957  }
2958  }
2959 
2960  return team->t.t_nproc;
2961 }
2962 
2963 kmp_r_sched_t __kmp_get_schedule_global() {
2964  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2965  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2966  // independently. So one can get the updated schedule here.
2967 
2968  kmp_r_sched_t r_sched;
2969 
2970  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2971  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2972  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2973  // different roots (even in OMP 2.5)
2974  if (__kmp_sched == kmp_sch_static) {
2975  // replace STATIC with more detailed schedule (balanced or greedy)
2976  r_sched.r_sched_type = __kmp_static;
2977  } else if (__kmp_sched == kmp_sch_guided_chunked) {
2978  // replace GUIDED with more detailed schedule (iterative or analytical)
2979  r_sched.r_sched_type = __kmp_guided;
2980  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2981  r_sched.r_sched_type = __kmp_sched;
2982  }
2983 
2984  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2985  // __kmp_chunk may be wrong here (if it was not ever set)
2986  r_sched.chunk = KMP_DEFAULT_CHUNK;
2987  } else {
2988  r_sched.chunk = __kmp_chunk;
2989  }
2990 
2991  return r_sched;
2992 }
2993 
2994 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2995  at least argc number of *t_argv entries for the requested team. */
2996 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2997 
2998  KMP_DEBUG_ASSERT(team);
2999  if (!realloc || argc > team->t.t_max_argc) {
3000 
3001  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3002  "current entries=%d\n",
3003  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3004  /* if previously allocated heap space for args, free them */
3005  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3006  __kmp_free((void *)team->t.t_argv);
3007 
3008  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3009  /* use unused space in the cache line for arguments */
3010  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3011  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3012  "argv entries\n",
3013  team->t.t_id, team->t.t_max_argc));
3014  team->t.t_argv = &team->t.t_inline_argv[0];
3015  if (__kmp_storage_map) {
3016  __kmp_print_storage_map_gtid(
3017  -1, &team->t.t_inline_argv[0],
3018  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3019  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3020  team->t.t_id);
3021  }
3022  } else {
3023  /* allocate space for arguments in the heap */
3024  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3025  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3026  : 2 * argc;
3027  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3028  "argv entries\n",
3029  team->t.t_id, team->t.t_max_argc));
3030  team->t.t_argv =
3031  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3032  if (__kmp_storage_map) {
3033  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3034  &team->t.t_argv[team->t.t_max_argc],
3035  sizeof(void *) * team->t.t_max_argc,
3036  "team_%d.t_argv", team->t.t_id);
3037  }
3038  }
3039  }
3040 }
3041 
3042 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3043  int i;
3044  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3045  team->t.t_threads =
3046  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3047  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3048  sizeof(dispatch_shared_info_t) * num_disp_buff);
3049  team->t.t_dispatch =
3050  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3051  team->t.t_implicit_task_taskdata =
3052  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3053  team->t.t_max_nproc = max_nth;
3054 
3055  /* setup dispatch buffers */
3056  for (i = 0; i < num_disp_buff; ++i) {
3057  team->t.t_disp_buffer[i].buffer_index = i;
3058 #if OMP_45_ENABLED
3059  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3060 #endif
3061  }
3062 }
3063 
3064 static void __kmp_free_team_arrays(kmp_team_t *team) {
3065  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3066  int i;
3067  for (i = 0; i < team->t.t_max_nproc; ++i) {
3068  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3069  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3070  team->t.t_dispatch[i].th_disp_buffer = NULL;
3071  }
3072  }
3073  __kmp_free(team->t.t_threads);
3074  __kmp_free(team->t.t_disp_buffer);
3075  __kmp_free(team->t.t_dispatch);
3076  __kmp_free(team->t.t_implicit_task_taskdata);
3077  team->t.t_threads = NULL;
3078  team->t.t_disp_buffer = NULL;
3079  team->t.t_dispatch = NULL;
3080  team->t.t_implicit_task_taskdata = 0;
3081 }
3082 
3083 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3084  kmp_info_t **oldThreads = team->t.t_threads;
3085 
3086  __kmp_free(team->t.t_disp_buffer);
3087  __kmp_free(team->t.t_dispatch);
3088  __kmp_free(team->t.t_implicit_task_taskdata);
3089  __kmp_allocate_team_arrays(team, max_nth);
3090 
3091  KMP_MEMCPY(team->t.t_threads, oldThreads,
3092  team->t.t_nproc * sizeof(kmp_info_t *));
3093 
3094  __kmp_free(oldThreads);
3095 }
3096 
3097 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3098 
3099  kmp_r_sched_t r_sched =
3100  __kmp_get_schedule_global(); // get current state of scheduling globals
3101 
3102 #if OMP_40_ENABLED
3103  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3104 #endif /* OMP_40_ENABLED */
3105 
3106  kmp_internal_control_t g_icvs = {
3107  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3108  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3109  // for nested parallelism (per thread)
3110  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3111  // adjustment of threads (per thread)
3112  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3113  // whether blocktime is explicitly set
3114  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3115 #if KMP_USE_MONITOR
3116  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3117 // intervals
3118 #endif
3119  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3120  // next parallel region (per thread)
3121  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3122  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3123  // for max_active_levels
3124  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3125 // {sched,chunk} pair
3126 #if OMP_40_ENABLED
3127  __kmp_nested_proc_bind.bind_types[0],
3128  __kmp_default_device,
3129 #endif /* OMP_40_ENABLED */
3130  NULL // struct kmp_internal_control *next;
3131  };
3132 
3133  return g_icvs;
3134 }
3135 
3136 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3137 
3138  kmp_internal_control_t gx_icvs;
3139  gx_icvs.serial_nesting_level =
3140  0; // probably =team->t.t_serial like in save_inter_controls
3141  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3142  gx_icvs.next = NULL;
3143 
3144  return gx_icvs;
3145 }
3146 
3147 static void __kmp_initialize_root(kmp_root_t *root) {
3148  int f;
3149  kmp_team_t *root_team;
3150  kmp_team_t *hot_team;
3151  int hot_team_max_nth;
3152  kmp_r_sched_t r_sched =
3153  __kmp_get_schedule_global(); // get current state of scheduling globals
3154  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3155  KMP_DEBUG_ASSERT(root);
3156  KMP_ASSERT(!root->r.r_begin);
3157 
3158  /* setup the root state structure */
3159  __kmp_init_lock(&root->r.r_begin_lock);
3160  root->r.r_begin = FALSE;
3161  root->r.r_active = FALSE;
3162  root->r.r_in_parallel = 0;
3163  root->r.r_blocktime = __kmp_dflt_blocktime;
3164  root->r.r_nested = __kmp_dflt_nested;
3165  root->r.r_cg_nthreads = 1;
3166 
3167  /* setup the root team for this task */
3168  /* allocate the root team structure */
3169  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3170 
3171  root_team =
3172  __kmp_allocate_team(root,
3173  1, // new_nproc
3174  1, // max_nproc
3175 #if OMPT_SUPPORT
3176  ompt_data_none, // root parallel id
3177 #endif
3178 #if OMP_40_ENABLED
3179  __kmp_nested_proc_bind.bind_types[0],
3180 #endif
3181  &r_icvs,
3182  0 // argc
3183  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3184  );
3185 #if USE_DEBUGGER
3186  // Non-NULL value should be assigned to make the debugger display the root
3187  // team.
3188  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3189 #endif
3190 
3191  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3192 
3193  root->r.r_root_team = root_team;
3194  root_team->t.t_control_stack_top = NULL;
3195 
3196  /* initialize root team */
3197  root_team->t.t_threads[0] = NULL;
3198  root_team->t.t_nproc = 1;
3199  root_team->t.t_serialized = 1;
3200  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3201  root_team->t.t_sched.sched = r_sched.sched;
3202  KA_TRACE(
3203  20,
3204  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3205  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3206 
3207  /* setup the hot team for this task */
3208  /* allocate the hot team structure */
3209  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3210 
3211  hot_team =
3212  __kmp_allocate_team(root,
3213  1, // new_nproc
3214  __kmp_dflt_team_nth_ub * 2, // max_nproc
3215 #if OMPT_SUPPORT
3216  ompt_data_none, // root parallel id
3217 #endif
3218 #if OMP_40_ENABLED
3219  __kmp_nested_proc_bind.bind_types[0],
3220 #endif
3221  &r_icvs,
3222  0 // argc
3223  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3224  );
3225  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3226 
3227  root->r.r_hot_team = hot_team;
3228  root_team->t.t_control_stack_top = NULL;
3229 
3230  /* first-time initialization */
3231  hot_team->t.t_parent = root_team;
3232 
3233  /* initialize hot team */
3234  hot_team_max_nth = hot_team->t.t_max_nproc;
3235  for (f = 0; f < hot_team_max_nth; ++f) {
3236  hot_team->t.t_threads[f] = NULL;
3237  }
3238  hot_team->t.t_nproc = 1;
3239  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3240  hot_team->t.t_sched.sched = r_sched.sched;
3241  hot_team->t.t_size_changed = 0;
3242 }
3243 
3244 #ifdef KMP_DEBUG
3245 
3246 typedef struct kmp_team_list_item {
3247  kmp_team_p const *entry;
3248  struct kmp_team_list_item *next;
3249 } kmp_team_list_item_t;
3250 typedef kmp_team_list_item_t *kmp_team_list_t;
3251 
3252 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3253  kmp_team_list_t list, // List of teams.
3254  kmp_team_p const *team // Team to add.
3255  ) {
3256 
3257  // List must terminate with item where both entry and next are NULL.
3258  // Team is added to the list only once.
3259  // List is sorted in ascending order by team id.
3260  // Team id is *not* a key.
3261 
3262  kmp_team_list_t l;
3263 
3264  KMP_DEBUG_ASSERT(list != NULL);
3265  if (team == NULL) {
3266  return;
3267  }
3268 
3269  __kmp_print_structure_team_accum(list, team->t.t_parent);
3270  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3271 
3272  // Search list for the team.
3273  l = list;
3274  while (l->next != NULL && l->entry != team) {
3275  l = l->next;
3276  }
3277  if (l->next != NULL) {
3278  return; // Team has been added before, exit.
3279  }
3280 
3281  // Team is not found. Search list again for insertion point.
3282  l = list;
3283  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3284  l = l->next;
3285  }
3286 
3287  // Insert team.
3288  {
3289  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3290  sizeof(kmp_team_list_item_t));
3291  *item = *l;
3292  l->entry = team;
3293  l->next = item;
3294  }
3295 }
3296 
3297 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3298 
3299  ) {
3300  __kmp_printf("%s", title);
3301  if (team != NULL) {
3302  __kmp_printf("%2x %p\n", team->t.t_id, team);
3303  } else {
3304  __kmp_printf(" - (nil)\n");
3305  }
3306 }
3307 
3308 static void __kmp_print_structure_thread(char const *title,
3309  kmp_info_p const *thread) {
3310  __kmp_printf("%s", title);
3311  if (thread != NULL) {
3312  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3313  } else {
3314  __kmp_printf(" - (nil)\n");
3315  }
3316 }
3317 
3318 void __kmp_print_structure(void) {
3319 
3320  kmp_team_list_t list;
3321 
3322  // Initialize list of teams.
3323  list =
3324  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3325  list->entry = NULL;
3326  list->next = NULL;
3327 
3328  __kmp_printf("\n------------------------------\nGlobal Thread "
3329  "Table\n------------------------------\n");
3330  {
3331  int gtid;
3332  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3333  __kmp_printf("%2d", gtid);
3334  if (__kmp_threads != NULL) {
3335  __kmp_printf(" %p", __kmp_threads[gtid]);
3336  }
3337  if (__kmp_root != NULL) {
3338  __kmp_printf(" %p", __kmp_root[gtid]);
3339  }
3340  __kmp_printf("\n");
3341  }
3342  }
3343 
3344  // Print out __kmp_threads array.
3345  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3346  "----------\n");
3347  if (__kmp_threads != NULL) {
3348  int gtid;
3349  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3350  kmp_info_t const *thread = __kmp_threads[gtid];
3351  if (thread != NULL) {
3352  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3353  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3354  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3355  __kmp_print_structure_team(" Serial Team: ",
3356  thread->th.th_serial_team);
3357  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3358  __kmp_print_structure_thread(" Master: ",
3359  thread->th.th_team_master);
3360  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3361  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3362 #if OMP_40_ENABLED
3363  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3364 #endif
3365  __kmp_print_structure_thread(" Next in pool: ",
3366  thread->th.th_next_pool);
3367  __kmp_printf("\n");
3368  __kmp_print_structure_team_accum(list, thread->th.th_team);
3369  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3370  }
3371  }
3372  } else {
3373  __kmp_printf("Threads array is not allocated.\n");
3374  }
3375 
3376  // Print out __kmp_root array.
3377  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3378  "--------\n");
3379  if (__kmp_root != NULL) {
3380  int gtid;
3381  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3382  kmp_root_t const *root = __kmp_root[gtid];
3383  if (root != NULL) {
3384  __kmp_printf("GTID %2d %p:\n", gtid, root);
3385  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3386  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3387  __kmp_print_structure_thread(" Uber Thread: ",
3388  root->r.r_uber_thread);
3389  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3390  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3391  __kmp_printf(" In Parallel: %2d\n", root->r.r_in_parallel);
3392  __kmp_printf("\n");
3393  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3394  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3395  }
3396  }
3397  } else {
3398  __kmp_printf("Ubers array is not allocated.\n");
3399  }
3400 
3401  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3402  "--------\n");
3403  while (list->next != NULL) {
3404  kmp_team_p const *team = list->entry;
3405  int i;
3406  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3407  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3408  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3409  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3410  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3411  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3412  for (i = 0; i < team->t.t_nproc; ++i) {
3413  __kmp_printf(" Thread %2d: ", i);
3414  __kmp_print_structure_thread("", team->t.t_threads[i]);
3415  }
3416  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3417  __kmp_printf("\n");
3418  list = list->next;
3419  }
3420 
3421  // Print out __kmp_thread_pool and __kmp_team_pool.
3422  __kmp_printf("\n------------------------------\nPools\n----------------------"
3423  "--------\n");
3424  __kmp_print_structure_thread("Thread pool: ",
3425  CCAST(kmp_info_t *, __kmp_thread_pool));
3426  __kmp_print_structure_team("Team pool: ",
3427  CCAST(kmp_team_t *, __kmp_team_pool));
3428  __kmp_printf("\n");
3429 
3430  // Free team list.
3431  while (list != NULL) {
3432  kmp_team_list_item_t *item = list;
3433  list = list->next;
3434  KMP_INTERNAL_FREE(item);
3435  }
3436 }
3437 
3438 #endif
3439 
3440 //---------------------------------------------------------------------------
3441 // Stuff for per-thread fast random number generator
3442 // Table of primes
3443 static const unsigned __kmp_primes[] = {
3444  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3445  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3446  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3447  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3448  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3449  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3450  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3451  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3452  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3453  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3454  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3455 
3456 //---------------------------------------------------------------------------
3457 // __kmp_get_random: Get a random number using a linear congruential method.
3458 unsigned short __kmp_get_random(kmp_info_t *thread) {
3459  unsigned x = thread->th.th_x;
3460  unsigned short r = x >> 16;
3461 
3462  thread->th.th_x = x * thread->th.th_a + 1;
3463 
3464  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3465  thread->th.th_info.ds.ds_tid, r));
3466 
3467  return r;
3468 }
3469 //--------------------------------------------------------
3470 // __kmp_init_random: Initialize a random number generator
3471 void __kmp_init_random(kmp_info_t *thread) {
3472  unsigned seed = thread->th.th_info.ds.ds_tid;
3473 
3474  thread->th.th_a =
3475  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3476  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3477  KA_TRACE(30,
3478  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3479 }
3480 
3481 #if KMP_OS_WINDOWS
3482 /* reclaim array entries for root threads that are already dead, returns number
3483  * reclaimed */
3484 static int __kmp_reclaim_dead_roots(void) {
3485  int i, r = 0;
3486 
3487  for (i = 0; i < __kmp_threads_capacity; ++i) {
3488  if (KMP_UBER_GTID(i) &&
3489  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3490  !__kmp_root[i]
3491  ->r.r_active) { // AC: reclaim only roots died in non-active state
3492  r += __kmp_unregister_root_other_thread(i);
3493  }
3494  }
3495  return r;
3496 }
3497 #endif
3498 
3499 /* This function attempts to create free entries in __kmp_threads and
3500  __kmp_root, and returns the number of free entries generated.
3501 
3502  For Windows* OS static library, the first mechanism used is to reclaim array
3503  entries for root threads that are already dead.
3504 
3505  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3506  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3507  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3508  threadprivate cache array has been created. Synchronization with
3509  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3510 
3511  After any dead root reclamation, if the clipping value allows array expansion
3512  to result in the generation of a total of nNeed free slots, the function does
3513  that expansion. If not, nothing is done beyond the possible initial root
3514  thread reclamation.
3515 
3516  If any argument is negative, the behavior is undefined. */
3517 static int __kmp_expand_threads(int nNeed) {
3518  int added = 0;
3519  int minimumRequiredCapacity;
3520  int newCapacity;
3521  kmp_info_t **newThreads;
3522  kmp_root_t **newRoot;
3523 
3524 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3525 // resizing __kmp_threads does not need additional protection if foreign
3526 // threads are present
3527 
3528 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3529  /* only for Windows static library */
3530  /* reclaim array entries for root threads that are already dead */
3531  added = __kmp_reclaim_dead_roots();
3532 
3533  if (nNeed) {
3534  nNeed -= added;
3535  if (nNeed < 0)
3536  nNeed = 0;
3537  }
3538 #endif
3539  if (nNeed <= 0)
3540  return added;
3541 
3542  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3543  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3544  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3545  // > __kmp_max_nth in one of two ways:
3546  //
3547  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3548  // may not be resused by another thread, so we may need to increase
3549  // __kmp_threads_capacity to __kmp_max_nth + 1.
3550  //
3551  // 2) New foreign root(s) are encountered. We always register new foreign
3552  // roots. This may cause a smaller # of threads to be allocated at
3553  // subsequent parallel regions, but the worker threads hang around (and
3554  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3555  //
3556  // Anyway, that is the reason for moving the check to see if
3557  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3558  // instead of having it performed here. -BB
3559 
3560  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3561 
3562  /* compute expansion headroom to check if we can expand */
3563  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3564  /* possible expansion too small -- give up */
3565  return added;
3566  }
3567  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3568 
3569  newCapacity = __kmp_threads_capacity;
3570  do {
3571  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3572  : __kmp_sys_max_nth;
3573  } while (newCapacity < minimumRequiredCapacity);
3574  newThreads = (kmp_info_t **)__kmp_allocate(
3575  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3576  newRoot =
3577  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3578  KMP_MEMCPY(newThreads, __kmp_threads,
3579  __kmp_threads_capacity * sizeof(kmp_info_t *));
3580  KMP_MEMCPY(newRoot, __kmp_root,
3581  __kmp_threads_capacity * sizeof(kmp_root_t *));
3582 
3583  kmp_info_t **temp_threads = __kmp_threads;
3584  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3585  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3586  __kmp_free(temp_threads);
3587  added += newCapacity - __kmp_threads_capacity;
3588  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3589 
3590  if (newCapacity > __kmp_tp_capacity) {
3591  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3592  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3593  __kmp_threadprivate_resize_cache(newCapacity);
3594  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3595  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3596  }
3597  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3598  }
3599 
3600  return added;
3601 }
3602 
3603 /* Register the current thread as a root thread and obtain our gtid. We must
3604  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3605  thread that calls from __kmp_do_serial_initialize() */
3606 int __kmp_register_root(int initial_thread) {
3607  kmp_info_t *root_thread;
3608  kmp_root_t *root;
3609  int gtid;
3610  int capacity;
3611  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3612  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3613  KMP_MB();
3614 
3615  /* 2007-03-02:
3616  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3617  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3618  work as expected -- it may return false (that means there is at least one
3619  empty slot in __kmp_threads array), but it is possible the only free slot
3620  is #0, which is reserved for initial thread and so cannot be used for this
3621  one. Following code workarounds this bug.
3622 
3623  However, right solution seems to be not reserving slot #0 for initial
3624  thread because:
3625  (1) there is no magic in slot #0,
3626  (2) we cannot detect initial thread reliably (the first thread which does
3627  serial initialization may be not a real initial thread).
3628  */
3629  capacity = __kmp_threads_capacity;
3630  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3631  --capacity;
3632  }
3633 
3634  /* see if there are too many threads */
3635  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3636  if (__kmp_tp_cached) {
3637  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3638  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3639  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3640  } else {
3641  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3642  __kmp_msg_null);
3643  }
3644  }
3645 
3646  /* find an available thread slot */
3647  /* Don't reassign the zero slot since we need that to only be used by initial
3648  thread */
3649  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3650  gtid++)
3651  ;
3652  KA_TRACE(1,
3653  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3654  KMP_ASSERT(gtid < __kmp_threads_capacity);
3655 
3656  /* update global accounting */
3657  __kmp_all_nth++;
3658  TCW_4(__kmp_nth, __kmp_nth + 1);
3659 
3660  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3661  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3662  if (__kmp_adjust_gtid_mode) {
3663  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3664  if (TCR_4(__kmp_gtid_mode) != 2) {
3665  TCW_4(__kmp_gtid_mode, 2);
3666  }
3667  } else {
3668  if (TCR_4(__kmp_gtid_mode) != 1) {
3669  TCW_4(__kmp_gtid_mode, 1);
3670  }
3671  }
3672  }
3673 
3674 #ifdef KMP_ADJUST_BLOCKTIME
3675  /* Adjust blocktime to zero if necessary */
3676  /* Middle initialization might not have occurred yet */
3677  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3678  if (__kmp_nth > __kmp_avail_proc) {
3679  __kmp_zero_bt = TRUE;
3680  }
3681  }
3682 #endif /* KMP_ADJUST_BLOCKTIME */
3683 
3684  /* setup this new hierarchy */
3685  if (!(root = __kmp_root[gtid])) {
3686  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3687  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3688  }
3689 
3690 #if KMP_STATS_ENABLED
3691  // Initialize stats as soon as possible (right after gtid assignment).
3692  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3693  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3694  KMP_SET_THREAD_STATE(SERIAL_REGION);
3695  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3696 #endif
3697  __kmp_initialize_root(root);
3698 
3699  /* setup new root thread structure */
3700  if (root->r.r_uber_thread) {
3701  root_thread = root->r.r_uber_thread;
3702  } else {
3703  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3704  if (__kmp_storage_map) {
3705  __kmp_print_thread_storage_map(root_thread, gtid);
3706  }
3707  root_thread->th.th_info.ds.ds_gtid = gtid;
3708 #if OMPT_SUPPORT
3709  root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3710 #endif
3711  root_thread->th.th_root = root;
3712  if (__kmp_env_consistency_check) {
3713  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3714  }
3715 #if USE_FAST_MEMORY
3716  __kmp_initialize_fast_memory(root_thread);
3717 #endif /* USE_FAST_MEMORY */
3718 
3719 #if KMP_USE_BGET
3720  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3721  __kmp_initialize_bget(root_thread);
3722 #endif
3723  __kmp_init_random(root_thread); // Initialize random number generator
3724  }
3725 
3726  /* setup the serial team held in reserve by the root thread */
3727  if (!root_thread->th.th_serial_team) {
3728  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3729  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3730  root_thread->th.th_serial_team =
3731  __kmp_allocate_team(root, 1, 1,
3732 #if OMPT_SUPPORT
3733  ompt_data_none, // root parallel id
3734 #endif
3735 #if OMP_40_ENABLED
3736  proc_bind_default,
3737 #endif
3738  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3739  }
3740  KMP_ASSERT(root_thread->th.th_serial_team);
3741  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3742  root_thread->th.th_serial_team));
3743 
3744  /* drop root_thread into place */
3745  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3746 
3747  root->r.r_root_team->t.t_threads[0] = root_thread;
3748  root->r.r_hot_team->t.t_threads[0] = root_thread;
3749  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3750  // AC: the team created in reserve, not for execution (it is unused for now).
3751  root_thread->th.th_serial_team->t.t_serialized = 0;
3752  root->r.r_uber_thread = root_thread;
3753 
3754  /* initialize the thread, get it ready to go */
3755  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3756  TCW_4(__kmp_init_gtid, TRUE);
3757 
3758  /* prepare the master thread for get_gtid() */
3759  __kmp_gtid_set_specific(gtid);
3760 
3761 #if USE_ITT_BUILD
3762  __kmp_itt_thread_name(gtid);
3763 #endif /* USE_ITT_BUILD */
3764 
3765 #ifdef KMP_TDATA_GTID
3766  __kmp_gtid = gtid;
3767 #endif
3768  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3769  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3770 
3771  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3772  "plain=%u\n",
3773  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3774  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3775  KMP_INIT_BARRIER_STATE));
3776  { // Initialize barrier data.
3777  int b;
3778  for (b = 0; b < bs_last_barrier; ++b) {
3779  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3780 #if USE_DEBUGGER
3781  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3782 #endif
3783  }
3784  }
3785  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3786  KMP_INIT_BARRIER_STATE);
3787 
3788 #if KMP_AFFINITY_SUPPORTED
3789 #if OMP_40_ENABLED
3790  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3791  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3792  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3793  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3794 #endif
3795 
3796  if (TCR_4(__kmp_init_middle)) {
3797  __kmp_affinity_set_init_mask(gtid, TRUE);
3798  }
3799 #endif /* KMP_AFFINITY_SUPPORTED */
3800 
3801  __kmp_root_counter++;
3802 
3803 #if OMPT_SUPPORT
3804  if (!initial_thread && ompt_enabled.enabled) {
3805 
3806  ompt_thread_t *root_thread = ompt_get_thread();
3807 
3808  ompt_set_thread_state(root_thread, omp_state_overhead);
3809 
3810  if (ompt_enabled.ompt_callback_thread_begin) {
3811  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3812  ompt_thread_initial, __ompt_get_thread_data_internal());
3813  }
3814  ompt_data_t *task_data;
3815  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3816  if (ompt_enabled.ompt_callback_task_create) {
3817  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3818  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3819  // initial task has nothing to return to
3820  }
3821 
3822  ompt_set_thread_state(root_thread, omp_state_work_serial);
3823  }
3824 #endif
3825 
3826  KMP_MB();
3827  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3828 
3829  return gtid;
3830 }
3831 
3832 #if KMP_NESTED_HOT_TEAMS
3833 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3834  const int max_level) {
3835  int i, n, nth;
3836  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3837  if (!hot_teams || !hot_teams[level].hot_team) {
3838  return 0;
3839  }
3840  KMP_DEBUG_ASSERT(level < max_level);
3841  kmp_team_t *team = hot_teams[level].hot_team;
3842  nth = hot_teams[level].hot_team_nth;
3843  n = nth - 1; // master is not freed
3844  if (level < max_level - 1) {
3845  for (i = 0; i < nth; ++i) {
3846  kmp_info_t *th = team->t.t_threads[i];
3847  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3848  if (i > 0 && th->th.th_hot_teams) {
3849  __kmp_free(th->th.th_hot_teams);
3850  th->th.th_hot_teams = NULL;
3851  }
3852  }
3853  }
3854  __kmp_free_team(root, team, NULL);
3855  return n;
3856 }
3857 #endif
3858 
3859 // Resets a root thread and clear its root and hot teams.
3860 // Returns the number of __kmp_threads entries directly and indirectly freed.
3861 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3862  kmp_team_t *root_team = root->r.r_root_team;
3863  kmp_team_t *hot_team = root->r.r_hot_team;
3864  int n = hot_team->t.t_nproc;
3865  int i;
3866 
3867  KMP_DEBUG_ASSERT(!root->r.r_active);
3868 
3869  root->r.r_root_team = NULL;
3870  root->r.r_hot_team = NULL;
3871  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3872  // before call to __kmp_free_team().
3873  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3874 #if KMP_NESTED_HOT_TEAMS
3875  if (__kmp_hot_teams_max_level >
3876  0) { // need to free nested hot teams and their threads if any
3877  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3878  kmp_info_t *th = hot_team->t.t_threads[i];
3879  if (__kmp_hot_teams_max_level > 1) {
3880  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3881  }
3882  if (th->th.th_hot_teams) {
3883  __kmp_free(th->th.th_hot_teams);
3884  th->th.th_hot_teams = NULL;
3885  }
3886  }
3887  }
3888 #endif
3889  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3890 
3891  // Before we can reap the thread, we need to make certain that all other
3892  // threads in the teams that had this root as ancestor have stopped trying to
3893  // steal tasks.
3894  if (__kmp_tasking_mode != tskm_immediate_exec) {
3895  __kmp_wait_to_unref_task_teams();
3896  }
3897 
3898 #if KMP_OS_WINDOWS
3899  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3900  KA_TRACE(
3901  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3902  "\n",
3903  (LPVOID) & (root->r.r_uber_thread->th),
3904  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3905  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3906 #endif /* KMP_OS_WINDOWS */
3907 
3908 #if OMPT_SUPPORT
3909  if (ompt_enabled.ompt_callback_thread_end) {
3910  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3911  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3912  }
3913 #endif
3914 
3915  TCW_4(__kmp_nth,
3916  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3917  root->r.r_cg_nthreads--;
3918 
3919  __kmp_reap_thread(root->r.r_uber_thread, 1);
3920 
3921  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3922  // of freeing.
3923  root->r.r_uber_thread = NULL;
3924  /* mark root as no longer in use */
3925  root->r.r_begin = FALSE;
3926 
3927  return n;
3928 }
3929 
3930 void __kmp_unregister_root_current_thread(int gtid) {
3931  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3932  /* this lock should be ok, since unregister_root_current_thread is never
3933  called during an abort, only during a normal close. furthermore, if you
3934  have the forkjoin lock, you should never try to get the initz lock */
3935  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3936  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3937  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3938  "exiting T#%d\n",
3939  gtid));
3940  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3941  return;
3942  }
3943  kmp_root_t *root = __kmp_root[gtid];
3944 
3945  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3946  KMP_ASSERT(KMP_UBER_GTID(gtid));
3947  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3948  KMP_ASSERT(root->r.r_active == FALSE);
3949 
3950  KMP_MB();
3951 
3952 #if OMP_45_ENABLED
3953  kmp_info_t *thread = __kmp_threads[gtid];
3954  kmp_team_t *team = thread->th.th_team;
3955  kmp_task_team_t *task_team = thread->th.th_task_team;
3956 
3957  // we need to wait for the proxy tasks before finishing the thread
3958  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3959 #if OMPT_SUPPORT
3960  // the runtime is shutting down so we won't report any events
3961  thread->th.ompt_thread_info.state = omp_state_undefined;
3962 #endif
3963  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3964  }
3965 #endif
3966 
3967  __kmp_reset_root(gtid, root);
3968 
3969  /* free up this thread slot */
3970  __kmp_gtid_set_specific(KMP_GTID_DNE);
3971 #ifdef KMP_TDATA_GTID
3972  __kmp_gtid = KMP_GTID_DNE;
3973 #endif
3974 
3975  KMP_MB();
3976  KC_TRACE(10,
3977  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3978 
3979  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3980 }
3981 
3982 #if KMP_OS_WINDOWS
3983 /* __kmp_forkjoin_lock must be already held
3984  Unregisters a root thread that is not the current thread. Returns the number
3985  of __kmp_threads entries freed as a result. */
3986 static int __kmp_unregister_root_other_thread(int gtid) {
3987  kmp_root_t *root = __kmp_root[gtid];
3988  int r;
3989 
3990  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3991  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3992  KMP_ASSERT(KMP_UBER_GTID(gtid));
3993  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3994  KMP_ASSERT(root->r.r_active == FALSE);
3995 
3996  r = __kmp_reset_root(gtid, root);
3997  KC_TRACE(10,
3998  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3999  return r;
4000 }
4001 #endif
4002 
4003 #if KMP_DEBUG
4004 void __kmp_task_info() {
4005 
4006  kmp_int32 gtid = __kmp_entry_gtid();
4007  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4008  kmp_info_t *this_thr = __kmp_threads[gtid];
4009  kmp_team_t *steam = this_thr->th.th_serial_team;
4010  kmp_team_t *team = this_thr->th.th_team;
4011 
4012  __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4013  "ptask=%p\n",
4014  gtid, tid, this_thr, team, this_thr->th.th_current_task,
4015  team->t.t_implicit_task_taskdata[tid].td_parent);
4016 }
4017 #endif // KMP_DEBUG
4018 
4019 /* TODO optimize with one big memclr, take out what isn't needed, split
4020  responsibility to workers as much as possible, and delay initialization of
4021  features as much as possible */
4022 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4023  int tid, int gtid) {
4024  /* this_thr->th.th_info.ds.ds_gtid is setup in
4025  kmp_allocate_thread/create_worker.
4026  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4027  kmp_info_t *master = team->t.t_threads[0];
4028  KMP_DEBUG_ASSERT(this_thr != NULL);
4029  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4030  KMP_DEBUG_ASSERT(team);
4031  KMP_DEBUG_ASSERT(team->t.t_threads);
4032  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4033  KMP_DEBUG_ASSERT(master);
4034  KMP_DEBUG_ASSERT(master->th.th_root);
4035 
4036  KMP_MB();
4037 
4038  TCW_SYNC_PTR(this_thr->th.th_team, team);
4039 
4040  this_thr->th.th_info.ds.ds_tid = tid;
4041  this_thr->th.th_set_nproc = 0;
4042  if (__kmp_tasking_mode != tskm_immediate_exec)
4043  // When tasking is possible, threads are not safe to reap until they are
4044  // done tasking; this will be set when tasking code is exited in wait
4045  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4046  else // no tasking --> always safe to reap
4047  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4048 #if OMP_40_ENABLED
4049  this_thr->th.th_set_proc_bind = proc_bind_default;
4050 #if KMP_AFFINITY_SUPPORTED
4051  this_thr->th.th_new_place = this_thr->th.th_current_place;
4052 #endif
4053 #endif
4054  this_thr->th.th_root = master->th.th_root;
4055 
4056  /* setup the thread's cache of the team structure */
4057  this_thr->th.th_team_nproc = team->t.t_nproc;
4058  this_thr->th.th_team_master = master;
4059  this_thr->th.th_team_serialized = team->t.t_serialized;
4060  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4061 
4062  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4063 
4064  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4065  tid, gtid, this_thr, this_thr->th.th_current_task));
4066 
4067  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4068  team, tid, TRUE);
4069 
4070  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4071  tid, gtid, this_thr, this_thr->th.th_current_task));
4072  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4073  // __kmp_initialize_team()?
4074 
4075  /* TODO no worksharing in speculative threads */
4076  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4077 
4078  this_thr->th.th_local.this_construct = 0;
4079 
4080  if (!this_thr->th.th_pri_common) {
4081  this_thr->th.th_pri_common =
4082  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4083  if (__kmp_storage_map) {
4084  __kmp_print_storage_map_gtid(
4085  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4086  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4087  }
4088  this_thr->th.th_pri_head = NULL;
4089  }
4090 
4091  /* Initialize dynamic dispatch */
4092  {
4093  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4094  // Use team max_nproc since this will never change for the team.
4095  size_t disp_size =
4096  sizeof(dispatch_private_info_t) *
4097  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4098  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4099  team->t.t_max_nproc));
4100  KMP_ASSERT(dispatch);
4101  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4102  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4103 
4104  dispatch->th_disp_index = 0;
4105 #if OMP_45_ENABLED
4106  dispatch->th_doacross_buf_idx = 0;
4107 #endif
4108  if (!dispatch->th_disp_buffer) {
4109  dispatch->th_disp_buffer =
4110  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4111 
4112  if (__kmp_storage_map) {
4113  __kmp_print_storage_map_gtid(
4114  gtid, &dispatch->th_disp_buffer[0],
4115  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4116  ? 1
4117  : __kmp_dispatch_num_buffers],
4118  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4119  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4120  gtid, team->t.t_id, gtid);
4121  }
4122  } else {
4123  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4124  }
4125 
4126  dispatch->th_dispatch_pr_current = 0;
4127  dispatch->th_dispatch_sh_current = 0;
4128 
4129  dispatch->th_deo_fcn = 0; /* ORDERED */
4130  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4131  }
4132 
4133  this_thr->th.th_next_pool = NULL;
4134 
4135  if (!this_thr->th.th_task_state_memo_stack) {
4136  size_t i;
4137  this_thr->th.th_task_state_memo_stack =
4138  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4139  this_thr->th.th_task_state_top = 0;
4140  this_thr->th.th_task_state_stack_sz = 4;
4141  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4142  ++i) // zero init the stack
4143  this_thr->th.th_task_state_memo_stack[i] = 0;
4144  }
4145 
4146  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4147  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4148 
4149  KMP_MB();
4150 }
4151 
4152 /* allocate a new thread for the requesting team. this is only called from
4153  within a forkjoin critical section. we will first try to get an available
4154  thread from the thread pool. if none is available, we will fork a new one
4155  assuming we are able to create a new one. this should be assured, as the
4156  caller should check on this first. */
4157 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4158  int new_tid) {
4159  kmp_team_t *serial_team;
4160  kmp_info_t *new_thr;
4161  int new_gtid;
4162 
4163  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4164  KMP_DEBUG_ASSERT(root && team);
4165 #if !KMP_NESTED_HOT_TEAMS
4166  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4167 #endif
4168  KMP_MB();
4169 
4170  /* first, try to get one from the thread pool */
4171  if (__kmp_thread_pool) {
4172 
4173  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4174  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4175  if (new_thr == __kmp_thread_pool_insert_pt) {
4176  __kmp_thread_pool_insert_pt = NULL;
4177  }
4178  TCW_4(new_thr->th.th_in_pool, FALSE);
4179  // Don't touch th_active_in_pool or th_active.
4180  // The worker thread adjusts those flags as it sleeps/awakens.
4181  __kmp_thread_pool_nth--;
4182 
4183  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4184  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4185  KMP_ASSERT(!new_thr->th.th_team);
4186  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4187  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4188 
4189  /* setup the thread structure */
4190  __kmp_initialize_info(new_thr, team, new_tid,
4191  new_thr->th.th_info.ds.ds_gtid);
4192  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4193 
4194  TCW_4(__kmp_nth, __kmp_nth + 1);
4195  root->r.r_cg_nthreads++;
4196 
4197  new_thr->th.th_task_state = 0;
4198  new_thr->th.th_task_state_top = 0;
4199  new_thr->th.th_task_state_stack_sz = 4;
4200 
4201 #ifdef KMP_ADJUST_BLOCKTIME
4202  /* Adjust blocktime back to zero if necessary */
4203  /* Middle initialization might not have occurred yet */
4204  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4205  if (__kmp_nth > __kmp_avail_proc) {
4206  __kmp_zero_bt = TRUE;
4207  }
4208  }
4209 #endif /* KMP_ADJUST_BLOCKTIME */
4210 
4211 #if KMP_DEBUG
4212  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4213  // KMP_BARRIER_PARENT_FLAG.
4214  int b;
4215  kmp_balign_t *balign = new_thr->th.th_bar;
4216  for (b = 0; b < bs_last_barrier; ++b)
4217  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4218 #endif
4219 
4220  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4221  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4222 
4223  KMP_MB();
4224  return new_thr;
4225  }
4226 
4227  /* no, well fork a new one */
4228  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4229  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4230 
4231 #if KMP_USE_MONITOR
4232  // If this is the first worker thread the RTL is creating, then also
4233  // launch the monitor thread. We try to do this as early as possible.
4234  if (!TCR_4(__kmp_init_monitor)) {
4235  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4236  if (!TCR_4(__kmp_init_monitor)) {
4237  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4238  TCW_4(__kmp_init_monitor, 1);
4239  __kmp_create_monitor(&__kmp_monitor);
4240  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4241 #if KMP_OS_WINDOWS
4242  // AC: wait until monitor has started. This is a fix for CQ232808.
4243  // The reason is that if the library is loaded/unloaded in a loop with
4244  // small (parallel) work in between, then there is high probability that
4245  // monitor thread started after the library shutdown. At shutdown it is
4246  // too late to cope with the problem, because when the master is in
4247  // DllMain (process detach) the monitor has no chances to start (it is
4248  // blocked), and master has no means to inform the monitor that the
4249  // library has gone, because all the memory which the monitor can access
4250  // is going to be released/reset.
4251  while (TCR_4(__kmp_init_monitor) < 2) {
4252  KMP_YIELD(TRUE);
4253  }
4254  KF_TRACE(10, ("after monitor thread has started\n"));
4255 #endif
4256  }
4257  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4258  }
4259 #endif
4260 
4261  KMP_MB();
4262  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4263  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4264  }
4265 
4266  /* allocate space for it. */
4267  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4268 
4269  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4270 
4271  if (__kmp_storage_map) {
4272  __kmp_print_thread_storage_map(new_thr, new_gtid);
4273  }
4274 
4275  // add the reserve serialized team, initialized from the team's master thread
4276  {
4277  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4278  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4279  new_thr->th.th_serial_team = serial_team =
4280  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4281 #if OMPT_SUPPORT
4282  ompt_data_none, // root parallel id
4283 #endif
4284 #if OMP_40_ENABLED
4285  proc_bind_default,
4286 #endif
4287  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4288  }
4289  KMP_ASSERT(serial_team);
4290  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4291  // execution (it is unused for now).
4292  serial_team->t.t_threads[0] = new_thr;
4293  KF_TRACE(10,
4294  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4295  new_thr));
4296 
4297  /* setup the thread structures */
4298  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4299 
4300 #if USE_FAST_MEMORY
4301  __kmp_initialize_fast_memory(new_thr);
4302 #endif /* USE_FAST_MEMORY */
4303 
4304 #if KMP_USE_BGET
4305  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4306  __kmp_initialize_bget(new_thr);
4307 #endif
4308 
4309  __kmp_init_random(new_thr); // Initialize random number generator
4310 
4311  /* Initialize these only once when thread is grabbed for a team allocation */
4312  KA_TRACE(20,
4313  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4314  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4315 
4316  int b;
4317  kmp_balign_t *balign = new_thr->th.th_bar;
4318  for (b = 0; b < bs_last_barrier; ++b) {
4319  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4320  balign[b].bb.team = NULL;
4321  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4322  balign[b].bb.use_oncore_barrier = 0;
4323  }
4324 
4325  new_thr->th.th_spin_here = FALSE;
4326  new_thr->th.th_next_waiting = 0;
4327 
4328 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4329  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4330  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4331  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4332  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4333 #endif
4334 
4335  TCW_4(new_thr->th.th_in_pool, FALSE);
4336  new_thr->th.th_active_in_pool = FALSE;
4337  TCW_4(new_thr->th.th_active, TRUE);
4338 
4339  /* adjust the global counters */
4340  __kmp_all_nth++;
4341  __kmp_nth++;
4342 
4343  root->r.r_cg_nthreads++;
4344 
4345  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4346  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4347  if (__kmp_adjust_gtid_mode) {
4348  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4349  if (TCR_4(__kmp_gtid_mode) != 2) {
4350  TCW_4(__kmp_gtid_mode, 2);
4351  }
4352  } else {
4353  if (TCR_4(__kmp_gtid_mode) != 1) {
4354  TCW_4(__kmp_gtid_mode, 1);
4355  }
4356  }
4357  }
4358 
4359 #ifdef KMP_ADJUST_BLOCKTIME
4360  /* Adjust blocktime back to zero if necessary */
4361  /* Middle initialization might not have occurred yet */
4362  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4363  if (__kmp_nth > __kmp_avail_proc) {
4364  __kmp_zero_bt = TRUE;
4365  }
4366  }
4367 #endif /* KMP_ADJUST_BLOCKTIME */
4368 
4369  /* actually fork it and create the new worker thread */
4370  KF_TRACE(
4371  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4372  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4373  KF_TRACE(10,
4374  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4375 
4376  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4377  new_gtid));
4378  KMP_MB();
4379  return new_thr;
4380 }
4381 
4382 /* Reinitialize team for reuse.
4383  The hot team code calls this case at every fork barrier, so EPCC barrier
4384  test are extremely sensitive to changes in it, esp. writes to the team
4385  struct, which cause a cache invalidation in all threads.
4386  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4387 static void __kmp_reinitialize_team(kmp_team_t *team,
4388  kmp_internal_control_t *new_icvs,
4389  ident_t *loc) {
4390  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4391  team->t.t_threads[0], team));
4392  KMP_DEBUG_ASSERT(team && new_icvs);
4393  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4394  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4395 
4396  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4397  // Copy ICVs to the master thread's implicit taskdata
4398  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4399  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4400 
4401  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4402  team->t.t_threads[0], team));
4403 }
4404 
4405 /* Initialize the team data structure.
4406  This assumes the t_threads and t_max_nproc are already set.
4407  Also, we don't touch the arguments */
4408 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4409  kmp_internal_control_t *new_icvs,
4410  ident_t *loc) {
4411  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4412 
4413  /* verify */
4414  KMP_DEBUG_ASSERT(team);
4415  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4416  KMP_DEBUG_ASSERT(team->t.t_threads);
4417  KMP_MB();
4418 
4419  team->t.t_master_tid = 0; /* not needed */
4420  /* team->t.t_master_bar; not needed */
4421  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4422  team->t.t_nproc = new_nproc;
4423 
4424  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4425  team->t.t_next_pool = NULL;
4426  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4427  * up hot team */
4428 
4429  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4430  team->t.t_invoke = NULL; /* not needed */
4431 
4432  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4433  team->t.t_sched.sched = new_icvs->sched.sched;
4434 
4435 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4436  team->t.t_fp_control_saved = FALSE; /* not needed */
4437  team->t.t_x87_fpu_control_word = 0; /* not needed */
4438  team->t.t_mxcsr = 0; /* not needed */
4439 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4440 
4441  team->t.t_construct = 0;
4442 
4443  team->t.t_ordered.dt.t_value = 0;
4444  team->t.t_master_active = FALSE;
4445 
4446  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4447 
4448 #ifdef KMP_DEBUG
4449  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4450 #endif
4451  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4452 
4453  team->t.t_control_stack_top = NULL;
4454 
4455  __kmp_reinitialize_team(team, new_icvs, loc);
4456 
4457  KMP_MB();
4458  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4459 }
4460 
4461 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4462 /* Sets full mask for thread and returns old mask, no changes to structures. */
4463 static void
4464 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4465  if (KMP_AFFINITY_CAPABLE()) {
4466  int status;
4467  if (old_mask != NULL) {
4468  status = __kmp_get_system_affinity(old_mask, TRUE);
4469  int error = errno;
4470  if (status != 0) {
4471  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4472  __kmp_msg_null);
4473  }
4474  }
4475  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4476  }
4477 }
4478 #endif
4479 
4480 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4481 
4482 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4483 // It calculats the worker + master thread's partition based upon the parent
4484 // thread's partition, and binds each worker to a thread in their partition.
4485 // The master thread's partition should already include its current binding.
4486 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4487  // Copy the master thread's place partion to the team struct
4488  kmp_info_t *master_th = team->t.t_threads[0];
4489  KMP_DEBUG_ASSERT(master_th != NULL);
4490  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4491  int first_place = master_th->th.th_first_place;
4492  int last_place = master_th->th.th_last_place;
4493  int masters_place = master_th->th.th_current_place;
4494  team->t.t_first_place = first_place;
4495  team->t.t_last_place = last_place;
4496 
4497  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4498  "bound to place %d partition = [%d,%d]\n",
4499  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4500  team->t.t_id, masters_place, first_place, last_place));
4501 
4502  switch (proc_bind) {
4503 
4504  case proc_bind_default:
4505  // serial teams might have the proc_bind policy set to proc_bind_default. It
4506  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4507  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4508  break;
4509 
4510  case proc_bind_master: {
4511  int f;
4512  int n_th = team->t.t_nproc;
4513  for (f = 1; f < n_th; f++) {
4514  kmp_info_t *th = team->t.t_threads[f];
4515  KMP_DEBUG_ASSERT(th != NULL);
4516  th->th.th_first_place = first_place;
4517  th->th.th_last_place = last_place;
4518  th->th.th_new_place = masters_place;
4519 
4520  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4521  "partition = [%d,%d]\n",
4522  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4523  f, masters_place, first_place, last_place));
4524  }
4525  } break;
4526 
4527  case proc_bind_close: {
4528  int f;
4529  int n_th = team->t.t_nproc;
4530  int n_places;
4531  if (first_place <= last_place) {
4532  n_places = last_place - first_place + 1;
4533  } else {
4534  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4535  }
4536  if (n_th <= n_places) {
4537  int place = masters_place;
4538  for (f = 1; f < n_th; f++) {
4539  kmp_info_t *th = team->t.t_threads[f];
4540  KMP_DEBUG_ASSERT(th != NULL);
4541 
4542  if (place == last_place) {
4543  place = first_place;
4544  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4545  place = 0;
4546  } else {
4547  place++;
4548  }
4549  th->th.th_first_place = first_place;
4550  th->th.th_last_place = last_place;
4551  th->th.th_new_place = place;
4552 
4553  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4554  "partition = [%d,%d]\n",
4555  __kmp_gtid_from_thread(team->t.t_threads[f]),
4556  team->t.t_id, f, place, first_place, last_place));
4557  }
4558  } else {
4559  int S, rem, gap, s_count;
4560  S = n_th / n_places;
4561  s_count = 0;
4562  rem = n_th - (S * n_places);
4563  gap = rem > 0 ? n_places / rem : n_places;
4564  int place = masters_place;
4565  int gap_ct = gap;
4566  for (f = 0; f < n_th; f++) {
4567  kmp_info_t *th = team->t.t_threads[f];
4568  KMP_DEBUG_ASSERT(th != NULL);
4569 
4570  th->th.th_first_place = first_place;
4571  th->th.th_last_place = last_place;
4572  th->th.th_new_place = place;
4573  s_count++;
4574 
4575  if ((s_count == S) && rem && (gap_ct == gap)) {
4576  // do nothing, add an extra thread to place on next iteration
4577  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4578  // we added an extra thread to this place; move to next place
4579  if (place == last_place) {
4580  place = first_place;
4581  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4582  place = 0;
4583  } else {
4584  place++;
4585  }
4586  s_count = 0;
4587  gap_ct = 1;
4588  rem--;
4589  } else if (s_count == S) { // place full; don't add extra
4590  if (place == last_place) {
4591  place = first_place;
4592  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4593  place = 0;
4594  } else {
4595  place++;
4596  }
4597  gap_ct++;
4598  s_count = 0;
4599  }
4600 
4601  KA_TRACE(100,
4602  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4603  "partition = [%d,%d]\n",
4604  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4605  th->th.th_new_place, first_place, last_place));
4606  }
4607  KMP_DEBUG_ASSERT(place == masters_place);
4608  }
4609  } break;
4610 
4611  case proc_bind_spread: {
4612  int f;
4613  int n_th = team->t.t_nproc;
4614  int n_places;
4615  int thidx;
4616  if (first_place <= last_place) {
4617  n_places = last_place - first_place + 1;
4618  } else {
4619  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4620  }
4621  if (n_th <= n_places) {
4622  int place = -1;
4623 
4624  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4625  int S = n_places / n_th;
4626  int s_count, rem, gap, gap_ct;
4627 
4628  place = masters_place;
4629  rem = n_places - n_th * S;
4630  gap = rem ? n_th / rem : 1;
4631  gap_ct = gap;
4632  thidx = n_th;
4633  if (update_master_only == 1)
4634  thidx = 1;
4635  for (f = 0; f < thidx; f++) {
4636  kmp_info_t *th = team->t.t_threads[f];
4637  KMP_DEBUG_ASSERT(th != NULL);
4638 
4639  th->th.th_first_place = place;
4640  th->th.th_new_place = place;
4641  s_count = 1;
4642  while (s_count < S) {
4643  if (place == last_place) {
4644  place = first_place;
4645  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4646  place = 0;
4647  } else {
4648  place++;
4649  }
4650  s_count++;
4651  }
4652  if (rem && (gap_ct == gap)) {
4653  if (place == last_place) {
4654  place = first_place;
4655  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4656  place = 0;
4657  } else {
4658  place++;
4659  }
4660  rem--;
4661  gap_ct = 0;
4662  }
4663  th->th.th_last_place = place;
4664  gap_ct++;
4665 
4666  if (place == last_place) {
4667  place = first_place;
4668  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4669  place = 0;
4670  } else {
4671  place++;
4672  }
4673 
4674  KA_TRACE(100,
4675  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4676  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4677  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4678  f, th->th.th_new_place, th->th.th_first_place,
4679  th->th.th_last_place, __kmp_affinity_num_masks));
4680  }
4681  } else {
4682  /* Having uniform space of available computation places I can create
4683  T partitions of round(P/T) size and put threads into the first
4684  place of each partition. */
4685  double current = static_cast<double>(masters_place);
4686  double spacing =
4687  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4688  int first, last;
4689  kmp_info_t *th;
4690 
4691  thidx = n_th + 1;
4692  if (update_master_only == 1)
4693  thidx = 1;
4694  for (f = 0; f < thidx; f++) {
4695  first = static_cast<int>(current);
4696  last = static_cast<int>(current + spacing) - 1;
4697  KMP_DEBUG_ASSERT(last >= first);
4698  if (first >= n_places) {
4699  if (masters_place) {
4700  first -= n_places;
4701  last -= n_places;
4702  if (first == (masters_place + 1)) {
4703  KMP_DEBUG_ASSERT(f == n_th);
4704  first--;
4705  }
4706  if (last == masters_place) {
4707  KMP_DEBUG_ASSERT(f == (n_th - 1));
4708  last--;
4709  }
4710  } else {
4711  KMP_DEBUG_ASSERT(f == n_th);
4712  first = 0;
4713  last = 0;
4714  }
4715  }
4716  if (last >= n_places) {
4717  last = (n_places - 1);
4718  }
4719  place = first;
4720  current += spacing;
4721  if (f < n_th) {
4722  KMP_DEBUG_ASSERT(0 <= first);
4723  KMP_DEBUG_ASSERT(n_places > first);
4724  KMP_DEBUG_ASSERT(0 <= last);
4725  KMP_DEBUG_ASSERT(n_places > last);
4726  KMP_DEBUG_ASSERT(last_place >= first_place);
4727  th = team->t.t_threads[f];
4728  KMP_DEBUG_ASSERT(th);
4729  th->th.th_first_place = first;
4730  th->th.th_new_place = place;
4731  th->th.th_last_place = last;
4732 
4733  KA_TRACE(100,
4734  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4735  "partition = [%d,%d], spacing = %.4f\n",
4736  __kmp_gtid_from_thread(team->t.t_threads[f]),
4737  team->t.t_id, f, th->th.th_new_place,
4738  th->th.th_first_place, th->th.th_last_place, spacing));
4739  }
4740  }
4741  }
4742  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4743  } else {
4744  int S, rem, gap, s_count;
4745  S = n_th / n_places;
4746  s_count = 0;
4747  rem = n_th - (S * n_places);
4748  gap = rem > 0 ? n_places / rem : n_places;
4749  int place = masters_place;
4750  int gap_ct = gap;
4751  thidx = n_th;
4752  if (update_master_only == 1)
4753  thidx = 1;
4754  for (f = 0; f < thidx; f++) {
4755  kmp_info_t *th = team->t.t_threads[f];
4756  KMP_DEBUG_ASSERT(th != NULL);
4757 
4758  th->th.th_first_place = place;
4759  th->th.th_last_place = place;
4760  th->th.th_new_place = place;
4761  s_count++;
4762 
4763  if ((s_count == S) && rem && (gap_ct == gap)) {
4764  // do nothing, add an extra thread to place on next iteration
4765  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4766  // we added an extra thread to this place; move on to next place
4767  if (place == last_place) {
4768  place = first_place;
4769  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4770  place = 0;
4771  } else {
4772  place++;
4773  }
4774  s_count = 0;
4775  gap_ct = 1;
4776  rem--;
4777  } else if (s_count == S) { // place is full; don't add extra thread
4778  if (place == last_place) {
4779  place = first_place;
4780  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4781  place = 0;
4782  } else {
4783  place++;
4784  }
4785  gap_ct++;
4786  s_count = 0;
4787  }
4788 
4789  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4790  "partition = [%d,%d]\n",
4791  __kmp_gtid_from_thread(team->t.t_threads[f]),
4792  team->t.t_id, f, th->th.th_new_place,
4793  th->th.th_first_place, th->th.th_last_place));
4794  }
4795  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4796  }
4797  } break;
4798 
4799  default:
4800  break;
4801  }
4802 
4803  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4804 }
4805 
4806 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4807 
4808 /* allocate a new team data structure to use. take one off of the free pool if
4809  available */
4810 kmp_team_t *
4811 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4812 #if OMPT_SUPPORT
4813  ompt_data_t ompt_parallel_data,
4814 #endif
4815 #if OMP_40_ENABLED
4816  kmp_proc_bind_t new_proc_bind,
4817 #endif
4818  kmp_internal_control_t *new_icvs,
4819  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4820  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4821  int f;
4822  kmp_team_t *team;
4823  int use_hot_team = !root->r.r_active;
4824  int level = 0;
4825 
4826  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4827  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4828  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4829  KMP_MB();
4830 
4831 #if KMP_NESTED_HOT_TEAMS
4832  kmp_hot_team_ptr_t *hot_teams;
4833  if (master) {
4834  team = master->th.th_team;
4835  level = team->t.t_active_level;
4836  if (master->th.th_teams_microtask) { // in teams construct?
4837  if (master->th.th_teams_size.nteams > 1 &&
4838  ( // #teams > 1
4839  team->t.t_pkfn ==
4840  (microtask_t)__kmp_teams_master || // inner fork of the teams
4841  master->th.th_teams_level <
4842  team->t.t_level)) { // or nested parallel inside the teams
4843  ++level; // not increment if #teams==1, or for outer fork of the teams;
4844  // increment otherwise
4845  }
4846  }
4847  hot_teams = master->th.th_hot_teams;
4848  if (level < __kmp_hot_teams_max_level && hot_teams &&
4849  hot_teams[level]
4850  .hot_team) { // hot team has already been allocated for given level
4851  use_hot_team = 1;
4852  } else {
4853  use_hot_team = 0;
4854  }
4855  }
4856 #endif
4857  // Optimization to use a "hot" team
4858  if (use_hot_team && new_nproc > 1) {
4859  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4860 #if KMP_NESTED_HOT_TEAMS
4861  team = hot_teams[level].hot_team;
4862 #else
4863  team = root->r.r_hot_team;
4864 #endif
4865 #if KMP_DEBUG
4866  if (__kmp_tasking_mode != tskm_immediate_exec) {
4867  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4868  "task_team[1] = %p before reinit\n",
4869  team->t.t_task_team[0], team->t.t_task_team[1]));
4870  }
4871 #endif
4872 
4873  // Has the number of threads changed?
4874  /* Let's assume the most common case is that the number of threads is
4875  unchanged, and put that case first. */
4876  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4877  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4878  // This case can mean that omp_set_num_threads() was called and the hot
4879  // team size was already reduced, so we check the special flag
4880  if (team->t.t_size_changed == -1) {
4881  team->t.t_size_changed = 1;
4882  } else {
4883  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4884  }
4885 
4886  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4887  kmp_r_sched_t new_sched = new_icvs->sched;
4888  // set master's schedule as new run-time schedule
4889  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4890 
4891  __kmp_reinitialize_team(team, new_icvs,
4892  root->r.r_uber_thread->th.th_ident);
4893 
4894  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4895  team->t.t_threads[0], team));
4896  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4897 
4898 #if OMP_40_ENABLED
4899 #if KMP_AFFINITY_SUPPORTED
4900  if ((team->t.t_size_changed == 0) &&
4901  (team->t.t_proc_bind == new_proc_bind)) {
4902  if (new_proc_bind == proc_bind_spread) {
4903  __kmp_partition_places(
4904  team, 1); // add flag to update only master for spread
4905  }
4906  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4907  "proc_bind = %d, partition = [%d,%d]\n",
4908  team->t.t_id, new_proc_bind, team->t.t_first_place,
4909  team->t.t_last_place));
4910  } else {
4911  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4912  __kmp_partition_places(team);
4913  }
4914 #else
4915  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4916 #endif /* KMP_AFFINITY_SUPPORTED */
4917 #endif /* OMP_40_ENABLED */
4918  } else if (team->t.t_nproc > new_nproc) {
4919  KA_TRACE(20,
4920  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4921  new_nproc));
4922 
4923  team->t.t_size_changed = 1;
4924 #if KMP_NESTED_HOT_TEAMS
4925  if (__kmp_hot_teams_mode == 0) {
4926  // AC: saved number of threads should correspond to team's value in this
4927  // mode, can be bigger in mode 1, when hot team has threads in reserve
4928  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4929  hot_teams[level].hot_team_nth = new_nproc;
4930 #endif // KMP_NESTED_HOT_TEAMS
4931  /* release the extra threads we don't need any more */
4932  for (f = new_nproc; f < team->t.t_nproc; f++) {
4933  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4934  if (__kmp_tasking_mode != tskm_immediate_exec) {
4935  // When decreasing team size, threads no longer in the team should
4936  // unref task team.
4937  team->t.t_threads[f]->th.th_task_team = NULL;
4938  }
4939  __kmp_free_thread(team->t.t_threads[f]);
4940  team->t.t_threads[f] = NULL;
4941  }
4942 #if KMP_NESTED_HOT_TEAMS
4943  } // (__kmp_hot_teams_mode == 0)
4944  else {
4945  // When keeping extra threads in team, switch threads to wait on own
4946  // b_go flag
4947  for (f = new_nproc; f < team->t.t_nproc; ++f) {
4948  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4949  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4950  for (int b = 0; b < bs_last_barrier; ++b) {
4951  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4952  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4953  }
4954  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4955  }
4956  }
4957  }
4958 #endif // KMP_NESTED_HOT_TEAMS
4959  team->t.t_nproc = new_nproc;
4960  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4961  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4962  __kmp_reinitialize_team(team, new_icvs,
4963  root->r.r_uber_thread->th.th_ident);
4964 
4965  /* update the remaining threads */
4966  for (f = 0; f < new_nproc; ++f) {
4967  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4968  }
4969  // restore the current task state of the master thread: should be the
4970  // implicit task
4971  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4972  team->t.t_threads[0], team));
4973 
4974  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4975 
4976 #ifdef KMP_DEBUG
4977  for (f = 0; f < team->t.t_nproc; f++) {
4978  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4979  team->t.t_threads[f]->th.th_team_nproc ==
4980  team->t.t_nproc);
4981  }
4982 #endif
4983 
4984 #if OMP_40_ENABLED
4985  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4986 #if KMP_AFFINITY_SUPPORTED
4987  __kmp_partition_places(team);
4988 #endif
4989 #endif
4990  } else { // team->t.t_nproc < new_nproc
4991 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4992  kmp_affin_mask_t *old_mask;
4993  if (KMP_AFFINITY_CAPABLE()) {
4994  KMP_CPU_ALLOC(old_mask);
4995  }
4996 #endif
4997 
4998  KA_TRACE(20,
4999  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5000  new_nproc));
5001 
5002  team->t.t_size_changed = 1;
5003 
5004 #if KMP_NESTED_HOT_TEAMS
5005  int avail_threads = hot_teams[level].hot_team_nth;
5006  if (new_nproc < avail_threads)
5007  avail_threads = new_nproc;
5008  kmp_info_t **other_threads = team->t.t_threads;
5009  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5010  // Adjust barrier data of reserved threads (if any) of the team
5011  // Other data will be set in __kmp_initialize_info() below.
5012  int b;
5013  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5014  for (b = 0; b < bs_last_barrier; ++b) {
5015  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5016  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5017 #if USE_DEBUGGER
5018  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5019 #endif
5020  }
5021  }
5022  if (hot_teams[level].hot_team_nth >= new_nproc) {
5023  // we have all needed threads in reserve, no need to allocate any
5024  // this only possible in mode 1, cannot have reserved threads in mode 0
5025  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5026  team->t.t_nproc = new_nproc; // just get reserved threads involved
5027  } else {
5028  // we may have some threads in reserve, but not enough
5029  team->t.t_nproc =
5030  hot_teams[level]
5031  .hot_team_nth; // get reserved threads involved if any
5032  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5033 #endif // KMP_NESTED_HOT_TEAMS
5034  if (team->t.t_max_nproc < new_nproc) {
5035  /* reallocate larger arrays */
5036  __kmp_reallocate_team_arrays(team, new_nproc);
5037  __kmp_reinitialize_team(team, new_icvs, NULL);
5038  }
5039 
5040 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5041  /* Temporarily set full mask for master thread before creation of
5042  workers. The reason is that workers inherit the affinity from master,
5043  so if a lot of workers are created on the single core quickly, they
5044  don't get a chance to set their own affinity for a long time. */
5045  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5046 #endif
5047 
5048  /* allocate new threads for the hot team */
5049  for (f = team->t.t_nproc; f < new_nproc; f++) {
5050  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5051  KMP_DEBUG_ASSERT(new_worker);
5052  team->t.t_threads[f] = new_worker;
5053 
5054  KA_TRACE(20,
5055  ("__kmp_allocate_team: team %d init T#%d arrived: "
5056  "join=%llu, plain=%llu\n",
5057  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5058  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5059  team->t.t_bar[bs_plain_barrier].b_arrived));
5060 
5061  { // Initialize barrier data for new threads.
5062  int b;
5063  kmp_balign_t *balign = new_worker->th.th_bar;
5064  for (b = 0; b < bs_last_barrier; ++b) {
5065  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5066  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5067  KMP_BARRIER_PARENT_FLAG);
5068 #if USE_DEBUGGER
5069  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5070 #endif
5071  }
5072  }
5073  }
5074 
5075 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5076  if (KMP_AFFINITY_CAPABLE()) {
5077  /* Restore initial master thread's affinity mask */
5078  __kmp_set_system_affinity(old_mask, TRUE);
5079  KMP_CPU_FREE(old_mask);
5080  }
5081 #endif
5082 #if KMP_NESTED_HOT_TEAMS
5083  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5084 #endif // KMP_NESTED_HOT_TEAMS
5085  /* make sure everyone is syncronized */
5086  int old_nproc = team->t.t_nproc; // save old value and use to update only
5087  // new threads below
5088  __kmp_initialize_team(team, new_nproc, new_icvs,
5089  root->r.r_uber_thread->th.th_ident);
5090 
5091  /* reinitialize the threads */
5092  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5093  for (f = 0; f < team->t.t_nproc; ++f)
5094  __kmp_initialize_info(team->t.t_threads[f], team, f,
5095  __kmp_gtid_from_tid(f, team));
5096  if (level) { // set th_task_state for new threads in nested hot team
5097  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5098  // only need to set the th_task_state for the new threads. th_task_state
5099  // for master thread will not be accurate until after this in
5100  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5101  // correct value.
5102  for (f = old_nproc; f < team->t.t_nproc; ++f)
5103  team->t.t_threads[f]->th.th_task_state =
5104  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5105  } else { // set th_task_state for new threads in non-nested hot team
5106  int old_state =
5107  team->t.t_threads[0]->th.th_task_state; // copy master's state
5108  for (f = old_nproc; f < team->t.t_nproc; ++f)
5109  team->t.t_threads[f]->th.th_task_state = old_state;
5110  }
5111 
5112 #ifdef KMP_DEBUG
5113  for (f = 0; f < team->t.t_nproc; ++f) {
5114  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5115  team->t.t_threads[f]->th.th_team_nproc ==
5116  team->t.t_nproc);
5117  }
5118 #endif
5119 
5120 #if OMP_40_ENABLED
5121  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5122 #if KMP_AFFINITY_SUPPORTED
5123  __kmp_partition_places(team);
5124 #endif
5125 #endif
5126  } // Check changes in number of threads
5127 
5128 #if OMP_40_ENABLED
5129  kmp_info_t *master = team->t.t_threads[0];
5130  if (master->th.th_teams_microtask) {
5131  for (f = 1; f < new_nproc; ++f) {
5132  // propagate teams construct specific info to workers
5133  kmp_info_t *thr = team->t.t_threads[f];
5134  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5135  thr->th.th_teams_level = master->th.th_teams_level;
5136  thr->th.th_teams_size = master->th.th_teams_size;
5137  }
5138  }
5139 #endif /* OMP_40_ENABLED */
5140 #if KMP_NESTED_HOT_TEAMS
5141  if (level) {
5142  // Sync barrier state for nested hot teams, not needed for outermost hot
5143  // team.
5144  for (f = 1; f < new_nproc; ++f) {
5145  kmp_info_t *thr = team->t.t_threads[f];
5146  int b;
5147  kmp_balign_t *balign = thr->th.th_bar;
5148  for (b = 0; b < bs_last_barrier; ++b) {
5149  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5150  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5151 #if USE_DEBUGGER
5152  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5153 #endif
5154  }
5155  }
5156  }
5157 #endif // KMP_NESTED_HOT_TEAMS
5158 
5159  /* reallocate space for arguments if necessary */
5160  __kmp_alloc_argv_entries(argc, team, TRUE);
5161  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5162  // The hot team re-uses the previous task team,
5163  // if untouched during the previous release->gather phase.
5164 
5165  KF_TRACE(10, (" hot_team = %p\n", team));
5166 
5167 #if KMP_DEBUG
5168  if (__kmp_tasking_mode != tskm_immediate_exec) {
5169  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5170  "task_team[1] = %p after reinit\n",
5171  team->t.t_task_team[0], team->t.t_task_team[1]));
5172  }
5173 #endif
5174 
5175 #if OMPT_SUPPORT
5176  __ompt_team_assign_id(team, ompt_parallel_data);
5177 #endif
5178 
5179  KMP_MB();
5180 
5181  return team;
5182  }
5183 
5184  /* next, let's try to take one from the team pool */
5185  KMP_MB();
5186  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5187  /* TODO: consider resizing undersized teams instead of reaping them, now
5188  that we have a resizing mechanism */
5189  if (team->t.t_max_nproc >= max_nproc) {
5190  /* take this team from the team pool */
5191  __kmp_team_pool = team->t.t_next_pool;
5192 
5193  /* setup the team for fresh use */
5194  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5195 
5196  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5197  "task_team[1] %p to NULL\n",
5198  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5199  team->t.t_task_team[0] = NULL;
5200  team->t.t_task_team[1] = NULL;
5201 
5202  /* reallocate space for arguments if necessary */
5203  __kmp_alloc_argv_entries(argc, team, TRUE);
5204  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5205 
5206  KA_TRACE(
5207  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5208  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5209  { // Initialize barrier data.
5210  int b;
5211  for (b = 0; b < bs_last_barrier; ++b) {
5212  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5213 #if USE_DEBUGGER
5214  team->t.t_bar[b].b_master_arrived = 0;
5215  team->t.t_bar[b].b_team_arrived = 0;
5216 #endif
5217  }
5218  }
5219 
5220 #if OMP_40_ENABLED
5221  team->t.t_proc_bind = new_proc_bind;
5222 #endif
5223 
5224  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5225  team->t.t_id));
5226 
5227 #if OMPT_SUPPORT
5228  __ompt_team_assign_id(team, ompt_parallel_data);
5229 #endif
5230 
5231  KMP_MB();
5232 
5233  return team;
5234  }
5235 
5236  /* reap team if it is too small, then loop back and check the next one */
5237  // not sure if this is wise, but, will be redone during the hot-teams
5238  // rewrite.
5239  /* TODO: Use technique to find the right size hot-team, don't reap them */
5240  team = __kmp_reap_team(team);
5241  __kmp_team_pool = team;
5242  }
5243 
5244  /* nothing available in the pool, no matter, make a new team! */
5245  KMP_MB();
5246  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5247 
5248  /* and set it up */
5249  team->t.t_max_nproc = max_nproc;
5250  /* NOTE well, for some reason allocating one big buffer and dividing it up
5251  seems to really hurt performance a lot on the P4, so, let's not use this */
5252  __kmp_allocate_team_arrays(team, max_nproc);
5253 
5254  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5255  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5256 
5257  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5258  "%p to NULL\n",
5259  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5260  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5261  // memory, no need to duplicate
5262  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5263  // memory, no need to duplicate
5264 
5265  if (__kmp_storage_map) {
5266  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5267  }
5268 
5269  /* allocate space for arguments */
5270  __kmp_alloc_argv_entries(argc, team, FALSE);
5271  team->t.t_argc = argc;
5272 
5273  KA_TRACE(20,
5274  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5275  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5276  { // Initialize barrier data.
5277  int b;
5278  for (b = 0; b < bs_last_barrier; ++b) {
5279  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5280 #if USE_DEBUGGER
5281  team->t.t_bar[b].b_master_arrived = 0;
5282  team->t.t_bar[b].b_team_arrived = 0;
5283 #endif
5284  }
5285  }
5286 
5287 #if OMP_40_ENABLED
5288  team->t.t_proc_bind = new_proc_bind;
5289 #endif
5290 
5291 #if OMPT_SUPPORT
5292  __ompt_team_assign_id(team, ompt_parallel_data);
5293  team->t.ompt_serialized_team_info = NULL;
5294 #endif
5295 
5296  KMP_MB();
5297 
5298  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5299  team->t.t_id));
5300 
5301  return team;
5302 }
5303 
5304 /* TODO implement hot-teams at all levels */
5305 /* TODO implement lazy thread release on demand (disband request) */
5306 
5307 /* free the team. return it to the team pool. release all the threads
5308  * associated with it */
5309 void __kmp_free_team(kmp_root_t *root,
5310  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5311  int f;
5312  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5313  team->t.t_id));
5314 
5315  /* verify state */
5316  KMP_DEBUG_ASSERT(root);
5317  KMP_DEBUG_ASSERT(team);
5318  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5319  KMP_DEBUG_ASSERT(team->t.t_threads);
5320 
5321  int use_hot_team = team == root->r.r_hot_team;
5322 #if KMP_NESTED_HOT_TEAMS
5323  int level;
5324  kmp_hot_team_ptr_t *hot_teams;
5325  if (master) {
5326  level = team->t.t_active_level - 1;
5327  if (master->th.th_teams_microtask) { // in teams construct?
5328  if (master->th.th_teams_size.nteams > 1) {
5329  ++level; // level was not increased in teams construct for
5330  // team_of_masters
5331  }
5332  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5333  master->th.th_teams_level == team->t.t_level) {
5334  ++level; // level was not increased in teams construct for
5335  // team_of_workers before the parallel
5336  } // team->t.t_level will be increased inside parallel
5337  }
5338  hot_teams = master->th.th_hot_teams;
5339  if (level < __kmp_hot_teams_max_level) {
5340  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5341  use_hot_team = 1;
5342  }
5343  }
5344 #endif // KMP_NESTED_HOT_TEAMS
5345 
5346  /* team is done working */
5347  TCW_SYNC_PTR(team->t.t_pkfn,
5348  NULL); // Important for Debugging Support Library.
5349  team->t.t_copyin_counter = 0; // init counter for possible reuse
5350  // Do not reset pointer to parent team to NULL for hot teams.
5351 
5352  /* if we are non-hot team, release our threads */
5353  if (!use_hot_team) {
5354  if (__kmp_tasking_mode != tskm_immediate_exec) {
5355  // Wait for threads to reach reapable state
5356  for (f = 1; f < team->t.t_nproc; ++f) {
5357  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5358  kmp_info_t *th = team->t.t_threads[f];
5359  volatile kmp_uint32 *state = &th->th.th_reap_state;
5360  while (*state != KMP_SAFE_TO_REAP) {
5361 #if KMP_OS_WINDOWS
5362  // On Windows a thread can be killed at any time, check this
5363  DWORD ecode;
5364  if (!__kmp_is_thread_alive(th, &ecode)) {
5365  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5366  break;
5367  }
5368 #endif
5369  // first check if thread is sleeping
5370  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5371  if (fl.is_sleeping())
5372  fl.resume(__kmp_gtid_from_thread(th));
5373  KMP_CPU_PAUSE();
5374  }
5375  }
5376 
5377  // Delete task teams
5378  int tt_idx;
5379  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5380  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5381  if (task_team != NULL) {
5382  for (f = 0; f < team->t.t_nproc;
5383  ++f) { // Have all threads unref task teams
5384  team->t.t_threads[f]->th.th_task_team = NULL;
5385  }
5386  KA_TRACE(
5387  20,
5388  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5389  __kmp_get_gtid(), task_team, team->t.t_id));
5390 #if KMP_NESTED_HOT_TEAMS
5391  __kmp_free_task_team(master, task_team);
5392 #endif
5393  team->t.t_task_team[tt_idx] = NULL;
5394  }
5395  }
5396  }
5397 
5398  // Reset pointer to parent team only for non-hot teams.
5399  team->t.t_parent = NULL;
5400  team->t.t_level = 0;
5401  team->t.t_active_level = 0;
5402 
5403  /* free the worker threads */
5404  for (f = 1; f < team->t.t_nproc; ++f) {
5405  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5406  __kmp_free_thread(team->t.t_threads[f]);
5407  team->t.t_threads[f] = NULL;
5408  }
5409 
5410  /* put the team back in the team pool */
5411  /* TODO limit size of team pool, call reap_team if pool too large */
5412  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5413  __kmp_team_pool = (volatile kmp_team_t *)team;
5414  }
5415 
5416  KMP_MB();
5417 }
5418 
5419 /* reap the team. destroy it, reclaim all its resources and free its memory */
5420 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5421  kmp_team_t *next_pool = team->t.t_next_pool;
5422 
5423  KMP_DEBUG_ASSERT(team);
5424  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5425  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5426  KMP_DEBUG_ASSERT(team->t.t_threads);
5427  KMP_DEBUG_ASSERT(team->t.t_argv);
5428 
5429  /* TODO clean the threads that are a part of this? */
5430 
5431  /* free stuff */
5432  __kmp_free_team_arrays(team);
5433  if (team->t.t_argv != &team->t.t_inline_argv[0])
5434  __kmp_free((void *)team->t.t_argv);
5435  __kmp_free(team);
5436 
5437  KMP_MB();
5438  return next_pool;
5439 }
5440 
5441 // Free the thread. Don't reap it, just place it on the pool of available
5442 // threads.
5443 //
5444 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5445 // binding for the affinity mechanism to be useful.
5446 //
5447 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5448 // However, we want to avoid a potential performance problem by always
5449 // scanning through the list to find the correct point at which to insert
5450 // the thread (potential N**2 behavior). To do this we keep track of the
5451 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5452 // With single-level parallelism, threads will always be added to the tail
5453 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5454 // parallelism, all bets are off and we may need to scan through the entire
5455 // free list.
5456 //
5457 // This change also has a potentially large performance benefit, for some
5458 // applications. Previously, as threads were freed from the hot team, they
5459 // would be placed back on the free list in inverse order. If the hot team
5460 // grew back to it's original size, then the freed thread would be placed
5461 // back on the hot team in reverse order. This could cause bad cache
5462 // locality problems on programs where the size of the hot team regularly
5463 // grew and shrunk.
5464 //
5465 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5466 void __kmp_free_thread(kmp_info_t *this_th) {
5467  int gtid;
5468  kmp_info_t **scan;
5469  kmp_root_t *root = this_th->th.th_root;
5470 
5471  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5472  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5473 
5474  KMP_DEBUG_ASSERT(this_th);
5475 
5476  // When moving thread to pool, switch thread to wait on own b_go flag, and
5477  // uninitialized (NULL team).
5478  int b;
5479  kmp_balign_t *balign = this_th->th.th_bar;
5480  for (b = 0; b < bs_last_barrier; ++b) {
5481  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5482  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5483  balign[b].bb.team = NULL;
5484  balign[b].bb.leaf_kids = 0;
5485  }
5486  this_th->th.th_task_state = 0;
5487  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5488 
5489  /* put thread back on the free pool */
5490  TCW_PTR(this_th->th.th_team, NULL);
5491  TCW_PTR(this_th->th.th_root, NULL);
5492  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5493 
5494  /* If the implicit task assigned to this thread can be used by other threads
5495  * -> multiple threads can share the data and try to free the task at
5496  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5497  * with higher probability when hot team is disabled but can occurs even when
5498  * the hot team is enabled */
5499  __kmp_free_implicit_task(this_th);
5500  this_th->th.th_current_task = NULL;
5501 
5502  // If the __kmp_thread_pool_insert_pt is already past the new insert
5503  // point, then we need to re-scan the entire list.
5504  gtid = this_th->th.th_info.ds.ds_gtid;
5505  if (__kmp_thread_pool_insert_pt != NULL) {
5506  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5507  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5508  __kmp_thread_pool_insert_pt = NULL;
5509  }
5510  }
5511 
5512  // Scan down the list to find the place to insert the thread.
5513  // scan is the address of a link in the list, possibly the address of
5514  // __kmp_thread_pool itself.
5515  //
5516  // In the absence of nested parallism, the for loop will have 0 iterations.
5517  if (__kmp_thread_pool_insert_pt != NULL) {
5518  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5519  } else {
5520  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5521  }
5522  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5523  scan = &((*scan)->th.th_next_pool))
5524  ;
5525 
5526  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5527  // to its address.
5528  TCW_PTR(this_th->th.th_next_pool, *scan);
5529  __kmp_thread_pool_insert_pt = *scan = this_th;
5530  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5531  (this_th->th.th_info.ds.ds_gtid <
5532  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5533  TCW_4(this_th->th.th_in_pool, TRUE);
5534  __kmp_thread_pool_nth++;
5535 
5536  TCW_4(__kmp_nth, __kmp_nth - 1);
5537  root->r.r_cg_nthreads--;
5538 
5539 #ifdef KMP_ADJUST_BLOCKTIME
5540  /* Adjust blocktime back to user setting or default if necessary */
5541  /* Middle initialization might never have occurred */
5542  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5543  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5544  if (__kmp_nth <= __kmp_avail_proc) {
5545  __kmp_zero_bt = FALSE;
5546  }
5547  }
5548 #endif /* KMP_ADJUST_BLOCKTIME */
5549 
5550  KMP_MB();
5551 }
5552 
5553 /* ------------------------------------------------------------------------ */
5554 
5555 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5556  int gtid = this_thr->th.th_info.ds.ds_gtid;
5557  /* void *stack_data;*/
5558  kmp_team_t *(*volatile pteam);
5559 
5560  KMP_MB();
5561  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5562 
5563  if (__kmp_env_consistency_check) {
5564  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5565  }
5566 
5567 #if OMPT_SUPPORT
5568  ompt_data_t *thread_data;
5569  if (ompt_enabled.enabled) {
5570  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5571  thread_data->ptr = NULL;
5572 
5573  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5574  this_thr->th.ompt_thread_info.wait_id = 0;
5575  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5576  if (ompt_enabled.ompt_callback_thread_begin) {
5577  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5578  ompt_thread_worker, thread_data);
5579  }
5580  }
5581 #endif
5582 
5583 #if OMPT_SUPPORT
5584  if (ompt_enabled.enabled) {
5585  this_thr->th.ompt_thread_info.state = omp_state_idle;
5586  }
5587 #endif
5588  /* This is the place where threads wait for work */
5589  while (!TCR_4(__kmp_global.g.g_done)) {
5590  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5591  KMP_MB();
5592 
5593  /* wait for work to do */
5594  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5595 
5596  /* No tid yet since not part of a team */
5597  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5598 
5599 #if OMPT_SUPPORT
5600  if (ompt_enabled.enabled) {
5601  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5602  }
5603 #endif
5604 
5605  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5606 
5607  /* have we been allocated? */
5608  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5609  /* we were just woken up, so run our new task */
5610  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5611  int rc;
5612  KA_TRACE(20,
5613  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5614  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5615  (*pteam)->t.t_pkfn));
5616 
5617  updateHWFPControl(*pteam);
5618 
5619 #if OMPT_SUPPORT
5620  if (ompt_enabled.enabled) {
5621  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5622  }
5623 #endif
5624 
5625  {
5626  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5627  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5628  rc = (*pteam)->t.t_invoke(gtid);
5629  }
5630  KMP_ASSERT(rc);
5631 
5632  KMP_MB();
5633  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5634  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5635  (*pteam)->t.t_pkfn));
5636  }
5637 #if OMPT_SUPPORT
5638  if (ompt_enabled.enabled) {
5639  /* no frame set while outside task */
5640  __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5641 
5642  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5643  }
5644 #endif
5645  /* join barrier after parallel region */
5646  __kmp_join_barrier(gtid);
5647  }
5648  }
5649  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5650 
5651 #if OMPT_SUPPORT
5652  if (ompt_enabled.ompt_callback_thread_end) {
5653  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5654  }
5655 #endif
5656 
5657  this_thr->th.th_task_team = NULL;
5658  /* run the destructors for the threadprivate data for this thread */
5659  __kmp_common_destroy_gtid(gtid);
5660 
5661  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5662  KMP_MB();
5663  return this_thr;
5664 }
5665 
5666 /* ------------------------------------------------------------------------ */
5667 
5668 void __kmp_internal_end_dest(void *specific_gtid) {
5669 #if KMP_COMPILER_ICC
5670 #pragma warning(push)
5671 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5672 // significant bits
5673 #endif
5674  // Make sure no significant bits are lost
5675  int gtid = (kmp_intptr_t)specific_gtid - 1;
5676 #if KMP_COMPILER_ICC
5677 #pragma warning(pop)
5678 #endif
5679 
5680  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5681  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5682  * this is because 0 is reserved for the nothing-stored case */
5683 
5684  /* josh: One reason for setting the gtid specific data even when it is being
5685  destroyed by pthread is to allow gtid lookup through thread specific data
5686  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5687  that gets executed in the call to __kmp_internal_end_thread, actually
5688  gets the gtid through the thread specific data. Setting it here seems
5689  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5690  to run smoothly.
5691  todo: get rid of this after we remove the dependence on
5692  __kmp_gtid_get_specific */
5693  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5694  __kmp_gtid_set_specific(gtid);
5695 #ifdef KMP_TDATA_GTID
5696  __kmp_gtid = gtid;
5697 #endif
5698  __kmp_internal_end_thread(gtid);
5699 }
5700 
5701 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5702 
5703 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5704 // destructors work perfectly, but in real libomp.so I have no evidence it is
5705 // ever called. However, -fini linker option in makefile.mk works fine.
5706 
5707 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5708  __kmp_internal_end_atexit();
5709 }
5710 
5711 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5712 
5713 #endif
5714 
5715 /* [Windows] josh: when the atexit handler is called, there may still be more
5716  than one thread alive */
5717 void __kmp_internal_end_atexit(void) {
5718  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5719  /* [Windows]
5720  josh: ideally, we want to completely shutdown the library in this atexit
5721  handler, but stat code that depends on thread specific data for gtid fails
5722  because that data becomes unavailable at some point during the shutdown, so
5723  we call __kmp_internal_end_thread instead. We should eventually remove the
5724  dependency on __kmp_get_specific_gtid in the stat code and use
5725  __kmp_internal_end_library to cleanly shutdown the library.
5726 
5727  // TODO: Can some of this comment about GVS be removed?
5728  I suspect that the offending stat code is executed when the calling thread
5729  tries to clean up a dead root thread's data structures, resulting in GVS
5730  code trying to close the GVS structures for that thread, but since the stat
5731  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5732  the calling thread is cleaning up itself instead of another thread, it get
5733  confused. This happens because allowing a thread to unregister and cleanup
5734  another thread is a recent modification for addressing an issue.
5735  Based on the current design (20050722), a thread may end up
5736  trying to unregister another thread only if thread death does not trigger
5737  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5738  thread specific data destructor function to detect thread death. For
5739  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5740  is nothing. Thus, the workaround is applicable only for Windows static
5741  stat library. */
5742  __kmp_internal_end_library(-1);
5743 #if KMP_OS_WINDOWS
5744  __kmp_close_console();
5745 #endif
5746 }
5747 
5748 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5749  // It is assumed __kmp_forkjoin_lock is acquired.
5750 
5751  int gtid;
5752 
5753  KMP_DEBUG_ASSERT(thread != NULL);
5754 
5755  gtid = thread->th.th_info.ds.ds_gtid;
5756 
5757  if (!is_root) {
5758 
5759  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5760  /* Assume the threads are at the fork barrier here */
5761  KA_TRACE(
5762  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5763  gtid));
5764  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5765  * (GEH) */
5766  ANNOTATE_HAPPENS_BEFORE(thread);
5767  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5768  __kmp_release_64(&flag);
5769  }
5770 
5771  // Terminate OS thread.
5772  __kmp_reap_worker(thread);
5773 
5774  // The thread was killed asynchronously. If it was actively
5775  // spinning in the thread pool, decrement the global count.
5776  //
5777  // There is a small timing hole here - if the worker thread was just waking
5778  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5779  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5780  // the global counter might not get updated.
5781  //
5782  // Currently, this can only happen as the library is unloaded,
5783  // so there are no harmful side effects.
5784  if (thread->th.th_active_in_pool) {
5785  thread->th.th_active_in_pool = FALSE;
5786  KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5787  KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5788  }
5789 
5790  // Decrement # of [worker] threads in the pool.
5791  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5792  --__kmp_thread_pool_nth;
5793  }
5794 
5795  __kmp_free_implicit_task(thread);
5796 
5797 // Free the fast memory for tasking
5798 #if USE_FAST_MEMORY
5799  __kmp_free_fast_memory(thread);
5800 #endif /* USE_FAST_MEMORY */
5801 
5802  __kmp_suspend_uninitialize_thread(thread);
5803 
5804  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5805  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5806 
5807  --__kmp_all_nth;
5808 // __kmp_nth was decremented when thread is added to the pool.
5809 
5810 #ifdef KMP_ADJUST_BLOCKTIME
5811  /* Adjust blocktime back to user setting or default if necessary */
5812  /* Middle initialization might never have occurred */
5813  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5814  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5815  if (__kmp_nth <= __kmp_avail_proc) {
5816  __kmp_zero_bt = FALSE;
5817  }
5818  }
5819 #endif /* KMP_ADJUST_BLOCKTIME */
5820 
5821  /* free the memory being used */
5822  if (__kmp_env_consistency_check) {
5823  if (thread->th.th_cons) {
5824  __kmp_free_cons_stack(thread->th.th_cons);
5825  thread->th.th_cons = NULL;
5826  }
5827  }
5828 
5829  if (thread->th.th_pri_common != NULL) {
5830  __kmp_free(thread->th.th_pri_common);
5831  thread->th.th_pri_common = NULL;
5832  }
5833 
5834  if (thread->th.th_task_state_memo_stack != NULL) {
5835  __kmp_free(thread->th.th_task_state_memo_stack);
5836  thread->th.th_task_state_memo_stack = NULL;
5837  }
5838 
5839 #if KMP_USE_BGET
5840  if (thread->th.th_local.bget_data != NULL) {
5841  __kmp_finalize_bget(thread);
5842  }
5843 #endif
5844 
5845 #if KMP_AFFINITY_SUPPORTED
5846  if (thread->th.th_affin_mask != NULL) {
5847  KMP_CPU_FREE(thread->th.th_affin_mask);
5848  thread->th.th_affin_mask = NULL;
5849  }
5850 #endif /* KMP_AFFINITY_SUPPORTED */
5851 
5852  __kmp_reap_team(thread->th.th_serial_team);
5853  thread->th.th_serial_team = NULL;
5854  __kmp_free(thread);
5855 
5856  KMP_MB();
5857 
5858 } // __kmp_reap_thread
5859 
5860 static void __kmp_internal_end(void) {
5861  int i;
5862 
5863  /* First, unregister the library */
5864  __kmp_unregister_library();
5865 
5866 #if KMP_OS_WINDOWS
5867  /* In Win static library, we can't tell when a root actually dies, so we
5868  reclaim the data structures for any root threads that have died but not
5869  unregistered themselves, in order to shut down cleanly.
5870  In Win dynamic library we also can't tell when a thread dies. */
5871  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5872 // dead roots
5873 #endif
5874 
5875  for (i = 0; i < __kmp_threads_capacity; i++)
5876  if (__kmp_root[i])
5877  if (__kmp_root[i]->r.r_active)
5878  break;
5879  KMP_MB(); /* Flush all pending memory write invalidates. */
5880  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5881 
5882  if (i < __kmp_threads_capacity) {
5883 #if KMP_USE_MONITOR
5884  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5885  KMP_MB(); /* Flush all pending memory write invalidates. */
5886 
5887  // Need to check that monitor was initialized before reaping it. If we are
5888  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5889  // __kmp_monitor will appear to contain valid data, but it is only valid in
5890  // the parent process, not the child.
5891  // New behavior (201008): instead of keying off of the flag
5892  // __kmp_init_parallel, the monitor thread creation is keyed off
5893  // of the new flag __kmp_init_monitor.
5894  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5895  if (TCR_4(__kmp_init_monitor)) {
5896  __kmp_reap_monitor(&__kmp_monitor);
5897  TCW_4(__kmp_init_monitor, 0);
5898  }
5899  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5900  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5901 #endif // KMP_USE_MONITOR
5902  } else {
5903 /* TODO move this to cleanup code */
5904 #ifdef KMP_DEBUG
5905  /* make sure that everything has properly ended */
5906  for (i = 0; i < __kmp_threads_capacity; i++) {
5907  if (__kmp_root[i]) {
5908  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5909  // there can be uber threads alive here
5910  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5911  }
5912  }
5913 #endif
5914 
5915  KMP_MB();
5916 
5917  // Reap the worker threads.
5918  // This is valid for now, but be careful if threads are reaped sooner.
5919  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5920  // Get the next thread from the pool.
5921  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5922  __kmp_thread_pool = thread->th.th_next_pool;
5923  // Reap it.
5924  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5925  thread->th.th_next_pool = NULL;
5926  thread->th.th_in_pool = FALSE;
5927  __kmp_reap_thread(thread, 0);
5928  }
5929  __kmp_thread_pool_insert_pt = NULL;
5930 
5931  // Reap teams.
5932  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5933  // Get the next team from the pool.
5934  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5935  __kmp_team_pool = team->t.t_next_pool;
5936  // Reap it.
5937  team->t.t_next_pool = NULL;
5938  __kmp_reap_team(team);
5939  }
5940 
5941  __kmp_reap_task_teams();
5942 
5943  for (i = 0; i < __kmp_threads_capacity; ++i) {
5944  // TBD: Add some checking...
5945  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5946  }
5947 
5948  /* Make sure all threadprivate destructors get run by joining with all
5949  worker threads before resetting this flag */
5950  TCW_SYNC_4(__kmp_init_common, FALSE);
5951 
5952  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5953  KMP_MB();
5954 
5955 #if KMP_USE_MONITOR
5956  // See note above: One of the possible fixes for CQ138434 / CQ140126
5957  //
5958  // FIXME: push both code fragments down and CSE them?
5959  // push them into __kmp_cleanup() ?
5960  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5961  if (TCR_4(__kmp_init_monitor)) {
5962  __kmp_reap_monitor(&__kmp_monitor);
5963  TCW_4(__kmp_init_monitor, 0);
5964  }
5965  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5966  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5967 #endif
5968  } /* else !__kmp_global.t_active */
5969  TCW_4(__kmp_init_gtid, FALSE);
5970  KMP_MB(); /* Flush all pending memory write invalidates. */
5971 
5972  __kmp_cleanup();
5973 #if OMPT_SUPPORT
5974  ompt_fini();
5975 #endif
5976 }
5977 
5978 void __kmp_internal_end_library(int gtid_req) {
5979  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5980  /* this shouldn't be a race condition because __kmp_internal_end() is the
5981  only place to clear __kmp_serial_init */
5982  /* we'll check this later too, after we get the lock */
5983  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5984  // redundaant, because the next check will work in any case.
5985  if (__kmp_global.g.g_abort) {
5986  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5987  /* TODO abort? */
5988  return;
5989  }
5990  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5991  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
5992  return;
5993  }
5994 
5995  KMP_MB(); /* Flush all pending memory write invalidates. */
5996 
5997  /* find out who we are and what we should do */
5998  {
5999  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6000  KA_TRACE(
6001  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6002  if (gtid == KMP_GTID_SHUTDOWN) {
6003  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6004  "already shutdown\n"));
6005  return;
6006  } else if (gtid == KMP_GTID_MONITOR) {
6007  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6008  "registered, or system shutdown\n"));
6009  return;
6010  } else if (gtid == KMP_GTID_DNE) {
6011  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6012  "shutdown\n"));
6013  /* we don't know who we are, but we may still shutdown the library */
6014  } else if (KMP_UBER_GTID(gtid)) {
6015  /* unregister ourselves as an uber thread. gtid is no longer valid */
6016  if (__kmp_root[gtid]->r.r_active) {
6017  __kmp_global.g.g_abort = -1;
6018  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6019  KA_TRACE(10,
6020  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6021  gtid));
6022  return;
6023  } else {
6024  KA_TRACE(
6025  10,
6026  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6027  __kmp_unregister_root_current_thread(gtid);
6028  }
6029  } else {
6030 /* worker threads may call this function through the atexit handler, if they
6031  * call exit() */
6032 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6033  TODO: do a thorough shutdown instead */
6034 #ifdef DUMP_DEBUG_ON_EXIT
6035  if (__kmp_debug_buf)
6036  __kmp_dump_debug_buffer();
6037 #endif
6038  return;
6039  }
6040  }
6041  /* synchronize the termination process */
6042  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6043 
6044  /* have we already finished */
6045  if (__kmp_global.g.g_abort) {
6046  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6047  /* TODO abort? */
6048  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6049  return;
6050  }
6051  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6052  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6053  return;
6054  }
6055 
6056  /* We need this lock to enforce mutex between this reading of
6057  __kmp_threads_capacity and the writing by __kmp_register_root.
6058  Alternatively, we can use a counter of roots that is atomically updated by
6059  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6060  __kmp_internal_end_*. */
6061  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6062 
6063  /* now we can safely conduct the actual termination */
6064  __kmp_internal_end();
6065 
6066  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6067  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6068 
6069  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6070 
6071 #ifdef DUMP_DEBUG_ON_EXIT
6072  if (__kmp_debug_buf)
6073  __kmp_dump_debug_buffer();
6074 #endif
6075 
6076 #if KMP_OS_WINDOWS
6077  __kmp_close_console();
6078 #endif
6079 
6080  __kmp_fini_allocator();
6081 
6082 } // __kmp_internal_end_library
6083 
6084 void __kmp_internal_end_thread(int gtid_req) {
6085  int i;
6086 
6087  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6088  /* this shouldn't be a race condition because __kmp_internal_end() is the
6089  * only place to clear __kmp_serial_init */
6090  /* we'll check this later too, after we get the lock */
6091  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6092  // redundant, because the next check will work in any case.
6093  if (__kmp_global.g.g_abort) {
6094  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6095  /* TODO abort? */
6096  return;
6097  }
6098  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6099  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6100  return;
6101  }
6102 
6103  KMP_MB(); /* Flush all pending memory write invalidates. */
6104 
6105  /* find out who we are and what we should do */
6106  {
6107  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6108  KA_TRACE(10,
6109  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6110  if (gtid == KMP_GTID_SHUTDOWN) {
6111  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6112  "already shutdown\n"));
6113  return;
6114  } else if (gtid == KMP_GTID_MONITOR) {
6115  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6116  "registered, or system shutdown\n"));
6117  return;
6118  } else if (gtid == KMP_GTID_DNE) {
6119  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6120  "shutdown\n"));
6121  return;
6122  /* we don't know who we are */
6123  } else if (KMP_UBER_GTID(gtid)) {
6124  /* unregister ourselves as an uber thread. gtid is no longer valid */
6125  if (__kmp_root[gtid]->r.r_active) {
6126  __kmp_global.g.g_abort = -1;
6127  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6128  KA_TRACE(10,
6129  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6130  gtid));
6131  return;
6132  } else {
6133  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6134  gtid));
6135  __kmp_unregister_root_current_thread(gtid);
6136  }
6137  } else {
6138  /* just a worker thread, let's leave */
6139  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6140 
6141  if (gtid >= 0) {
6142  __kmp_threads[gtid]->th.th_task_team = NULL;
6143  }
6144 
6145  KA_TRACE(10,
6146  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6147  gtid));
6148  return;
6149  }
6150  }
6151 #if defined KMP_DYNAMIC_LIB
6152  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6153  // thread, because we will better shutdown later in the library destructor.
6154  // The reason of this change is performance problem when non-openmp thread in
6155  // a loop forks and joins many openmp threads. We can save a lot of time
6156  // keeping worker threads alive until the program shutdown.
6157  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6158  // and Windows(DPD200287443) that occurs when using critical sections from
6159  // foreign threads.
6160  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6161  return;
6162 #endif
6163  /* synchronize the termination process */
6164  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6165 
6166  /* have we already finished */
6167  if (__kmp_global.g.g_abort) {
6168  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6169  /* TODO abort? */
6170  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6171  return;
6172  }
6173  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6174  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6175  return;
6176  }
6177 
6178  /* We need this lock to enforce mutex between this reading of
6179  __kmp_threads_capacity and the writing by __kmp_register_root.
6180  Alternatively, we can use a counter of roots that is atomically updated by
6181  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6182  __kmp_internal_end_*. */
6183 
6184  /* should we finish the run-time? are all siblings done? */
6185  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6186 
6187  for (i = 0; i < __kmp_threads_capacity; ++i) {
6188  if (KMP_UBER_GTID(i)) {
6189  KA_TRACE(
6190  10,
6191  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6192  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6193  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6194  return;
6195  }
6196  }
6197 
6198  /* now we can safely conduct the actual termination */
6199 
6200  __kmp_internal_end();
6201 
6202  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6203  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6204 
6205  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6206 
6207 #ifdef DUMP_DEBUG_ON_EXIT
6208  if (__kmp_debug_buf)
6209  __kmp_dump_debug_buffer();
6210 #endif
6211 } // __kmp_internal_end_thread
6212 
6213 // -----------------------------------------------------------------------------
6214 // Library registration stuff.
6215 
6216 static long __kmp_registration_flag = 0;
6217 // Random value used to indicate library initialization.
6218 static char *__kmp_registration_str = NULL;
6219 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6220 
6221 static inline char *__kmp_reg_status_name() {
6222  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6223  each thread. If registration and unregistration go in different threads
6224  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6225  env var can not be found, because the name will contain different pid. */
6226  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6227 } // __kmp_reg_status_get
6228 
6229 void __kmp_register_library_startup(void) {
6230 
6231  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6232  int done = 0;
6233  union {
6234  double dtime;
6235  long ltime;
6236  } time;
6237 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6238  __kmp_initialize_system_tick();
6239 #endif
6240  __kmp_read_system_time(&time.dtime);
6241  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6242  __kmp_registration_str =
6243  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6244  __kmp_registration_flag, KMP_LIBRARY_FILE);
6245 
6246  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6247  __kmp_registration_str));
6248 
6249  while (!done) {
6250 
6251  char *value = NULL; // Actual value of the environment variable.
6252 
6253  // Set environment variable, but do not overwrite if it is exist.
6254  __kmp_env_set(name, __kmp_registration_str, 0);
6255  // Check the variable is written.
6256  value = __kmp_env_get(name);
6257  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6258 
6259  done = 1; // Ok, environment variable set successfully, exit the loop.
6260 
6261  } else {
6262 
6263  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6264  // Check whether it alive or dead.
6265  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6266  char *tail = value;
6267  char *flag_addr_str = NULL;
6268  char *flag_val_str = NULL;
6269  char const *file_name = NULL;
6270  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6271  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6272  file_name = tail;
6273  if (tail != NULL) {
6274  long *flag_addr = 0;
6275  long flag_val = 0;
6276  KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6277  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6278  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6279  // First, check whether environment-encoded address is mapped into
6280  // addr space.
6281  // If so, dereference it to see if it still has the right value.
6282  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6283  neighbor = 1;
6284  } else {
6285  // If not, then we know the other copy of the library is no longer
6286  // running.
6287  neighbor = 2;
6288  }
6289  }
6290  }
6291  switch (neighbor) {
6292  case 0: // Cannot parse environment variable -- neighbor status unknown.
6293  // Assume it is the incompatible format of future version of the
6294  // library. Assume the other library is alive.
6295  // WARN( ... ); // TODO: Issue a warning.
6296  file_name = "unknown library";
6297  // Attention! Falling to the next case. That's intentional.
6298  case 1: { // Neighbor is alive.
6299  // Check it is allowed.
6300  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6301  if (!__kmp_str_match_true(duplicate_ok)) {
6302  // That's not allowed. Issue fatal error.
6303  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6304  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6305  }
6306  KMP_INTERNAL_FREE(duplicate_ok);
6307  __kmp_duplicate_library_ok = 1;
6308  done = 1; // Exit the loop.
6309  } break;
6310  case 2: { // Neighbor is dead.
6311  // Clear the variable and try to register library again.
6312  __kmp_env_unset(name);
6313  } break;
6314  default: { KMP_DEBUG_ASSERT(0); } break;
6315  }
6316  }
6317  KMP_INTERNAL_FREE((void *)value);
6318  }
6319  KMP_INTERNAL_FREE((void *)name);
6320 
6321 } // func __kmp_register_library_startup
6322 
6323 void __kmp_unregister_library(void) {
6324 
6325  char *name = __kmp_reg_status_name();
6326  char *value = __kmp_env_get(name);
6327 
6328  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6329  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6330  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6331  // Ok, this is our variable. Delete it.
6332  __kmp_env_unset(name);
6333  }
6334 
6335  KMP_INTERNAL_FREE(__kmp_registration_str);
6336  KMP_INTERNAL_FREE(value);
6337  KMP_INTERNAL_FREE(name);
6338 
6339  __kmp_registration_flag = 0;
6340  __kmp_registration_str = NULL;
6341 
6342 } // __kmp_unregister_library
6343 
6344 // End of Library registration stuff.
6345 // -----------------------------------------------------------------------------
6346 
6347 #if KMP_MIC_SUPPORTED
6348 
6349 static void __kmp_check_mic_type() {
6350  kmp_cpuid_t cpuid_state = {0};
6351  kmp_cpuid_t *cs_p = &cpuid_state;
6352  __kmp_x86_cpuid(1, 0, cs_p);
6353  // We don't support mic1 at the moment
6354  if ((cs_p->eax & 0xff0) == 0xB10) {
6355  __kmp_mic_type = mic2;
6356  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6357  __kmp_mic_type = mic3;
6358  } else {
6359  __kmp_mic_type = non_mic;
6360  }
6361 }
6362 
6363 #endif /* KMP_MIC_SUPPORTED */
6364 
6365 static void __kmp_do_serial_initialize(void) {
6366  int i, gtid;
6367  int size;
6368 
6369  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6370 
6371  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6372  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6373  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6374  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6375  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6376 
6377 #if OMPT_SUPPORT
6378  ompt_pre_init();
6379 #endif
6380 
6381  __kmp_validate_locks();
6382 
6383  /* Initialize internal memory allocator */
6384  __kmp_init_allocator();
6385 
6386  /* Register the library startup via an environment variable and check to see
6387  whether another copy of the library is already registered. */
6388 
6389  __kmp_register_library_startup();
6390 
6391  /* TODO reinitialization of library */
6392  if (TCR_4(__kmp_global.g.g_done)) {
6393  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6394  }
6395 
6396  __kmp_global.g.g_abort = 0;
6397  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6398 
6399 /* initialize the locks */
6400 #if KMP_USE_ADAPTIVE_LOCKS
6401 #if KMP_DEBUG_ADAPTIVE_LOCKS
6402  __kmp_init_speculative_stats();
6403 #endif
6404 #endif
6405 #if KMP_STATS_ENABLED
6406  __kmp_stats_init();
6407 #endif
6408  __kmp_init_lock(&__kmp_global_lock);
6409  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6410  __kmp_init_lock(&__kmp_debug_lock);
6411  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6412  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6413  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6414  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6415  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6416  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6417  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6418  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6419  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6420  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6421  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6422  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6423  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6424  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6425  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6426 #if KMP_USE_MONITOR
6427  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6428 #endif
6429  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6430 
6431  /* conduct initialization and initial setup of configuration */
6432 
6433  __kmp_runtime_initialize();
6434 
6435 #if KMP_MIC_SUPPORTED
6436  __kmp_check_mic_type();
6437 #endif
6438 
6439 // Some global variable initialization moved here from kmp_env_initialize()
6440 #ifdef KMP_DEBUG
6441  kmp_diag = 0;
6442 #endif
6443  __kmp_abort_delay = 0;
6444 
6445  // From __kmp_init_dflt_team_nth()
6446  /* assume the entire machine will be used */
6447  __kmp_dflt_team_nth_ub = __kmp_xproc;
6448  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6449  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6450  }
6451  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6452  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6453  }
6454  __kmp_max_nth = __kmp_sys_max_nth;
6455  __kmp_cg_max_nth = __kmp_sys_max_nth;
6456  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6457  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6458  __kmp_teams_max_nth = __kmp_sys_max_nth;
6459  }
6460 
6461  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6462  // part
6463  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6464 #if KMP_USE_MONITOR
6465  __kmp_monitor_wakeups =
6466  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6467  __kmp_bt_intervals =
6468  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6469 #endif
6470  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6471  __kmp_library = library_throughput;
6472  // From KMP_SCHEDULE initialization
6473  __kmp_static = kmp_sch_static_balanced;
6474 // AC: do not use analytical here, because it is non-monotonous
6475 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6476 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6477 // need to repeat assignment
6478 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6479 // bit control and barrier method control parts
6480 #if KMP_FAST_REDUCTION_BARRIER
6481 #define kmp_reduction_barrier_gather_bb ((int)1)
6482 #define kmp_reduction_barrier_release_bb ((int)1)
6483 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6484 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6485 #endif // KMP_FAST_REDUCTION_BARRIER
6486  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6487  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6488  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6489  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6490  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6491 #if KMP_FAST_REDUCTION_BARRIER
6492  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6493  // lin_64 ): hyper,1
6494  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6495  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6496  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6497  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6498  }
6499 #endif // KMP_FAST_REDUCTION_BARRIER
6500  }
6501 #if KMP_FAST_REDUCTION_BARRIER
6502 #undef kmp_reduction_barrier_release_pat
6503 #undef kmp_reduction_barrier_gather_pat
6504 #undef kmp_reduction_barrier_release_bb
6505 #undef kmp_reduction_barrier_gather_bb
6506 #endif // KMP_FAST_REDUCTION_BARRIER
6507 #if KMP_MIC_SUPPORTED
6508  if (__kmp_mic_type == mic2) { // KNC
6509  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6510  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6511  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6512  1; // forkjoin release
6513  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6514  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6515  }
6516 #if KMP_FAST_REDUCTION_BARRIER
6517  if (__kmp_mic_type == mic2) { // KNC
6518  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6519  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6520  }
6521 #endif // KMP_FAST_REDUCTION_BARRIER
6522 #endif // KMP_MIC_SUPPORTED
6523 
6524 // From KMP_CHECKS initialization
6525 #ifdef KMP_DEBUG
6526  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6527 #else
6528  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6529 #endif
6530 
6531  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6532  __kmp_foreign_tp = TRUE;
6533 
6534  __kmp_global.g.g_dynamic = FALSE;
6535  __kmp_global.g.g_dynamic_mode = dynamic_default;
6536 
6537  __kmp_env_initialize(NULL);
6538 
6539 // Print all messages in message catalog for testing purposes.
6540 #ifdef KMP_DEBUG
6541  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6542  if (__kmp_str_match_true(val)) {
6543  kmp_str_buf_t buffer;
6544  __kmp_str_buf_init(&buffer);
6545  __kmp_i18n_dump_catalog(&buffer);
6546  __kmp_printf("%s", buffer.str);
6547  __kmp_str_buf_free(&buffer);
6548  }
6549  __kmp_env_free(&val);
6550 #endif
6551 
6552  __kmp_threads_capacity =
6553  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6554  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6555  __kmp_tp_capacity = __kmp_default_tp_capacity(
6556  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6557 
6558  // If the library is shut down properly, both pools must be NULL. Just in
6559  // case, set them to NULL -- some memory may leak, but subsequent code will
6560  // work even if pools are not freed.
6561  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6562  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6563  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6564  __kmp_thread_pool = NULL;
6565  __kmp_thread_pool_insert_pt = NULL;
6566  __kmp_team_pool = NULL;
6567 
6568  /* Allocate all of the variable sized records */
6569  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6570  * expandable */
6571  /* Since allocation is cache-aligned, just add extra padding at the end */
6572  size =
6573  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6574  CACHE_LINE;
6575  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6576  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6577  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6578 
6579  /* init thread counts */
6580  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6581  0); // Asserts fail if the library is reinitializing and
6582  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6583  __kmp_all_nth = 0;
6584  __kmp_nth = 0;
6585 
6586  /* setup the uber master thread and hierarchy */
6587  gtid = __kmp_register_root(TRUE);
6588  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6589  KMP_ASSERT(KMP_UBER_GTID(gtid));
6590  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6591 
6592  KMP_MB(); /* Flush all pending memory write invalidates. */
6593 
6594  __kmp_common_initialize();
6595 
6596 #if KMP_OS_UNIX
6597  /* invoke the child fork handler */
6598  __kmp_register_atfork();
6599 #endif
6600 
6601 #if !defined KMP_DYNAMIC_LIB
6602  {
6603  /* Invoke the exit handler when the program finishes, only for static
6604  library. For dynamic library, we already have _fini and DllMain. */
6605  int rc = atexit(__kmp_internal_end_atexit);
6606  if (rc != 0) {
6607  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6608  __kmp_msg_null);
6609  }
6610  }
6611 #endif
6612 
6613 #if KMP_HANDLE_SIGNALS
6614 #if KMP_OS_UNIX
6615  /* NOTE: make sure that this is called before the user installs their own
6616  signal handlers so that the user handlers are called first. this way they
6617  can return false, not call our handler, avoid terminating the library, and
6618  continue execution where they left off. */
6619  __kmp_install_signals(FALSE);
6620 #endif /* KMP_OS_UNIX */
6621 #if KMP_OS_WINDOWS
6622  __kmp_install_signals(TRUE);
6623 #endif /* KMP_OS_WINDOWS */
6624 #endif
6625 
6626  /* we have finished the serial initialization */
6627  __kmp_init_counter++;
6628 
6629  __kmp_init_serial = TRUE;
6630 
6631  if (__kmp_settings) {
6632  __kmp_env_print();
6633  }
6634 
6635 #if OMP_40_ENABLED
6636  if (__kmp_display_env || __kmp_display_env_verbose) {
6637  __kmp_env_print_2();
6638  }
6639 #endif // OMP_40_ENABLED
6640 
6641 #if OMPT_SUPPORT
6642  ompt_post_init();
6643 #endif
6644 
6645  KMP_MB();
6646 
6647  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6648 }
6649 
6650 void __kmp_serial_initialize(void) {
6651  if (__kmp_init_serial) {
6652  return;
6653  }
6654  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6655  if (__kmp_init_serial) {
6656  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6657  return;
6658  }
6659  __kmp_do_serial_initialize();
6660  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6661 }
6662 
6663 static void __kmp_do_middle_initialize(void) {
6664  int i, j;
6665  int prev_dflt_team_nth;
6666 
6667  if (!__kmp_init_serial) {
6668  __kmp_do_serial_initialize();
6669  }
6670 
6671  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6672 
6673  // Save the previous value for the __kmp_dflt_team_nth so that
6674  // we can avoid some reinitialization if it hasn't changed.
6675  prev_dflt_team_nth = __kmp_dflt_team_nth;
6676 
6677 #if KMP_AFFINITY_SUPPORTED
6678  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6679  // number of cores on the machine.
6680  __kmp_affinity_initialize();
6681 
6682  // Run through the __kmp_threads array and set the affinity mask
6683  // for each root thread that is currently registered with the RTL.
6684  for (i = 0; i < __kmp_threads_capacity; i++) {
6685  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6686  __kmp_affinity_set_init_mask(i, TRUE);
6687  }
6688  }
6689 #endif /* KMP_AFFINITY_SUPPORTED */
6690 
6691  KMP_ASSERT(__kmp_xproc > 0);
6692  if (__kmp_avail_proc == 0) {
6693  __kmp_avail_proc = __kmp_xproc;
6694  }
6695 
6696  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6697  // correct them now
6698  j = 0;
6699  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6700  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6701  __kmp_avail_proc;
6702  j++;
6703  }
6704 
6705  if (__kmp_dflt_team_nth == 0) {
6706 #ifdef KMP_DFLT_NTH_CORES
6707  // Default #threads = #cores
6708  __kmp_dflt_team_nth = __kmp_ncores;
6709  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6710  "__kmp_ncores (%d)\n",
6711  __kmp_dflt_team_nth));
6712 #else
6713  // Default #threads = #available OS procs
6714  __kmp_dflt_team_nth = __kmp_avail_proc;
6715  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6716  "__kmp_avail_proc(%d)\n",
6717  __kmp_dflt_team_nth));
6718 #endif /* KMP_DFLT_NTH_CORES */
6719  }
6720 
6721  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6722  __kmp_dflt_team_nth = KMP_MIN_NTH;
6723  }
6724  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6725  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6726  }
6727 
6728  // There's no harm in continuing if the following check fails,
6729  // but it indicates an error in the previous logic.
6730  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6731 
6732  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6733  // Run through the __kmp_threads array and set the num threads icv for each
6734  // root thread that is currently registered with the RTL (which has not
6735  // already explicitly set its nthreads-var with a call to
6736  // omp_set_num_threads()).
6737  for (i = 0; i < __kmp_threads_capacity; i++) {
6738  kmp_info_t *thread = __kmp_threads[i];
6739  if (thread == NULL)
6740  continue;
6741  if (thread->th.th_current_task->td_icvs.nproc != 0)
6742  continue;
6743 
6744  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6745  }
6746  }
6747  KA_TRACE(
6748  20,
6749  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6750  __kmp_dflt_team_nth));
6751 
6752 #ifdef KMP_ADJUST_BLOCKTIME
6753  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6754  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6755  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6756  if (__kmp_nth > __kmp_avail_proc) {
6757  __kmp_zero_bt = TRUE;
6758  }
6759  }
6760 #endif /* KMP_ADJUST_BLOCKTIME */
6761 
6762  /* we have finished middle initialization */
6763  TCW_SYNC_4(__kmp_init_middle, TRUE);
6764 
6765  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6766 }
6767 
6768 void __kmp_middle_initialize(void) {
6769  if (__kmp_init_middle) {
6770  return;
6771  }
6772  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6773  if (__kmp_init_middle) {
6774  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6775  return;
6776  }
6777  __kmp_do_middle_initialize();
6778  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6779 }
6780 
6781 void __kmp_parallel_initialize(void) {
6782  int gtid = __kmp_entry_gtid(); // this might be a new root
6783 
6784  /* synchronize parallel initialization (for sibling) */
6785  if (TCR_4(__kmp_init_parallel))
6786  return;
6787  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6788  if (TCR_4(__kmp_init_parallel)) {
6789  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6790  return;
6791  }
6792 
6793  /* TODO reinitialization after we have already shut down */
6794  if (TCR_4(__kmp_global.g.g_done)) {
6795  KA_TRACE(
6796  10,
6797  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6798  __kmp_infinite_loop();
6799  }
6800 
6801  /* jc: The lock __kmp_initz_lock is already held, so calling
6802  __kmp_serial_initialize would cause a deadlock. So we call
6803  __kmp_do_serial_initialize directly. */
6804  if (!__kmp_init_middle) {
6805  __kmp_do_middle_initialize();
6806  }
6807 
6808  /* begin initialization */
6809  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6810  KMP_ASSERT(KMP_UBER_GTID(gtid));
6811 
6812 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6813  // Save the FP control regs.
6814  // Worker threads will set theirs to these values at thread startup.
6815  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6816  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6817  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6818 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6819 
6820 #if KMP_OS_UNIX
6821 #if KMP_HANDLE_SIGNALS
6822  /* must be after __kmp_serial_initialize */
6823  __kmp_install_signals(TRUE);
6824 #endif
6825 #endif
6826 
6827  __kmp_suspend_initialize();
6828 
6829 #if defined(USE_LOAD_BALANCE)
6830  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6831  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6832  }
6833 #else
6834  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6835  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6836  }
6837 #endif
6838 
6839  if (__kmp_version) {
6840  __kmp_print_version_2();
6841  }
6842 
6843  /* we have finished parallel initialization */
6844  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6845 
6846  KMP_MB();
6847  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6848 
6849  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6850 }
6851 
6852 /* ------------------------------------------------------------------------ */
6853 
6854 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6855  kmp_team_t *team) {
6856  kmp_disp_t *dispatch;
6857 
6858  KMP_MB();
6859 
6860  /* none of the threads have encountered any constructs, yet. */
6861  this_thr->th.th_local.this_construct = 0;
6862 #if KMP_CACHE_MANAGE
6863  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6864 #endif /* KMP_CACHE_MANAGE */
6865  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6866  KMP_DEBUG_ASSERT(dispatch);
6867  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6868  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6869  // this_thr->th.th_info.ds.ds_tid ] );
6870 
6871  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6872 #if OMP_45_ENABLED
6873  dispatch->th_doacross_buf_idx =
6874  0; /* reset the doacross dispatch buffer counter */
6875 #endif
6876  if (__kmp_env_consistency_check)
6877  __kmp_push_parallel(gtid, team->t.t_ident);
6878 
6879  KMP_MB(); /* Flush all pending memory write invalidates. */
6880 }
6881 
6882 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6883  kmp_team_t *team) {
6884  if (__kmp_env_consistency_check)
6885  __kmp_pop_parallel(gtid, team->t.t_ident);
6886 
6887  __kmp_finish_implicit_task(this_thr);
6888 }
6889 
6890 int __kmp_invoke_task_func(int gtid) {
6891  int rc;
6892  int tid = __kmp_tid_from_gtid(gtid);
6893  kmp_info_t *this_thr = __kmp_threads[gtid];
6894  kmp_team_t *team = this_thr->th.th_team;
6895 
6896  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6897 #if USE_ITT_BUILD
6898  if (__itt_stack_caller_create_ptr) {
6899  __kmp_itt_stack_callee_enter(
6900  (__itt_caller)
6901  team->t.t_stack_id); // inform ittnotify about entering user's code
6902  }
6903 #endif /* USE_ITT_BUILD */
6904 #if INCLUDE_SSC_MARKS
6905  SSC_MARK_INVOKING();
6906 #endif
6907 
6908 #if OMPT_SUPPORT
6909  void *dummy;
6910  void **exit_runtime_p;
6911  ompt_data_t *my_task_data;
6912  ompt_data_t *my_parallel_data;
6913  int ompt_team_size;
6914 
6915  if (ompt_enabled.enabled) {
6916  exit_runtime_p = &(
6917  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6918  } else {
6919  exit_runtime_p = &dummy;
6920  }
6921 
6922  my_task_data =
6923  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6924  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6925  if (ompt_enabled.ompt_callback_implicit_task) {
6926  ompt_team_size = team->t.t_nproc;
6927  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6928  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6929  __kmp_tid_from_gtid(gtid));
6930  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6931  }
6932 #endif
6933 
6934  {
6935  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6936  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6937  rc =
6938  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6939  tid, (int)team->t.t_argc, (void **)team->t.t_argv
6940 #if OMPT_SUPPORT
6941  ,
6942  exit_runtime_p
6943 #endif
6944  );
6945 #if OMPT_SUPPORT
6946  *exit_runtime_p = NULL;
6947 #endif
6948  }
6949 
6950 #if USE_ITT_BUILD
6951  if (__itt_stack_caller_create_ptr) {
6952  __kmp_itt_stack_callee_leave(
6953  (__itt_caller)
6954  team->t.t_stack_id); // inform ittnotify about leaving user's code
6955  }
6956 #endif /* USE_ITT_BUILD */
6957  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6958 
6959  return rc;
6960 }
6961 
6962 #if OMP_40_ENABLED
6963 void __kmp_teams_master(int gtid) {
6964  // This routine is called by all master threads in teams construct
6965  kmp_info_t *thr = __kmp_threads[gtid];
6966  kmp_team_t *team = thr->th.th_team;
6967  ident_t *loc = team->t.t_ident;
6968  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6969  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6970  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6971  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6972  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6973 // Launch league of teams now, but not let workers execute
6974 // (they hang on fork barrier until next parallel)
6975 #if INCLUDE_SSC_MARKS
6976  SSC_MARK_FORKING();
6977 #endif
6978  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6979  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6980  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6981 #if INCLUDE_SSC_MARKS
6982  SSC_MARK_JOINING();
6983 #endif
6984 
6985  // AC: last parameter "1" eliminates join barrier which won't work because
6986  // worker threads are in a fork barrier waiting for more parallel regions
6987  __kmp_join_call(loc, gtid
6988 #if OMPT_SUPPORT
6989  ,
6990  fork_context_intel
6991 #endif
6992  ,
6993  1);
6994 }
6995 
6996 int __kmp_invoke_teams_master(int gtid) {
6997  kmp_info_t *this_thr = __kmp_threads[gtid];
6998  kmp_team_t *team = this_thr->th.th_team;
6999 #if KMP_DEBUG
7000  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7001  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7002  (void *)__kmp_teams_master);
7003 #endif
7004  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7005  __kmp_teams_master(gtid);
7006  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7007  return 1;
7008 }
7009 #endif /* OMP_40_ENABLED */
7010 
7011 /* this sets the requested number of threads for the next parallel region
7012  encountered by this team. since this should be enclosed in the forkjoin
7013  critical section it should avoid race conditions with assymmetrical nested
7014  parallelism */
7015 
7016 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7017  kmp_info_t *thr = __kmp_threads[gtid];
7018 
7019  if (num_threads > 0)
7020  thr->th.th_set_nproc = num_threads;
7021 }
7022 
7023 #if OMP_40_ENABLED
7024 
7025 /* this sets the requested number of teams for the teams region and/or
7026  the number of threads for the next parallel region encountered */
7027 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7028  int num_threads) {
7029  kmp_info_t *thr = __kmp_threads[gtid];
7030  KMP_DEBUG_ASSERT(num_teams >= 0);
7031  KMP_DEBUG_ASSERT(num_threads >= 0);
7032 
7033  if (num_teams == 0)
7034  num_teams = 1; // default number of teams is 1.
7035  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7036  if (!__kmp_reserve_warn) {
7037  __kmp_reserve_warn = 1;
7038  __kmp_msg(kmp_ms_warning,
7039  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7040  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7041  }
7042  num_teams = __kmp_teams_max_nth;
7043  }
7044  // Set number of teams (number of threads in the outer "parallel" of the
7045  // teams)
7046  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7047 
7048  // Remember the number of threads for inner parallel regions
7049  if (num_threads == 0) {
7050  if (!TCR_4(__kmp_init_middle))
7051  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7052  num_threads = __kmp_avail_proc / num_teams;
7053  if (num_teams * num_threads > __kmp_teams_max_nth) {
7054  // adjust num_threads w/o warning as it is not user setting
7055  num_threads = __kmp_teams_max_nth / num_teams;
7056  }
7057  } else {
7058  if (num_teams * num_threads > __kmp_teams_max_nth) {
7059  int new_threads = __kmp_teams_max_nth / num_teams;
7060  if (!__kmp_reserve_warn) { // user asked for too many threads
7061  __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7062  __kmp_msg(kmp_ms_warning,
7063  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7064  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7065  }
7066  num_threads = new_threads;
7067  }
7068  }
7069  thr->th.th_teams_size.nth = num_threads;
7070 }
7071 
7072 // Set the proc_bind var to use in the following parallel region.
7073 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7074  kmp_info_t *thr = __kmp_threads[gtid];
7075  thr->th.th_set_proc_bind = proc_bind;
7076 }
7077 
7078 #endif /* OMP_40_ENABLED */
7079 
7080 /* Launch the worker threads into the microtask. */
7081 
7082 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7083  kmp_info_t *this_thr = __kmp_threads[gtid];
7084 
7085 #ifdef KMP_DEBUG
7086  int f;
7087 #endif /* KMP_DEBUG */
7088 
7089  KMP_DEBUG_ASSERT(team);
7090  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7091  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7092  KMP_MB(); /* Flush all pending memory write invalidates. */
7093 
7094  team->t.t_construct = 0; /* no single directives seen yet */
7095  team->t.t_ordered.dt.t_value =
7096  0; /* thread 0 enters the ordered section first */
7097 
7098  /* Reset the identifiers on the dispatch buffer */
7099  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7100  if (team->t.t_max_nproc > 1) {
7101  int i;
7102  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7103  team->t.t_disp_buffer[i].buffer_index = i;
7104 #if OMP_45_ENABLED
7105  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7106 #endif
7107  }
7108  } else {
7109  team->t.t_disp_buffer[0].buffer_index = 0;
7110 #if OMP_45_ENABLED
7111  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7112 #endif
7113  }
7114 
7115  KMP_MB(); /* Flush all pending memory write invalidates. */
7116  KMP_ASSERT(this_thr->th.th_team == team);
7117 
7118 #ifdef KMP_DEBUG
7119  for (f = 0; f < team->t.t_nproc; f++) {
7120  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7121  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7122  }
7123 #endif /* KMP_DEBUG */
7124 
7125  /* release the worker threads so they may begin working */
7126  __kmp_fork_barrier(gtid, 0);
7127 }
7128 
7129 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7130  kmp_info_t *this_thr = __kmp_threads[gtid];
7131 
7132  KMP_DEBUG_ASSERT(team);
7133  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7134  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7135  KMP_MB(); /* Flush all pending memory write invalidates. */
7136 
7137 /* Join barrier after fork */
7138 
7139 #ifdef KMP_DEBUG
7140  if (__kmp_threads[gtid] &&
7141  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7142  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7143  __kmp_threads[gtid]);
7144  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7145  "team->t.t_nproc=%d\n",
7146  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7147  team->t.t_nproc);
7148  __kmp_print_structure();
7149  }
7150  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7151  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7152 #endif /* KMP_DEBUG */
7153 
7154  __kmp_join_barrier(gtid); /* wait for everyone */
7155 #if OMPT_SUPPORT
7156  if (ompt_enabled.enabled &&
7157  this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7158  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7159  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7160  this_thr->th.ompt_thread_info.state = omp_state_overhead;
7161 #if OMPT_OPTIONAL
7162  void *codeptr = NULL;
7163  if (KMP_MASTER_TID(ds_tid) &&
7164  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7165  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7166  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7167 
7168  if (ompt_enabled.ompt_callback_sync_region_wait) {
7169  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7170  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7171  }
7172  if (ompt_enabled.ompt_callback_sync_region) {
7173  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7174  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7175  }
7176 #endif
7177  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7178  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7179  ompt_scope_end, NULL, task_data, 0, ds_tid);
7180  }
7181  }
7182 #endif
7183 
7184  KMP_MB(); /* Flush all pending memory write invalidates. */
7185  KMP_ASSERT(this_thr->th.th_team == team);
7186 }
7187 
7188 /* ------------------------------------------------------------------------ */
7189 
7190 #ifdef USE_LOAD_BALANCE
7191 
7192 // Return the worker threads actively spinning in the hot team, if we
7193 // are at the outermost level of parallelism. Otherwise, return 0.
7194 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7195  int i;
7196  int retval;
7197  kmp_team_t *hot_team;
7198 
7199  if (root->r.r_active) {
7200  return 0;
7201  }
7202  hot_team = root->r.r_hot_team;
7203  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7204  return hot_team->t.t_nproc - 1; // Don't count master thread
7205  }
7206 
7207  // Skip the master thread - it is accounted for elsewhere.
7208  retval = 0;
7209  for (i = 1; i < hot_team->t.t_nproc; i++) {
7210  if (hot_team->t.t_threads[i]->th.th_active) {
7211  retval++;
7212  }
7213  }
7214  return retval;
7215 }
7216 
7217 // Perform an automatic adjustment to the number of
7218 // threads used by the next parallel region.
7219 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7220  int retval;
7221  int pool_active;
7222  int hot_team_active;
7223  int team_curr_active;
7224  int system_active;
7225 
7226  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7227  set_nproc));
7228  KMP_DEBUG_ASSERT(root);
7229  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7230  ->th.th_current_task->td_icvs.dynamic == TRUE);
7231  KMP_DEBUG_ASSERT(set_nproc > 1);
7232 
7233  if (set_nproc == 1) {
7234  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7235  return 1;
7236  }
7237 
7238  // Threads that are active in the thread pool, active in the hot team for this
7239  // particular root (if we are at the outer par level), and the currently
7240  // executing thread (to become the master) are available to add to the new
7241  // team, but are currently contributing to the system load, and must be
7242  // accounted for.
7243  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7244  hot_team_active = __kmp_active_hot_team_nproc(root);
7245  team_curr_active = pool_active + hot_team_active + 1;
7246 
7247  // Check the system load.
7248  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7249  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7250  "hot team active = %d\n",
7251  system_active, pool_active, hot_team_active));
7252 
7253  if (system_active < 0) {
7254  // There was an error reading the necessary info from /proc, so use the
7255  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7256  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7257  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7258  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7259 
7260  // Make this call behave like the thread limit algorithm.
7261  retval = __kmp_avail_proc - __kmp_nth +
7262  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7263  if (retval > set_nproc) {
7264  retval = set_nproc;
7265  }
7266  if (retval < KMP_MIN_NTH) {
7267  retval = KMP_MIN_NTH;
7268  }
7269 
7270  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7271  retval));
7272  return retval;
7273  }
7274 
7275  // There is a slight delay in the load balance algorithm in detecting new
7276  // running procs. The real system load at this instant should be at least as
7277  // large as the #active omp thread that are available to add to the team.
7278  if (system_active < team_curr_active) {
7279  system_active = team_curr_active;
7280  }
7281  retval = __kmp_avail_proc - system_active + team_curr_active;
7282  if (retval > set_nproc) {
7283  retval = set_nproc;
7284  }
7285  if (retval < KMP_MIN_NTH) {
7286  retval = KMP_MIN_NTH;
7287  }
7288 
7289  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7290  return retval;
7291 } // __kmp_load_balance_nproc()
7292 
7293 #endif /* USE_LOAD_BALANCE */
7294 
7295 /* ------------------------------------------------------------------------ */
7296 
7297 /* NOTE: this is called with the __kmp_init_lock held */
7298 void __kmp_cleanup(void) {
7299  int f;
7300 
7301  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7302 
7303  if (TCR_4(__kmp_init_parallel)) {
7304 #if KMP_HANDLE_SIGNALS
7305  __kmp_remove_signals();
7306 #endif
7307  TCW_4(__kmp_init_parallel, FALSE);
7308  }
7309 
7310  if (TCR_4(__kmp_init_middle)) {
7311 #if KMP_AFFINITY_SUPPORTED
7312  __kmp_affinity_uninitialize();
7313 #endif /* KMP_AFFINITY_SUPPORTED */
7314  __kmp_cleanup_hierarchy();
7315  TCW_4(__kmp_init_middle, FALSE);
7316  }
7317 
7318  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7319 
7320  if (__kmp_init_serial) {
7321  __kmp_runtime_destroy();
7322  __kmp_init_serial = FALSE;
7323  }
7324 
7325  __kmp_cleanup_threadprivate_caches();
7326 
7327  for (f = 0; f < __kmp_threads_capacity; f++) {
7328  if (__kmp_root[f] != NULL) {
7329  __kmp_free(__kmp_root[f]);
7330  __kmp_root[f] = NULL;
7331  }
7332  }
7333  __kmp_free(__kmp_threads);
7334  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7335  // there is no need in freeing __kmp_root.
7336  __kmp_threads = NULL;
7337  __kmp_root = NULL;
7338  __kmp_threads_capacity = 0;
7339 
7340 #if KMP_USE_DYNAMIC_LOCK
7341  __kmp_cleanup_indirect_user_locks();
7342 #else
7343  __kmp_cleanup_user_locks();
7344 #endif
7345 
7346 #if KMP_AFFINITY_SUPPORTED
7347  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7348  __kmp_cpuinfo_file = NULL;
7349 #endif /* KMP_AFFINITY_SUPPORTED */
7350 
7351 #if KMP_USE_ADAPTIVE_LOCKS
7352 #if KMP_DEBUG_ADAPTIVE_LOCKS
7353  __kmp_print_speculative_stats();
7354 #endif
7355 #endif
7356  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7357  __kmp_nested_nth.nth = NULL;
7358  __kmp_nested_nth.size = 0;
7359  __kmp_nested_nth.used = 0;
7360  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7361  __kmp_nested_proc_bind.bind_types = NULL;
7362  __kmp_nested_proc_bind.size = 0;
7363  __kmp_nested_proc_bind.used = 0;
7364 
7365  __kmp_i18n_catclose();
7366 
7367 #if KMP_STATS_ENABLED
7368  __kmp_stats_fini();
7369 #endif
7370 
7371  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7372 }
7373 
7374 /* ------------------------------------------------------------------------ */
7375 
7376 int __kmp_ignore_mppbeg(void) {
7377  char *env;
7378 
7379  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7380  if (__kmp_str_match_false(env))
7381  return FALSE;
7382  }
7383  // By default __kmpc_begin() is no-op.
7384  return TRUE;
7385 }
7386 
7387 int __kmp_ignore_mppend(void) {
7388  char *env;
7389 
7390  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7391  if (__kmp_str_match_false(env))
7392  return FALSE;
7393  }
7394  // By default __kmpc_end() is no-op.
7395  return TRUE;
7396 }
7397 
7398 void __kmp_internal_begin(void) {
7399  int gtid;
7400  kmp_root_t *root;
7401 
7402  /* this is a very important step as it will register new sibling threads
7403  and assign these new uber threads a new gtid */
7404  gtid = __kmp_entry_gtid();
7405  root = __kmp_threads[gtid]->th.th_root;
7406  KMP_ASSERT(KMP_UBER_GTID(gtid));
7407 
7408  if (root->r.r_begin)
7409  return;
7410  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7411  if (root->r.r_begin) {
7412  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7413  return;
7414  }
7415 
7416  root->r.r_begin = TRUE;
7417 
7418  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7419 }
7420 
7421 /* ------------------------------------------------------------------------ */
7422 
7423 void __kmp_user_set_library(enum library_type arg) {
7424  int gtid;
7425  kmp_root_t *root;
7426  kmp_info_t *thread;
7427 
7428  /* first, make sure we are initialized so we can get our gtid */
7429 
7430  gtid = __kmp_entry_gtid();
7431  thread = __kmp_threads[gtid];
7432 
7433  root = thread->th.th_root;
7434 
7435  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7436  library_serial));
7437  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7438  thread */
7439  KMP_WARNING(SetLibraryIncorrectCall);
7440  return;
7441  }
7442 
7443  switch (arg) {
7444  case library_serial:
7445  thread->th.th_set_nproc = 0;
7446  set__nproc(thread, 1);
7447  break;
7448  case library_turnaround:
7449  thread->th.th_set_nproc = 0;
7450  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7451  : __kmp_dflt_team_nth_ub);
7452  break;
7453  case library_throughput:
7454  thread->th.th_set_nproc = 0;
7455  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7456  : __kmp_dflt_team_nth_ub);
7457  break;
7458  default:
7459  KMP_FATAL(UnknownLibraryType, arg);
7460  }
7461 
7462  __kmp_aux_set_library(arg);
7463 }
7464 
7465 void __kmp_aux_set_stacksize(size_t arg) {
7466  if (!__kmp_init_serial)
7467  __kmp_serial_initialize();
7468 
7469 #if KMP_OS_DARWIN
7470  if (arg & (0x1000 - 1)) {
7471  arg &= ~(0x1000 - 1);
7472  if (arg + 0x1000) /* check for overflow if we round up */
7473  arg += 0x1000;
7474  }
7475 #endif
7476  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7477 
7478  /* only change the default stacksize before the first parallel region */
7479  if (!TCR_4(__kmp_init_parallel)) {
7480  size_t value = arg; /* argument is in bytes */
7481 
7482  if (value < __kmp_sys_min_stksize)
7483  value = __kmp_sys_min_stksize;
7484  else if (value > KMP_MAX_STKSIZE)
7485  value = KMP_MAX_STKSIZE;
7486 
7487  __kmp_stksize = value;
7488 
7489  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7490  }
7491 
7492  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7493 }
7494 
7495 /* set the behaviour of the runtime library */
7496 /* TODO this can cause some odd behaviour with sibling parallelism... */
7497 void __kmp_aux_set_library(enum library_type arg) {
7498  __kmp_library = arg;
7499 
7500  switch (__kmp_library) {
7501  case library_serial: {
7502  KMP_INFORM(LibraryIsSerial);
7503  (void)__kmp_change_library(TRUE);
7504  } break;
7505  case library_turnaround:
7506  (void)__kmp_change_library(TRUE);
7507  break;
7508  case library_throughput:
7509  (void)__kmp_change_library(FALSE);
7510  break;
7511  default:
7512  KMP_FATAL(UnknownLibraryType, arg);
7513  }
7514 }
7515 
7516 /* ------------------------------------------------------------------------ */
7517 
7518 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7519  int blocktime = arg; /* argument is in milliseconds */
7520 #if KMP_USE_MONITOR
7521  int bt_intervals;
7522 #endif
7523  int bt_set;
7524 
7525  __kmp_save_internal_controls(thread);
7526 
7527  /* Normalize and set blocktime for the teams */
7528  if (blocktime < KMP_MIN_BLOCKTIME)
7529  blocktime = KMP_MIN_BLOCKTIME;
7530  else if (blocktime > KMP_MAX_BLOCKTIME)
7531  blocktime = KMP_MAX_BLOCKTIME;
7532 
7533  set__blocktime_team(thread->th.th_team, tid, blocktime);
7534  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7535 
7536 #if KMP_USE_MONITOR
7537  /* Calculate and set blocktime intervals for the teams */
7538  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7539 
7540  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7541  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7542 #endif
7543 
7544  /* Set whether blocktime has been set to "TRUE" */
7545  bt_set = TRUE;
7546 
7547  set__bt_set_team(thread->th.th_team, tid, bt_set);
7548  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7549 #if KMP_USE_MONITOR
7550  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7551  "bt_intervals=%d, monitor_updates=%d\n",
7552  __kmp_gtid_from_tid(tid, thread->th.th_team),
7553  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7554  __kmp_monitor_wakeups));
7555 #else
7556  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7557  __kmp_gtid_from_tid(tid, thread->th.th_team),
7558  thread->th.th_team->t.t_id, tid, blocktime));
7559 #endif
7560 }
7561 
7562 void __kmp_aux_set_defaults(char const *str, int len) {
7563  if (!__kmp_init_serial) {
7564  __kmp_serial_initialize();
7565  }
7566  __kmp_env_initialize(str);
7567 
7568  if (__kmp_settings
7569 #if OMP_40_ENABLED
7570  || __kmp_display_env || __kmp_display_env_verbose
7571 #endif // OMP_40_ENABLED
7572  ) {
7573  __kmp_env_print();
7574  }
7575 } // __kmp_aux_set_defaults
7576 
7577 /* ------------------------------------------------------------------------ */
7578 /* internal fast reduction routines */
7579 
7580 PACKED_REDUCTION_METHOD_T
7581 __kmp_determine_reduction_method(
7582  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7583  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7584  kmp_critical_name *lck) {
7585 
7586  // Default reduction method: critical construct ( lck != NULL, like in current
7587  // PAROPT )
7588  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7589  // can be selected by RTL
7590  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7591  // can be selected by RTL
7592  // Finally, it's up to OpenMP RTL to make a decision on which method to select
7593  // among generated by PAROPT.
7594 
7595  PACKED_REDUCTION_METHOD_T retval;
7596 
7597  int team_size;
7598 
7599  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7600  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7601 
7602 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
7603  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7604 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7605 
7606  retval = critical_reduce_block;
7607 
7608  // another choice of getting a team size (with 1 dynamic deference) is slower
7609  team_size = __kmp_get_team_num_threads(global_tid);
7610  if (team_size == 1) {
7611 
7612  retval = empty_reduce_block;
7613 
7614  } else {
7615 
7616  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7617  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7618 
7619 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7620 
7621 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \
7622  KMP_OS_DARWIN
7623 
7624  int teamsize_cutoff = 4;
7625 
7626 #if KMP_MIC_SUPPORTED
7627  if (__kmp_mic_type != non_mic) {
7628  teamsize_cutoff = 8;
7629  }
7630 #endif
7631  if (tree_available) {
7632  if (team_size <= teamsize_cutoff) {
7633  if (atomic_available) {
7634  retval = atomic_reduce_block;
7635  }
7636  } else {
7637  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7638  }
7639  } else if (atomic_available) {
7640  retval = atomic_reduce_block;
7641  }
7642 #else
7643 #error "Unknown or unsupported OS"
7644 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7645 // KMP_OS_DARWIN
7646 
7647 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7648 
7649 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7650 
7651  // basic tuning
7652 
7653  if (atomic_available) {
7654  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7655  retval = atomic_reduce_block;
7656  }
7657  } // otherwise: use critical section
7658 
7659 #elif KMP_OS_DARWIN
7660 
7661  if (atomic_available && (num_vars <= 3)) {
7662  retval = atomic_reduce_block;
7663  } else if (tree_available) {
7664  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7665  (reduce_size < (2000 * sizeof(kmp_real64)))) {
7666  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7667  }
7668  } // otherwise: use critical section
7669 
7670 #else
7671 #error "Unknown or unsupported OS"
7672 #endif
7673 
7674 #else
7675 #error "Unknown or unsupported architecture"
7676 #endif
7677  }
7678 
7679  // KMP_FORCE_REDUCTION
7680 
7681  // If the team is serialized (team_size == 1), ignore the forced reduction
7682  // method and stay with the unsynchronized method (empty_reduce_block)
7683  if (__kmp_force_reduction_method != reduction_method_not_defined &&
7684  team_size != 1) {
7685 
7686  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7687 
7688  int atomic_available, tree_available;
7689 
7690  switch ((forced_retval = __kmp_force_reduction_method)) {
7691  case critical_reduce_block:
7692  KMP_ASSERT(lck); // lck should be != 0
7693  break;
7694 
7695  case atomic_reduce_block:
7696  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7697  if (!atomic_available) {
7698  KMP_WARNING(RedMethodNotSupported, "atomic");
7699  forced_retval = critical_reduce_block;
7700  }
7701  break;
7702 
7703  case tree_reduce_block:
7704  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7705  if (!tree_available) {
7706  KMP_WARNING(RedMethodNotSupported, "tree");
7707  forced_retval = critical_reduce_block;
7708  } else {
7709 #if KMP_FAST_REDUCTION_BARRIER
7710  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7711 #endif
7712  }
7713  break;
7714 
7715  default:
7716  KMP_ASSERT(0); // "unsupported method specified"
7717  }
7718 
7719  retval = forced_retval;
7720  }
7721 
7722  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7723 
7724 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7725 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7726 
7727  return (retval);
7728 }
7729 
7730 // this function is for testing set/get/determine reduce method
7731 kmp_int32 __kmp_get_reduce_method(void) {
7732  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7733 }
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:834
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:803
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:189
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:883
sched_type
Definition: kmp.h:317
Definition: kmp.h:210
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:212