LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  * it may change values between parallel regions. __kmp_max_nth
18  * is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-specific.h"
40 #endif
41 
42 /* ------------------------------------------------------------------------ */
43 
44 #if KMP_STATIC_STEAL_ENABLED
45 
46 // replaces dispatch_private_info{32,64} structures and
47 // dispatch_private_info{32,64}_t types
48 template <typename T> struct dispatch_private_infoXX_template {
49  typedef typename traits_t<T>::unsigned_t UT;
50  typedef typename traits_t<T>::signed_t ST;
51  UT count; // unsigned
52  T ub;
53  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
54  T lb;
55  ST st; // signed
56  UT tc; // unsigned
57  T static_steal_counter; // for static_steal only; maybe better to put after ub
58 
59  /* parm[1-4] are used in different ways by different scheduling algorithms */
60 
61  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
62  // a) parm3 is properly aligned and
63  // b) all parm1-4 are in the same cache line.
64  // Because of parm1-4 are used together, performance seems to be better
65  // if they are in the same line (not measured though).
66 
67  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
68  T parm1;
69  T parm2;
70  T parm3;
71  T parm4;
72  };
73 
74  UT ordered_lower; // unsigned
75  UT ordered_upper; // unsigned
76 #if KMP_OS_WINDOWS
77  T last_upper;
78 #endif /* KMP_OS_WINDOWS */
79 };
80 
81 #else /* KMP_STATIC_STEAL_ENABLED */
82 
83 // replaces dispatch_private_info{32,64} structures and
84 // dispatch_private_info{32,64}_t types
85 template <typename T> struct dispatch_private_infoXX_template {
86  typedef typename traits_t<T>::unsigned_t UT;
87  typedef typename traits_t<T>::signed_t ST;
88  T lb;
89  T ub;
90  ST st; // signed
91  UT tc; // unsigned
92 
93  T parm1;
94  T parm2;
95  T parm3;
96  T parm4;
97 
98  UT count; // unsigned
99 
100  UT ordered_lower; // unsigned
101  UT ordered_upper; // unsigned
102 #if KMP_OS_WINDOWS
103  T last_upper;
104 #endif /* KMP_OS_WINDOWS */
105 };
106 
107 #endif /* KMP_STATIC_STEAL_ENABLED */
108 
109 // replaces dispatch_private_info structure and dispatch_private_info_t type
110 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
111  // duplicate alignment here, otherwise size of structure is not correct in our
112  // compiler
113  union KMP_ALIGN_CACHE private_info_tmpl {
114  dispatch_private_infoXX_template<T> p;
115  dispatch_private_info64_t p64;
116  } u;
117  enum sched_type schedule; /* scheduling algorithm */
118  kmp_uint32 ordered; /* ordered clause specified */
119  kmp_uint32 ordered_bumped;
120  // To retain the structure size after making ordered_iteration scalar
121  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
122  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
123  kmp_uint32 nomerge; /* don't merge iters if serialized */
124  kmp_uint32 type_size;
125  enum cons_type pushed_ws;
126 };
127 
128 // replaces dispatch_shared_info{32,64} structures and
129 // dispatch_shared_info{32,64}_t types
130 template <typename UT> struct dispatch_shared_infoXX_template {
131  /* chunk index under dynamic, number of idle threads under static-steal;
132  iteration index otherwise */
133  volatile UT iteration;
134  volatile UT num_done;
135  volatile UT ordered_iteration;
136  // to retain the structure size making ordered_iteration scalar
137  UT ordered_dummy[KMP_MAX_ORDERED - 3];
138 };
139 
140 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
141 template <typename UT> struct dispatch_shared_info_template {
142  // we need union here to keep the structure size
143  union shared_info_tmpl {
144  dispatch_shared_infoXX_template<UT> s;
145  dispatch_shared_info64_t s64;
146  } u;
147  volatile kmp_uint32 buffer_index;
148 #if OMP_45_ENABLED
149  volatile kmp_int32 doacross_buf_idx; // teamwise index
150  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
151  kmp_int32 doacross_num_done; // count finished threads
152 #endif
153 #if KMP_USE_HWLOC
154  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
155  // machines (> 48 cores). Performance analysis showed that a cache thrash
156  // was occurring and this padding helps alleviate the problem.
157  char padding[64];
158 #endif
159 };
160 
161 /* ------------------------------------------------------------------------ */
162 
163 #undef USE_TEST_LOCKS
164 
165 // test_then_add template (general template should NOT be used)
166 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
167 
168 template <>
169 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
170  kmp_int32 d) {
171  kmp_int32 r;
172  r = KMP_TEST_THEN_ADD32(p, d);
173  return r;
174 }
175 
176 template <>
177 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
178  kmp_int64 d) {
179  kmp_int64 r;
180  r = KMP_TEST_THEN_ADD64(p, d);
181  return r;
182 }
183 
184 // test_then_inc_acq template (general template should NOT be used)
185 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
186 
187 template <>
188 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
189  kmp_int32 r;
190  r = KMP_TEST_THEN_INC_ACQ32(p);
191  return r;
192 }
193 
194 template <>
195 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
196  kmp_int64 r;
197  r = KMP_TEST_THEN_INC_ACQ64(p);
198  return r;
199 }
200 
201 // test_then_inc template (general template should NOT be used)
202 template <typename T> static __forceinline T test_then_inc(volatile T *p);
203 
204 template <>
205 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
206  kmp_int32 r;
207  r = KMP_TEST_THEN_INC32(p);
208  return r;
209 }
210 
211 template <>
212 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
213  kmp_int64 r;
214  r = KMP_TEST_THEN_INC64(p);
215  return r;
216 }
217 
218 // compare_and_swap template (general template should NOT be used)
219 template <typename T>
220 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
221 
222 template <>
223 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
224  kmp_int32 c, kmp_int32 s) {
225  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
226 }
227 
228 template <>
229 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
230  kmp_int64 c, kmp_int64 s) {
231  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
232 }
233 
234 /* Spin wait loop that first does pause, then yield.
235  Waits until function returns non-zero when called with *spinner and check.
236  Does NOT put threads to sleep.
237  Arguments:
238  UT is unsigned 4- or 8-byte type
239  spinner - memory location to check value
240  checker - value which spinner is >, <, ==, etc.
241  pred - predicate function to perform binary comparison of some sort
242 #if USE_ITT_BUILD
243  obj -- is higher-level synchronization object to report to ittnotify.
244  It is used to report locks consistently. For example, if lock is
245  acquired immediately, its address is reported to ittnotify via
246  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
247  and lock routine calls to KMP_WAIT_YIELD(), the later should report the
248  same address, not an address of low-level spinner.
249 #endif // USE_ITT_BUILD
250  TODO: make inline function (move to header file for icl)
251 */
252 template <typename UT>
253 static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
254  kmp_uint32 (*pred)(UT, UT)
255  USE_ITT_BUILD_ARG(void *obj)) {
256  // note: we may not belong to a team at this point
257  volatile UT *spin = spinner;
258  UT check = checker;
259  kmp_uint32 spins;
260  kmp_uint32 (*f)(UT, UT) = pred;
261  UT r;
262 
263  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
264  KMP_INIT_YIELD(spins);
265  // main wait spin loop
266  while (!f(r = *spin, check)) {
267  KMP_FSYNC_SPIN_PREPARE(obj);
268  /* GEH - remove this since it was accidentally introduced when kmp_wait was
269  split. It causes problems with infinite recursion because of exit lock */
270  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
271  __kmp_abort_thread(); */
272 
273  // if we are oversubscribed, or have waited a bit (and
274  // KMP_LIBRARY=throughput, then yield. pause is in the following code
275  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
276  KMP_YIELD_SPIN(spins);
277  }
278  KMP_FSYNC_SPIN_ACQUIRED(obj);
279  return r;
280 }
281 
282 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
283  return value == checker;
284 }
285 
286 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
287  return value != checker;
288 }
289 
290 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
291  return value < checker;
292 }
293 
294 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
295  return value >= checker;
296 }
297 
298 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
299  return value <= checker;
300 }
301 
302 /* ------------------------------------------------------------------------ */
303 
304 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
305  ident_t *loc_ref) {
306  kmp_info_t *th;
307 
308  KMP_DEBUG_ASSERT(gtid_ref);
309 
310  if (__kmp_env_consistency_check) {
311  th = __kmp_threads[*gtid_ref];
312  if (th->th.th_root->r.r_active &&
313  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
314 #if KMP_USE_DYNAMIC_LOCK
315  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
316 #else
317  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
318 #endif
319  }
320  }
321 }
322 
323 template <typename UT>
324 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
325  typedef typename traits_t<UT>::signed_t ST;
326  dispatch_private_info_template<UT> *pr;
327 
328  int gtid = *gtid_ref;
329  // int cid = *cid_ref;
330  kmp_info_t *th = __kmp_threads[gtid];
331  KMP_DEBUG_ASSERT(th->th.th_dispatch);
332 
333  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
334  if (__kmp_env_consistency_check) {
335  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
336  th->th.th_dispatch->th_dispatch_pr_current);
337  if (pr->pushed_ws != ct_none) {
338 #if KMP_USE_DYNAMIC_LOCK
339  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
340 #else
341  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
342 #endif
343  }
344  }
345 
346  if (!th->th.th_team->t.t_serialized) {
347  dispatch_shared_info_template<UT> *sh =
348  reinterpret_cast<dispatch_shared_info_template<UT> *>(
349  th->th.th_dispatch->th_dispatch_sh_current);
350  UT lower;
351 
352  if (!__kmp_env_consistency_check) {
353  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
354  th->th.th_dispatch->th_dispatch_pr_current);
355  }
356  lower = pr->u.p.ordered_lower;
357 
358 #if !defined(KMP_GOMP_COMPAT)
359  if (__kmp_env_consistency_check) {
360  if (pr->ordered_bumped) {
361  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
362  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
363  ct_ordered_in_pdo, loc_ref,
364  &p->stack_data[p->w_top]);
365  }
366  }
367 #endif /* !defined(KMP_GOMP_COMPAT) */
368 
369  KMP_MB();
370 #ifdef KMP_DEBUG
371  {
372  char *buff;
373  // create format specifiers before the debug output
374  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
375  "ordered_iter:%%%s lower:%%%s\n",
376  traits_t<UT>::spec, traits_t<UT>::spec);
377  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
378  __kmp_str_free(&buff);
379  }
380 #endif
381 
382  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
383  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
384  KMP_MB(); /* is this necessary? */
385 #ifdef KMP_DEBUG
386  {
387  char *buff;
388  // create format specifiers before the debug output
389  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
390  "ordered_iter:%%%s lower:%%%s\n",
391  traits_t<UT>::spec, traits_t<UT>::spec);
392  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
393  __kmp_str_free(&buff);
394  }
395 #endif
396  }
397  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
398 }
399 
400 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
401  ident_t *loc_ref) {
402  kmp_info_t *th;
403 
404  if (__kmp_env_consistency_check) {
405  th = __kmp_threads[*gtid_ref];
406  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
407  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
408  }
409  }
410 }
411 
412 template <typename UT>
413 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
414  typedef typename traits_t<UT>::signed_t ST;
415  dispatch_private_info_template<UT> *pr;
416 
417  int gtid = *gtid_ref;
418  // int cid = *cid_ref;
419  kmp_info_t *th = __kmp_threads[gtid];
420  KMP_DEBUG_ASSERT(th->th.th_dispatch);
421 
422  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
423  if (__kmp_env_consistency_check) {
424  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
425  th->th.th_dispatch->th_dispatch_pr_current);
426  if (pr->pushed_ws != ct_none) {
427  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
428  }
429  }
430 
431  if (!th->th.th_team->t.t_serialized) {
432  dispatch_shared_info_template<UT> *sh =
433  reinterpret_cast<dispatch_shared_info_template<UT> *>(
434  th->th.th_dispatch->th_dispatch_sh_current);
435 
436  if (!__kmp_env_consistency_check) {
437  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
438  th->th.th_dispatch->th_dispatch_pr_current);
439  }
440 
441  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
442 #if !defined(KMP_GOMP_COMPAT)
443  if (__kmp_env_consistency_check) {
444  if (pr->ordered_bumped != 0) {
445  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
446  /* How to test it? - OM */
447  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
448  ct_ordered_in_pdo, loc_ref,
449  &p->stack_data[p->w_top]);
450  }
451  }
452 #endif /* !defined(KMP_GOMP_COMPAT) */
453 
454  KMP_MB(); /* Flush all pending memory write invalidates. */
455 
456  pr->ordered_bumped += 1;
457 
458  KD_TRACE(1000,
459  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
460  gtid, pr->ordered_bumped));
461 
462  KMP_MB(); /* Flush all pending memory write invalidates. */
463 
464  /* TODO use general release procedure? */
465  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
466 
467  KMP_MB(); /* Flush all pending memory write invalidates. */
468  }
469  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
470 }
471 
472 // Computes and returns x to the power of y, where y must a non-negative integer
473 template <typename UT>
474 static __forceinline long double __kmp_pow(long double x, UT y) {
475  long double s = 1.0L;
476 
477  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
478  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
479  while (y) {
480  if (y & 1)
481  s *= x;
482  x *= x;
483  y >>= 1;
484  }
485  return s;
486 }
487 
488 /* Computes and returns the number of unassigned iterations after idx chunks
489  have been assigned (the total number of unassigned iterations in chunks with
490  index greater than or equal to idx). __forceinline seems to be broken so that
491  if we __forceinline this function, the behavior is wrong
492  (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
493 template <typename T>
494 static __inline typename traits_t<T>::unsigned_t
495 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
496  typename traits_t<T>::unsigned_t idx) {
497  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
498  ICL 8.1, long double arithmetic may not really have long double precision,
499  even with /Qlong_double. Currently, we workaround that in the caller code,
500  by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack
501  of precision is not expected to be a correctness issue, though. */
502  typedef typename traits_t<T>::unsigned_t UT;
503 
504  long double x = tc * __kmp_pow<UT>(base, idx);
505  UT r = (UT)x;
506  if (x == r)
507  return r;
508  return r + 1;
509 }
510 
511 // Parameters of the guided-iterative algorithm:
512 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
513 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
514 // by default n = 2. For example with n = 3 the chunks distribution will be more
515 // flat.
516 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
517 static int guided_int_param = 2;
518 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
519 
520 // UT - unsigned flavor of T, ST - signed flavor of T,
521 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
522 template <typename T>
523 static void
524 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
525  T ub, typename traits_t<T>::signed_t st,
526  typename traits_t<T>::signed_t chunk, int push_ws) {
527  typedef typename traits_t<T>::unsigned_t UT;
528  typedef typename traits_t<T>::signed_t ST;
529  typedef typename traits_t<T>::floating_t DBL;
530 
531  int active;
532  T tc;
533  kmp_info_t *th;
534  kmp_team_t *team;
535  kmp_uint32 my_buffer_index;
536  dispatch_private_info_template<T> *pr;
537  dispatch_shared_info_template<UT> volatile *sh;
538 
539  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
540  sizeof(dispatch_private_info));
541  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
542  sizeof(dispatch_shared_info));
543 
544  if (!TCR_4(__kmp_init_parallel))
545  __kmp_parallel_initialize();
546 
547 #if INCLUDE_SSC_MARKS
548  SSC_MARK_DISPATCH_INIT();
549 #endif
550 #ifdef KMP_DEBUG
551  {
552  char *buff;
553  // create format specifiers before the debug output
554  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
555  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
556  traits_t<ST>::spec, traits_t<T>::spec,
557  traits_t<T>::spec, traits_t<ST>::spec);
558  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
559  __kmp_str_free(&buff);
560  }
561 #endif
562  /* setup data */
563  th = __kmp_threads[gtid];
564  team = th->th.th_team;
565  active = !team->t.t_serialized;
566  th->th.th_ident = loc;
567 
568 #if USE_ITT_BUILD
569  kmp_uint64 cur_chunk = chunk;
570  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
571  __kmp_forkjoin_frames_mode == 3 &&
572  KMP_MASTER_GTID(gtid) &&
573 #if OMP_40_ENABLED
574  th->th.th_teams_microtask == NULL &&
575 #endif
576  team->t.t_active_level == 1;
577 #endif
578  if (!active) {
579  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
580  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
581  } else {
582  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
583  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
584 
585  my_buffer_index = th->th.th_dispatch->th_disp_index++;
586 
587  /* What happens when number of threads changes, need to resize buffer? */
588  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
589  &th->th.th_dispatch
590  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
591  sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
592  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593  }
594 
595 #if (KMP_STATIC_STEAL_ENABLED)
596  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
597  // AC: we now have only one implementation of stealing, so use it
598  schedule = kmp_sch_static_steal;
599  else
600 #endif
601  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
602 
603  /* Pick up the nomerge/ordered bits from the scheduling type */
604  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
605  pr->nomerge = TRUE;
606  schedule =
607  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
608  } else {
609  pr->nomerge = FALSE;
610  }
611  pr->type_size = traits_t<T>::type_size; // remember the size of variables
612  if (kmp_ord_lower & schedule) {
613  pr->ordered = TRUE;
614  schedule =
615  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
616  } else {
617  pr->ordered = FALSE;
618  }
619 
620  if (schedule == kmp_sch_static) {
621  schedule = __kmp_static;
622  } else {
623  if (schedule == kmp_sch_runtime) {
624  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
625  // not specified)
626  schedule = team->t.t_sched.r_sched_type;
627  // Detail the schedule if needed (global controls are differentiated
628  // appropriately)
629  if (schedule == kmp_sch_guided_chunked) {
630  schedule = __kmp_guided;
631  } else if (schedule == kmp_sch_static) {
632  schedule = __kmp_static;
633  }
634  // Use the chunk size specified by OMP_SCHEDULE (or default if not
635  // specified)
636  chunk = team->t.t_sched.chunk;
637 #if USE_ITT_BUILD
638  cur_chunk = chunk;
639 #endif
640 #ifdef KMP_DEBUG
641  {
642  char *buff;
643  // create format specifiers before the debug output
644  buff = __kmp_str_format(
645  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
646  traits_t<ST>::spec);
647  KD_TRACE(10, (buff, gtid, schedule, chunk));
648  __kmp_str_free(&buff);
649  }
650 #endif
651  } else {
652  if (schedule == kmp_sch_guided_chunked) {
653  schedule = __kmp_guided;
654  }
655  if (chunk <= 0) {
656  chunk = KMP_DEFAULT_CHUNK;
657  }
658  }
659 
660  if (schedule == kmp_sch_auto) {
661  // mapping and differentiation: in the __kmp_do_serial_initialize()
662  schedule = __kmp_auto;
663 #ifdef KMP_DEBUG
664  {
665  char *buff;
666  // create format specifiers before the debug output
667  buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
668  "schedule:%%d chunk:%%%s\n",
669  traits_t<ST>::spec);
670  KD_TRACE(10, (buff, gtid, schedule, chunk));
671  __kmp_str_free(&buff);
672  }
673 #endif
674  }
675 
676  /* guided analytical not safe for too many threads */
677  if (schedule == kmp_sch_guided_analytical_chunked &&
678  th->th.th_team_nproc > 1 << 20) {
679  schedule = kmp_sch_guided_iterative_chunked;
680  KMP_WARNING(DispatchManyThreads);
681  }
682  if (schedule == kmp_sch_runtime_simd) {
683  // compiler provides simd_width in the chunk parameter
684  schedule = team->t.t_sched.r_sched_type;
685  // Detail the schedule if needed (global controls are differentiated
686  // appropriately)
687  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
688  schedule == __kmp_static) {
689  schedule = kmp_sch_static_balanced_chunked;
690  } else {
691  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
692  schedule = kmp_sch_guided_simd;
693  }
694  chunk = team->t.t_sched.chunk * chunk;
695  }
696 #if USE_ITT_BUILD
697  cur_chunk = chunk;
698 #endif
699 #ifdef KMP_DEBUG
700  {
701  char *buff;
702  // create format specifiers before the debug output
703  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
704  " chunk:%%%s\n",
705  traits_t<ST>::spec);
706  KD_TRACE(10, (buff, gtid, schedule, chunk));
707  __kmp_str_free(&buff);
708  }
709 #endif
710  }
711  pr->u.p.parm1 = chunk;
712  }
713  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
714  "unknown scheduling type");
715 
716  pr->u.p.count = 0;
717 
718  if (__kmp_env_consistency_check) {
719  if (st == 0) {
720  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
721  (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
722  }
723  }
724  // compute trip count
725  if (st == 1) { // most common case
726  if (ub >= lb) {
727  tc = ub - lb + 1;
728  } else { // ub < lb
729  tc = 0; // zero-trip
730  }
731  } else if (st < 0) {
732  if (lb >= ub) {
733  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
734  // where the division needs to be unsigned regardless of the result type
735  tc = (UT)(lb - ub) / (-st) + 1;
736  } else { // lb < ub
737  tc = 0; // zero-trip
738  }
739  } else { // st > 0
740  if (ub >= lb) {
741  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
742  // where the division needs to be unsigned regardless of the result type
743  tc = (UT)(ub - lb) / st + 1;
744  } else { // ub < lb
745  tc = 0; // zero-trip
746  }
747  }
748 
749  // Any half-decent optimizer will remove this test when the blocks are empty
750  // since the macros expand to nothing when statistics are disabled.
751  if (schedule == __kmp_static) {
752  KMP_COUNT_BLOCK(OMP_FOR_static);
753  KMP_COUNT_VALUE(FOR_static_iterations, tc);
754  } else {
755  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
756  KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
757  }
758 
759  pr->u.p.lb = lb;
760  pr->u.p.ub = ub;
761  pr->u.p.st = st;
762  pr->u.p.tc = tc;
763 
764 #if KMP_OS_WINDOWS
765  pr->u.p.last_upper = ub + st;
766 #endif /* KMP_OS_WINDOWS */
767 
768  /* NOTE: only the active parallel region(s) has active ordered sections */
769 
770  if (active) {
771  if (pr->ordered == 0) {
772  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
773  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
774  } else {
775  pr->ordered_bumped = 0;
776 
777  pr->u.p.ordered_lower = 1;
778  pr->u.p.ordered_upper = 0;
779 
780  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
781  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
782  }
783  }
784 
785  if (__kmp_env_consistency_check) {
786  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
787  if (push_ws) {
788  __kmp_push_workshare(gtid, ws, loc);
789  pr->pushed_ws = ws;
790  } else {
791  __kmp_check_workshare(gtid, ws, loc);
792  pr->pushed_ws = ct_none;
793  }
794  }
795 
796  switch (schedule) {
797 #if (KMP_STATIC_STEAL_ENABLED)
798  case kmp_sch_static_steal: {
799  T nproc = th->th.th_team_nproc;
800  T ntc, init;
801 
802  KD_TRACE(100,
803  ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
804 
805  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
806  if (nproc > 1 && ntc >= nproc) {
807  KMP_COUNT_BLOCK(OMP_FOR_static_steal);
808  T id = __kmp_tid_from_gtid(gtid);
809  T small_chunk, extras;
810 
811  small_chunk = ntc / nproc;
812  extras = ntc % nproc;
813 
814  init = id * small_chunk + (id < extras ? id : extras);
815  pr->u.p.count = init;
816  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
817 
818  pr->u.p.parm2 = lb;
819  // pr->pfields.parm3 = 0; // it's not used in static_steal
820  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
821  pr->u.p.st = st;
822  if (traits_t<T>::type_size > 4) {
823  // AC: TODO: check if 16-byte CAS available and use it to
824  // improve performance (probably wait for explicit request
825  // before spending time on this).
826  // For now use dynamically allocated per-thread lock,
827  // free memory in __kmp_dispatch_next when status==0.
828  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
829  th->th.th_dispatch->th_steal_lock =
830  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
831  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
832  }
833  break;
834  } else {
835  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
836  "kmp_sch_static_balanced\n",
837  gtid));
838  schedule = kmp_sch_static_balanced;
839  /* too few iterations: fall-through to kmp_sch_static_balanced */
840  } // if
841  /* FALL-THROUGH to static balanced */
842  } // case
843 #endif
844  case kmp_sch_static_balanced: {
845  T nproc = th->th.th_team_nproc;
846  T init, limit;
847 
848  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
849  gtid));
850 
851  if (nproc > 1) {
852  T id = __kmp_tid_from_gtid(gtid);
853 
854  if (tc < nproc) {
855  if (id < tc) {
856  init = id;
857  limit = id;
858  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
859  } else {
860  pr->u.p.count = 1; /* means no more chunks to execute */
861  pr->u.p.parm1 = FALSE;
862  break;
863  }
864  } else {
865  T small_chunk = tc / nproc;
866  T extras = tc % nproc;
867  init = id * small_chunk + (id < extras ? id : extras);
868  limit = init + small_chunk - (id < extras ? 0 : 1);
869  pr->u.p.parm1 = (id == nproc - 1);
870  }
871  } else {
872  if (tc > 0) {
873  init = 0;
874  limit = tc - 1;
875  pr->u.p.parm1 = TRUE;
876  } else { // zero trip count
877  pr->u.p.count = 1; /* means no more chunks to execute */
878  pr->u.p.parm1 = FALSE;
879  break;
880  }
881  }
882 #if USE_ITT_BUILD
883  // Calculate chunk for metadata report
884  if (itt_need_metadata_reporting)
885  cur_chunk = limit - init + 1;
886 #endif
887  if (st == 1) {
888  pr->u.p.lb = lb + init;
889  pr->u.p.ub = lb + limit;
890  } else {
891  // calculated upper bound, "ub" is user-defined upper bound
892  T ub_tmp = lb + limit * st;
893  pr->u.p.lb = lb + init * st;
894  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
895  // it exactly
896  if (st > 0) {
897  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
898  } else {
899  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
900  }
901  }
902  if (pr->ordered) {
903  pr->u.p.ordered_lower = init;
904  pr->u.p.ordered_upper = limit;
905  }
906  break;
907  } // case
908  case kmp_sch_static_balanced_chunked: {
909  // similar to balanced, but chunk adjusted to multiple of simd width
910  T nth = th->th.th_team_nproc;
911  KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
912  " -> falling-through to static_greedy\n",
913  gtid));
914  schedule = kmp_sch_static_greedy;
915  if (nth > 1)
916  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
917  else
918  pr->u.p.parm1 = tc;
919  break;
920  } // case
921  case kmp_sch_guided_iterative_chunked:
922  case kmp_sch_guided_simd: {
923  T nproc = th->th.th_team_nproc;
924  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
925  " case\n",
926  gtid));
927 
928  if (nproc > 1) {
929  if ((2L * chunk + 1) * nproc >= tc) {
930  /* chunk size too large, switch to dynamic */
931  schedule = kmp_sch_dynamic_chunked;
932  } else {
933  // when remaining iters become less than parm2 - switch to dynamic
934  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
935  *(double *)&pr->u.p.parm3 =
936  guided_flt_param / nproc; // may occupy parm3 and parm4
937  }
938  } else {
939  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
940  "kmp_sch_static_greedy\n",
941  gtid));
942  schedule = kmp_sch_static_greedy;
943  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
944  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
945  gtid));
946  pr->u.p.parm1 = tc;
947  } // if
948  } // case
949  break;
950  case kmp_sch_guided_analytical_chunked: {
951  T nproc = th->th.th_team_nproc;
952  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
953  " case\n",
954  gtid));
955  if (nproc > 1) {
956  if ((2L * chunk + 1) * nproc >= tc) {
957  /* chunk size too large, switch to dynamic */
958  schedule = kmp_sch_dynamic_chunked;
959  } else {
960  /* commonly used term: (2 nproc - 1)/(2 nproc) */
961  DBL x;
962 
963 #if KMP_OS_WINDOWS && KMP_ARCH_X86
964  /* Linux* OS already has 64-bit computation by default for long double,
965  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
966  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
967  instead of the default 53-bit. Even though long double doesn't work
968  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
969  expected to impact the correctness of the algorithm, but this has not
970  been mathematically proven. */
971  // save original FPCW and set precision to 64-bit, as
972  // Windows* OS on IA-32 architecture defaults to 53-bit
973  unsigned int oldFpcw = _control87(0, 0);
974  _control87(_PC_64, _MCW_PC); // 0,0x30000
975 #endif
976  /* value used for comparison in solver for cross-over point */
977  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
978 
979  /* crossover point--chunk indexes equal to or greater than
980  this point switch to dynamic-style scheduling */
981  UT cross;
982 
983  /* commonly used term: (2 nproc - 1)/(2 nproc) */
984  x = (long double)1.0 - (long double)0.5 / nproc;
985 
986 #ifdef KMP_DEBUG
987  { // test natural alignment
988  struct _test_a {
989  char a;
990  union {
991  char b;
992  DBL d;
993  };
994  } t;
995  ptrdiff_t natural_alignment =
996  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
997  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
998  // long)natural_alignment );
999  KMP_DEBUG_ASSERT(
1000  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1001  }
1002 #endif // KMP_DEBUG
1003 
1004  /* save the term in thread private dispatch structure */
1005  *(DBL *)&pr->u.p.parm3 = x;
1006 
1007  /* solve for the crossover point to the nearest integer i for which C_i
1008  <= chunk */
1009  {
1010  UT left, right, mid;
1011  long double p;
1012 
1013  /* estimate initial upper and lower bound */
1014 
1015  /* doesn't matter what value right is as long as it is positive, but
1016  it affects performance of the solver */
1017  right = 229;
1018  p = __kmp_pow<UT>(x, right);
1019  if (p > target) {
1020  do {
1021  p *= p;
1022  right <<= 1;
1023  } while (p > target && right < (1 << 27));
1024  /* lower bound is previous (failed) estimate of upper bound */
1025  left = right >> 1;
1026  } else {
1027  left = 0;
1028  }
1029 
1030  /* bisection root-finding method */
1031  while (left + 1 < right) {
1032  mid = (left + right) / 2;
1033  if (__kmp_pow<UT>(x, mid) > target) {
1034  left = mid;
1035  } else {
1036  right = mid;
1037  }
1038  } // while
1039  cross = right;
1040  }
1041  /* assert sanity of computed crossover point */
1042  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1043  __kmp_pow<UT>(x, cross) <= target);
1044 
1045  /* save the crossover point in thread private dispatch structure */
1046  pr->u.p.parm2 = cross;
1047 
1048 // C75803
1049 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1050 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1051 #else
1052 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1053 #endif
1054  /* dynamic-style scheduling offset */
1055  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1056  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1057  cross * chunk;
1058 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1059  // restore FPCW
1060  _control87(oldFpcw, _MCW_PC);
1061 #endif
1062  } // if
1063  } else {
1064  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1065  "kmp_sch_static_greedy\n",
1066  gtid));
1067  schedule = kmp_sch_static_greedy;
1068  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1069  pr->u.p.parm1 = tc;
1070  } // if
1071  } // case
1072  break;
1073  case kmp_sch_static_greedy:
1074  KD_TRACE(100,
1075  ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1076  pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1077  ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1078  : tc;
1079  break;
1080  case kmp_sch_static_chunked:
1081  case kmp_sch_dynamic_chunked:
1082  if (pr->u.p.parm1 <= 0) {
1083  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1084  }
1085  KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1086  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1087  gtid));
1088  break;
1089  case kmp_sch_trapezoidal: {
1090  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1091 
1092  T parm1, parm2, parm3, parm4;
1093  KD_TRACE(100,
1094  ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1095 
1096  parm1 = chunk;
1097 
1098  /* F : size of the first cycle */
1099  parm2 = (tc / (2 * th->th.th_team_nproc));
1100 
1101  if (parm2 < 1) {
1102  parm2 = 1;
1103  }
1104 
1105  /* L : size of the last cycle. Make sure the last cycle is not larger
1106  than the first cycle. */
1107  if (parm1 < 1) {
1108  parm1 = 1;
1109  } else if (parm1 > parm2) {
1110  parm1 = parm2;
1111  }
1112 
1113  /* N : number of cycles */
1114  parm3 = (parm2 + parm1);
1115  parm3 = (2 * tc + parm3 - 1) / parm3;
1116 
1117  if (parm3 < 2) {
1118  parm3 = 2;
1119  }
1120 
1121  /* sigma : decreasing incr of the trapezoid */
1122  parm4 = (parm3 - 1);
1123  parm4 = (parm2 - parm1) / parm4;
1124 
1125  // pointless check, because parm4 >= 0 always
1126  // if ( parm4 < 0 ) {
1127  // parm4 = 0;
1128  //}
1129 
1130  pr->u.p.parm1 = parm1;
1131  pr->u.p.parm2 = parm2;
1132  pr->u.p.parm3 = parm3;
1133  pr->u.p.parm4 = parm4;
1134  } // case
1135  break;
1136 
1137  default: {
1138  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1139  KMP_HNT(GetNewerLibrary), // Hint
1140  __kmp_msg_null // Variadic argument list terminator
1141  );
1142  } break;
1143  } // switch
1144  pr->schedule = schedule;
1145  if (active) {
1146  /* The name of this buffer should be my_buffer_index when it's free to use
1147  * it */
1148 
1149  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1150  "sh->buffer_index:%d\n",
1151  gtid, my_buffer_index, sh->buffer_index));
1152  __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1153  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1154  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1155  // my_buffer_index are *always* 32-bit integers.
1156  KMP_MB(); /* is this necessary? */
1157  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1158  "sh->buffer_index:%d\n",
1159  gtid, my_buffer_index, sh->buffer_index));
1160 
1161  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1162  th->th.th_dispatch->th_dispatch_sh_current =
1163  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1164 #if USE_ITT_BUILD
1165  if (pr->ordered) {
1166  __kmp_itt_ordered_init(gtid);
1167  }
1168  // Report loop metadata
1169  if (itt_need_metadata_reporting) {
1170  // Only report metadata by master of active team at level 1
1171  kmp_uint64 schedtype = 0;
1172  switch (schedule) {
1173  case kmp_sch_static_chunked:
1174  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1175  break;
1176  case kmp_sch_static_greedy:
1177  cur_chunk = pr->u.p.parm1;
1178  break;
1179  case kmp_sch_dynamic_chunked:
1180  schedtype = 1;
1181  break;
1182  case kmp_sch_guided_iterative_chunked:
1183  case kmp_sch_guided_analytical_chunked:
1184  case kmp_sch_guided_simd:
1185  schedtype = 2;
1186  break;
1187  default:
1188  // Should we put this case under "static"?
1189  // case kmp_sch_static_steal:
1190  schedtype = 3;
1191  break;
1192  }
1193  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1194  }
1195 #endif /* USE_ITT_BUILD */
1196  }
1197 
1198 #ifdef KMP_DEBUG
1199  {
1200  char *buff;
1201  // create format specifiers before the debug output
1202  buff = __kmp_str_format(
1203  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1204  "lb:%%%s ub:%%%s"
1205  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1206  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1207  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1208  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1209  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1210  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1211  KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1212  pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1213  pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1214  pr->u.p.parm3, pr->u.p.parm4));
1215  __kmp_str_free(&buff);
1216  }
1217 #endif
1218 #if (KMP_STATIC_STEAL_ENABLED)
1219  // It cannot be guaranteed that after execution of a loop with some other
1220  // schedule kind all the parm3 variables will contain the same value. Even if
1221  // all parm3 will be the same, it still exists a bad case like using 0 and 1
1222  // rather than program life-time increment. So the dedicated variable is
1223  // required. The 'static_steal_counter' is used.
1224  if (schedule == kmp_sch_static_steal) {
1225  // Other threads will inspect this variable when searching for a victim.
1226  // This is a flag showing that other threads may steal from this thread
1227  // since then.
1228  volatile T *p = &pr->u.p.static_steal_counter;
1229  *p = *p + 1;
1230  }
1231 #endif // ( KMP_STATIC_STEAL_ENABLED )
1232 
1233 #if OMPT_SUPPORT && OMPT_OPTIONAL
1234  if (ompt_enabled.ompt_callback_work) {
1235  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1236  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1237  kmp_info_t *thr = __kmp_threads[gtid];
1238  ompt_callbacks.ompt_callback(ompt_callback_work)(
1239  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1240  &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1241  }
1242 #endif
1243 }
1244 
1245 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1246  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1247  * every chunk of iterations. If the ordered section(s) were not executed
1248  * for this iteration (or every iteration in this chunk), we need to set the
1249  * ordered iteration counters so that the next thread can proceed. */
1250 template <typename UT>
1251 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1252  typedef typename traits_t<UT>::signed_t ST;
1253  kmp_info_t *th = __kmp_threads[gtid];
1254 
1255  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1256  if (!th->th.th_team->t.t_serialized) {
1257 
1258  dispatch_private_info_template<UT> *pr =
1259  reinterpret_cast<dispatch_private_info_template<UT> *>(
1260  th->th.th_dispatch->th_dispatch_pr_current);
1261  dispatch_shared_info_template<UT> volatile *sh =
1262  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1263  th->th.th_dispatch->th_dispatch_sh_current);
1264  KMP_DEBUG_ASSERT(pr);
1265  KMP_DEBUG_ASSERT(sh);
1266  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1267  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1268 
1269  if (pr->ordered_bumped) {
1270  KD_TRACE(
1271  1000,
1272  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1273  gtid));
1274  pr->ordered_bumped = 0;
1275  } else {
1276  UT lower = pr->u.p.ordered_lower;
1277 
1278 #ifdef KMP_DEBUG
1279  {
1280  char *buff;
1281  // create format specifiers before the debug output
1282  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1283  "ordered_iteration:%%%s lower:%%%s\n",
1284  traits_t<UT>::spec, traits_t<UT>::spec);
1285  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1286  __kmp_str_free(&buff);
1287  }
1288 #endif
1289 
1290  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1291  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1292  KMP_MB(); /* is this necessary? */
1293 #ifdef KMP_DEBUG
1294  {
1295  char *buff;
1296  // create format specifiers before the debug output
1297  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1298  "ordered_iteration:%%%s lower:%%%s\n",
1299  traits_t<UT>::spec, traits_t<UT>::spec);
1300  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1301  __kmp_str_free(&buff);
1302  }
1303 #endif
1304 
1305  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1306  } // if
1307  } // if
1308  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1309 }
1310 
1311 #ifdef KMP_GOMP_COMPAT
1312 
1313 template <typename UT>
1314 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1315  typedef typename traits_t<UT>::signed_t ST;
1316  kmp_info_t *th = __kmp_threads[gtid];
1317 
1318  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1319  if (!th->th.th_team->t.t_serialized) {
1320  // int cid;
1321  dispatch_private_info_template<UT> *pr =
1322  reinterpret_cast<dispatch_private_info_template<UT> *>(
1323  th->th.th_dispatch->th_dispatch_pr_current);
1324  dispatch_shared_info_template<UT> volatile *sh =
1325  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1326  th->th.th_dispatch->th_dispatch_sh_current);
1327  KMP_DEBUG_ASSERT(pr);
1328  KMP_DEBUG_ASSERT(sh);
1329  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1330  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1331 
1332  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1333  UT lower = pr->u.p.ordered_lower;
1334  UT upper = pr->u.p.ordered_upper;
1335  UT inc = upper - lower + 1;
1336 
1337  if (pr->ordered_bumped == inc) {
1338  KD_TRACE(
1339  1000,
1340  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1341  gtid));
1342  pr->ordered_bumped = 0;
1343  } else {
1344  inc -= pr->ordered_bumped;
1345 
1346 #ifdef KMP_DEBUG
1347  {
1348  char *buff;
1349  // create format specifiers before the debug output
1350  buff = __kmp_str_format(
1351  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1352  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1353  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1354  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1355  __kmp_str_free(&buff);
1356  }
1357 #endif
1358 
1359  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1360  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1361 
1362  KMP_MB(); /* is this necessary? */
1363  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1364  "ordered_bumped to zero\n",
1365  gtid));
1366  pr->ordered_bumped = 0;
1368 #ifdef KMP_DEBUG
1369  {
1370  char *buff;
1371  // create format specifiers before the debug output
1372  buff = __kmp_str_format(
1373  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1374  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1375  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1376  traits_t<UT>::spec);
1377  KD_TRACE(1000,
1378  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1379  __kmp_str_free(&buff);
1380  }
1381 #endif
1382 
1383  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1384  }
1385  // }
1386  }
1387  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1388 }
1389 
1390 #endif /* KMP_GOMP_COMPAT */
1391 
1392 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1393  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1394  is not called. */
1395 #if OMPT_SUPPORT && OMPT_OPTIONAL
1396 #define OMPT_LOOP_END \
1397  if (status == 0) { \
1398  if (ompt_enabled.ompt_callback_work) { \
1399  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1400  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1401  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1402  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1403  &(task_info->task_data), 0, codeptr); \
1404  } \
1405  }
1406 // TODO: implement count
1407 #else
1408 #define OMPT_LOOP_END // no-op
1409 #endif
1410 
1411 template <typename T>
1412 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1413  T *p_lb, T *p_ub,
1414  typename traits_t<T>::signed_t *p_st
1415 #if OMPT_SUPPORT && OMPT_OPTIONAL
1416  ,
1417  void *codeptr
1418 #endif
1419  ) {
1420 
1421  typedef typename traits_t<T>::unsigned_t UT;
1422  typedef typename traits_t<T>::signed_t ST;
1423  typedef typename traits_t<T>::floating_t DBL;
1424 
1425  // This is potentially slightly misleading, schedule(runtime) will appear here
1426  // even if the actual runtme schedule is static. (Which points out a
1427  // disadavantage of schedule(runtime): even when static scheduling is used it
1428  // costs more than a compile time choice to use static scheduling would.)
1429  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1430 
1431  int status;
1432  dispatch_private_info_template<T> *pr;
1433  kmp_info_t *th = __kmp_threads[gtid];
1434  kmp_team_t *team = th->th.th_team;
1435 
1436  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1437 #ifdef KMP_DEBUG
1438  {
1439  char *buff;
1440  // create format specifiers before the debug output
1441  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1442  "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1443  traits_t<T>::spec, traits_t<T>::spec,
1444  traits_t<ST>::spec);
1445  KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1446  __kmp_str_free(&buff);
1447  }
1448 #endif
1449 
1450  if (team->t.t_serialized) {
1451  /* NOTE: serialize this dispatch becase we are not at the active level */
1452  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1453  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1454  KMP_DEBUG_ASSERT(pr);
1455 
1456  if ((status = (pr->u.p.tc != 0)) == 0) {
1457  *p_lb = 0;
1458  *p_ub = 0;
1459  // if ( p_last != NULL )
1460  // *p_last = 0;
1461  if (p_st != NULL)
1462  *p_st = 0;
1463  if (__kmp_env_consistency_check) {
1464  if (pr->pushed_ws != ct_none) {
1465  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1466  }
1467  }
1468  } else if (pr->nomerge) {
1469  kmp_int32 last;
1470  T start;
1471  UT limit, trip, init;
1472  ST incr;
1473  T chunk = pr->u.p.parm1;
1474 
1475  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1476  gtid));
1477 
1478  init = chunk * pr->u.p.count++;
1479  trip = pr->u.p.tc - 1;
1480 
1481  if ((status = (init <= trip)) == 0) {
1482  *p_lb = 0;
1483  *p_ub = 0;
1484  // if ( p_last != NULL )
1485  // *p_last = 0;
1486  if (p_st != NULL)
1487  *p_st = 0;
1488  if (__kmp_env_consistency_check) {
1489  if (pr->pushed_ws != ct_none) {
1490  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1491  }
1492  }
1493  } else {
1494  start = pr->u.p.lb;
1495  limit = chunk + init - 1;
1496  incr = pr->u.p.st;
1497 
1498  if ((last = (limit >= trip)) != 0) {
1499  limit = trip;
1500 #if KMP_OS_WINDOWS
1501  pr->u.p.last_upper = pr->u.p.ub;
1502 #endif /* KMP_OS_WINDOWS */
1503  }
1504  if (p_last != NULL)
1505  *p_last = last;
1506  if (p_st != NULL)
1507  *p_st = incr;
1508  if (incr == 1) {
1509  *p_lb = start + init;
1510  *p_ub = start + limit;
1511  } else {
1512  *p_lb = start + init * incr;
1513  *p_ub = start + limit * incr;
1514  }
1515 
1516  if (pr->ordered) {
1517  pr->u.p.ordered_lower = init;
1518  pr->u.p.ordered_upper = limit;
1519 #ifdef KMP_DEBUG
1520  {
1521  char *buff;
1522  // create format specifiers before the debug output
1523  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1524  "ordered_lower:%%%s ordered_upper:%%%s\n",
1525  traits_t<UT>::spec, traits_t<UT>::spec);
1526  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1527  pr->u.p.ordered_upper));
1528  __kmp_str_free(&buff);
1529  }
1530 #endif
1531  } // if
1532  } // if
1533  } else {
1534  pr->u.p.tc = 0;
1535  *p_lb = pr->u.p.lb;
1536  *p_ub = pr->u.p.ub;
1537 #if KMP_OS_WINDOWS
1538  pr->u.p.last_upper = *p_ub;
1539 #endif /* KMP_OS_WINDOWS */
1540  if (p_last != NULL)
1541  *p_last = TRUE;
1542  if (p_st != NULL)
1543  *p_st = pr->u.p.st;
1544  } // if
1545 #ifdef KMP_DEBUG
1546  {
1547  char *buff;
1548  // create format specifiers before the debug output
1549  buff = __kmp_str_format(
1550  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1551  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1552  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1553  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1554  __kmp_str_free(&buff);
1555  }
1556 #endif
1557 #if INCLUDE_SSC_MARKS
1558  SSC_MARK_DISPATCH_NEXT();
1559 #endif
1560  OMPT_LOOP_END;
1561  return status;
1562  } else {
1563  kmp_int32 last = 0;
1564  dispatch_shared_info_template<UT> *sh;
1565  T start;
1566  ST incr;
1567  UT limit, trip, init;
1568 
1569  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1570  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1571 
1572  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1573  th->th.th_dispatch->th_dispatch_pr_current);
1574  KMP_DEBUG_ASSERT(pr);
1575  sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1576  th->th.th_dispatch->th_dispatch_sh_current);
1577  KMP_DEBUG_ASSERT(sh);
1578 
1579  if (pr->u.p.tc == 0) {
1580  // zero trip count
1581  status = 0;
1582  } else {
1583  switch (pr->schedule) {
1584 #if (KMP_STATIC_STEAL_ENABLED)
1585  case kmp_sch_static_steal: {
1586  T chunk = pr->u.p.parm1;
1587  int nproc = th->th.th_team_nproc;
1588 
1589  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1590  gtid));
1591 
1592  trip = pr->u.p.tc - 1;
1593 
1594  if (traits_t<T>::type_size > 4) {
1595  // use lock for 8-byte and CAS for 4-byte induction
1596  // variable. TODO (optional): check and use 16-byte CAS
1597  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1598  KMP_DEBUG_ASSERT(lck != NULL);
1599  if (pr->u.p.count < (UT)pr->u.p.ub) {
1600  __kmp_acquire_lock(lck, gtid);
1601  // try to get own chunk of iterations
1602  init = (pr->u.p.count)++;
1603  status = (init < (UT)pr->u.p.ub);
1604  __kmp_release_lock(lck, gtid);
1605  } else {
1606  status = 0; // no own chunks
1607  }
1608  if (!status) { // try to steal
1609  kmp_info_t **other_threads = team->t.t_threads;
1610  int while_limit = nproc; // nproc attempts to find a victim
1611  int while_index = 0;
1612  // TODO: algorithm of searching for a victim
1613  // should be cleaned up and measured
1614  while ((!status) && (while_limit != ++while_index)) {
1615  T remaining;
1616  T victimIdx = pr->u.p.parm4;
1617  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1618  dispatch_private_info_template<T> *victim =
1619  reinterpret_cast<dispatch_private_info_template<T> *>(
1620  other_threads[victimIdx]
1621  ->th.th_dispatch->th_dispatch_pr_current);
1622  while ((victim == NULL || victim == pr ||
1623  (*(volatile T *)&victim->u.p.static_steal_counter !=
1624  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1625  oldVictimIdx != victimIdx) {
1626  victimIdx = (victimIdx + 1) % nproc;
1627  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1628  other_threads[victimIdx]
1629  ->th.th_dispatch->th_dispatch_pr_current);
1630  }
1631  if (!victim ||
1632  (*(volatile T *)&victim->u.p.static_steal_counter !=
1633  *(volatile T *)&pr->u.p.static_steal_counter)) {
1634  continue; // try once more (nproc attempts in total)
1635  // no victim is ready yet to participate in stealing
1636  // because all victims are still in kmp_init_dispatch
1637  }
1638  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1639  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1640  continue; // not enough chunks to steal, goto next victim
1641  }
1642 
1643  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1644  KMP_ASSERT(lck != NULL);
1645  __kmp_acquire_lock(lck, gtid);
1646  limit = victim->u.p.ub; // keep initial ub
1647  if (victim->u.p.count >= limit ||
1648  (remaining = limit - victim->u.p.count) < 2) {
1649  __kmp_release_lock(lck, gtid);
1650  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1651  continue; // not enough chunks to steal
1652  }
1653  // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1654  // or by 1
1655  if (remaining > 3) {
1656  KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1657  init = (victim->u.p.ub -=
1658  (remaining >> 2)); // steal 1/4 of remaining
1659  } else {
1660  KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1661  init =
1662  (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1663  }
1664  __kmp_release_lock(lck, gtid);
1665 
1666  KMP_DEBUG_ASSERT(init + 1 <= limit);
1667  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1668  status = 1;
1669  while_index = 0;
1670  // now update own count and ub with stolen range but init chunk
1671  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1672  pr->u.p.count = init + 1;
1673  pr->u.p.ub = limit;
1674  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1675  } // while (search for victim)
1676  } // if (try to find victim and steal)
1677  } else {
1678  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1679  typedef union {
1680  struct {
1681  UT count;
1682  T ub;
1683  } p;
1684  kmp_int64 b;
1685  } union_i4;
1686  // All operations on 'count' or 'ub' must be combined atomically
1687  // together.
1688  {
1689  union_i4 vold, vnew;
1690  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1691  vnew = vold;
1692  vnew.p.count++;
1693  while (!KMP_COMPARE_AND_STORE_ACQ64(
1694  (volatile kmp_int64 *)&pr->u.p.count,
1695  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1696  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1697  KMP_CPU_PAUSE();
1698  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1699  vnew = vold;
1700  vnew.p.count++;
1701  }
1702  vnew = vold;
1703  init = vnew.p.count;
1704  status = (init < (UT)vnew.p.ub);
1705  }
1706 
1707  if (!status) {
1708  kmp_info_t **other_threads = team->t.t_threads;
1709  int while_limit = nproc; // nproc attempts to find a victim
1710  int while_index = 0;
1711 
1712  // TODO: algorithm of searching for a victim
1713  // should be cleaned up and measured
1714  while ((!status) && (while_limit != ++while_index)) {
1715  union_i4 vold, vnew;
1716  kmp_int32 remaining;
1717  T victimIdx = pr->u.p.parm4;
1718  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1719  dispatch_private_info_template<T> *victim =
1720  reinterpret_cast<dispatch_private_info_template<T> *>(
1721  other_threads[victimIdx]
1722  ->th.th_dispatch->th_dispatch_pr_current);
1723  while ((victim == NULL || victim == pr ||
1724  (*(volatile T *)&victim->u.p.static_steal_counter !=
1725  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1726  oldVictimIdx != victimIdx) {
1727  victimIdx = (victimIdx + 1) % nproc;
1728  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1729  other_threads[victimIdx]
1730  ->th.th_dispatch->th_dispatch_pr_current);
1731  }
1732  if (!victim ||
1733  (*(volatile T *)&victim->u.p.static_steal_counter !=
1734  *(volatile T *)&pr->u.p.static_steal_counter)) {
1735  continue; // try once more (nproc attempts in total)
1736  // no victim is ready yet to participate in stealing
1737  // because all victims are still in kmp_init_dispatch
1738  }
1739  pr->u.p.parm4 = victimIdx; // new victim found
1740  while (1) { // CAS loop if victim has enough chunks to steal
1741  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1742  vnew = vold;
1743 
1744  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1745  if (vnew.p.count >= (UT)vnew.p.ub ||
1746  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1747  pr->u.p.parm4 =
1748  (victimIdx + 1) % nproc; // shift start victim id
1749  break; // not enough chunks to steal, goto next victim
1750  }
1751  if (remaining > 3) {
1752  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1753  } else {
1754  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1755  }
1756  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1757  // TODO: Should this be acquire or release?
1758  if (KMP_COMPARE_AND_STORE_ACQ64(
1759  (volatile kmp_int64 *)&victim->u.p.count,
1760  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1761  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1762  // stealing succeeded
1763  KMP_COUNT_VALUE(FOR_static_steal_stolen,
1764  vold.p.ub - vnew.p.ub);
1765  status = 1;
1766  while_index = 0;
1767  // now update own count and ub
1768  init = vnew.p.ub;
1769  vold.p.count = init + 1;
1770 #if KMP_ARCH_X86
1771  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1772  vold.b);
1773 #else
1774  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1775 #endif
1776  break;
1777  } // if (check CAS result)
1778  KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1779  } // while (try to steal from particular victim)
1780  } // while (search for victim)
1781  } // if (try to find victim and steal)
1782  } // if (4-byte induction variable)
1783  if (!status) {
1784  *p_lb = 0;
1785  *p_ub = 0;
1786  if (p_st != NULL)
1787  *p_st = 0;
1788  } else {
1789  start = pr->u.p.parm2;
1790  init *= chunk;
1791  limit = chunk + init - 1;
1792  incr = pr->u.p.st;
1793  KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1794 
1795  KMP_DEBUG_ASSERT(init <= trip);
1796  if ((last = (limit >= trip)) != 0)
1797  limit = trip;
1798  if (p_st != NULL)
1799  *p_st = incr;
1800 
1801  if (incr == 1) {
1802  *p_lb = start + init;
1803  *p_ub = start + limit;
1804  } else {
1805  *p_lb = start + init * incr;
1806  *p_ub = start + limit * incr;
1807  }
1808 
1809  if (pr->ordered) {
1810  pr->u.p.ordered_lower = init;
1811  pr->u.p.ordered_upper = limit;
1812 #ifdef KMP_DEBUG
1813  {
1814  char *buff;
1815  // create format specifiers before the debug output
1816  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1817  "ordered_lower:%%%s ordered_upper:%%%s\n",
1818  traits_t<UT>::spec, traits_t<UT>::spec);
1819  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1820  pr->u.p.ordered_upper));
1821  __kmp_str_free(&buff);
1822  }
1823 #endif
1824  } // if
1825  } // if
1826  break;
1827  } // case
1828 #endif // ( KMP_STATIC_STEAL_ENABLED )
1829  case kmp_sch_static_balanced: {
1830  KD_TRACE(
1831  100,
1832  ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1833  if ((status = !pr->u.p.count) !=
1834  0) { /* check if thread has any iteration to do */
1835  pr->u.p.count = 1;
1836  *p_lb = pr->u.p.lb;
1837  *p_ub = pr->u.p.ub;
1838  last = pr->u.p.parm1;
1839  if (p_st != NULL)
1840  *p_st = pr->u.p.st;
1841  } else { /* no iterations to do */
1842  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1843  }
1844  if (pr->ordered) {
1845 #ifdef KMP_DEBUG
1846  {
1847  char *buff;
1848  // create format specifiers before the debug output
1849  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1850  "ordered_lower:%%%s ordered_upper:%%%s\n",
1851  traits_t<UT>::spec, traits_t<UT>::spec);
1852  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1853  pr->u.p.ordered_upper));
1854  __kmp_str_free(&buff);
1855  }
1856 #endif
1857  } // if
1858  } // case
1859  break;
1860  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1861  merged here */
1862  case kmp_sch_static_chunked: {
1863  T parm1;
1864 
1865  KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1866  "kmp_sch_static_[affinity|chunked] case\n",
1867  gtid));
1868  parm1 = pr->u.p.parm1;
1869 
1870  trip = pr->u.p.tc - 1;
1871  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1872 
1873  if ((status = (init <= trip)) != 0) {
1874  start = pr->u.p.lb;
1875  incr = pr->u.p.st;
1876  limit = parm1 + init - 1;
1877 
1878  if ((last = (limit >= trip)) != 0)
1879  limit = trip;
1880 
1881  if (p_st != NULL)
1882  *p_st = incr;
1883 
1884  pr->u.p.count += th->th.th_team_nproc;
1885 
1886  if (incr == 1) {
1887  *p_lb = start + init;
1888  *p_ub = start + limit;
1889  } else {
1890  *p_lb = start + init * incr;
1891  *p_ub = start + limit * incr;
1892  }
1893 
1894  if (pr->ordered) {
1895  pr->u.p.ordered_lower = init;
1896  pr->u.p.ordered_upper = limit;
1897 #ifdef KMP_DEBUG
1898  {
1899  char *buff;
1900  // create format specifiers before the debug output
1901  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1902  "ordered_lower:%%%s ordered_upper:%%%s\n",
1903  traits_t<UT>::spec, traits_t<UT>::spec);
1904  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1905  pr->u.p.ordered_upper));
1906  __kmp_str_free(&buff);
1907  }
1908 #endif
1909  } // if
1910  } // if
1911  } // case
1912  break;
1913 
1914  case kmp_sch_dynamic_chunked: {
1915  T chunk = pr->u.p.parm1;
1916 
1917  KD_TRACE(
1918  100,
1919  ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1920 
1921  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1922  trip = pr->u.p.tc - 1;
1923 
1924  if ((status = (init <= trip)) == 0) {
1925  *p_lb = 0;
1926  *p_ub = 0;
1927  if (p_st != NULL)
1928  *p_st = 0;
1929  } else {
1930  start = pr->u.p.lb;
1931  limit = chunk + init - 1;
1932  incr = pr->u.p.st;
1933 
1934  if ((last = (limit >= trip)) != 0)
1935  limit = trip;
1936 
1937  if (p_st != NULL)
1938  *p_st = incr;
1939 
1940  if (incr == 1) {
1941  *p_lb = start + init;
1942  *p_ub = start + limit;
1943  } else {
1944  *p_lb = start + init * incr;
1945  *p_ub = start + limit * incr;
1946  }
1947 
1948  if (pr->ordered) {
1949  pr->u.p.ordered_lower = init;
1950  pr->u.p.ordered_upper = limit;
1951 #ifdef KMP_DEBUG
1952  {
1953  char *buff;
1954  // create format specifiers before the debug output
1955  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1956  "ordered_lower:%%%s ordered_upper:%%%s\n",
1957  traits_t<UT>::spec, traits_t<UT>::spec);
1958  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1959  pr->u.p.ordered_upper));
1960  __kmp_str_free(&buff);
1961  }
1962 #endif
1963  } // if
1964  } // if
1965  } // case
1966  break;
1967 
1968  case kmp_sch_guided_iterative_chunked: {
1969  T chunkspec = pr->u.p.parm1;
1970  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1971  "iterative case\n",
1972  gtid));
1973  trip = pr->u.p.tc;
1974  // Start atomic part of calculations
1975  while (1) {
1976  ST remaining; // signed, because can be < 0
1977  init = sh->u.s.iteration; // shared value
1978  remaining = trip - init;
1979  if (remaining <= 0) { // AC: need to compare with 0 first
1980  // nothing to do, don't try atomic op
1981  status = 0;
1982  break;
1983  }
1984  if ((T)remaining <
1985  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1986  // use dynamic-style shcedule
1987  // atomically inrement iterations, get old value
1988  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1989  (ST)chunkspec);
1990  remaining = trip - init;
1991  if (remaining <= 0) {
1992  status = 0; // all iterations got by other threads
1993  } else { // got some iterations to work on
1994  status = 1;
1995  if ((T)remaining > chunkspec) {
1996  limit = init + chunkspec - 1;
1997  } else {
1998  last = 1; // the last chunk
1999  limit = init + remaining - 1;
2000  } // if
2001  } // if
2002  break;
2003  } // if
2004  limit = init + (UT)(remaining *
2005  *(double *)&pr->u.p.parm3); // divide by K*nproc
2006  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2007  (ST)init, (ST)limit)) {
2008  // CAS was successful, chunk obtained
2009  status = 1;
2010  --limit;
2011  break;
2012  } // if
2013  } // while
2014  if (status != 0) {
2015  start = pr->u.p.lb;
2016  incr = pr->u.p.st;
2017  if (p_st != NULL)
2018  *p_st = incr;
2019  *p_lb = start + init * incr;
2020  *p_ub = start + limit * incr;
2021  if (pr->ordered) {
2022  pr->u.p.ordered_lower = init;
2023  pr->u.p.ordered_upper = limit;
2024 #ifdef KMP_DEBUG
2025  {
2026  char *buff;
2027  // create format specifiers before the debug output
2028  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2029  "ordered_lower:%%%s ordered_upper:%%%s\n",
2030  traits_t<UT>::spec, traits_t<UT>::spec);
2031  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2032  pr->u.p.ordered_upper));
2033  __kmp_str_free(&buff);
2034  }
2035 #endif
2036  } // if
2037  } else {
2038  *p_lb = 0;
2039  *p_ub = 0;
2040  if (p_st != NULL)
2041  *p_st = 0;
2042  } // if
2043  } // case
2044  break;
2045 
2046  case kmp_sch_guided_simd: {
2047  // same as iterative but curr-chunk adjusted to be multiple of given
2048  // chunk
2049  T chunk = pr->u.p.parm1;
2050  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2051  gtid));
2052  trip = pr->u.p.tc;
2053  // Start atomic part of calculations
2054  while (1) {
2055  ST remaining; // signed, because can be < 0
2056  init = sh->u.s.iteration; // shared value
2057  remaining = trip - init;
2058  if (remaining <= 0) { // AC: need to compare with 0 first
2059  status = 0; // nothing to do, don't try atomic op
2060  break;
2061  }
2062  KMP_DEBUG_ASSERT(init % chunk == 0);
2063  // compare with K*nproc*(chunk+1), K=2 by default
2064  if ((T)remaining < pr->u.p.parm2) {
2065  // use dynamic-style shcedule
2066  // atomically inrement iterations, get old value
2067  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2068  (ST)chunk);
2069  remaining = trip - init;
2070  if (remaining <= 0) {
2071  status = 0; // all iterations got by other threads
2072  } else {
2073  // got some iterations to work on
2074  status = 1;
2075  if ((T)remaining > chunk) {
2076  limit = init + chunk - 1;
2077  } else {
2078  last = 1; // the last chunk
2079  limit = init + remaining - 1;
2080  } // if
2081  } // if
2082  break;
2083  } // if
2084  // divide by K*nproc
2085  UT span = remaining * (*(double *)&pr->u.p.parm3);
2086  UT rem = span % chunk;
2087  if (rem) // adjust so that span%chunk == 0
2088  span += chunk - rem;
2089  limit = init + span;
2090  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2091  (ST)init, (ST)limit)) {
2092  // CAS was successful, chunk obtained
2093  status = 1;
2094  --limit;
2095  break;
2096  } // if
2097  } // while
2098  if (status != 0) {
2099  start = pr->u.p.lb;
2100  incr = pr->u.p.st;
2101  if (p_st != NULL)
2102  *p_st = incr;
2103  *p_lb = start + init * incr;
2104  *p_ub = start + limit * incr;
2105  if (pr->ordered) {
2106  pr->u.p.ordered_lower = init;
2107  pr->u.p.ordered_upper = limit;
2108 #ifdef KMP_DEBUG
2109  {
2110  char *buff;
2111  // create format specifiers before the debug output
2112  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2113  "ordered_lower:%%%s ordered_upper:%%%s\n",
2114  traits_t<UT>::spec, traits_t<UT>::spec);
2115  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2116  pr->u.p.ordered_upper));
2117  __kmp_str_free(&buff);
2118  }
2119 #endif
2120  } // if
2121  } else {
2122  *p_lb = 0;
2123  *p_ub = 0;
2124  if (p_st != NULL)
2125  *p_st = 0;
2126  } // if
2127  } // case
2128  break;
2129 
2130  case kmp_sch_guided_analytical_chunked: {
2131  T chunkspec = pr->u.p.parm1;
2132  UT chunkIdx;
2133 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2134  /* for storing original FPCW value for Windows* OS on
2135  IA-32 architecture 8-byte version */
2136  unsigned int oldFpcw;
2137  unsigned int fpcwSet = 0;
2138 #endif
2139  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2140  "analytical case\n",
2141  gtid));
2142 
2143  trip = pr->u.p.tc;
2144 
2145  KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2146  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2147  trip);
2148 
2149  while (1) { /* this while loop is a safeguard against unexpected zero
2150  chunk sizes */
2151  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2152  if (chunkIdx >= (UT)pr->u.p.parm2) {
2153  --trip;
2154  /* use dynamic-style scheduling */
2155  init = chunkIdx * chunkspec + pr->u.p.count;
2156  /* need to verify init > 0 in case of overflow in the above
2157  * calculation */
2158  if ((status = (init > 0 && init <= trip)) != 0) {
2159  limit = init + chunkspec - 1;
2160 
2161  if ((last = (limit >= trip)) != 0)
2162  limit = trip;
2163  }
2164  break;
2165  } else {
2166 /* use exponential-style scheduling */
2167 /* The following check is to workaround the lack of long double precision on
2168  Windows* OS.
2169  This check works around the possible effect that init != 0 for chunkIdx == 0.
2170  */
2171 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2172  /* If we haven't already done so, save original FPCW and set
2173  precision to 64-bit, as Windows* OS on IA-32 architecture
2174  defaults to 53-bit */
2175  if (!fpcwSet) {
2176  oldFpcw = _control87(0, 0);
2177  _control87(_PC_64, _MCW_PC);
2178  fpcwSet = 0x30000;
2179  }
2180 #endif
2181  if (chunkIdx) {
2182  init = __kmp_dispatch_guided_remaining<T>(
2183  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2184  KMP_DEBUG_ASSERT(init);
2185  init = trip - init;
2186  } else
2187  init = 0;
2188  limit = trip - __kmp_dispatch_guided_remaining<T>(
2189  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2190  KMP_ASSERT(init <= limit);
2191  if (init < limit) {
2192  KMP_DEBUG_ASSERT(limit <= trip);
2193  --limit;
2194  status = 1;
2195  break;
2196  } // if
2197  } // if
2198  } // while (1)
2199 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2200  /* restore FPCW if necessary
2201  AC: check fpcwSet flag first because oldFpcw can be uninitialized
2202  here */
2203  if (fpcwSet && (oldFpcw & fpcwSet))
2204  _control87(oldFpcw, _MCW_PC);
2205 #endif
2206  if (status != 0) {
2207  start = pr->u.p.lb;
2208  incr = pr->u.p.st;
2209  if (p_st != NULL)
2210  *p_st = incr;
2211  *p_lb = start + init * incr;
2212  *p_ub = start + limit * incr;
2213  if (pr->ordered) {
2214  pr->u.p.ordered_lower = init;
2215  pr->u.p.ordered_upper = limit;
2216 #ifdef KMP_DEBUG
2217  {
2218  char *buff;
2219  // create format specifiers before the debug output
2220  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2221  "ordered_lower:%%%s ordered_upper:%%%s\n",
2222  traits_t<UT>::spec, traits_t<UT>::spec);
2223  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2224  pr->u.p.ordered_upper));
2225  __kmp_str_free(&buff);
2226  }
2227 #endif
2228  }
2229  } else {
2230  *p_lb = 0;
2231  *p_ub = 0;
2232  if (p_st != NULL)
2233  *p_st = 0;
2234  }
2235  } // case
2236  break;
2237 
2238  case kmp_sch_trapezoidal: {
2239  UT index;
2240  T parm2 = pr->u.p.parm2;
2241  T parm3 = pr->u.p.parm3;
2242  T parm4 = pr->u.p.parm4;
2243  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2244  gtid));
2245 
2246  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2247 
2248  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2249  trip = pr->u.p.tc - 1;
2250 
2251  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2252  *p_lb = 0;
2253  *p_ub = 0;
2254  if (p_st != NULL)
2255  *p_st = 0;
2256  } else {
2257  start = pr->u.p.lb;
2258  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2259  incr = pr->u.p.st;
2260 
2261  if ((last = (limit >= trip)) != 0)
2262  limit = trip;
2263 
2264  if (p_st != NULL)
2265  *p_st = incr;
2266 
2267  if (incr == 1) {
2268  *p_lb = start + init;
2269  *p_ub = start + limit;
2270  } else {
2271  *p_lb = start + init * incr;
2272  *p_ub = start + limit * incr;
2273  }
2274 
2275  if (pr->ordered) {
2276  pr->u.p.ordered_lower = init;
2277  pr->u.p.ordered_upper = limit;
2278 #ifdef KMP_DEBUG
2279  {
2280  char *buff;
2281  // create format specifiers before the debug output
2282  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2283  "ordered_lower:%%%s ordered_upper:%%%s\n",
2284  traits_t<UT>::spec, traits_t<UT>::spec);
2285  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2286  pr->u.p.ordered_upper));
2287  __kmp_str_free(&buff);
2288  }
2289 #endif
2290  } // if
2291  } // if
2292  } // case
2293  break;
2294  default: {
2295  status = 0; // to avoid complaints on uninitialized variable use
2296  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2297  KMP_HNT(GetNewerLibrary), // Hint
2298  __kmp_msg_null // Variadic argument list terminator
2299  );
2300  } break;
2301  } // switch
2302  } // if tc == 0;
2303 
2304  if (status == 0) {
2305  UT num_done;
2306 
2307  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2308 #ifdef KMP_DEBUG
2309  {
2310  char *buff;
2311  // create format specifiers before the debug output
2312  buff = __kmp_str_format(
2313  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2314  traits_t<UT>::spec);
2315  KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2316  __kmp_str_free(&buff);
2317  }
2318 #endif
2319 
2320  if ((ST)num_done == th->th.th_team_nproc - 1) {
2321 #if (KMP_STATIC_STEAL_ENABLED)
2322  if (pr->schedule == kmp_sch_static_steal &&
2323  traits_t<T>::type_size > 4) {
2324  int i;
2325  kmp_info_t **other_threads = team->t.t_threads;
2326  // loop complete, safe to destroy locks used for stealing
2327  for (i = 0; i < th->th.th_team_nproc; ++i) {
2328  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2329  KMP_ASSERT(lck != NULL);
2330  __kmp_destroy_lock(lck);
2331  __kmp_free(lck);
2332  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2333  }
2334  }
2335 #endif
2336  /* NOTE: release this buffer to be reused */
2337 
2338  KMP_MB(); /* Flush all pending memory write invalidates. */
2339 
2340  sh->u.s.num_done = 0;
2341  sh->u.s.iteration = 0;
2342 
2343  /* TODO replace with general release procedure? */
2344  if (pr->ordered) {
2345  sh->u.s.ordered_iteration = 0;
2346  }
2347 
2348  KMP_MB(); /* Flush all pending memory write invalidates. */
2349 
2350  sh->buffer_index += __kmp_dispatch_num_buffers;
2351  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2352  gtid, sh->buffer_index));
2353 
2354  KMP_MB(); /* Flush all pending memory write invalidates. */
2355 
2356  } // if
2357  if (__kmp_env_consistency_check) {
2358  if (pr->pushed_ws != ct_none) {
2359  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2360  }
2361  }
2362 
2363  th->th.th_dispatch->th_deo_fcn = NULL;
2364  th->th.th_dispatch->th_dxo_fcn = NULL;
2365  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2366  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2367  } // if (status == 0)
2368 #if KMP_OS_WINDOWS
2369  else if (last) {
2370  pr->u.p.last_upper = pr->u.p.ub;
2371  }
2372 #endif /* KMP_OS_WINDOWS */
2373  if (p_last != NULL && status != 0)
2374  *p_last = last;
2375  } // if
2376 
2377 #ifdef KMP_DEBUG
2378  {
2379  char *buff;
2380  // create format specifiers before the debug output
2381  buff = __kmp_str_format(
2382  "__kmp_dispatch_next: T#%%d normal case: "
2383  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2384  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2385  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2386  __kmp_str_free(&buff);
2387  }
2388 #endif
2389 #if INCLUDE_SSC_MARKS
2390  SSC_MARK_DISPATCH_NEXT();
2391 #endif
2392  OMPT_LOOP_END;
2393  return status;
2394 }
2395 
2396 template <typename T>
2397 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2398  kmp_int32 *plastiter, T *plower, T *pupper,
2399  typename traits_t<T>::signed_t incr) {
2400  typedef typename traits_t<T>::unsigned_t UT;
2401  typedef typename traits_t<T>::signed_t ST;
2402  kmp_uint32 team_id;
2403  kmp_uint32 nteams;
2404  UT trip_count;
2405  kmp_team_t *team;
2406  kmp_info_t *th;
2407 
2408  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2409  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2410 #ifdef KMP_DEBUG
2411  {
2412  char *buff;
2413  // create format specifiers before the debug output
2414  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2415  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2416  traits_t<T>::spec, traits_t<T>::spec,
2417  traits_t<ST>::spec, traits_t<T>::spec);
2418  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2419  __kmp_str_free(&buff);
2420  }
2421 #endif
2422 
2423  if (__kmp_env_consistency_check) {
2424  if (incr == 0) {
2425  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2426  loc);
2427  }
2428  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2429  // The loop is illegal.
2430  // Some zero-trip loops maintained by compiler, e.g.:
2431  // for(i=10;i<0;++i) // lower >= upper - run-time check
2432  // for(i=0;i>10;--i) // lower <= upper - run-time check
2433  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2434  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2435  // Compiler does not check the following illegal loops:
2436  // for(i=0;i<10;i+=incr) // where incr<0
2437  // for(i=10;i>0;i-=incr) // where incr<0
2438  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2439  }
2440  }
2441  th = __kmp_threads[gtid];
2442  team = th->th.th_team;
2443 #if OMP_40_ENABLED
2444  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2445  nteams = th->th.th_teams_size.nteams;
2446 #endif
2447  team_id = team->t.t_master_tid;
2448  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2449 
2450  // compute global trip count
2451  if (incr == 1) {
2452  trip_count = *pupper - *plower + 1;
2453  } else if (incr == -1) {
2454  trip_count = *plower - *pupper + 1;
2455  } else if (incr > 0) {
2456  // upper-lower can exceed the limit of signed type
2457  trip_count = (UT)(*pupper - *plower) / incr + 1;
2458  } else {
2459  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2460  }
2461 
2462  if (trip_count <= nteams) {
2463  KMP_DEBUG_ASSERT(
2464  __kmp_static == kmp_sch_static_greedy ||
2465  __kmp_static ==
2466  kmp_sch_static_balanced); // Unknown static scheduling type.
2467  // only some teams get single iteration, others get nothing
2468  if (team_id < trip_count) {
2469  *pupper = *plower = *plower + team_id * incr;
2470  } else {
2471  *plower = *pupper + incr; // zero-trip loop
2472  }
2473  if (plastiter != NULL)
2474  *plastiter = (team_id == trip_count - 1);
2475  } else {
2476  if (__kmp_static == kmp_sch_static_balanced) {
2477  UT chunk = trip_count / nteams;
2478  UT extras = trip_count % nteams;
2479  *plower +=
2480  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2481  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2482  if (plastiter != NULL)
2483  *plastiter = (team_id == nteams - 1);
2484  } else {
2485  T chunk_inc_count =
2486  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2487  T upper = *pupper;
2488  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2489  // Unknown static scheduling type.
2490  *plower += team_id * chunk_inc_count;
2491  *pupper = *plower + chunk_inc_count - incr;
2492  // Check/correct bounds if needed
2493  if (incr > 0) {
2494  if (*pupper < *plower)
2495  *pupper = traits_t<T>::max_value;
2496  if (plastiter != NULL)
2497  *plastiter = *plower <= upper && *pupper > upper - incr;
2498  if (*pupper > upper)
2499  *pupper = upper; // tracker C73258
2500  } else {
2501  if (*pupper > *plower)
2502  *pupper = traits_t<T>::min_value;
2503  if (plastiter != NULL)
2504  *plastiter = *plower >= upper && *pupper < upper - incr;
2505  if (*pupper < upper)
2506  *pupper = upper; // tracker C73258
2507  }
2508  }
2509  }
2510 }
2511 
2512 //-----------------------------------------------------------------------------
2513 // Dispatch routines
2514 // Transfer call to template< type T >
2515 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2516 // T lb, T ub, ST st, ST chunk )
2517 extern "C" {
2518 
2535 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2536  enum sched_type schedule, kmp_int32 lb,
2537  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2538  KMP_DEBUG_ASSERT(__kmp_init_serial);
2539 #if OMPT_SUPPORT && OMPT_OPTIONAL
2540  OMPT_STORE_RETURN_ADDRESS(gtid);
2541 #endif
2542  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2543 }
2547 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2548  enum sched_type schedule, kmp_uint32 lb,
2549  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2550  KMP_DEBUG_ASSERT(__kmp_init_serial);
2551 #if OMPT_SUPPORT && OMPT_OPTIONAL
2552  OMPT_STORE_RETURN_ADDRESS(gtid);
2553 #endif
2554  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2555 }
2556 
2560 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2561  enum sched_type schedule, kmp_int64 lb,
2562  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2563  KMP_DEBUG_ASSERT(__kmp_init_serial);
2564 #if OMPT_SUPPORT && OMPT_OPTIONAL
2565  OMPT_STORE_RETURN_ADDRESS(gtid);
2566 #endif
2567  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2568 }
2569 
2573 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2574  enum sched_type schedule, kmp_uint64 lb,
2575  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2576  KMP_DEBUG_ASSERT(__kmp_init_serial);
2577 #if OMPT_SUPPORT && OMPT_OPTIONAL
2578  OMPT_STORE_RETURN_ADDRESS(gtid);
2579 #endif
2580  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2581 }
2582 
2592 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2593  enum sched_type schedule, kmp_int32 *p_last,
2594  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2595  kmp_int32 chunk) {
2596  KMP_DEBUG_ASSERT(__kmp_init_serial);
2597 #if OMPT_SUPPORT && OMPT_OPTIONAL
2598  OMPT_STORE_RETURN_ADDRESS(gtid);
2599 #endif
2600  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2601  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2602 }
2603 
2604 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2605  enum sched_type schedule, kmp_int32 *p_last,
2606  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2607  kmp_int32 chunk) {
2608  KMP_DEBUG_ASSERT(__kmp_init_serial);
2609 #if OMPT_SUPPORT && OMPT_OPTIONAL
2610  OMPT_STORE_RETURN_ADDRESS(gtid);
2611 #endif
2612  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2613  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2614 }
2615 
2616 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2617  enum sched_type schedule, kmp_int32 *p_last,
2618  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2619  kmp_int64 chunk) {
2620  KMP_DEBUG_ASSERT(__kmp_init_serial);
2621 #if OMPT_SUPPORT && OMPT_OPTIONAL
2622  OMPT_STORE_RETURN_ADDRESS(gtid);
2623 #endif
2624  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2625  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2626 }
2627 
2628 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2629  enum sched_type schedule, kmp_int32 *p_last,
2630  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2631  kmp_int64 chunk) {
2632  KMP_DEBUG_ASSERT(__kmp_init_serial);
2633 #if OMPT_SUPPORT && OMPT_OPTIONAL
2634  OMPT_STORE_RETURN_ADDRESS(gtid);
2635 #endif
2636  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2637  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2638 }
2639 
2653 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2654  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2655 #if OMPT_SUPPORT && OMPT_OPTIONAL
2656  OMPT_STORE_RETURN_ADDRESS(gtid);
2657 #endif
2658  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2659 #if OMPT_SUPPORT && OMPT_OPTIONAL
2660  ,
2661  OMPT_LOAD_RETURN_ADDRESS(gtid)
2662 #endif
2663  );
2664 }
2665 
2669 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2670  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2671  kmp_int32 *p_st) {
2672 #if OMPT_SUPPORT && OMPT_OPTIONAL
2673  OMPT_STORE_RETURN_ADDRESS(gtid);
2674 #endif
2675  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2676 #if OMPT_SUPPORT && OMPT_OPTIONAL
2677  ,
2678  OMPT_LOAD_RETURN_ADDRESS(gtid)
2679 #endif
2680  );
2681 }
2682 
2686 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2687  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2688 #if OMPT_SUPPORT && OMPT_OPTIONAL
2689  OMPT_STORE_RETURN_ADDRESS(gtid);
2690 #endif
2691  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2692 #if OMPT_SUPPORT && OMPT_OPTIONAL
2693  ,
2694  OMPT_LOAD_RETURN_ADDRESS(gtid)
2695 #endif
2696  );
2697 }
2698 
2702 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2703  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2704  kmp_int64 *p_st) {
2705 #if OMPT_SUPPORT && OMPT_OPTIONAL
2706  OMPT_STORE_RETURN_ADDRESS(gtid);
2707 #endif
2708  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2709 #if OMPT_SUPPORT && OMPT_OPTIONAL
2710  ,
2711  OMPT_LOAD_RETURN_ADDRESS(gtid)
2712 #endif
2713  );
2714 }
2715 
2722 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2723  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2724 }
2725 
2729 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2730  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2731 }
2732 
2736 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2737  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2738 }
2739 
2743 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2744  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2745 }
2748 //-----------------------------------------------------------------------------
2749 // Non-template routines from kmp_dispatch.cpp used in other sources
2750 
2751 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2752  return value == checker;
2753 }
2754 
2755 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2756  return value != checker;
2757 }
2758 
2759 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2760  return value < checker;
2761 }
2762 
2763 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2764  return value >= checker;
2765 }
2766 
2767 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2768  return value <= checker;
2769 }
2770 
2771 kmp_uint32
2772 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2773  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2774  void *obj // Higher-level synchronization object, or NULL.
2775  ) {
2776  // note: we may not belong to a team at this point
2777  volatile kmp_uint32 *spin = spinner;
2778  kmp_uint32 check = checker;
2779  kmp_uint32 spins;
2780  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2781  kmp_uint32 r;
2782 
2783  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2784  KMP_INIT_YIELD(spins);
2785  // main wait spin loop
2786  while (!f(r = TCR_4(*spin), check)) {
2787  KMP_FSYNC_SPIN_PREPARE(obj);
2788  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2789  split. It causes problems with infinite recursion because of exit lock */
2790  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2791  __kmp_abort_thread(); */
2792 
2793  /* if we have waited a bit, or are oversubscribed, yield */
2794  /* pause is in the following code */
2795  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2796  KMP_YIELD_SPIN(spins);
2797  }
2798  KMP_FSYNC_SPIN_ACQUIRED(obj);
2799  return r;
2800 }
2801 
2802 void __kmp_wait_yield_4_ptr(
2803  void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2804  void *obj // Higher-level synchronization object, or NULL.
2805  ) {
2806  // note: we may not belong to a team at this point
2807  void *spin = spinner;
2808  kmp_uint32 check = checker;
2809  kmp_uint32 spins;
2810  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2811 
2812  KMP_FSYNC_SPIN_INIT(obj, spin);
2813  KMP_INIT_YIELD(spins);
2814  // main wait spin loop
2815  while (!f(spin, check)) {
2816  KMP_FSYNC_SPIN_PREPARE(obj);
2817  /* if we have waited a bit, or are oversubscribed, yield */
2818  /* pause is in the following code */
2819  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2820  KMP_YIELD_SPIN(spins);
2821  }
2822  KMP_FSYNC_SPIN_ACQUIRED(obj);
2823 }
2824 
2825 } // extern "C"
2826 
2827 #ifdef KMP_GOMP_COMPAT
2828 
2829 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2830  enum sched_type schedule, kmp_int32 lb,
2831  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2832  int push_ws) {
2833  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2834  push_ws);
2835 }
2836 
2837 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2838  enum sched_type schedule, kmp_uint32 lb,
2839  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2840  int push_ws) {
2841  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2842  push_ws);
2843 }
2844 
2845 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2846  enum sched_type schedule, kmp_int64 lb,
2847  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2848  int push_ws) {
2849  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2850  push_ws);
2851 }
2852 
2853 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2854  enum sched_type schedule, kmp_uint64 lb,
2855  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2856  int push_ws) {
2857  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2858  push_ws);
2859 }
2860 
2861 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2862  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2863 }
2864 
2865 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2866  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2867 }
2868 
2869 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2870  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2871 }
2872 
2873 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2874  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2875 }
2876 
2877 #endif /* KMP_GOMP_COMPAT */
2878 
2879 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:803
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:816
sched_type
Definition: kmp.h:317
Definition: kmp.h:210
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)