/* -----------------------------------------------------------------------------
 *
 * (c) The University of Glasgow 2004
 *
 * Application-related bits.
 *
 * This file is written in a subset of C--, extended with various
 * features specific to GHC.  It is compiled by GHC directly.  For the
 * syntax of .cmm files, see the parser in ghc/compiler/GHC/Cmm/Parser.y.
 *
 * -------------------------------------------------------------------------- */

#include "Cmm.h"

/* ----------------------------------------------------------------------------
 * Evaluate a closure and return it.
 *
 * There isn't an info table / return address version of stg_ap_0, because
 * everything being returned is guaranteed evaluated, so it would be a no-op.
 */

STRING(stg_ap_0_ret_str,"stg_ap_0_ret... ")

stg_ap_0_fast ( P_ fun )
{
    IF_DEBUG(apply,
        ccall debugBelch(stg_ap_0_ret_str);
        ccall printClosure(R1 "ptr"));

    IF_DEBUG(sanity,
        ccall checkStackFrame(Sp "ptr"));

#if !defined(PROFILING)

    ENTER(fun);

#else

/*
  Note [Evaluating functions with profiling]

  If we evaluate something like

    let f = {-# SCC "f" #-} g

  where g is a function, then updating the thunk for f to point to g
  would be incorrect: we've lost the SCC annotation.  In general, when
  we evaluate a function and the current CCS is different from the one
  stored in the function, we need to return a function with the
  correct CCS in it.

  The mechanism we use to wrap the function is to create a
  zero-argument PAP as a proxy object to hold the new CCS, and return
  that.

  If the closure we evaluated is itself a PAP, we cannot make a nested
  PAP, so we copy the original PAP and set the CCS in the new PAP to
  enterFunCCS(pap->header.prof.ccs).
*/

again:
    W_  info;
    P_ untaggedfun;
    W_ arity;
    // We must obey the correct heap object observation pattern in
    // Note [Heap memory barriers] in SMP.h.
    untaggedfun = UNTAG(fun);
    info = %INFO_PTR(untaggedfun);
    switch [INVALID_OBJECT .. N_CLOSURE_TYPES]
        (TO_W_( %INFO_TYPE(%STD_INFO(info)) )) {
        case
            IND,
            IND_STATIC:
        {
            fun = StgInd_indirectee(fun);
            goto again;
        }
        case BCO:
        {
            arity = TO_W_(StgBCO_arity(untaggedfun));
            goto dofun;
        }
        case
            FUN,
            FUN_1_0,
            FUN_0_1,
            FUN_2_0,
            FUN_1_1,
            FUN_0_2,
            FUN_STATIC:
        {
            arity = TO_W_(StgFunInfoExtra_arity(%FUN_INFO(info)));
        dofun:
            if (CCCS == StgHeader_ccs(untaggedfun)) {
                return (fun);
            } else {
                // We're going to build a new PAP, with zero extra
                // arguments and therefore the same arity as the
                // original function.  In other words, we're using a
                // zero-argument PAP as an indirection to the
                // function, so that we can attach a different CCS to
                // it.
                HP_CHK_GEN(SIZEOF_StgPAP);
                TICK_ALLOC_PAP(SIZEOF_StgPAP, 0);
                // attribute this allocation to the "overhead of profiling"
                CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
                P_ pap;
                pap = Hp - SIZEOF_StgPAP + WDS(1);
                SET_HDR(pap, stg_PAP_info, CCCS);
                StgPAP_arity(pap) = arity;
                if (arity <= TAG_MASK) {
                  // TODO: Shouldn't this already be tagged? If not why did we
                  // untag it at the beginning of this function?
                  fun = untaggedfun + arity;
                }
                StgPAP_fun(pap)   = fun;
                StgPAP_n_args(pap) = 0;
                return (pap);
            }
        }
        case PAP:
        {
            if (CCCS == StgHeader_ccs(untaggedfun)) {
                return (fun);
            } else {
                // We're going to copy this PAP, and put the new CCS in it
                W_ size;
                size = SIZEOF_StgPAP + WDS(TO_W_(StgPAP_n_args(untaggedfun)));
                HP_CHK_GEN(size);
                TICK_ALLOC_PAP(size, 0);
                // attribute this allocation to the "overhead of profiling"
                CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
                P_ pap;
                pap = Hp - size + WDS(1);
                // We'll lose the original PAP, so we should enter its CCS
                ccall enterFunCCS(BaseReg "ptr", StgHeader_ccs(untaggedfun) "ptr");
                SET_HDR(pap, stg_PAP_info, CCCS);
                StgPAP_arity(pap) = StgPAP_arity(untaggedfun);
                StgPAP_n_args(pap) = StgPAP_n_args(untaggedfun);
                StgPAP_fun(pap)   = StgPAP_fun(fun);
                W_ i;
                i = TO_W_(StgPAP_n_args(untaggedfun));
            loop:
                if (i == 0) {
                    return (pap);
                }
                i = i - 1;
                StgPAP_payload(pap,i) = StgPAP_payload(fun,i);
                goto loop;
            }
        }
        case AP,
             AP_STACK,
             BLACKHOLE,
             WHITEHOLE,
             THUNK,
             THUNK_1_0,
             THUNK_0_1,
             THUNK_2_0,
             THUNK_1_1,
             THUNK_0_2,
             THUNK_STATIC,
             THUNK_SELECTOR:
        {
            // We have a thunk of some kind, so evaluate it.

            // The thunk might evaluate to a function, so we have to
            // come back here again to adjust its CCS if necessary.
            // Therefore we need to push a stack frame to look at the
            // function that gets returned (a stg_restore_ccs_eval
            // frame), and therefore we need a stack check.
            STK_CHK_GEN();

            // We can't use the value of 'info' any more, because if
            // STK_CHK_GEN() did a GC then the closure we're looking
            // at may have changed, e.g. a THUNK_SELECTOR may have
            // been evaluated by the GC.  So we reload the info
            // pointer now.
            untaggedfun = UNTAG(fun);
            info = %INFO_PTR(untaggedfun);

            jump %ENTRY_CODE(info)
                (stg_restore_cccs_eval_info, CCCS)
                (untaggedfun);
        }
        default:
        {
            jump %ENTRY_CODE(info) (UNTAG(fun));
        }
    }
#endif
}

/* -----------------------------------------------------------------------------
   Entry Code for a PAP.

   This entry code is *only* called by one of the stg_ap functions.
   On entry: Sp points to the remaining arguments on the stack.  If
   the stack check fails, we can just push the PAP on the stack and
   return to the scheduler.

   On entry: R1 points to the PAP.  The rest of the function's
   arguments (apart from those that are already in the PAP) are on the
   stack, starting at Sp(0).  R2 contains an info table which
   describes these arguments, which is used in the event that the
   stack check in the entry code below fails.  The info table is
   currently one of the stg_ap_*_ret family, as this code is always
   entered from those functions.

   The idea is to copy the chunk of stack from the PAP object onto the
   stack / into registers, and enter the function.
   -------------------------------------------------------------------------- */

INFO_TABLE(stg_PAP,/*special layout*/0,0,PAP,"PAP","PAP")
{  ccall barf("PAP object (%p) entered!", R1) never returns; }

stg_PAP_apply /* no args => explicit stack */
{
  W_ Words;
  W_ pap;

  pap = R1;

  Words = TO_W_(StgPAP_n_args(pap));

  //
  // Check for stack overflow and bump the stack pointer.
  // We have a hand-rolled stack check fragment here, because none of
  // the canned ones suit this situation.
  //
  if (Sp - (WDS(Words) + 2/* see ARG_BCO below */) < SpLim) {
      // there is a return address in R2 in the event of a
      // stack check failure.  The various stg_apply functions arrange
      // this before calling stg_PAP_entry.
      Sp_adj(-1);
      Sp(0) = R2;
      jump stg_gc_unpt_r1 [R1];
  }
  Sp_adj(-Words);

  // profiling
  TICK_ENT_PAP();
  LDV_ENTER(pap);
#if defined(PROFILING)
  ccall enterFunCCS(BaseReg "ptr", StgHeader_ccs(pap) "ptr");
#endif

  // Reload the stack
  W_ i;
  W_ p;
  p = pap + SIZEOF_StgHeader + OFFSET_StgPAP_payload;
  i = 0;
for:
  if (i < Words) {
    Sp(i) = W_[p];
    p = p + WDS(1);
    i = i + 1;
    goto for;
  }

  R1 = StgPAP_fun(pap);

/* DEBUGGING CODE, ensures that arity 1 and 2 functions are entered tagged
  if (TO_W_(StgFunInfoExtra_arity(%FUN_INFO(%INFO_PTR(UNTAG(R1))))) == 1 ) {
    if (GETTAG(R1)!=1) {
        W_[0]=1;
    }
  }

  if (TO_W_(StgFunInfoExtra_arity(%FUN_INFO(%INFO_PTR(UNTAG(R1))))) == 2 ) {
    if (GETTAG(R1)!=2) {
        W_[0]=1;
    }
  }
*/

  // Off we go!
  TICK_ENT_VIA_NODE();

#if defined(NO_ARG_REGS)
  jump %GET_ENTRY(UNTAG(R1)) [R1];
#else
      W_ info;
      info = %GET_FUN_INFO(UNTAG(R1));
      W_ type;
      type = TO_W_(StgFunInfoExtra_fun_type(info));
      if (type == ARG_GEN) {
          jump StgFunInfoExtra_slow_apply(info) [R1];
      }
      if (type == ARG_GEN_BIG) {
          jump StgFunInfoExtra_slow_apply(info) [R1];
      }
      if (type == ARG_BCO) {
          Sp_adj(-2);
          Sp(1) = R1;
          Sp(0) = stg_apply_interp_info;
          jump stg_yield_to_interpreter [];
      }
      jump W_[stg_ap_stack_entries +
                WDS(TO_W_(StgFunInfoExtra_fun_type(info)))] [R1];
#endif
}

/* -----------------------------------------------------------------------------
   Entry Code for an AP (a PAP with arity zero).

   The entry code is very similar to a PAP, except there are no
   further arguments on the stack to worry about, so the stack check
   is simpler.  We must also push an update frame on the stack before
   applying the function.
   -------------------------------------------------------------------------- */

INFO_TABLE(stg_AP,/*special layout*/0,0,AP,"AP","AP")
 /* no args => explicit stack */
{
  W_ Words;
  W_ ap;

  ap = R1;

  Words = TO_W_(StgAP_n_args(ap));

  /*
   * Check for stack overflow.  IMPORTANT: use a _ENTER check here,
   * because if the check fails, we might end up blackholing this very
   * closure, in which case we must enter the blackhole on return rather
   * than continuing to evaluate the now-defunct closure.
   */
  STK_CHK_ENTER(WDS(Words) +
                SIZEOF_StgUpdateFrame +
                2/* see ARG_BCO below */, R1);

  PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);
  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(Words);

  TICK_ENT_AP();
  LDV_ENTER(ap);
  ENTER_CCS_THUNK(ap);

  // Reload the stack
  W_ i;
  W_ p;
  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_payload;
  i = 0;
for:
  if (i < Words) {
    Sp(i) = W_[p];
    p = p + WDS(1);
    i = i + 1;
    goto for;
  }

  R1 = StgAP_fun(ap);

  // Off we go!
  TICK_ENT_VIA_NODE();

#if defined(NO_ARG_REGS)
  jump %GET_ENTRY(UNTAG(R1)) [R1];
#else
      W_ info;
      info = %GET_FUN_INFO(UNTAG(R1));
      W_ type;
      type = TO_W_(StgFunInfoExtra_fun_type(info));
      if (type == ARG_GEN) {
          jump StgFunInfoExtra_slow_apply(info) [R1];
      }
      if (type == ARG_GEN_BIG) {
          jump StgFunInfoExtra_slow_apply(info) [R1];
      }
      if (type == ARG_BCO) {
          Sp_adj(-2);
          Sp(1) = R1;
          Sp(0) = stg_apply_interp_info;
          jump stg_yield_to_interpreter [];
      }
      jump W_[stg_ap_stack_entries +
                WDS(TO_W_(StgFunInfoExtra_fun_type(info)))] [R1];
#endif
}

/* AP_NOUPD is exactly like AP, except that no update frame is pushed.
   Use for thunks that are guaranteed to be entered once only, such as
   those generated by the byte-code compiler for inserting breakpoints. */

INFO_TABLE(stg_AP_NOUPD,/*special layout*/0,0,AP,"AP_NOUPD","AP_NOUPD")
   /* no args => explicit stack */
{
  W_ Words;
  W_ ap;

  ap = R1;

  Words = TO_W_(StgAP_n_args(ap));

  /*
   * Check for stack overflow.  IMPORTANT: use a _ENTER check here,
   * because if the check fails, we might end up blackholing this very
   * closure, in which case we must enter the blackhole on return rather
   * than continuing to evaluate the now-defunct closure.
   */
  STK_CHK_ENTER(WDS(Words) +
                2/* see ARG_BCO below */, R1);
  Sp = Sp - WDS(Words);

  TICK_ENT_AP();
  LDV_ENTER(ap);
  ENTER_CCS_THUNK(ap);

  // Reload the stack
  W_ i;
  W_ p;
  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_payload;
  i = 0;
for:
  if (i < Words) {
    Sp(i) = W_[p];
    p = p + WDS(1);
    i = i + 1;
    goto for;
  }

  R1 = StgAP_fun(ap);

  // Off we go!
  TICK_ENT_VIA_NODE();

#if defined(NO_ARG_REGS)
  jump %GET_ENTRY(UNTAG(R1)) [R1];
#else
      W_ info;
      info = %GET_FUN_INFO(UNTAG(R1));
      W_ type;
      type = TO_W_(StgFunInfoExtra_fun_type(info));
      if (type == ARG_GEN) {
          jump StgFunInfoExtra_slow_apply(info) [R1];
      }
      if (type == ARG_GEN_BIG) {
          jump StgFunInfoExtra_slow_apply(info) [R1];
      }
      if (type == ARG_BCO) {
          Sp_adj(-2);
          Sp(1) = R1;
          Sp(0) = stg_apply_interp_info;
          jump stg_yield_to_interpreter [];
      }
      jump W_[stg_ap_stack_entries +
                WDS(TO_W_(StgFunInfoExtra_fun_type(info)))] [R1];
#endif
}

/* -----------------------------------------------------------------------------
   Entry Code for an AP_STACK.

   Very similar to a PAP and AP.  The layout is the same as PAP
   and AP, except that the payload is a chunk of stack instead of
   being described by the function's info table.  Like an AP,
   there are no further arguments on the stack to worry about.
   However, the function closure (ap->fun) does not necessarily point
   directly to a function, so we have to enter it using stg_ap_0.
   -------------------------------------------------------------------------- */

/*
 Note [AP_STACKs must be eagerly blackholed]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#13615 describes a nasty concurrency issue where we can enter into the
middle of an ST action multiple times, resulting in duplication of effects.
In short, the construction of an AP_STACK allows us to suspend a computation
which should not be duplicated. When running with lazy blackholing, we can then
enter this AP_STACK multiple times, duplicating the computation with potentially
disastrous consequences.

For instance, consider the case of a simple ST program which computes a sum
using in─place mutation,

   inplaceSum :: Num a => [a] ─> a
   inplaceSum xs0 = runST $ do
     y <─ newSTRef 0
     let go [] = readSTRef y
         go (x : xs) = do
           modifySTRef y (+x)
           go xs
     go xs0

Of course, it is fine if we enter an inplaceSum thunk more than once: the two
threads will inhabit different worlds with different STRefs. However, if we
suspend some part of inplaceSum (for instance, due to the heap check at the
beginning of go) and then multiple threads resume that suspension (as is safe in
pure computation) we will have multiple threads concurrently mutating the same
STRef. Disaster!

Let's consider how this might happen: Consider this situation,

  ┌─────────┐            ┌───────┐      ┌───────┐          ┌─────────┐
  │  TSO 1  │      ╭────→│ go    │      │ fun   │          │  TSO 2  │
  └─────────┘      │     └───────┘      └───────┘          └─────────┘
                   │                        │
  ┌─────────┐      │                        │              ┌─────────┐
  │         │──────╯                        ╰──────────────│         │
  ├─────────┤           ┌─────────┐                        ├─────────┤
  │ UPDATE_ │──────────→│ THUNK A │               ╭────────│ UPDATE_ │
  │ FRAME   │ updatee   └─────────┘               │updatee │ FRAME   │
  ├─────────┤                                     │        ├─────────┤
  │ ...     │                                     │        │ etc.    │
  ├─────────┤ updatee              ┌─────────┐    │
  │ UPDATE_ │─────────────────────→│ THUNK B │←───╯
  │ FRAME   │                      └─────────┘
  ├─────────┤
  │ etc.    │

Here we have two threads (TSO 1 and TSO 2) which are in currently pausing (e.g.
in threadPaused). Since they are pausing, their stacks are headed by a pointer
to the continuation code which we will run on resumption (go and fun,
respectively). We also see that there are two thunks on the heap: THUNK A and
THUNK B where THUNK B depends upon THUNK A (as in, evaluation of B will force
A). We see that thread 1 has THUNK A under evaluation, and both threads have
THUNK B under evaluation.

As each thread enters threadPaused, threadPaused will walk its stack looking for
duplicate computation (see Note [suspend duplicate work], although there is some
background described below as well). Let's consider what this check does:

Say that TSO 2 begins this check first. The check will walk TSO 2's stack, until
it finds the first update frame, which updates THUNK B. Upon finding this frame,
it will try to lock THUNK B, replacing it with a BLACKHOLE owned by its TSO. We
now have,

  ┌─────────┐            ┌───────┐   ┌───────┐             ┌─────────┐
  │  TSO 1  │      ╭────→│ go    │   │ fun   │   ╭────────→│  TSO 2  │
  └─────────┘      │     └───────┘   └───────┘   │         └─────────┘
                   │                     ↑ ╭─────╯
  ┌─────────┐      │                     │ │               ┌─────────┐
  │         │──────╯                     ╰─────────────────│         │
  ├─────────┤ updatee   ┌─────────┐        │               ├─────────┤
  │ UPDATE_ │──────────→│ THUNK A │        │    ╭──────────│ UPDATE_ │
  │ FRAME   │           └─────────┘        │    │  updatee │ FRAME   │
  ├─────────┤                              │    │          ├─────────┤
  │ ...     │                         owner│    │          │ etc.    │
  ├─────────┤ updatee           ┌────────────┐  │
  │ UPDATE_ │──────────────────→│ BLACKHOLE  │←─╯
  │ FRAME   │                   └────────────┘
  ├─────────┤
  │ etc.    │

Now consider what happens when TSO 1 runs its duplicate-computation check.
Again, we start walking the stack from the top, where we find the update
frame updating THUNK A. We will lock this thunk, replacing it with a BLACKHOLE
owned by its TSO. We now have,

  ┌─────────┐            ┌───────┐   ┌───────┐             ┌─────────┐
  │  TSO 1  │←──╮  ╭────→│ go    │   │ fun   │   ╭────────→│  TSO 2  │
  └─────────┘   │  │     └───────┘   └───────┘   │         └─────────┘
                │  │                     ↑ ╭─────╯
  ┌─────────┐   ╰──│─────────╮           │ │               ┌─────────┐
  │         │──────╯         │owner      ╰─────────────────│         │
  ├─────────┤           ┌───────────┐      │               ├─────────┤
  │ UPDATE_ │──────────→│ BLACKHOLE │      │    ╭──────────│ UPDATE_ │
  │ FRAME   │ updatee   └───────────┘      │    │  updatee │ FRAME   │
  ├─────────┤                              │    │          ├─────────┤
  │ ...     │                         owner│    │          │ etc.    │
  ├─────────┤ updatee           ┌────────────┐  │
  │ UPDATE_ │──────────────────→│ BLACKHOLE  │←─╯
  │ FRAME   │                   └────────────┘
  ├─────────┤
  │ etc.    │

Now we will continue walking down TSO 1's stack, next coming across the second
update frame, pointing to the now-BLACKHOLE'd THUNK B. At this point
threadPaused will correctly conclude that TSO 1 is duplicating a computation
being carried out by TSO 2 and attempt to suspend it.

The suspension process proceeds by invoking raiseAsync, which walks the stack
from the top looking for update frames. For each update frame we take any stack
frames preceding it and construct an AP_STACK heap object from them. We then
replace the updatee of the frame with an indirection pointing to the AP_STACK.
So, after suspending the first update frame we have,

  ┌─────────┐            ┌───────┐    ┌───────┐            ┌─────────┐
  │  TSO 1  │  ╭────────→│ go    │←─╮ │ fun   │   ╭───────→│  TSO 2  │
  └─────────┘  │         └───────┘  │ └───────┘   │        └─────────┘
               │      ┌───────────┐ │     ↑ ╭─────╯
  ┌─────────┐  │      │ AP_STACK  │ │     │ │              ┌─────────┐
  │         │──╯      ├───────────┤ │     ╰────────────────│         │
  ├─────────┤         │           │─╯       │              ├─────────┤
  │ UPDATE_ │───────╮ └───────────┘         │   ╭──────────│ UPDATE_ │
  │ FRAME   │updatee│     ↑                 │   │  updatee │ FRAME   │
  ├─────────┤       │     │indirectee       │   │          ├─────────┤
  │ ...     │       ╰→┌───────────┐         │   │          │ etc.    │
  ├─────────┤updatee  │ BLACKHOLE │         │   │
  │ UPDATE_ │──╮      └───────────┘    owner│   │
  │ FRAME   │  │                ┌────────────┐  │
  ├─────────┤  ╰───────────────→│ BLACKHOLE  │←─╯
  │ etc.    │                   └────────────┘

Finally, we will replace the second update frame with a blackhole so that TSO 1
will block on TSO 2's computation of THUNK B,

  ┌─────────┐            ┌───────┐    ┌───────┐            ┌─────────┐
  │  TSO 1  │  ╭────────→│ go    │←─╮ │ fun   │   ╭───────→│  TSO 2  │
  └─────────┘  │         └───────┘  │ └───────┘   │        └─────────┘
               │      ┌───────────┐ │     ↑ ╭─────╯
  ┌─────────┐  │      │ AP_STACK  │ │     │ │              ┌─────────┐
  │         │──╯      ├───────────┤ │     ╰────────────────│         │
  ├─────────┤         │           │─╯       │              ├─────────┤
  │ UPDATE_ │───────╮ └───────────┘         │   ╭──────────│ UPDATE_ │
  │ FRAME   │updatee│     ↑                 │   │  updatee │ FRAME   │
  ├─────────┤       │     │indirectee       │   │          ├─────────┤
  │ ...     │       ╰→┌───────────┐         │   │          │ etc.    │
  ├─────────┤         │ BLACKHOLE │         │   │
  │ BLACK   │         └───────────┘    owner│   │
  │ HOLE    │───────────╮       ┌────────────┐  │
  ├─────────┤indirectee ╰──────→│ BLACKHOLE  │←─╯
  │ etc.    │                   └────────────┘

At first glance there's still nothing terribly alarming here. However, consider
what would happen if some other closure held a reference to THUNK A. We would
now have leaked an AP_STACK capturing the state of a potentially
non-duplicatable computation to heap. Even worse, if two threads had references
to THUNK A and both attempted to enter at the same time, they would both succeed
if we allowed AP_STACKs to be lazily blackholed. This is the reason why we must
be very careful when entering AP_STACKS: they introduce the possibility that we
duplicate a computation which could never otherwise be duplicated.

For this reason we employ an atomic blackholing strategy when entering AP_STACK
closures.
 */


INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
  /* no args => explicit stack */
{
  W_ Words;
  W_ ap;

  ap = R1;

  Words = StgAP_STACK_size(ap);

  /*
   * Check for stack overflow.  IMPORTANT: use a _ENTER check here,
   * because if the check fails, we might end up blackholing this very
   * closure, in which case we must enter the blackhole on return rather
   * than continuing to evaluate the now-defunct closure.
   */
  STK_CHK_ENTER(WDS(Words) + SIZEOF_StgUpdateFrame + WDS(AP_STACK_SPLIM), R1);

  /*
   * It is imperative that we blackhole lest we may duplicate computation which
   * must not be duplicated. See Note [AP_STACKs must be eagerly blackholed].
   */
  W_ old_info;
  (old_info) = prim %cmpxchgW(ap, stg_AP_STACK_info, stg_WHITEHOLE_info);
  if (old_info != stg_AP_STACK_info) {
    /* someone else beat us to it */
    jump ENTRY_LBL(stg_WHITEHOLE) (ap);
  }
  // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is
  // not reachable.
  StgInd_indirectee(ap) = CurrentTSO;
  prim_write_barrier;
  SET_INFO(ap, __stg_EAGER_BLACKHOLE_info);

  /* ensure there is at least AP_STACK_SPLIM words of headroom available
   * after unpacking the AP_STACK. See bug #1466 */
  PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);
  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(Words);

  TICK_ENT_AP();
  LDV_ENTER(ap);
  ENTER_CCS_THUNK(ap);

  // Reload the stack
  W_ i;
  W_ p;
  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_STACK_payload;
  i = 0;
for:
  if (i < Words) {
    Sp(i) = W_[p];
    p = p + WDS(1);
    i = i + 1;
    goto for;
  }

  // Off we go!
  TICK_ENT_VIA_NODE();

  R1 = StgAP_STACK_fun(ap);

  // Because of eager blackholing the closure no longer has correct size so
  // threadPaused() can't correctly zero the slop, so we do it here. See #15571
  // and Note [zeroing slop when overwriting closures].
  OVERWRITING_CLOSURE_SIZE(ap, BYTES_TO_WDS(SIZEOF_StgThunkHeader) + 2 + Words);

  ENTER_R1();
}

/* -----------------------------------------------------------------------------
   AP_STACK_NOUPD - exactly like AP_STACK, but doesn't push an update frame.
   -------------------------------------------------------------------------- */

INFO_TABLE(stg_AP_STACK_NOUPD,/*special layout*/0,0,AP_STACK,
                                        "AP_STACK_NOUPD","AP_STACK_NOUPD")
   /* no args => explicit stack */
{
  W_ Words;
  W_ ap;

  ap = R1;

  Words = StgAP_STACK_size(ap);

  /*
   * Check for stack overflow.  IMPORTANT: use a _NP check here,
   * because if the check fails, we might end up blackholing this very
   * closure, in which case we must enter the blackhole on return rather
   * than continuing to evaluate the now-defunct closure.
   */
  STK_CHK_ENTER(WDS(Words) + WDS(AP_STACK_SPLIM), R1);
  /* ensure there is at least AP_STACK_SPLIM words of headroom available
   * after unpacking the AP_STACK. See bug #1466 */

  Sp = Sp - WDS(Words);

  TICK_ENT_AP();
  LDV_ENTER(ap);
  ENTER_CCS_THUNK(ap);

  // Reload the stack
  W_ i;
  W_ p;
  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_STACK_payload;
  i = 0;
for:
  if (i < Words) {
    Sp(i) = W_[p];
    p = p + WDS(1);
    i = i + 1;
    goto for;
  }

  // Off we go!
  TICK_ENT_VIA_NODE();

  R1 = StgAP_STACK_fun(ap);

  ENTER_R1();
}