Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix jl_active_task_stack to return correct values #56477

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -3414,6 +3414,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
}
jl_gc_debug_print();

ct->ctx.activefp = (char*)jl_get_frame_addr();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, that one seems simple enough :-)

int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
// `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
Expand Down
1 change: 0 additions & 1 deletion src/jl_exported_funcs.inc
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,6 @@
XX(jl_tagged_gensym) \
XX(jl_take_buffer) \
XX(jl_task_get_next) \
XX(jl_task_stack_buffer) \
XX(jl_termios_size) \
XX(jl_test_cpu_feature) \
XX(jl_threadid) \
Expand Down
85 changes: 8 additions & 77 deletions src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,21 +72,20 @@ typedef struct _jl_tls_states_t *jl_ptls_t;
#endif
#include "gc-interface.h"
#include "julia_atomics.h"
#include "julia_threads.h"
#include "julia_assert.h"

// the common fields are hidden before the pointer, but the following macro is
// used to indicate which types below are subtypes of jl_value_t
#define JL_DATA_TYPE
typedef struct _jl_value_t jl_value_t;
#include "julia_threads.h"

#ifdef __cplusplus
extern "C" {
#endif

// core data types ------------------------------------------------------------

// the common fields are hidden before the pointer, but the following macro is
// used to indicate which types below are subtypes of jl_value_t
#define JL_DATA_TYPE

typedef struct _jl_value_t jl_value_t;

struct _jl_taggedvalue_bits {
uintptr_t gc:2;
uintptr_t in_image:1;
Expand Down Expand Up @@ -484,9 +483,6 @@ typedef struct _jl_abi_override_t {
jl_method_instance_t *def;
} jl_abi_override_t;

// all values are callable as Functions
typedef jl_value_t jl_function_t;

typedef struct {
JL_DATA_TYPE
jl_sym_t *name;
Expand Down Expand Up @@ -2263,12 +2259,8 @@ JL_DLLEXPORT void jl_sigatomic_end(void);

// tasks and exceptions -------------------------------------------------------

typedef struct _jl_timing_block_t jl_timing_block_t;
typedef struct _jl_timing_event_t jl_timing_event_t;
typedef struct _jl_excstack_t jl_excstack_t;

// info describing an exception handler
typedef struct _jl_handler_t {
struct _jl_handler_t {
jl_jmp_buf eh_ctx;
jl_gcframe_t *gcstack;
jl_value_t *scope;
Expand All @@ -2278,68 +2270,7 @@ typedef struct _jl_handler_t {
sig_atomic_t defer_signal;
jl_timing_block_t *timing_stack;
size_t world_age;
} jl_handler_t;

#define JL_RNG_SIZE 5 // xoshiro 4 + splitmix 1

typedef struct _jl_task_t {
JL_DATA_TYPE
jl_value_t *next; // invasive linked list for scheduler
jl_value_t *queue; // invasive linked list for scheduler
jl_value_t *tls;
jl_value_t *donenotify;
jl_value_t *result;
jl_value_t *scope;
jl_function_t *start;
_Atomic(uint8_t) _state;
uint8_t sticky; // record whether this Task can be migrated to a new thread
uint16_t priority;
_Atomic(uint8_t) _isexception; // set if `result` is an exception to throw or that we exited with
uint8_t pad0[3];
// === 64 bytes (cache line)
uint64_t rngState[JL_RNG_SIZE];
// flag indicating whether or not to record timing metrics for this task
uint8_t metrics_enabled;
uint8_t pad1[3];
// timestamp this task first entered the run queue
_Atomic(uint64_t) first_enqueued_at;
// timestamp this task was most recently scheduled to run
_Atomic(uint64_t) last_started_running_at;
// time this task has spent running; updated when it yields or finishes.
_Atomic(uint64_t) running_time_ns;
// === 64 bytes (cache line)
// timestamp this task finished (i.e. entered state DONE or FAILED).
_Atomic(uint64_t) finished_at;

// hidden state:

// id of owning thread - does not need to be defined until the task runs
_Atomic(int16_t) tid;
// threadpool id
int8_t threadpoolid;
// Reentrancy bits
// Bit 0: 1 if we are currently running inference/codegen
// Bit 1-2: 0-3 counter of how many times we've reentered inference
// Bit 3: 1 if we are writing the image and inference is illegal
uint8_t reentrant_timing;
// 2 bytes of padding on 32-bit, 6 bytes on 64-bit
// uint16_t padding2_32;
// uint48_t padding2_64;
// saved gc stack top for context switches
jl_gcframe_t *gcstack;
size_t world_age;
// quick lookup for current ptls
jl_ptls_t ptls; // == jl_all_tls_states[tid]
#ifdef USE_TRACY
const char *name;
#endif
// saved exception stack
jl_excstack_t *excstack;
// current exception handler
jl_handler_t *eh;
// saved thread state
jl_ucontext_t ctx; // pointer into stkbuf, if suspended
} jl_task_t;
};

#define JL_TASK_STATE_RUNNABLE 0
#define JL_TASK_STATE_DONE 1
Expand Down
12 changes: 3 additions & 9 deletions src/julia_gcext.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,19 +135,13 @@ JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void);
// NOTE: Only valid to call from within a GC context.
JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) JL_NOTSAFEPOINT;

// Return a non-null pointer to the start of the stack area if the task
// has an associated stack buffer. In that case, *size will also contain
// the size of that stack buffer upon return. Also, if task is a thread's
// current task, that thread's id will be stored in *tid; otherwise,
// *tid will be set to -1.
//
// DEPRECATED: use jl_active_task_stack() instead.
JL_DLLEXPORT void *jl_task_stack_buffer(jl_task_t *task, size_t *size, int *tid);

// Query the active and total stack range for the given task, and set
// *active_start and *active_end respectively *total_start and *total_end
// accordingly. The range for the active part is a best-effort approximation
// and may not be tight.
//
// NOTE: Only valid to call from within a GC context. May return incorrect
// values and segfault otherwise.
JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task,
char **active_start, char **active_end,
char **total_start, char **total_end) JL_NOTSAFEPOINT;
Expand Down
12 changes: 0 additions & 12 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1765,18 +1765,6 @@ JL_DLLEXPORT unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *fi
void register_eh_frames(uint8_t *Addr, size_t Size) JL_NOTSAFEPOINT;
void deregister_eh_frames(uint8_t *Addr, size_t Size) JL_NOTSAFEPOINT;

STATIC_INLINE void *jl_get_frame_addr(void) JL_NOTSAFEPOINT
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved to julia_threads.h so it can be called there, OK

{
#ifdef __GNUC__
return __builtin_frame_address(0);
#else
void *dummy = NULL;
// The mask is to suppress the compiler warning about returning
// address of local variable
return (void*)((uintptr_t)&dummy & ~(uintptr_t)15);
#endif
}

// Log `msg` to the current logger by calling CoreLogging.logmsg_shim() on the
// julia side. If any of module, group, id, file or line are NULL, these will
// be passed to the julia side as `nothing`. If `kwargs` is NULL an empty set
Expand Down
86 changes: 86 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ typedef struct {
_jl_ucontext_t *ctx;
jl_stack_context_t *copy_ctx;
};
void *activefp;
void *stkbuf; // malloc'd memory (either copybuf or stack)
size_t bufsz; // actual sizeof stkbuf
unsigned int copy_stack:31; // sizeof stack for copybuf
Expand Down Expand Up @@ -218,6 +219,76 @@ typedef struct _jl_tls_states_t {
#endif
} jl_tls_states_t;

#define JL_RNG_SIZE 5 // xoshiro 4 + splitmix 1

// all values are callable as Functions
typedef jl_value_t jl_function_t;

typedef struct _jl_timing_block_t jl_timing_block_t;
typedef struct _jl_timing_event_t jl_timing_event_t;
typedef struct _jl_excstack_t jl_excstack_t;

typedef struct _jl_handler_t jl_handler_t;

typedef struct _jl_task_t {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A major chunk of this PR seems to be about moving _jl_task_t from julia.h to here. Is there a functional reason why this is necessary for this PR? Perhaps something got added to _jl_task_t -- but it's difficult to say because everything is in a single commit

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made PR #57117 to take care of this

JL_DATA_TYPE
jl_value_t *next; // invasive linked list for scheduler
jl_value_t *queue; // invasive linked list for scheduler
jl_value_t *tls;
jl_value_t *donenotify;
jl_value_t *result;
jl_value_t *scope;
jl_function_t *start;
_Atomic(uint8_t) _state;
uint8_t sticky; // record whether this Task can be migrated to a new thread
uint16_t priority;
_Atomic(uint8_t) _isexception; // set if `result` is an exception to throw or that we exited with
uint8_t pad0[3];
// === 64 bytes (cache line)
uint64_t rngState[JL_RNG_SIZE];
// flag indicating whether or not to record timing metrics for this task
uint8_t metrics_enabled;
uint8_t pad1[3];
// timestamp this task first entered the run queue
_Atomic(uint64_t) first_enqueued_at;
// timestamp this task was most recently scheduled to run
_Atomic(uint64_t) last_started_running_at;
// time this task has spent running; updated when it yields or finishes.
_Atomic(uint64_t) running_time_ns;
// === 64 bytes (cache line)
// timestamp this task finished (i.e. entered state DONE or FAILED).
_Atomic(uint64_t) finished_at;

// hidden state:

// id of owning thread - does not need to be defined until the task runs
_Atomic(int16_t) tid;
// threadpool id
int8_t threadpoolid;
// Reentrancy bits
// Bit 0: 1 if we are currently running inference/codegen
// Bit 1-2: 0-3 counter of how many times we've reentered inference
// Bit 3: 1 if we are writing the image and inference is illegal
uint8_t reentrant_timing;
// 2 bytes of padding on 32-bit, 6 bytes on 64-bit
// uint16_t padding2_32;
// uint48_t padding2_64;
// saved gc stack top for context switches
jl_gcframe_t *gcstack;
size_t world_age;
// quick lookup for current ptls
jl_ptls_t ptls; // == jl_all_tls_states[tid]
#ifdef USE_TRACY
const char *name;
#endif
// saved exception stack
jl_excstack_t *excstack;
// current exception handler
jl_handler_t *eh;
// saved thread state
jl_ucontext_t ctx; // pointer into stkbuf, if suspended
} jl_task_t;

JL_DLLEXPORT void *jl_get_ptls_states(void);

// Update codegen version in `ccall.cpp` after changing either `pause` or `wake`
Expand Down Expand Up @@ -247,6 +318,18 @@ JL_DLLEXPORT void (jl_cpu_pause)(void);
JL_DLLEXPORT void (jl_cpu_suspend)(void);
JL_DLLEXPORT void (jl_cpu_wake)(void);

STATIC_INLINE void *jl_get_frame_addr(void) JL_NOTSAFEPOINT
{
#ifdef __GNUC__
return __builtin_frame_address(0);
#else
void *dummy = NULL;
// The mask is to suppress the compiler warning about returning
// address of local variable
return (void*)((uintptr_t)&dummy & ~(uintptr_t)15);
#endif
}

#ifdef __clang_gcanalyzer__
// Note that the sigint safepoint can also trigger GC, albeit less likely
void jl_gc_safepoint_(jl_ptls_t tls);
Expand Down Expand Up @@ -274,6 +357,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state,
{
assert(old_state != JL_GC_PARALLEL_COLLECTOR_THREAD);
assert(old_state != JL_GC_CONCURRENT_COLLECTOR_THREAD);
if (__builtin_constant_p(state) ? state : // required if !old_state && state, otherwise optional
__builtin_constant_p(old_state) ? !old_state : 1)
jl_atomic_load_relaxed(&ptls->current_task)->ctx.activefp = (char*)jl_get_frame_addr();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea what the semantics for jl_gc_state_set are, when it is supposed to be called, etc., so I can't judge whether this is sensible or effective.

This seems to store the current frame pointer into the task. Presumably that means this function is being called from the task as it is about to switch out? At least that would make sense.

My personal dream is that the wonderful people adding MMTk and doing other GC restructuring work, like @gbaraldi @d-netto @udesou), add some more comments describing such things as they figure this stuff out for themselves ;-).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we should be updating the activefp in jl_gc_state_set?

Makes more sense to me to update this in the segv_handler or jl_set_gc_and_wait.

Also, I believe we should fetch the frame pointer from the context the thread was running with before it got trapped in the signal handler -- note that we run the signal handlers with a different stack.

This is more or less what I started doing in #55003.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we should be updating the activefp in jl_gc_state_set? ... Also, I believe we should fetch the frame pointer from the context the thread was running with before it got trapped in the signal handler -- note that we run the signal handlers with a different stack.

It ends up being mandatory, since we don't stop all threads when reading their pgcstack, although that does mean it may be often unreliable / unsafe to conservatively scan the stack of the other threads. The signal handler issue is a good point. We would need to pass through the ucontext_t to jl_set_gc_and_wait, like we do on Apple, to get a stack pointer from there (plus redzone).

Even with those changed though, we know now that in v1.11+ the llvm optimizations have reached a point where we know many cases that will necessarily cause the GC to segfault when doing conservative stack scanning, so there isn't any safe way to do this. I am going to go ahead and close out this PR, though someone could revive it under a build flag if they really care about it, and don't care if using conservative GC makes their code segfault sometimes or are willing to run with -O0.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, we already do use conservative GC in Julia 1.11 and master... though we do it by catching the "segfaults", though really the only ones we had so far were due to incorrect values for the stack start being reported causing us to read into guard pages.

Though perhaps we'd be crashing more if we used tasks extensively? I don't know enough about what the problem is right now / how "wrong" the currently reported values are? I thought they are just "too big" as in the stack is overestimated? But maybe that's not true anymore?

Copy link
Member Author

@vtjnash vtjnash Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The observation now is that values won't be allocated correctly once they are provably garbage (during optimization), which may lead to the GC trying to access memory that is not valid to access. So you need to either turn off optimizations (-O0) or use a precise stack scan, or just hope to be lucky that it doesn't crash often.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still don't quite understand what that means. Anyway based on the bits I do understand I hope we won't be affected because we don't scan for general Julia objects, just for those of a specific "foreign" type. Since these are allocated via the C ABI from C code I don't think the LLVM optimizer has any chance of optimizing them away in any way...

jl_atomic_store_release(&ptls->gc_state, state);
if (state == JL_GC_STATE_UNSAFE || old_state == JL_GC_STATE_UNSAFE)
jl_gc_safepoint_(ptls);
Expand Down
29 changes: 18 additions & 11 deletions src/llvm-codegen-shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,16 +237,24 @@ static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Type *T_s
emit_signal_fence(builder);
}

static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state, bool final)
static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, llvm::Value *state, llvm::Value *old_state, bool final)
{
using namespace llvm;
Value *ptls = get_current_ptls_from_task(builder, task, tbaa);
Type *T_int8 = state->getType();
unsigned offset = offsetof(jl_tls_states_t, gc_state);
Value *gc_state = builder.CreateConstInBoundsGEP1_32(T_int8, ptls, offset, "gc_state");
if (old_state == nullptr) {
old_state = builder.CreateLoad(T_int8, gc_state);
cast<LoadInst>(old_state)->setOrdering(AtomicOrdering::Monotonic);
}
if (isa<Constant>(state) ? state == builder.getInt8(0) :
isa<Constant>(old_state) ? old_state == builder.getInt8(0) : true) {
unsigned offset = offsetof(jl_task_t, ctx.activefp);
Value *currentfp = builder.CreateConstInBoundsGEP1_32(T_int8, task, offset, "gc_state");
Value *fp = builder.CreateIntrinsic(Intrinsic::frameaddress, {builder.getPtrTy()}, {builder.getInt32(0)});
builder.CreateAlignedStore(fp, currentfp, Align(sizeof(void*)));
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea how this code is supposed to work, so I can't review it other than on a superficial level: My guess would be that this function is a code-emitting counterpart to jl_gc_state_set, and so a similar modification as there is applied here, to update activefp to contain the current framepointer. The inserted code looks plausibly like it might be doing that

builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release);
if (auto *C = dyn_cast<ConstantInt>(old_state))
if (C->isZero())
Expand All @@ -261,39 +269,38 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::T
builder.CreateICmpEQ(state, zero8)),
passBB, exitBB);
builder.SetInsertPoint(passBB);
MDNode *tbaa = get_tbaa_const(builder.getContext());
emit_gc_safepoint(builder, T_size, ptls, tbaa, final);
emit_gc_safepoint(builder, T_size, ptls, get_tbaa_const(builder.getContext()), final);
builder.CreateBr(exitBB);
builder.SetInsertPoint(exitBB);
return old_state;
}

static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, bool final)
static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, bool final)
{
using namespace llvm;
Value *state = builder.getInt8(0);
return emit_gc_state_set(builder, T_size, ptls, state, nullptr, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, nullptr, final);
}

static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::Value *state, bool final)
static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, llvm::Value *state, bool final)
{
using namespace llvm;
Value *old_state = builder.getInt8(JL_GC_STATE_UNSAFE);
return emit_gc_state_set(builder, T_size, ptls, state, old_state, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, old_state, final);
}

static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, bool final)
static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, bool final)
{
using namespace llvm;
Value *state = builder.getInt8(JL_GC_STATE_SAFE);
return emit_gc_state_set(builder, T_size, ptls, state, nullptr, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, nullptr, final);
}

static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::Value *state, bool final)
static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, llvm::Value *state, bool final)
{
using namespace llvm;
Value *old_state = builder.getInt8(JL_GC_STATE_SAFE);
return emit_gc_state_set(builder, T_size, ptls, state, old_state, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, old_state, final);
}

// Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14.
Expand Down
4 changes: 2 additions & 2 deletions src/llvm-ptls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
builder.SetInsertPoint(fastTerm->getParent());
fastTerm->removeFromParent();
MDNode *tbaa = tbaa_gcframe;
Value *prior = emit_gc_unsafe_enter(builder, T_size, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa), true);
Value *prior = emit_gc_unsafe_enter(builder, T_size, get_current_task_from_pgcstack(builder, pgcstack), tbaa, true);
builder.Insert(fastTerm);
phi->addIncoming(pgcstack, fastTerm->getParent());
// emit pre-return cleanup
Expand All @@ -197,7 +197,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
for (auto &BB : *pgcstack->getParent()->getParent()) {
if (isa<ReturnInst>(BB.getTerminator())) {
builder.SetInsertPoint(BB.getTerminator());
emit_gc_unsafe_leave(builder, T_size, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state, true);
emit_gc_unsafe_leave(builder, T_size, get_current_task_from_pgcstack(builder, phi), tbaa, last_gc_state, true);
}
}
}
Expand Down
Loading
Loading