diff --git a/runtime/src/kmp_barrier.cpp b/runtime/src/kmp_barrier.cpp index 8fd4e8cfa..07d051faf 100644 --- a/runtime/src/kmp_barrier.cpp +++ b/runtime/src/kmp_barrier.cpp @@ -1726,7 +1726,10 @@ void __kmp_join_barrier(int gtid) { if (!KMP_MASTER_TID(ds_tid)) this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); #endif - this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; + // vi3-ompt_state_wait_barrier_implicit_parallel: + // old + // this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit_parallel; } #endif @@ -1983,8 +1986,12 @@ void __kmp_fork_barrier(int gtid, int tid) { } #if OMPT_SUPPORT + // vi3-ompt_state_wait_barrier_implicit_parallel + // old + // if (ompt_enabled.enabled && + // this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { if (ompt_enabled.enabled && - this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { + this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit_parallel) { int ds_tid = this_thr->th.th_info.ds.ds_tid; ompt_data_t *task_data = (team) ? OMPT_CUR_TASK_DATA(this_thr) diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp index e66f37529..f99c3e844 100644 --- a/runtime/src/kmp_runtime.cpp +++ b/runtime/src/kmp_runtime.cpp @@ -967,8 +967,23 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); KMP_MB(); + // The ds_tid is set again inside __kmp_initialize_info + // The ds_tid was set to zero, the team of the innermost region was formed + // and set as current team of the thread. However, the corresponding implicit + // task hasn't been set as th_current_task yet. At this moment, + // the th_current_task points to the innermost task being executed inside + // the outer region. + // Since __ompt_get_task_info_internal uses th_current_task, previous "early" + // update of the ds_tid leads to losing the information about the thread_num + // in th_current_task->team. + // Thread needs to iterate over the th_current_task->team->t_threads array + // in order to find the right thread_num. + // This can be avoided if ds_tid is updated after the implicit task + // (of the newly formed innermost region) is pushed to the thread + // (set as new the th_current_task). + // FIXME VI3: check if this is safe to do for hot teams! /* first, let's setup the master thread */ - master_th->th.th_info.ds.ds_tid = 0; + //master_th->th.th_info.ds.ds_tid = 0; master_th->th.th_team = team; master_th->th.th_team_nproc = team->t.t_nproc; master_th->th.th_team_master = master_th; @@ -1052,6 +1067,9 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, #endif } + // Update ds_tid after updating the th_current_task. + master_th->th.th_info.ds.ds_tid = 0; + if (__kmp_display_affinity && team->t.t_display_affinity != 1) { for (i = 0; i < team->t.t_nproc; i++) { kmp_info_t *thr = team->t.t_threads[i]; @@ -2350,6 +2368,12 @@ void __kmp_join_call(ident_t *loc, int gtid #if OMPT_SUPPORT ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); void *codeptr = team->t.ompt_team_info.master_return_address; + + // FIXME vi3: Store old value of parallel_data before freeing team. + // Since, ompt_callback_parallel_end is called after team has been free, + // it is possible that pointer to parallel_data points to a parallel_data + // of a new (recycled) team. + ompt_data_t old_parallel_data = *parallel_data; #endif #if USE_ITT_BUILD @@ -2421,8 +2445,12 @@ void __kmp_join_call(ident_t *loc, int gtid #if OMPT_SUPPORT if (ompt_enabled.enabled) { - __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, + // __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, + // codeptr); + // FIXME vi3: I think this is not necessary since this happens before freeing the team. + __kmp_join_ompt(gtid, master_th, parent_team, &old_parallel_data, fork_context, codeptr); + } #endif @@ -2535,8 +2563,29 @@ void __kmp_join_call(ident_t *loc, int gtid #if OMPT_SUPPORT if (ompt_enabled.enabled) { - __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, + // __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, + // codeptr); + // FIXME vi3: parallel_data represents a pointer to ompt_data_t structure stored + // inside team descriptor (datastructure). + // Data race may happened when sending parallel_data to ompt_callback_parallel_end + // after team has been freed. + // Consider the following example: + // 1. join has been called for parallel region A. + // parallel_data points to a field in A's descriptor. + // 2. fork has been called for region B. + // 3. free A's team and its descriptor. parallel_data points to a field + // contained in descriptor present in freelist. + // 4. allocate new team descriptor for B region by reusing previously freed descriptor. + // parallel_data now points to a field stored in B's descriptor. + // 5. Calls ompt_callback_parallel_end by sending parallel_data. + // Aforementioned callbacks thinks that region B is ending. + // In order to avoid this data rade, follow next steps: + // 1. Before freeing A's team, access to field to which parallel_data points and + // save its (filed's) content to local variable old_parallel_data. + // 2. Send address of old_parallel_data to ompt_callback_parallel_end. + __kmp_join_ompt(gtid, master_th, parent_team, &old_parallel_data, fork_context, codeptr); + } #endif @@ -4014,8 +4063,15 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, KMP_MB(); TCW_SYNC_PTR(this_thr->th.th_team, team); - - this_thr->th.th_info.ds.ds_tid = tid; + // FIXME VI3: Hasn't this already done inside __kmp_fork_team_threads for the + // thread with tid == 0. + // The problem that may appear for the master thread of the + // corresponding region (which is soon to be fully formed) if the ds_tid + // is updated before the th_current_task is the following: + // - ds_tid is set to zero + // - th_current_task corresponds to the outer region so the + // ds_tid doesn't match the thread_num in that team. + // this_thr->th.th_info.ds.ds_tid = tid; this_thr->th.th_set_nproc = 0; if (__kmp_tasking_mode != tskm_immediate_exec) // When tasking is possible, threads are not safe to reap until they are @@ -4043,6 +4099,9 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE); + // Update ds_tid now. + this_thr->th.th_info.ds.ds_tid = tid; + KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", tid, gtid, this_thr, this_thr->th.th_current_task)); // TODO: Initialize ICVs from parent; GEH - isn't that already done in @@ -5413,8 +5472,17 @@ void __kmp_free_team(kmp_root_t *root, } } + // FIXME vi3: This team is still active. If someone asks for parallel_data + // at level 0 (while this function is still active), parallel_data stored + // in this team's descriptor will be returned. + // On the other hand, if someone tries to access parent's parallel_data, + // (while this function is still active), NULL will be returned, + // because there's no way to access parent's descriptor + // (Team's connection to parent team has been disconnected + // by invalidating t_parent pointer). + // Reset pointer to parent team only for non-hot teams. - team->t.t_parent = NULL; + // team->t.t_parent = NULL; team->t.t_level = 0; team->t.t_active_level = 0; @@ -7260,8 +7328,12 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { __kmp_join_barrier(gtid); /* wait for everyone */ #if OMPT_SUPPORT + // vi3-ompt_state_wait_barrier_implicit_parallel + // old + // if (ompt_enabled.enabled && + // this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { if (ompt_enabled.enabled && - this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { + this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit_parallel) { int ds_tid = this_thr->th.th_info.ds.ds_tid; ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); this_thr->th.ompt_thread_info.state = ompt_state_overhead; diff --git a/runtime/src/kmp_tasking.cpp b/runtime/src/kmp_tasking.cpp index 1fd68f003..8d956fa33 100644 --- a/runtime/src/kmp_tasking.cpp +++ b/runtime/src/kmp_tasking.cpp @@ -512,6 +512,14 @@ static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, } #endif /* BUILD_TIED_TASK_STACK */ +#if OMPT_SUPPORT + // FIXME: Set scheduling parent first. + // Otherwise, if someone calls ompt_get_task_info(1) immediately after + // current_task is set as the th_current_task, then the function will use + // th_current_task->ompt_task_info.scheduling_parent which may be outdated + // if th_current_task (here only current) is recycled. + taskdata->ompt_task_info.scheduling_parent = current_task; +#endif // mark starting task as executing and as current task thread->th.th_current_task = taskdata; @@ -568,7 +576,12 @@ static inline void __ompt_task_start(kmp_task_t *task, &(current_task->ompt_task_info.task_data), status, &(taskdata->ompt_task_info.task_data)); } - taskdata->ompt_task_info.scheduling_parent = current_task; + // FIXME vi3: If taskdata is set to thr->th.th_current_task before + // current_task is set as taskdata's scheduling_parent, and if + // someone calls ompt_get_task_info(1), then the function will use + // potentially outdated taskdata's scheduling_parent. + // This happens if taskdata has been recycled. + //taskdata->ompt_task_info.scheduling_parent = current_task; } // __ompt_task_finish: @@ -1818,6 +1831,8 @@ template static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, void *frame_address, void *return_address) { + // TODO: consider removing the return_address parameter + return_address = NULL; kmp_taskdata_t *taskdata; kmp_info_t *thread; int thread_finished = FALSE; @@ -1840,12 +1855,18 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; if (ompt_enabled.ompt_callback_sync_region) { + if (!return_address) + return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); + ompt_callbacks.ompt_callback(ompt_callback_sync_region)( ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, my_task_data, return_address); } if (ompt_enabled.ompt_callback_sync_region_wait) { + if (!return_address) + return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); + ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, my_task_data, return_address); @@ -1935,7 +1956,7 @@ kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { if (UNLIKELY(ompt_enabled.enabled)) { OMPT_STORE_RETURN_ADDRESS(gtid); return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), - OMPT_LOAD_RETURN_ADDRESS(gtid)); + NULL); } #endif return __kmpc_omp_taskwait_template(loc_ref, gtid, NULL, NULL); diff --git a/runtime/src/kmp_wait_release.h b/runtime/src/kmp_wait_release.h index df4a9de8e..0f3c1f481 100644 --- a/runtime/src/kmp_wait_release.h +++ b/runtime/src/kmp_wait_release.h @@ -123,7 +123,10 @@ static void __ompt_implicit_task_end(kmp_info_t *this_thr, ompt_state_t ompt_state, ompt_data_t *tId) { int ds_tid = this_thr->th.th_info.ds.ds_tid; - if (ompt_state == ompt_state_wait_barrier_implicit) { + // vi3-ompt_state_wait_barrier_implicit_parallel + // old + // if (ompt_state == ompt_state_wait_barrier_implicit) { + if (ompt_state == ompt_state_wait_barrier_implicit_parallel) { this_thr->th.ompt_thread_info.state = ompt_state_overhead; #if OMPT_OPTIONAL void *codeptr = NULL; @@ -267,7 +270,10 @@ final_spin=FALSE) ompt_data_t *tId; if (ompt_enabled.enabled) { ompt_entry_state = this_thr->th.ompt_thread_info.state; - if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit || + // vi3-ompt_state_wait_barrier_implicit_parallel + // old + // if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit || + if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit_parallel || KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) { ompt_lw_taskteam_t *team = this_thr->th.th_team->t.ompt_serialized_team_info; diff --git a/runtime/src/ompt-specific.cpp b/runtime/src/ompt-specific.cpp index e3fa9166c..3d01f8da9 100644 --- a/runtime/src/ompt-specific.cpp +++ b/runtime/src/ompt-specific.cpp @@ -358,16 +358,29 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type, kmp_taskdata_t *taskdata = thr->th.th_current_task; if (taskdata == NULL) return 0; - kmp_team *team = thr->th.th_team, *prev_team = NULL; + kmp_team *team = taskdata->td_team, *prev_team = NULL; + if (team == NULL) return 0; ompt_lw_taskteam_t *lwt = NULL, *next_lwt = LWT_FROM_TEAM(taskdata->td_team), *prev_lwt = NULL; + // NOTE: It is possible that taskdata->td_team and team doesn't match + // in one of the following cases: + // - team if formed, but the implicit task execution still hasn't start + // so the taskdata points to the outer task instead of the implicit task + // if newly formed team + // - the implicit task of the innermost region has been finished, and + // is revoked as th_current_task, however team is still active team. + // Since the tool is interested to know the information about the tasks, + // shouldn't we just used the information stored inside taskdatas? + + + while (ancestor_level > 0) { // needed for thread_num - prev_team = team; + // FIXME Test prev_lwt? prev_lwt = lwt; // next lightweight team (if any) if (lwt) @@ -387,6 +400,9 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type, taskdata = taskdata->td_parent; if (team == NULL) return 0; + // Store current team as previous team only when encounter + // on an implicit task. + prev_team = team; team = team->t.t_parent; if (taskdata) { next_lwt = LWT_FROM_TEAM(taskdata->td_team); @@ -415,23 +431,76 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type, } } } + // If the info doesn't exists, then there's no task at the + // specified ancestor level. Return 0, and don't try to provide + // any of the passed arguments. Do the same if team doesn't exist + // at this level. + if (!info || !team) { + return 0; + } + + // FIXME VI3: Even if it is determined that the info exists, I don't + // this it is guaranteed it's still valid, and not reclaimed memory. + // Consider the case when the thread is waiting on the last implicit + // barrier. if (task_data) { - *task_data = info ? &info->task_data : NULL; + *task_data = &info->task_data; } if (task_frame) { // OpenMP spec asks for the scheduling task to be returned. - *task_frame = info ? &info->frame : NULL; + *task_frame = &info->frame; } if (parallel_data) { *parallel_data = team_info ? &(team_info->parallel_data) : NULL; } if (thread_num) { - if (level == 0) - *thread_num = __kmp_get_tid(); + if (level == 0 || !prev_team) { + // prev_team == NULL if at ancestor_level is an implicit task of the + // innermost region or an explicit task that belongs to the region. + int tnum = __kmp_get_tid(); + // NOTE: It is possible that master of the outer region + // is in the middle of process of creating/destroying the inner region. + // Even though thread finished updating/invalidating th_current_task + // (implicit task that corresponds to the innermost region), the ds_tid + // may not be updated yet. Since it remains zero for both inner and + // outer region, it is safe to return zero as thread_num. + // However, this is not case for the worker of outer regions. + // Handle this carefully. + if (team->t.t_threads[tnum] != thr) { + // Information stored inside th.th_info.ds.ds_tid doesn't match the + // thread_num inside the th_current_task->team. + // Either thread changed the ds_tid before invalidating + // th_current_task, or thread set + // newly formed implicit task as th_current_task, but hasn't updated + // ds_tid to be zero yet. + // team variable corresponds to the just finished/created implicit task. + // ds_tid matches thread_num inside team->t.t_parent. + // 0 is the thread_num of the thread inside the team. + kmp_team_t *parent_team = team->t.t_parent; + assert(parent_team && parent_team->t.t_threads[tnum] == thr); + tnum = 0; + } + // store thread_num + *thread_num = tnum; + assert(team->t.t_threads[*thread_num] == thr); + // FIXME VI3 ASSERT THIS. + } else if (prev_lwt) *thread_num = 0; - else - *thread_num = prev_team->t.t_master_tid; + else { + // FIXME VI3 ASSERT THIS. + // Need to be careful in this case. It is possible tha thread is not + // part of the team, but some of the nested teams instead. + // Consider the case when the worker of the regions at level 2 + // calls this function with ancestor_level 1. + // If thread is part of the team, then it is the master of prev_team, + // so use prev_team->t.t_master_tid. + // Otherwise, I think some special value should be return as thread_num. + // This case is not clarified in the OMPT 5.0 specification + int prev_team_master_id = prev_team->t.t_master_tid; + *thread_num = (team->t.t_threads[prev_team_master_id] == thr) + ? prev_team->t.t_master_tid : -1; + } // *thread_num = team->t.t_master_tid; } return info ? 2 : 0;