diff --git a/ggml.c b/ggml.c index 06db6d58e2a77..dfb9109cd0da2 100644 --- a/ggml.c +++ b/ggml.c @@ -17234,8 +17234,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (GGML_OP_HAS_FINALIZE[node->op]) { params.type = GGML_TASK_FINALIZE; ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); - } + } + ggml_graph_compute_perf_stats_node(node, state->shared); } else { break; } @@ -17269,10 +17269,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (state->ith < node->n_tasks) { ggml_compute_forward(¶ms, node); } - } + ggml_graph_compute_perf_stats_node(node, state->shared); + } - return 0; - } + return 0; + } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { const int n_threads = cgraph->n_threads; @@ -18246,6 +18247,8 @@ void ggml_graph_print_impl(const struct ggml_cgraph * cgraph, bool print_nodes, GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + node->perf_time_us = MAX(node->perf_time_us, 1); // should not happen anymore + node->perf_runs = MAX(node->perf_runs, 1); perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us); perf_array[i] = node->perf_time_us / node->perf_runs; diff --git a/libfalcon.cpp b/libfalcon.cpp index 5ddecfd98bff0..c64af8e1145e7 100644 --- a/libfalcon.cpp +++ b/libfalcon.cpp @@ -3793,7 +3793,7 @@ struct falcon_context * falcon_init_from_file( } }; } - + int64_t t_start_us = ggml_time_us(); ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; falcon_model *model = falcon_model_load(path_model, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, memory_type, params.use_mmap, params.use_mlock, @@ -3808,7 +3808,8 @@ struct falcon_context * falcon_init_from_file( params.i_gpu_start = model->i_gpu_start; // first layer that's GPU accelerated params.i_gpu_last = model->i_gpu_last; // last layer that's GPU accelerated falcon_context * f_ctx = falcon_context_prepare(params, model, "falcon_main",true); - + f_ctx->t_load_us = ggml_time_us() - t_start_us; + f_ctx->t_start_us = t_start_us; //falcon_context_set_buffers(f_ctx,params.n_batch,params.n_ctx); //const size_t memory_size = ggml_nbytes(model->kv_self.k) + ggml_nbytes(model->kv_self.v); //fprintf(stderr, "%s: RAM buffers - key_val = %7.2f MB, Compute = %7.2f MB, Scratch 0 = %7.2f MB, Scratch 1 = %7.2f MB \n", __func__, memory_size / 1024.0 / 1024.0, f_ctx->buf_compute.size /1024.0/1024.0, (f_ctx->buf_scratch[0].size)/1024.0/1024.0, (f_ctx->buf_scratch[1].size)/1024.0/1024.0);