ggml perf bugfix

John · John · commit 9383922dc8ee · 2023-07-10T14:09:53.000+02:00
diff --git a/ggml.c b/ggml.c
@@ -17234,8 +17234,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                     if (GGML_OP_HAS_FINALIZE[node->op]) {
                         params.type = GGML_TASK_FINALIZE;
                         ggml_compute_forward(&params, node);
-                        ggml_graph_compute_perf_stats_node(node, state->shared);
-                    }
+                    } 
+                    ggml_graph_compute_perf_stats_node(node, state->shared);
                 } else {
                     break;
                 }
@@ -17269,10 +17269,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         if (state->ith < node->n_tasks) {
             ggml_compute_forward(&params, node);
         }
-        }
+        ggml_graph_compute_perf_stats_node(node, state->shared);
+    }
 
-        return 0;
-        }
+    return 0;
+    }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     const int n_threads = cgraph->n_threads;
@@ -18246,6 +18247,8 @@ void ggml_graph_print_impl(const struct ggml_cgraph * cgraph, bool print_nodes,
     GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
+        node->perf_time_us = MAX(node->perf_time_us, 1);  // should not happen anymore
+        node->perf_runs = MAX(node->perf_runs, 1);
 
         perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
         perf_array[i] = node->perf_time_us / node->perf_runs;
diff --git a/libfalcon.cpp b/libfalcon.cpp
@@ -3793,7 +3793,7 @@ struct falcon_context * falcon_init_from_file(
             }
         };
     }
-
+    int64_t t_start_us = ggml_time_us();
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
     falcon_model *model = falcon_model_load(path_model, params.n_ctx, params.n_batch, params.n_gpu_layers,
                 params.main_gpu, memory_type, params.use_mmap, params.use_mlock,
@@ -3808,7 +3808,8 @@ struct falcon_context * falcon_init_from_file(
     params.i_gpu_start = model->i_gpu_start; // first layer that's GPU accelerated
     params.i_gpu_last = model->i_gpu_last; // last layer that's GPU accelerated
     falcon_context * f_ctx = falcon_context_prepare(params, model, "falcon_main",true);
-    
+    f_ctx->t_load_us = ggml_time_us() - t_start_us;
+    f_ctx->t_start_us = t_start_us;
     //falcon_context_set_buffers(f_ctx,params.n_batch,params.n_ctx);
     //const size_t memory_size = ggml_nbytes(model->kv_self.k) + ggml_nbytes(model->kv_self.v);
     //fprintf(stderr, "%s: RAM buffers - key_val = %7.2f MB, Compute = %7.2f MB, Scratch 0 = %7.2f MB, Scratch 1 = %7.2f MB \n", __func__, memory_size / 1024.0 / 1024.0, f_ctx->buf_compute.size /1024.0/1024.0, (f_ctx->buf_scratch[0].size)/1024.0/1024.0, (f_ctx->buf_scratch[1].size)/1024.0/1024.0);