diff --git a/common/arg.cpp b/common/arg.cpp index 117b6a9a7cbec..60e37a89a68e8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--chunks"}, "N", format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), diff --git a/examples/main/README.md b/examples/main/README.md index 8b233fd057374..6730effdf2d66 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -161,7 +161,7 @@ A value of -1 will enable infinite text generation, even though we have a finite If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. -The `--no-context-shift` options allows you to stop the inifinite text generation once the finite context window is full. +The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full. It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3122b27a0933a..27623e4a70285 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1815,13 +1815,6 @@ struct server_context { for (server_slot & slot : slots) { if (slot.ga_n == 1) { if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { - if (!params.ctx_shift){ - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - continue; - } // Shift context const int n_keep = slot.params.n_keep + add_bos_token; const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; @@ -3175,4 +3168,4 @@ int main(int argc, char ** argv) { t.join(); return 0; -} +} \ No newline at end of file