openvinotoolkit · iefode · Feb 7, 2025 · Feb 7, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -392,6 +392,9 @@ class ContinuousBatchingPipeline:
     @typing.overload
     def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], int | None] | StreamerBase | None = None) -> list[GenerationResult]:
         ...
+    @typing.overload
+    def generate(self, prompt: str, generation_config: GenerationConfig, streamer: typing.Callable[[str], int | None] | StreamerBase | None = None) -> list[GenerationResult]:
+        ...
     def get_config(self) -> GenerationConfig:
         ...
     def get_metrics(self) -> PipelineMetrics:

diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
@@ -308,5 +308,21 @@ void init_continuous_batching_pipeline(py::module_& m) {
             py::arg("prompts"),
             py::arg("generation_config"),
             py::arg("streamer") = std::monostate{}
+        )
+
+        .def(
+            "generate",
+            [](ContinuousBatchingPipeline& pipe,
+               const std::string& prompt,
+               const ov::genai::GenerationConfig& generation_config,
+               const pyutils::PyBindStreamerVariant& streamer
+            ) -> py::typing::Union<std::vector<ov::genai::GenerationResult>> {
+                std::vector<std::string> prompts = { prompts };
+                std::vector<ov::genai::GenerationConfig> generation_configs = { generation_config };
+                return __call_cb_generate(pipe, prompts, generation_configs, streamer);
+            },
+            py::arg("prompt"),
+            py::arg("generation_config"),
+            py::arg("streamer") = std::monostate{}
         );
 }
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
@@ -15,8 +15,9 @@
 
 from utils.generation_config import get_greedy, get_beam_search
 from utils.constants import get_default_llm_properties
-from utils.hugging_face import convert_models, get_hugging_face_models, run_hugging_face
+from utils.hugging_face import download_and_convert_model, run_hugging_face
 from utils.comparation import compare_generation_results
+from utils.ov_genai_pipelines import dict_to_scheduler_config, run_ov_pipeline, StreamerWithResults, PipelineType
 
 TESTS_ROOT = Path(__file__).parent
 
@@ -36,28 +37,6 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
     return (prompts, generation_configs)
 
 
-def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig:
-    scheduler_config = SchedulerConfig()
-    if scheduler_params is None:
-        scheduler_config.dynamic_split_fuse = True
-        # vLLM specific
-        scheduler_config.max_num_batched_tokens = 256
-        scheduler_config.max_num_seqs = 256
-
-        # Expedited number of blocks = text_blocks_n * G * n_prompts, where
-        # text_blocks_n - number of blocks required for storing prompt and generated text,
-        # currently it is 1 block for prompt (31 token with block_size 32) + 1 block for generated text (max length of generated text - 30 tokens);
-        # G - number of sequences in a sequence group, for beam search it is 2(group_size) * 3 (num_groups);
-        # n_prompts - number of prompts.
-        # For current parameters in tests expedited number of blocks is approximately 48.
-        scheduler_config.num_kv_blocks = 60
-    else:
-        for param, value in scheduler_params.items():
-            setattr(scheduler_config, param, value)
-
-    return scheduler_config
-
-
 def run_continuous_batching(
     models_path : Path,
     scheduler_config : SchedulerConfig,
@@ -66,46 +45,13 @@ def run_continuous_batching(
 ) -> List[GenerationResult]:
     if type(generation_configs) is not list:
         generation_configs = [generation_configs] * len(prompts)
-
-    cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_llm_properties())
-    output = cb_pipe.generate(prompts, generation_configs)
-
-    del cb_pipe
-    shutil.rmtree(models_path)
-
-    return output
-
-
-def get_models_list_from_path(file_name: str):
-    models = []
-    with open(file_name) as f:
-        for model_name in f:
-            model_name = model_name.strip()
-            # skip comment in model scope file
-            if model_name.startswith('#'):
-                continue
-            models.append(model_name)
-    return models
-
-
-class StreamerWithResults:
-    # Return a streamer which accumulates results in order to compare with results returned from generate.
-    results: List[str] = []
-    def __init__(self):
-        self.results = []
-
-    def accumulate(self, subword) -> bool:
-        self.results.append(subword)
-        return False
-
-    def get_results(self) -> List[GenerationResult]:
-        streaming_result = GenerationResult()
-        streaming_result.m_generation_ids = [''.join(self.results)]
-        return [streaming_result]
-
-    def reset(self):
-        self.results = []
 
+    return run_ov_pipeline(models_path=models_path,
+                           prompt=prompts,
+                           generation_config=generation_configs,
+                           pipeline_type=PipelineType.CONTINIOUS_BATCHING,
+                           scheduler_config=scheduler_config,
+                           ov_config=get_default_llm_properties())
 
 
 def run_llm_pipeline(
@@ -116,44 +62,12 @@ def run_llm_pipeline(
     streamer: StreamerWithResults | Callable | StreamerBase = None
 ) -> List[GenerationResult]:
     properties = get_default_llm_properties()
-    if use_cb:
-        properties['scheduler_config'] = SchedulerConfig()
-    ov_pipe = LLMPipeline(models_path, device='CPU', **properties)
-
-    if streamer is None and not (generation_config.is_beam_search() or generation_config.num_return_sequences > 1) and len(prompts) == 1:
-        # We can use streamer only if we have a single prompt and not beam search.
-        streamer = StreamerWithResults()
-    if isinstance(streamer, StreamerWithResults):
-        # Clear the accumulated strings to avoid side effects
-        streamer.reset()
-
-    generate_outputs : DecodedResults = ov_pipe.generate(
-        inputs=prompts, 
-        generation_config=generation_config, 
-        streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer
-    )
-
-    index = 0
-    generation_results = []
-
-    for _ in prompts:
-        generation_result = GenerationResult()
-
-        generation_result.m_generation_ids = generate_outputs.texts[index : index + generation_config.num_return_sequences]
-        # sequences_scores are available only for beam search case
-        if generation_config.is_beam_search():
-            generation_result.m_scores = generate_outputs.scores[index : index + generation_config.num_return_sequences]
-        generation_results.append(generation_result)
-
-        index += generation_config.num_return_sequences
-
-    del ov_pipe
-    shutil.rmtree(models_path)
-
-    if isinstance(streamer, StreamerWithResults):
-        compare_generation_results(prompts, generation_results, streamer.get_results(), generation_config)
-
-    return generation_results
+    return run_ov_pipeline(models_path=models_path,
+                           prompt=prompts,
+                           generation_config=generation_config,
+                           pipeline_type=(PipelineType.STATELESS if use_cb else PipelineType.STATEFUL),
+                           streamer=streamer,
+                           ov_config=properties)
 
 
 def run_llm_pipeline_with_ref(model_id: str, 
@@ -162,34 +76,31 @@ def run_llm_pipeline_with_ref(model_id: str,
                               tmp_path: Path, 
                               use_cb : bool = False,
                               streamer: StreamerWithResults | Callable | StreamerBase = None):
-    models_path : Path = tmp_path / model_id
-    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
-
     if type(generation_config) is dict:
         generation_config = GenerationConfig(**generation_config)
 
-    convert_models(opt_model, hf_tokenizer, models_path)
+    opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id, tmp_path)
 
     ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb, streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer)
     hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config)
 
     compare_generation_results(prompts, hf_results, ov_results, generation_config)
 
 
-def run_cb_pipeline_with_ref(tmp_path: str, model_id: str, scheduler_params: dict = {}, generation_config : GenerationConfig | dict = None):
+def run_cb_pipeline_with_ref(tmp_path: str,
+                             model_id: str,
+                             scheduler_params: dict = {},
+                             generation_config : GenerationConfig | dict = None):
     prompts, generation_configs = get_test_dataset()
-    scheduler_config = get_scheduler_config(scheduler_params)
+    scheduler_config = dict_to_scheduler_config(scheduler_params)
 
     # override dataset's generation config
     if generation_config is not None:
         if type(generation_config) is dict:
             generation_config = GenerationConfig(**generation_config)
         generation_configs = [generation_config] * len(prompts)
 
-    models_path : Path = tmp_path / model_id
-    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
-
-    convert_models(opt_model, hf_tokenizer, models_path)
+    opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id, tmp_path)
 
     hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_configs)
     ov_results = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
@@ -211,17 +122,3 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st
         for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids):
             assert ref_text == ov_text
 
-"""rt_info has the highest priority. Delete it to respect configs."""
-def delete_rt_info(configs: List[Tuple], temp_path):
-    core = openvino.Core()
-    core.set_property({'ENABLE_MMAP': False})
-    for model_path in temp_path / "openvino_tokenizer.xml", temp_path / "openvino_detokenizer.xml":
-        tokenizer = core.read_model(model_path)
-        rt_info = tokenizer.get_rt_info()
-        for config, _ in configs:
-            for key in config.keys():
-                try:
-                    del rt_info[key]
-                except KeyError:
-                    pass
-        openvino.save_model(tokenizer, model_path)
diff --git a/tests/python_tests/data/models.py b/tests/python_tests/data/models.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import pathlib
+import os
+import pytest
+
+
+def get_models_list():
+    precommit_models = [
+        "katuni4ka/tiny-random-phi3",
+    ]
+
+    nightly_models = [
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        "facebook/opt-125m",
+        "microsoft/phi-1_5",
+        "microsoft/phi-2",
+        "THUDM/chatglm3-6b",
+        "Qwen/Qwen2-0.5B-Instruct",
+        "Qwen/Qwen-7B-Chat",
+        "Qwen/Qwen1.5-7B-Chat",
+        "argilla/notus-7b-v1",
+        "HuggingFaceH4/zephyr-7b-beta",
+        "ikala/redpajama-3b-chat",
+        "mistralai/Mistral-7B-v0.1",
+
+        # "meta-llama/Llama-2-7b-chat-hf",  # Cannot be downloaded without access token
+        # "google/gemma-2b-it",  # Cannot be downloaded without access token.
+        # "google/gemma-7b-it",  # Cannot be downloaded without access token.
+        "meta-llama/Llama-2-13b-chat-hf",
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        "openlm-research/open_llama_3b",
+        "openlm-research/open_llama_3b_v2",
+        "openlm-research/open_llama_7b",
+        "databricks/dolly-v2-12b",
+        "databricks/dolly-v2-3b",
+    ]
+
+    if pytest.run_marker == "precommit":
+        model_ids = precommit_models
+    else:
+        model_ids = nightly_models
+
+    if pytest.selected_model_ids:
+        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+
+    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
+    return [ (model_id, prefix) for model_id in model_ids ]
+
+
+def get_chat_models_list():
+    precommit_models = [
+        "Qwen/Qwen2-0.5B-Instruct",
+    ]
+
+    nightly_models = [
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        "meta-llama/Llama-2-7b-chat-hf",
+        # "google/gemma-2b-it",  # Cannot be downloaded without access token
+        # "google/gemma-7b-it",  # Cannot be downloaded without access token
+    ]
+
+    if pytest.run_marker == "precommit":
+        model_ids = precommit_models
+    else:
+        model_ids = nightly_models
+
+    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
+    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]