Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ Test ][ PR4 ] Splitting & Refactoring Common.py #1722

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
fb2c332
Generation config separation
iefode Feb 7, 2025
f1b0237
Hugging face
iefode Feb 7, 2025
3a42c1c
Test
iefode Feb 10, 2025
f13dcf7
Merge remote-tracking branch 'upstream/master' into hugging_face_utils
iefode Feb 10, 2025
d2fc50c
comparation
iefode Feb 10, 2025
4e03adc
move get_image from utils
iefode Feb 10, 2025
2ade2dc
Tokenizer config
iefode Feb 10, 2025
ef8283f
longbench
iefode Feb 10, 2025
940ac3e
comp
iefode Feb 10, 2025
333845f
remove extra init
iefode Feb 10, 2025
34b7303
remove extra
iefode Feb 10, 2025
ee15e37
Models
iefode Feb 10, 2025
82ebb74
upper case
iefode Feb 10, 2025
713332d
Merge remote-tracking branch 'upstream/master' into hugging_face_utils
iefode Feb 10, 2025
dba2a87
fix tests
iefode Feb 10, 2025
fa1c7ea
Merge branch 'hugging_face_utils' into pr_2
iefode Feb 10, 2025
5e22dda
Merge branch 'pr_2' into pr_3
iefode Feb 10, 2025
7893a9b
init
iefode Feb 10, 2025
c97d4a0
merge read_model_with_gethf
iefode Feb 11, 2025
5b4589b
ov_pipe
iefode Feb 11, 2025
25716a0
Refactor a bit
iefode Feb 11, 2025
eff21ec
init
iefode Feb 12, 2025
85371e1
test merge of run_llm_pipe
iefode Feb 12, 2025
a206fd2
Merge remote-tracking branch 'upstream/master' into pr_4
iefode Feb 12, 2025
76b3d44
fix merge
iefode Feb 12, 2025
4c9f407
ci
iefode Feb 12, 2025
40397e3
Merge remote-tracking branch 'upstream/master' into pr_4
iefode Feb 12, 2025
88fab3c
Fix python stubs
iefode Feb 12, 2025
08ac428
check
iefode Feb 12, 2025
e5a120b
Merge remote-tracking branch 'upstream/master' into pr_4
iefode Feb 13, 2025
4de5533
reove extra
iefode Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,9 @@ class ContinuousBatchingPipeline:
@typing.overload
def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], int | None] | StreamerBase | None = None) -> list[GenerationResult]:
...
@typing.overload
def generate(self, prompt: str, generation_config: GenerationConfig, streamer: typing.Callable[[str], int | None] | StreamerBase | None = None) -> list[GenerationResult]:
...
def get_config(self) -> GenerationConfig:
...
def get_metrics(self) -> PipelineMetrics:
Expand Down
16 changes: 16 additions & 0 deletions src/python/py_continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,5 +308,21 @@ void init_continuous_batching_pipeline(py::module_& m) {
py::arg("prompts"),
py::arg("generation_config"),
py::arg("streamer") = std::monostate{}
)

.def(
"generate",
[](ContinuousBatchingPipeline& pipe,
const std::string& prompt,
const ov::genai::GenerationConfig& generation_config,
const pyutils::PyBindStreamerVariant& streamer
) -> py::typing::Union<std::vector<ov::genai::GenerationResult>> {
std::vector<std::string> prompts = { prompts };
std::vector<ov::genai::GenerationConfig> generation_configs = { generation_config };
return __call_cb_generate(pipe, prompts, generation_configs, streamer);
},
py::arg("prompt"),
py::arg("generation_config"),
py::arg("streamer") = std::monostate{}
);
}
145 changes: 21 additions & 124 deletions tests/python_tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@

from utils.generation_config import get_greedy, get_beam_search
from utils.constants import get_default_llm_properties
from utils.hugging_face import convert_models, get_hugging_face_models, run_hugging_face
from utils.hugging_face import download_and_convert_model, run_hugging_face
from utils.comparation import compare_generation_results
from utils.ov_genai_pipelines import dict_to_scheduler_config, run_ov_pipeline, StreamerWithResults, PipelineType

TESTS_ROOT = Path(__file__).parent

Expand All @@ -36,28 +37,6 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
return (prompts, generation_configs)


def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig:
scheduler_config = SchedulerConfig()
if scheduler_params is None:
scheduler_config.dynamic_split_fuse = True
# vLLM specific
scheduler_config.max_num_batched_tokens = 256
scheduler_config.max_num_seqs = 256

# Expedited number of blocks = text_blocks_n * G * n_prompts, where
# text_blocks_n - number of blocks required for storing prompt and generated text,
# currently it is 1 block for prompt (31 token with block_size 32) + 1 block for generated text (max length of generated text - 30 tokens);
# G - number of sequences in a sequence group, for beam search it is 2(group_size) * 3 (num_groups);
# n_prompts - number of prompts.
# For current parameters in tests expedited number of blocks is approximately 48.
scheduler_config.num_kv_blocks = 60
else:
for param, value in scheduler_params.items():
setattr(scheduler_config, param, value)

return scheduler_config


def run_continuous_batching(
models_path : Path,
scheduler_config : SchedulerConfig,
Expand All @@ -66,46 +45,13 @@ def run_continuous_batching(
) -> List[GenerationResult]:
if type(generation_configs) is not list:
generation_configs = [generation_configs] * len(prompts)

cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_llm_properties())
output = cb_pipe.generate(prompts, generation_configs)

del cb_pipe
shutil.rmtree(models_path)

return output


def get_models_list_from_path(file_name: str):
models = []
with open(file_name) as f:
for model_name in f:
model_name = model_name.strip()
# skip comment in model scope file
if model_name.startswith('#'):
continue
models.append(model_name)
return models


class StreamerWithResults:
# Return a streamer which accumulates results in order to compare with results returned from generate.
results: List[str] = []
def __init__(self):
self.results = []

def accumulate(self, subword) -> bool:
self.results.append(subword)
return False

def get_results(self) -> List[GenerationResult]:
streaming_result = GenerationResult()
streaming_result.m_generation_ids = [''.join(self.results)]
return [streaming_result]

def reset(self):
self.results = []

return run_ov_pipeline(models_path=models_path,
prompt=prompts,
generation_config=generation_configs,
pipeline_type=PipelineType.CONTINIOUS_BATCHING,
scheduler_config=scheduler_config,
ov_config=get_default_llm_properties())


def run_llm_pipeline(
Expand All @@ -116,44 +62,12 @@ def run_llm_pipeline(
streamer: StreamerWithResults | Callable | StreamerBase = None
) -> List[GenerationResult]:
properties = get_default_llm_properties()
if use_cb:
properties['scheduler_config'] = SchedulerConfig()
ov_pipe = LLMPipeline(models_path, device='CPU', **properties)

if streamer is None and not (generation_config.is_beam_search() or generation_config.num_return_sequences > 1) and len(prompts) == 1:
# We can use streamer only if we have a single prompt and not beam search.
streamer = StreamerWithResults()
if isinstance(streamer, StreamerWithResults):
# Clear the accumulated strings to avoid side effects
streamer.reset()

generate_outputs : DecodedResults = ov_pipe.generate(
inputs=prompts,
generation_config=generation_config,
streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer
)

index = 0
generation_results = []

for _ in prompts:
generation_result = GenerationResult()

generation_result.m_generation_ids = generate_outputs.texts[index : index + generation_config.num_return_sequences]
# sequences_scores are available only for beam search case
if generation_config.is_beam_search():
generation_result.m_scores = generate_outputs.scores[index : index + generation_config.num_return_sequences]
generation_results.append(generation_result)

index += generation_config.num_return_sequences

del ov_pipe
shutil.rmtree(models_path)

if isinstance(streamer, StreamerWithResults):
compare_generation_results(prompts, generation_results, streamer.get_results(), generation_config)

return generation_results
return run_ov_pipeline(models_path=models_path,
prompt=prompts,
generation_config=generation_config,
pipeline_type=(PipelineType.STATELESS if use_cb else PipelineType.STATEFUL),
streamer=streamer,
ov_config=properties)


def run_llm_pipeline_with_ref(model_id: str,
Expand All @@ -162,34 +76,31 @@ def run_llm_pipeline_with_ref(model_id: str,
tmp_path: Path,
use_cb : bool = False,
streamer: StreamerWithResults | Callable | StreamerBase = None):
models_path : Path = tmp_path / model_id
opt_model, hf_tokenizer = get_hugging_face_models(model_id)

if type(generation_config) is dict:
generation_config = GenerationConfig(**generation_config)

convert_models(opt_model, hf_tokenizer, models_path)
opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id, tmp_path)

ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb, streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer)
hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config)

compare_generation_results(prompts, hf_results, ov_results, generation_config)


def run_cb_pipeline_with_ref(tmp_path: str, model_id: str, scheduler_params: dict = {}, generation_config : GenerationConfig | dict = None):
def run_cb_pipeline_with_ref(tmp_path: str,
model_id: str,
scheduler_params: dict = {},
generation_config : GenerationConfig | dict = None):
prompts, generation_configs = get_test_dataset()
scheduler_config = get_scheduler_config(scheduler_params)
scheduler_config = dict_to_scheduler_config(scheduler_params)

# override dataset's generation config
if generation_config is not None:
if type(generation_config) is dict:
generation_config = GenerationConfig(**generation_config)
generation_configs = [generation_config] * len(prompts)

models_path : Path = tmp_path / model_id
opt_model, hf_tokenizer = get_hugging_face_models(model_id)

convert_models(opt_model, hf_tokenizer, models_path)
opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id, tmp_path)

hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_configs)
ov_results = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
Expand All @@ -211,17 +122,3 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st
for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids):
assert ref_text == ov_text

"""rt_info has the highest priority. Delete it to respect configs."""
def delete_rt_info(configs: List[Tuple], temp_path):
core = openvino.Core()
core.set_property({'ENABLE_MMAP': False})
for model_path in temp_path / "openvino_tokenizer.xml", temp_path / "openvino_detokenizer.xml":
tokenizer = core.read_model(model_path)
rt_info = tokenizer.get_rt_info()
for config, _ in configs:
for key in config.keys():
try:
del rt_info[key]
except KeyError:
pass
openvino.save_model(tokenizer, model_path)
71 changes: 71 additions & 0 deletions tests/python_tests/data/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import pathlib
import os
import pytest


def get_models_list():
precommit_models = [
"katuni4ka/tiny-random-phi3",
]

nightly_models = [
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"facebook/opt-125m",
"microsoft/phi-1_5",
"microsoft/phi-2",
"THUDM/chatglm3-6b",
"Qwen/Qwen2-0.5B-Instruct",
"Qwen/Qwen-7B-Chat",
"Qwen/Qwen1.5-7B-Chat",
"argilla/notus-7b-v1",
"HuggingFaceH4/zephyr-7b-beta",
"ikala/redpajama-3b-chat",
"mistralai/Mistral-7B-v0.1",

# "meta-llama/Llama-2-7b-chat-hf", # Cannot be downloaded without access token
# "google/gemma-2b-it", # Cannot be downloaded without access token.
# "google/gemma-7b-it", # Cannot be downloaded without access token.
"meta-llama/Llama-2-13b-chat-hf",
"meta-llama/Meta-Llama-3-8B-Instruct",
"openlm-research/open_llama_3b",
"openlm-research/open_llama_3b_v2",
"openlm-research/open_llama_7b",
"databricks/dolly-v2-12b",
"databricks/dolly-v2-3b",
]

if pytest.run_marker == "precommit":
model_ids = precommit_models
else:
model_ids = nightly_models

if pytest.selected_model_ids:
model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]

prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
return [ (model_id, prefix) for model_id in model_ids ]


def get_chat_models_list():
precommit_models = [
"Qwen/Qwen2-0.5B-Instruct",
]

nightly_models = [
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Llama-2-7b-chat-hf",
# "google/gemma-2b-it", # Cannot be downloaded without access token
# "google/gemma-7b-it", # Cannot be downloaded without access token
]

if pytest.run_marker == "precommit":
model_ids = precommit_models
else:
model_ids = nightly_models

prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
Loading
Loading